cacode commited on
Commit
9def03a
·
verified ·
1 Parent(s): 8385899

Harden Selenium login flow and browser stability

Browse files
Files changed (2) hide show
  1. core/course_bot.py +158 -24
  2. webdriver_utils.py +73 -44
core/course_bot.py CHANGED
@@ -1,4 +1,4 @@
1
- from __future__ import annotations
2
 
3
  import base64
4
  import json
@@ -11,6 +11,7 @@ from typing import Callable
11
  from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException
12
  from selenium.webdriver.common.by import By
13
  from selenium.webdriver.remote.webdriver import WebDriver
 
14
  from selenium.webdriver.support.wait import WebDriverWait
15
 
16
  import onnx_inference
@@ -252,8 +253,7 @@ class CourseBot:
252
 
253
  def _login(self, driver: WebDriver, wait: WebDriverWait) -> None:
254
  for attempt in range(1, self.config.login_retry_limit + 1):
255
- driver.get(URL_LOGIN)
256
- webdriver_utils.wait_for_ready(wait)
257
 
258
  std_id_box = self._find_first_visible(driver, LOGIN_STUDENT_SELECTORS, "登录学号输入框", timeout=6)
259
  password_box = self._find_first_visible(driver, LOGIN_PASSWORD_SELECTORS, "登录密码输入框", timeout=6)
@@ -261,7 +261,8 @@ class CourseBot:
261
  login_button = self._find_first_visible(driver, LOGIN_BUTTON_SELECTORS, "登录按钮", timeout=6)
262
  captcha_image = self._find_first_visible(driver, LOGIN_CAPTCHA_IMAGE_SELECTORS, "登录验证码图片", timeout=6)
263
 
264
- captcha_text = self.captcha_solver.classification(self._extract_image_bytes(captcha_image))
 
265
 
266
  std_id_box.clear()
267
  std_id_box.send_keys(self.user["student_id"])
@@ -270,26 +271,33 @@ class CourseBot:
270
  captcha_box.clear()
271
  captcha_box.send_keys(captcha_text)
272
  login_button.click()
273
- time.sleep(1.2)
274
 
275
- if any(driver.current_url.startswith(prefix) for prefix in LOGIN_SUCCESS_PREFIXES):
 
276
  self.logger("INFO", f"登录成功,耗费 {attempt} 次尝试。")
277
  return
278
 
279
- error_message = self._read_login_error(driver)
280
  if any(token in error_message for token in ("用户名或密码错误", "密码错误", "账号或密码错误", "用户不存在")):
281
  raise FatalCredentialsError("学号或密码错误,任务已停止,请在面板中更新后重新启动。")
 
282
  if error_message:
283
  self.logger("WARNING", f"登录失败,第 {attempt} 次尝试: {error_message}")
284
  else:
285
- self.logger("WARNING", f"登录失败,第 {attempt} 次尝试,未读取到明确错误提示。")
 
 
 
 
 
286
 
287
- raise RecoverableAutomationError("连续多次登录失败,系统将稍后自动重试。")
288
 
289
  def _goto_select_course(self, driver: WebDriver, wait: WebDriverWait) -> bool:
290
- driver.get(URL_SELECT_COURSE)
291
- webdriver_utils.wait_for_ready(wait)
292
- body_text = self._find(driver, By.TAG_NAME, "body").text
 
 
293
  if "非选课" in body_text or "未到选课时间" in body_text:
294
  self.logger("INFO", body_text.strip() or "当前不是选课时段。")
295
  return False
@@ -319,12 +327,17 @@ class CourseBot:
319
  )
320
  )
321
  except TimeoutException as exc:
322
- raise RecoverableAutomationError("课程查询超时,页面可能暂时无响应。") from exc
 
 
323
  finally:
324
  driver.switch_to.default_content()
325
 
326
  time.sleep(0.2)
327
- found_target = driver.execute_script(self.select_course_js, course_key) == "yes"
 
 
 
328
  if not found_target:
329
  self.logger("INFO", f"本轮未找到目标课程 {course_key}。")
330
  return False
@@ -412,7 +425,7 @@ class CourseBot:
412
  if not image or not input_box or not button:
413
  return False
414
 
415
- captcha_text = self.captcha_solver.classification(self._extract_image_bytes(image))
416
  self.logger("INFO", f"提交验证码 OCR 输出: {captcha_text}")
417
  input_box.clear()
418
  input_box.send_keys(captcha_text)
@@ -424,12 +437,12 @@ class CourseBot:
424
  tab_id = CATEGORY_META[category]["tab_id"]
425
  tab = self._find(driver, By.ID, tab_id)
426
  tab.click()
427
- webdriver_utils.wait_for_ready(wait)
428
  time.sleep(0.2)
429
 
430
  def _read_result_page(self, driver: WebDriver, wait: WebDriverWait) -> list[dict]:
431
  try:
432
- webdriver_utils.wait_for_ready(wait)
433
  wait.until(
434
  lambda current_driver: current_driver.execute_script(
435
  """
@@ -440,24 +453,145 @@ class CourseBot:
440
  )
441
  return json.loads(driver.execute_script(self.check_result_js))
442
  except Exception as exc:
443
- raise RecoverableAutomationError("读取选课结果失败,页面结构可能发生变化。") from exc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
 
445
  def _read_login_error(self, driver: WebDriver) -> str:
446
  script = """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
447
  const nodes = Array.from(document.querySelectorAll('body span, body div'));
448
  for (const node of nodes) {
449
  const text = (node.innerText || '').trim();
450
- if (!text) continue;
451
  if (/验证码|密码|用户|账号/.test(text)) return text;
452
  }
453
  return '';
454
  """
455
  raw_message = driver.execute_script(script) or ""
456
- message = re.sub(r"\s+", " ", str(raw_message)).strip()
457
- return message
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
458
 
459
- @staticmethod
460
- def _extract_image_bytes(image_element) -> bytes:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
461
  source = image_element.get_attribute("src") or ""
462
  if "base64," in source:
463
  return base64.b64decode(source.split("base64,", 1)[1])
@@ -466,7 +600,7 @@ class CourseBot:
466
  def _find_first_visible(self, driver: WebDriver, selectors: list[tuple[str, str]], label: str, timeout: int = 0):
467
  element = self._find_first_visible_optional(driver, selectors, timeout=timeout)
468
  if element is None:
469
- raise RecoverableAutomationError(f"页面元素未找到: {label}")
470
  return element
471
 
472
  @staticmethod
 
1
+ from __future__ import annotations
2
 
3
  import base64
4
  import json
 
11
  from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException
12
  from selenium.webdriver.common.by import By
13
  from selenium.webdriver.remote.webdriver import WebDriver
14
+ from selenium.webdriver.remote.webelement import WebElement
15
  from selenium.webdriver.support.wait import WebDriverWait
16
 
17
  import onnx_inference
 
253
 
254
  def _login(self, driver: WebDriver, wait: WebDriverWait) -> None:
255
  for attempt in range(1, self.config.login_retry_limit + 1):
256
+ self._open_page(driver, wait, URL_LOGIN, f"登录页(第 {attempt} 次)", log_on_success=True)
 
257
 
258
  std_id_box = self._find_first_visible(driver, LOGIN_STUDENT_SELECTORS, "登录学号输入框", timeout=6)
259
  password_box = self._find_first_visible(driver, LOGIN_PASSWORD_SELECTORS, "登录密码输入框", timeout=6)
 
261
  login_button = self._find_first_visible(driver, LOGIN_BUTTON_SELECTORS, "登录按钮", timeout=6)
262
  captcha_image = self._find_first_visible(driver, LOGIN_CAPTCHA_IMAGE_SELECTORS, "登录验证码图片", timeout=6)
263
 
264
+ captcha_text = self._solve_captcha_text(captcha_image, scene="登录")
265
+ self.logger("INFO", f"登录尝试 {attempt}/{self.config.login_retry_limit},验证码 OCR 输出: {captcha_text}")
266
 
267
  std_id_box.clear()
268
  std_id_box.send_keys(self.user["student_id"])
 
271
  captcha_box.clear()
272
  captcha_box.send_keys(captcha_text)
273
  login_button.click()
 
274
 
275
+ state, error_message = self._wait_for_login_outcome(driver, timeout_seconds=10)
276
+ if state == "success":
277
  self.logger("INFO", f"登录成功,耗费 {attempt} 次尝试。")
278
  return
279
 
 
280
  if any(token in error_message for token in ("用户名或密码错误", "密码错误", "账号或密码错误", "用户不存在")):
281
  raise FatalCredentialsError("学号或密码错误,任务已停止,请在面板中更新后重新启动。")
282
+
283
  if error_message:
284
  self.logger("WARNING", f"登录失败,第 {attempt} 次尝试: {error_message}")
285
  else:
286
+ self.logger(
287
+ "WARNING",
288
+ f"登录失败,第 {attempt} 次尝试,未读取到明确错误提示。{self._page_snapshot(driver, include_body=True)}",
289
+ )
290
+
291
+ time.sleep(0.6)
292
 
293
+ raise RecoverableAutomationError("连续多次登录失败,可能是验证码识别失败、页面异常或系统暂时不可用。")
294
 
295
  def _goto_select_course(self, driver: WebDriver, wait: WebDriverWait) -> bool:
296
+ self._open_page(driver, wait, URL_SELECT_COURSE, "选课页")
297
+ if self._is_session_expired(driver):
298
+ raise RecoverableAutomationError("检测到登录会话已失效,准备重新登录。")
299
+
300
+ body_text = self._safe_body_text(driver)
301
  if "非选课" in body_text or "未到选课时间" in body_text:
302
  self.logger("INFO", body_text.strip() or "当前不是选课时段。")
303
  return False
 
327
  )
328
  )
329
  except TimeoutException as exc:
330
+ raise RecoverableAutomationError(
331
+ f"课程查询超时,页面可能暂时无响应。{self._page_snapshot(driver, include_body=True)}"
332
+ ) from exc
333
  finally:
334
  driver.switch_to.default_content()
335
 
336
  time.sleep(0.2)
337
+ try:
338
+ found_target = driver.execute_script(self.select_course_js, course_key) == "yes"
339
+ except WebDriverException as exc:
340
+ raise RecoverableAutomationError(f"执行课程勾选脚本失败。{self._page_snapshot(driver, include_body=True)}") from exc
341
  if not found_target:
342
  self.logger("INFO", f"本轮未找到目标课程 {course_key}。")
343
  return False
 
425
  if not image or not input_box or not button:
426
  return False
427
 
428
+ captcha_text = self._solve_captcha_text(image, scene="提交")
429
  self.logger("INFO", f"提交验证码 OCR 输出: {captcha_text}")
430
  input_box.clear()
431
  input_box.send_keys(captcha_text)
 
437
  tab_id = CATEGORY_META[category]["tab_id"]
438
  tab = self._find(driver, By.ID, tab_id)
439
  tab.click()
440
+ webdriver_utils.wait_for_ready(wait, allow_interactive=True)
441
  time.sleep(0.2)
442
 
443
  def _read_result_page(self, driver: WebDriver, wait: WebDriverWait) -> list[dict]:
444
  try:
445
+ webdriver_utils.wait_for_ready(wait, allow_interactive=True)
446
  wait.until(
447
  lambda current_driver: current_driver.execute_script(
448
  """
 
453
  )
454
  return json.loads(driver.execute_script(self.check_result_js))
455
  except Exception as exc:
456
+ raise RecoverableAutomationError(
457
+ f"读取选课结果失败,页面结构可能发生变化。{self._page_snapshot(driver, include_body=True)}"
458
+ ) from exc
459
+
460
+ def _open_page(
461
+ self,
462
+ driver: WebDriver,
463
+ wait: WebDriverWait,
464
+ url: str,
465
+ label: str,
466
+ *,
467
+ allow_interactive: bool = True,
468
+ log_on_success: bool = False,
469
+ ) -> None:
470
+ timed_out = webdriver_utils.open_with_recovery(driver, url)
471
+ if timed_out:
472
+ self.logger("WARNING", f"{label} 页面加载超时,尝试停止页面并继续执行。")
473
+ try:
474
+ ready_state = webdriver_utils.wait_for_ready(wait, allow_interactive=allow_interactive)
475
+ except TimeoutException as exc:
476
+ raise RecoverableAutomationError(f"{label} 页面加载失败。{self._page_snapshot(driver, include_body=True)}") from exc
477
+ if log_on_success or timed_out:
478
+ self.logger("INFO", f"{label} 页面已打开,readyState={ready_state}。{self._page_snapshot(driver, include_body=timed_out)}")
479
+
480
+ def _wait_for_login_outcome(self, driver: WebDriver, timeout_seconds: int = 10) -> tuple[str, str]:
481
+ deadline = time.monotonic() + max(1, timeout_seconds)
482
+ last_error = ""
483
+ while time.monotonic() < deadline:
484
+ current_url = driver.current_url or ""
485
+ if self._is_login_success_url(current_url):
486
+ return "success", ""
487
+ last_error = self._read_login_error(driver)
488
+ if last_error:
489
+ return "error", last_error
490
+ time.sleep(0.4)
491
+ return "unknown", self._read_login_error(driver)
492
 
493
  def _read_login_error(self, driver: WebDriver) -> str:
494
  script = """
495
+ const visible = (node) => {
496
+ if (!node) return false;
497
+ const style = window.getComputedStyle(node);
498
+ const rect = node.getBoundingClientRect();
499
+ return style.display !== 'none' && style.visibility !== 'hidden' && rect.width > 0 && rect.height > 0;
500
+ };
501
+ const selectors = [
502
+ '.el-message',
503
+ '[role="alert"]',
504
+ '.message',
505
+ '.toast',
506
+ '.el-form-item__error',
507
+ '.error'
508
+ ];
509
+ for (const selector of selectors) {
510
+ for (const node of Array.from(document.querySelectorAll(selector))) {
511
+ const text = (node.innerText || '').trim();
512
+ if (visible(node) && text) {
513
+ return text;
514
+ }
515
+ }
516
+ }
517
  const nodes = Array.from(document.querySelectorAll('body span, body div'));
518
  for (const node of nodes) {
519
  const text = (node.innerText || '').trim();
520
+ if (!text || !visible(node)) continue;
521
  if (/验证码|密码|用户|账号/.test(text)) return text;
522
  }
523
  return '';
524
  """
525
  raw_message = driver.execute_script(script) or ""
526
+ return re.sub(r"\s+", " ", str(raw_message)).strip()
527
+
528
+ def _solve_captcha_text(self, image_element: WebElement, *, scene: str) -> str:
529
+ last_candidate = ""
530
+ for attempt in range(1, 3):
531
+ raw_text = self.captcha_solver.classification(self._extract_image_bytes(image_element))
532
+ normalized = re.sub(r"[^0-9A-Za-z]", "", str(raw_text or "")).strip()
533
+ if len(normalized) >= 4:
534
+ return normalized[:4]
535
+ last_candidate = normalized
536
+ self.logger("WARNING", f"{scene}验证码 OCR 输出异常,第 {attempt} 次结果: {raw_text!r}")
537
+ try:
538
+ image_element.click()
539
+ time.sleep(0.4)
540
+ except Exception:
541
+ pass
542
+ if len(last_candidate) >= 3:
543
+ return last_candidate[:4]
544
+ raise RecoverableAutomationError(f"{scene}验证码 OCR 未能识别出有效内容。")
545
+
546
+ def _is_login_success_url(self, url: str) -> bool:
547
+ if not url:
548
+ return False
549
+ if any(url.startswith(prefix) for prefix in LOGIN_SUCCESS_PREFIXES):
550
+ return True
551
+ return "zhjw.scu.edu.cn" in url and "id.scu.edu.cn" not in url
552
 
553
+ def _is_session_expired(self, driver: WebDriver) -> bool:
554
+ current_url = driver.current_url or ""
555
+ if "id.scu.edu.cn" in current_url:
556
+ return True
557
+ password_box = self._find_first_visible_optional(driver, LOGIN_PASSWORD_SELECTORS, timeout=1)
558
+ return password_box is not None
559
+
560
+ def _safe_body_text(self, driver: WebDriver) -> str:
561
+ try:
562
+ body = self._find(driver, By.TAG_NAME, "body")
563
+ return body.text or ""
564
+ except RecoverableAutomationError:
565
+ return ""
566
+
567
+ def _page_snapshot(self, driver: WebDriver, *, include_body: bool = False) -> str:
568
+ script = """
569
+ const body = document.body ? (document.body.innerText || '') : '';
570
+ return JSON.stringify({
571
+ url: window.location.href || '',
572
+ title: document.title || '',
573
+ readyState: document.readyState || '',
574
+ body: body.replace(/\\s+/g, ' ').trim().slice(0, 180)
575
+ });
576
+ """
577
+ try:
578
+ raw = driver.execute_script(script)
579
+ data = json.loads(raw) if isinstance(raw, str) else raw
580
+ except Exception:
581
+ current_url = getattr(driver, "current_url", "") or ""
582
+ return f" url={current_url or '-'}"
583
+
584
+ parts = [
585
+ f"url={data.get('url') or '-'}",
586
+ f"title={data.get('title') or '-'}",
587
+ f"readyState={data.get('readyState') or '-'}",
588
+ ]
589
+ body_excerpt = (data.get("body") or "").strip()
590
+ if include_body and body_excerpt:
591
+ parts.append(f"body={body_excerpt}")
592
+ return " " + " | ".join(parts)
593
+
594
+ def _extract_image_bytes(self, image_element: WebElement) -> bytes:
595
  source = image_element.get_attribute("src") or ""
596
  if "base64," in source:
597
  return base64.b64decode(source.split("base64,", 1)[1])
 
600
  def _find_first_visible(self, driver: WebDriver, selectors: list[tuple[str, str]], label: str, timeout: int = 0):
601
  element = self._find_first_visible_optional(driver, selectors, timeout=timeout)
602
  if element is None:
603
+ raise RecoverableAutomationError(f"页面元素未找到: {label}。{self._page_snapshot(driver, include_body=True)}")
604
  return element
605
 
606
  @staticmethod
webdriver_utils.py CHANGED
@@ -1,44 +1,73 @@
1
- from __future__ import annotations
2
-
3
- from selenium import webdriver
4
- from selenium.webdriver.chrome.service import Service as ChromeService
5
- from selenium.webdriver.remote.webdriver import WebDriver
6
- from selenium.webdriver.support.wait import WebDriverWait
7
-
8
-
9
- DEFAULT_USER_AGENT = (
10
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
11
- "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
12
- )
13
-
14
-
15
- def configure_browser(*, chrome_binary: str, chromedriver_path: str, page_timeout: int = 40) -> WebDriver:
16
- options = webdriver.ChromeOptions()
17
- options.binary_location = chrome_binary
18
- options.add_argument("--headless=new")
19
- options.add_argument("--disable-gpu")
20
- options.add_argument("--no-sandbox")
21
- options.add_argument("--disable-dev-shm-usage")
22
- options.add_argument("--disable-blink-features=AutomationControlled")
23
- options.add_argument("--window-size=1440,1280")
24
- options.add_argument("--lang=zh-CN")
25
- options.add_argument(f"--user-agent={DEFAULT_USER_AGENT}")
26
- options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
27
- options.add_experimental_option("useAutomationExtension", False)
28
-
29
- service = ChromeService(executable_path=chromedriver_path)
30
- driver = webdriver.Chrome(service=service, options=options)
31
- driver.set_page_load_timeout(page_timeout)
32
- driver.implicitly_wait(8)
33
- driver.execute_cdp_cmd(
34
- "Page.addScriptToEvaluateOnNewDocument",
35
- {"source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined});"},
36
- )
37
- return driver
38
-
39
-
40
- def wait_for_ready(driver_wait: WebDriverWait) -> None:
41
- driver_wait.until(
42
- lambda web_driver: web_driver.execute_script("return document.readyState") == "complete",
43
- "The target page did not finish loading in time.",
44
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from selenium import webdriver
4
+ from selenium.common.exceptions import TimeoutException, WebDriverException
5
+ from selenium.webdriver.chrome.service import Service as ChromeService
6
+ from selenium.webdriver.remote.webdriver import WebDriver
7
+ from selenium.webdriver.support.wait import WebDriverWait
8
+
9
+
10
+ DEFAULT_USER_AGENT = (
11
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
12
+ "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
13
+ )
14
+
15
+
16
+ def configure_browser(*, chrome_binary: str, chromedriver_path: str, page_timeout: int = 40) -> WebDriver:
17
+ options = webdriver.ChromeOptions()
18
+ options.binary_location = chrome_binary
19
+ options.page_load_strategy = "eager"
20
+ options.add_argument("--headless=new")
21
+ options.add_argument("--disable-gpu")
22
+ options.add_argument("--no-sandbox")
23
+ options.add_argument("--disable-dev-shm-usage")
24
+ options.add_argument("--disable-blink-features=AutomationControlled")
25
+ options.add_argument("--disable-background-networking")
26
+ options.add_argument("--disable-background-timer-throttling")
27
+ options.add_argument("--disable-backgrounding-occluded-windows")
28
+ options.add_argument("--disable-renderer-backgrounding")
29
+ options.add_argument("--disable-extensions")
30
+ options.add_argument("--disable-default-apps")
31
+ options.add_argument("--no-first-run")
32
+ options.add_argument("--no-default-browser-check")
33
+ options.add_argument("--mute-audio")
34
+ options.add_argument("--window-size=1440,1280")
35
+ options.add_argument("--lang=zh-CN")
36
+ options.add_argument(f"--user-agent={DEFAULT_USER_AGENT}")
37
+ options.add_argument("--remote-debugging-pipe")
38
+ options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
39
+ options.add_experimental_option("useAutomationExtension", False)
40
+
41
+ service = ChromeService(executable_path=chromedriver_path)
42
+ driver = webdriver.Chrome(service=service, options=options)
43
+ driver.set_page_load_timeout(page_timeout)
44
+ driver.set_script_timeout(min(page_timeout, 20))
45
+ driver.implicitly_wait(6)
46
+ driver.execute_cdp_cmd(
47
+ "Page.addScriptToEvaluateOnNewDocument",
48
+ {
49
+ "source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined});"
50
+ },
51
+ )
52
+ return driver
53
+
54
+
55
+ def open_with_recovery(driver: WebDriver, url: str) -> bool:
56
+ try:
57
+ driver.get(url)
58
+ return False
59
+ except TimeoutException:
60
+ try:
61
+ driver.execute_script("window.stop();")
62
+ except WebDriverException:
63
+ pass
64
+ return True
65
+
66
+
67
+ def wait_for_ready(driver_wait: WebDriverWait, *, allow_interactive: bool = True) -> str:
68
+ acceptable_states = {"complete", "interactive"} if allow_interactive else {"complete"}
69
+ driver_wait.until(
70
+ lambda web_driver: web_driver.execute_script("return document.readyState") in acceptable_states,
71
+ "The target page did not finish loading in time.",
72
+ )
73
+ return str(driver_wait._driver.execute_script("return document.readyState"))