Spaces:

cacode
/

SACC

Paused

App Files Files Community

cacode commited on 28 days ago

Commit

9def03a

verified ·

1 Parent(s): 8385899

Harden Selenium login flow and browser stability

Browse files

Files changed (2) hide show

core/course_bot.py +158 -24
webdriver_utils.py +73 -44

core/course_bot.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from __future__ import annotations
 import base64
 import json
@@ -11,6 +11,7 @@ from typing import Callable
 from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException
 from selenium.webdriver.common.by import By
 from selenium.webdriver.remote.webdriver import WebDriver
 from selenium.webdriver.support.wait import WebDriverWait
 import onnx_inference
@@ -252,8 +253,7 @@ class CourseBot:
     def _login(self, driver: WebDriver, wait: WebDriverWait) -> None:
         for attempt in range(1, self.config.login_retry_limit + 1):
-            driver.get(URL_LOGIN)
-            webdriver_utils.wait_for_ready(wait)
             std_id_box = self._find_first_visible(driver, LOGIN_STUDENT_SELECTORS, "登录学号输入框", timeout=6)
             password_box = self._find_first_visible(driver, LOGIN_PASSWORD_SELECTORS, "登录密码输入框", timeout=6)
@@ -261,7 +261,8 @@ class CourseBot:
             login_button = self._find_first_visible(driver, LOGIN_BUTTON_SELECTORS, "登录按钮", timeout=6)
             captcha_image = self._find_first_visible(driver, LOGIN_CAPTCHA_IMAGE_SELECTORS, "登录验证码图片", timeout=6)
-            captcha_text = self.captcha_solver.classification(self._extract_image_bytes(captcha_image))
             std_id_box.clear()
             std_id_box.send_keys(self.user["student_id"])
@@ -270,26 +271,33 @@ class CourseBot:
             captcha_box.clear()
             captcha_box.send_keys(captcha_text)
             login_button.click()
-            time.sleep(1.2)
-            if any(driver.current_url.startswith(prefix) for prefix in LOGIN_SUCCESS_PREFIXES):
                 self.logger("INFO", f"登录成功，耗费 {attempt} 次尝试。")
                 return
-            error_message = self._read_login_error(driver)
             if any(token in error_message for token in ("用户名或密码错误", "密码错误", "账号或密码错误", "用户不存在")):
                 raise FatalCredentialsError("学号或密码错误，任务已停止，请在面板中更新后重新启动。")
             if error_message:
                 self.logger("WARNING", f"登录失败，第 {attempt} 次尝试: {error_message}")
             else:
-                self.logger("WARNING", f"登录失败，第 {attempt} 次尝试，未读取到明确错误提示。")
-        raise RecoverableAutomationError("连续多次登录失败，系统将稍后自动重试。")
     def _goto_select_course(self, driver: WebDriver, wait: WebDriverWait) -> bool:
-        driver.get(URL_SELECT_COURSE)
-        webdriver_utils.wait_for_ready(wait)
-        body_text = self._find(driver, By.TAG_NAME, "body").text
         if "非选课" in body_text or "未到选课时间" in body_text:
             self.logger("INFO", body_text.strip() or "当前不是选课时段。")
             return False
@@ -319,12 +327,17 @@ class CourseBot:
                 )
             )
         except TimeoutException as exc:
-            raise RecoverableAutomationError("课程查询超时，页面可能暂时无响应。") from exc
         finally:
             driver.switch_to.default_content()
         time.sleep(0.2)
-        found_target = driver.execute_script(self.select_course_js, course_key) == "yes"
         if not found_target:
             self.logger("INFO", f"本轮未找到目标课程 {course_key}。")
             return False
@@ -412,7 +425,7 @@ class CourseBot:
         if not image or not input_box or not button:
             return False
-        captcha_text = self.captcha_solver.classification(self._extract_image_bytes(image))
         self.logger("INFO", f"提交验证码 OCR 输出: {captcha_text}")
         input_box.clear()
         input_box.send_keys(captcha_text)
@@ -424,12 +437,12 @@ class CourseBot:
         tab_id = CATEGORY_META[category]["tab_id"]
         tab = self._find(driver, By.ID, tab_id)
         tab.click()
-        webdriver_utils.wait_for_ready(wait)
         time.sleep(0.2)
     def _read_result_page(self, driver: WebDriver, wait: WebDriverWait) -> list[dict]:
         try:
-            webdriver_utils.wait_for_ready(wait)
             wait.until(
                 lambda current_driver: current_driver.execute_script(
                     """
@@ -440,24 +453,145 @@ class CourseBot:
             )
             return json.loads(driver.execute_script(self.check_result_js))
         except Exception as exc:
-            raise RecoverableAutomationError("读取选课结果失败，页面结构可能发生变化。") from exc
     def _read_login_error(self, driver: WebDriver) -> str:
         script = """
             const nodes = Array.from(document.querySelectorAll('body span, body div'));
             for (const node of nodes) {
                 const text = (node.innerText || '').trim();
-                if (!text) continue;
                 if (/验证码|密码|用户|账号/.test(text)) return text;
             }
             return '';
         """
         raw_message = driver.execute_script(script) or ""
-        message = re.sub(r"\s+", " ", str(raw_message)).strip()
-        return message
-    @staticmethod
-    def _extract_image_bytes(image_element) -> bytes:
         source = image_element.get_attribute("src") or ""
         if "base64," in source:
             return base64.b64decode(source.split("base64,", 1)[1])
@@ -466,7 +600,7 @@ class CourseBot:
     def _find_first_visible(self, driver: WebDriver, selectors: list[tuple[str, str]], label: str, timeout: int = 0):
         element = self._find_first_visible_optional(driver, selectors, timeout=timeout)
         if element is None:
-            raise RecoverableAutomationError(f"页面元素未找到: {label}")
         return element
     @staticmethod

+from __future__ import annotations
 import base64
 import json
 from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException
 from selenium.webdriver.common.by import By
 from selenium.webdriver.remote.webdriver import WebDriver
+from selenium.webdriver.remote.webelement import WebElement
 from selenium.webdriver.support.wait import WebDriverWait
 import onnx_inference
     def _login(self, driver: WebDriver, wait: WebDriverWait) -> None:
         for attempt in range(1, self.config.login_retry_limit + 1):
+            self._open_page(driver, wait, URL_LOGIN, f"登录页（第 {attempt} 次）", log_on_success=True)
             std_id_box = self._find_first_visible(driver, LOGIN_STUDENT_SELECTORS, "登录学号输入框", timeout=6)
             password_box = self._find_first_visible(driver, LOGIN_PASSWORD_SELECTORS, "登录密码输入框", timeout=6)
             login_button = self._find_first_visible(driver, LOGIN_BUTTON_SELECTORS, "登录按钮", timeout=6)
             captcha_image = self._find_first_visible(driver, LOGIN_CAPTCHA_IMAGE_SELECTORS, "登录验证码图片", timeout=6)
+            captcha_text = self._solve_captcha_text(captcha_image, scene="登录")
+            self.logger("INFO", f"登录尝试 {attempt}/{self.config.login_retry_limit}，验证码 OCR 输出: {captcha_text}")
             std_id_box.clear()
             std_id_box.send_keys(self.user["student_id"])
             captcha_box.clear()
             captcha_box.send_keys(captcha_text)
             login_button.click()
+            state, error_message = self._wait_for_login_outcome(driver, timeout_seconds=10)
+            if state == "success":
                 self.logger("INFO", f"登录成功，耗费 {attempt} 次尝试。")
                 return
             if any(token in error_message for token in ("用户名或密码错误", "密码错误", "账号或密码错误", "用户不存在")):
                 raise FatalCredentialsError("学号或密码错误，任务已停止，请在面板中更新后重新启动。")
             if error_message:
                 self.logger("WARNING", f"登录失败，第 {attempt} 次尝试: {error_message}")
             else:
+                self.logger(
+                    "WARNING",
+                    f"登录失败，第 {attempt} 次尝试，未读取到明确错误提示。{self._page_snapshot(driver, include_body=True)}",
+                )
+            time.sleep(0.6)
+        raise RecoverableAutomationError("连续多次登录失败，可能是验证码识别失败、页面异常或系统暂时不可用。")
     def _goto_select_course(self, driver: WebDriver, wait: WebDriverWait) -> bool:
+        self._open_page(driver, wait, URL_SELECT_COURSE, "选课页")
+        if self._is_session_expired(driver):
+            raise RecoverableAutomationError("检测到登录会话已失效，准备重新登录。")
+        body_text = self._safe_body_text(driver)
         if "非选课" in body_text or "未到选课时间" in body_text:
             self.logger("INFO", body_text.strip() or "当前不是选课时段。")
             return False
                 )
             )
         except TimeoutException as exc:
+            raise RecoverableAutomationError(
+                f"课程查询超时，页面可能暂时无响应。{self._page_snapshot(driver, include_body=True)}"
+            ) from exc
         finally:
             driver.switch_to.default_content()
         time.sleep(0.2)
+        try:
+            found_target = driver.execute_script(self.select_course_js, course_key) == "yes"
+        except WebDriverException as exc:
+            raise RecoverableAutomationError(f"执行课程勾选脚本失败。{self._page_snapshot(driver, include_body=True)}") from exc
         if not found_target:
             self.logger("INFO", f"本轮未找到目标课程 {course_key}。")
             return False
         if not image or not input_box or not button:
             return False
+        captcha_text = self._solve_captcha_text(image, scene="提交")
         self.logger("INFO", f"提交验证码 OCR 输出: {captcha_text}")
         input_box.clear()
         input_box.send_keys(captcha_text)
         tab_id = CATEGORY_META[category]["tab_id"]
         tab = self._find(driver, By.ID, tab_id)
         tab.click()
+        webdriver_utils.wait_for_ready(wait, allow_interactive=True)
         time.sleep(0.2)
     def _read_result_page(self, driver: WebDriver, wait: WebDriverWait) -> list[dict]:
         try:
+            webdriver_utils.wait_for_ready(wait, allow_interactive=True)
             wait.until(
                 lambda current_driver: current_driver.execute_script(
                     """
             )
             return json.loads(driver.execute_script(self.check_result_js))
         except Exception as exc:
+            raise RecoverableAutomationError(
+                f"读取选课结果失败，页面结构可能发生变化。{self._page_snapshot(driver, include_body=True)}"
+            ) from exc
+    def _open_page(
+        self,
+        driver: WebDriver,
+        wait: WebDriverWait,
+        url: str,
+        label: str,
+        *,
+        allow_interactive: bool = True,
+        log_on_success: bool = False,
+    ) -> None:
+        timed_out = webdriver_utils.open_with_recovery(driver, url)
+        if timed_out:
+            self.logger("WARNING", f"{label} 页面加载超时，尝试停止页面并继续执行。")
+        try:
+            ready_state = webdriver_utils.wait_for_ready(wait, allow_interactive=allow_interactive)
+        except TimeoutException as exc:
+            raise RecoverableAutomationError(f"{label} 页面加载失败。{self._page_snapshot(driver, include_body=True)}") from exc
+        if log_on_success or timed_out:
+            self.logger("INFO", f"{label} 页面已打开，readyState={ready_state}。{self._page_snapshot(driver, include_body=timed_out)}")
+    def _wait_for_login_outcome(self, driver: WebDriver, timeout_seconds: int = 10) -> tuple[str, str]:
+        deadline = time.monotonic() + max(1, timeout_seconds)
+        last_error = ""
+        while time.monotonic() < deadline:
+            current_url = driver.current_url or ""
+            if self._is_login_success_url(current_url):
+                return "success", ""
+            last_error = self._read_login_error(driver)
+            if last_error:
+                return "error", last_error
+            time.sleep(0.4)
+        return "unknown", self._read_login_error(driver)
     def _read_login_error(self, driver: WebDriver) -> str:
         script = """
+            const visible = (node) => {
+                if (!node) return false;
+                const style = window.getComputedStyle(node);
+                const rect = node.getBoundingClientRect();
+                return style.display !== 'none' && style.visibility !== 'hidden' && rect.width > 0 && rect.height > 0;
+            };
+            const selectors = [
+                '.el-message',
+                '[role="alert"]',
+                '.message',
+                '.toast',
+                '.el-form-item__error',
+                '.error'
+            ];
+            for (const selector of selectors) {
+                for (const node of Array.from(document.querySelectorAll(selector))) {
+                    const text = (node.innerText || '').trim();
+                    if (visible(node) && text) {
+                        return text;
+                    }
+                }
+            }
             const nodes = Array.from(document.querySelectorAll('body span, body div'));
             for (const node of nodes) {
                 const text = (node.innerText || '').trim();
+                if (!text || !visible(node)) continue;
                 if (/验证码|密码|用户|账号/.test(text)) return text;
             }
             return '';
         """
         raw_message = driver.execute_script(script) or ""
+        return re.sub(r"\s+", " ", str(raw_message)).strip()
+    def _solve_captcha_text(self, image_element: WebElement, *, scene: str) -> str:
+        last_candidate = ""
+        for attempt in range(1, 3):
+            raw_text = self.captcha_solver.classification(self._extract_image_bytes(image_element))
+            normalized = re.sub(r"[^0-9A-Za-z]", "", str(raw_text or "")).strip()
+            if len(normalized) >= 4:
+                return normalized[:4]
+            last_candidate = normalized
+            self.logger("WARNING", f"{scene}验证码 OCR 输出异常，第 {attempt} 次结果: {raw_text!r}")
+            try:
+                image_element.click()
+                time.sleep(0.4)
+            except Exception:
+                pass
+        if len(last_candidate) >= 3:
+            return last_candidate[:4]
+        raise RecoverableAutomationError(f"{scene}验证码 OCR 未能识别出有效内容。")
+    def _is_login_success_url(self, url: str) -> bool:
+        if not url:
+            return False
+        if any(url.startswith(prefix) for prefix in LOGIN_SUCCESS_PREFIXES):
+            return True
+        return "zhjw.scu.edu.cn" in url and "id.scu.edu.cn" not in url
+    def _is_session_expired(self, driver: WebDriver) -> bool:
+        current_url = driver.current_url or ""
+        if "id.scu.edu.cn" in current_url:
+            return True
+        password_box = self._find_first_visible_optional(driver, LOGIN_PASSWORD_SELECTORS, timeout=1)
+        return password_box is not None
+    def _safe_body_text(self, driver: WebDriver) -> str:
+        try:
+            body = self._find(driver, By.TAG_NAME, "body")
+            return body.text or ""
+        except RecoverableAutomationError:
+            return ""
+    def _page_snapshot(self, driver: WebDriver, *, include_body: bool = False) -> str:
+        script = """
+            const body = document.body ? (document.body.innerText || '') : '';
+            return JSON.stringify({
+                url: window.location.href || '',
+                title: document.title || '',
+                readyState: document.readyState || '',
+                body: body.replace(/\\s+/g, ' ').trim().slice(0, 180)
+            });
+        """
+        try:
+            raw = driver.execute_script(script)
+            data = json.loads(raw) if isinstance(raw, str) else raw
+        except Exception:
+            current_url = getattr(driver, "current_url", "") or ""
+            return f" url={current_url or '-'}"
+        parts = [
+            f"url={data.get('url') or '-'}",
+            f"title={data.get('title') or '-'}",
+            f"readyState={data.get('readyState') or '-'}",
+        ]
+        body_excerpt = (data.get("body") or "").strip()
+        if include_body and body_excerpt:
+            parts.append(f"body={body_excerpt}")
+        return " " + " | ".join(parts)
+    def _extract_image_bytes(self, image_element: WebElement) -> bytes:
         source = image_element.get_attribute("src") or ""
         if "base64," in source:
             return base64.b64decode(source.split("base64,", 1)[1])
     def _find_first_visible(self, driver: WebDriver, selectors: list[tuple[str, str]], label: str, timeout: int = 0):
         element = self._find_first_visible_optional(driver, selectors, timeout=timeout)
         if element is None:
+            raise RecoverableAutomationError(f"页面元素未找到: {label}。{self._page_snapshot(driver, include_body=True)}")
         return element
     @staticmethod

webdriver_utils.py CHANGED Viewed

@@ -1,44 +1,73 @@
-from __future__ import annotations
-from selenium import webdriver
-from selenium.webdriver.chrome.service import Service as ChromeService
-from selenium.webdriver.remote.webdriver import WebDriver
-from selenium.webdriver.support.wait import WebDriverWait
-DEFAULT_USER_AGENT = (
-    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
-    "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
-)
-def configure_browser(*, chrome_binary: str, chromedriver_path: str, page_timeout: int = 40) -> WebDriver:
-    options = webdriver.ChromeOptions()
-    options.binary_location = chrome_binary
-    options.add_argument("--headless=new")
-    options.add_argument("--disable-gpu")
-    options.add_argument("--no-sandbox")
-    options.add_argument("--disable-dev-shm-usage")
-    options.add_argument("--disable-blink-features=AutomationControlled")
-    options.add_argument("--window-size=1440,1280")
-    options.add_argument("--lang=zh-CN")
-    options.add_argument(f"--user-agent={DEFAULT_USER_AGENT}")
-    options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
-    options.add_experimental_option("useAutomationExtension", False)
-    service = ChromeService(executable_path=chromedriver_path)
-    driver = webdriver.Chrome(service=service, options=options)
-    driver.set_page_load_timeout(page_timeout)
-    driver.implicitly_wait(8)
-    driver.execute_cdp_cmd(
-        "Page.addScriptToEvaluateOnNewDocument",
-        {"source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined});"},
-    )
-    return driver
-def wait_for_ready(driver_wait: WebDriverWait) -> None:
-    driver_wait.until(
-        lambda web_driver: web_driver.execute_script("return document.readyState") == "complete",
-        "The target page did not finish loading in time.",
-    )

+from __future__ import annotations
+from selenium import webdriver
+from selenium.common.exceptions import TimeoutException, WebDriverException
+from selenium.webdriver.chrome.service import Service as ChromeService
+from selenium.webdriver.remote.webdriver import WebDriver
+from selenium.webdriver.support.wait import WebDriverWait
+DEFAULT_USER_AGENT = (
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+    "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
+)
+def configure_browser(*, chrome_binary: str, chromedriver_path: str, page_timeout: int = 40) -> WebDriver:
+    options = webdriver.ChromeOptions()
+    options.binary_location = chrome_binary
+    options.page_load_strategy = "eager"
+    options.add_argument("--headless=new")
+    options.add_argument("--disable-gpu")
+    options.add_argument("--no-sandbox")
+    options.add_argument("--disable-dev-shm-usage")
+    options.add_argument("--disable-blink-features=AutomationControlled")
+    options.add_argument("--disable-background-networking")
+    options.add_argument("--disable-background-timer-throttling")
+    options.add_argument("--disable-backgrounding-occluded-windows")
+    options.add_argument("--disable-renderer-backgrounding")
+    options.add_argument("--disable-extensions")
+    options.add_argument("--disable-default-apps")
+    options.add_argument("--no-first-run")
+    options.add_argument("--no-default-browser-check")
+    options.add_argument("--mute-audio")
+    options.add_argument("--window-size=1440,1280")
+    options.add_argument("--lang=zh-CN")
+    options.add_argument(f"--user-agent={DEFAULT_USER_AGENT}")
+    options.add_argument("--remote-debugging-pipe")
+    options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
+    options.add_experimental_option("useAutomationExtension", False)
+    service = ChromeService(executable_path=chromedriver_path)
+    driver = webdriver.Chrome(service=service, options=options)
+    driver.set_page_load_timeout(page_timeout)
+    driver.set_script_timeout(min(page_timeout, 20))
+    driver.implicitly_wait(6)
+    driver.execute_cdp_cmd(
+        "Page.addScriptToEvaluateOnNewDocument",
+        {
+            "source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined});"
+        },
+    )
+    return driver
+def open_with_recovery(driver: WebDriver, url: str) -> bool:
+    try:
+        driver.get(url)
+        return False
+    except TimeoutException:
+        try:
+            driver.execute_script("window.stop();")
+        except WebDriverException:
+            pass
+        return True
+def wait_for_ready(driver_wait: WebDriverWait, *, allow_interactive: bool = True) -> str:
+    acceptable_states = {"complete", "interactive"} if allow_interactive else {"complete"}
+    driver_wait.until(
+        lambda web_driver: web_driver.execute_script("return document.readyState") in acceptable_states,
+        "The target page did not finish loading in time.",
+    )
+    return str(driver_wait._driver.execute_script("return document.readyState"))