Harden Selenium login flow and browser stability
Browse files- core/course_bot.py +158 -24
- webdriver_utils.py +73 -44
core/course_bot.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
|
| 3 |
import base64
|
| 4 |
import json
|
|
@@ -11,6 +11,7 @@ from typing import Callable
|
|
| 11 |
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException
|
| 12 |
from selenium.webdriver.common.by import By
|
| 13 |
from selenium.webdriver.remote.webdriver import WebDriver
|
|
|
|
| 14 |
from selenium.webdriver.support.wait import WebDriverWait
|
| 15 |
|
| 16 |
import onnx_inference
|
|
@@ -252,8 +253,7 @@ class CourseBot:
|
|
| 252 |
|
| 253 |
def _login(self, driver: WebDriver, wait: WebDriverWait) -> None:
|
| 254 |
for attempt in range(1, self.config.login_retry_limit + 1):
|
| 255 |
-
|
| 256 |
-
webdriver_utils.wait_for_ready(wait)
|
| 257 |
|
| 258 |
std_id_box = self._find_first_visible(driver, LOGIN_STUDENT_SELECTORS, "登录学号输入框", timeout=6)
|
| 259 |
password_box = self._find_first_visible(driver, LOGIN_PASSWORD_SELECTORS, "登录密码输入框", timeout=6)
|
|
@@ -261,7 +261,8 @@ class CourseBot:
|
|
| 261 |
login_button = self._find_first_visible(driver, LOGIN_BUTTON_SELECTORS, "登录按钮", timeout=6)
|
| 262 |
captcha_image = self._find_first_visible(driver, LOGIN_CAPTCHA_IMAGE_SELECTORS, "登录验证码图片", timeout=6)
|
| 263 |
|
| 264 |
-
captcha_text = self.
|
|
|
|
| 265 |
|
| 266 |
std_id_box.clear()
|
| 267 |
std_id_box.send_keys(self.user["student_id"])
|
|
@@ -270,26 +271,33 @@ class CourseBot:
|
|
| 270 |
captcha_box.clear()
|
| 271 |
captcha_box.send_keys(captcha_text)
|
| 272 |
login_button.click()
|
| 273 |
-
time.sleep(1.2)
|
| 274 |
|
| 275 |
-
|
|
|
|
| 276 |
self.logger("INFO", f"登录成功,耗费 {attempt} 次尝试。")
|
| 277 |
return
|
| 278 |
|
| 279 |
-
error_message = self._read_login_error(driver)
|
| 280 |
if any(token in error_message for token in ("用户名或密码错误", "密码错误", "账号或密码错误", "用户不存在")):
|
| 281 |
raise FatalCredentialsError("学号或密码错误,任务已停止,请在面板中更新后重新启动。")
|
|
|
|
| 282 |
if error_message:
|
| 283 |
self.logger("WARNING", f"登录失败,第 {attempt} 次尝试: {error_message}")
|
| 284 |
else:
|
| 285 |
-
self.logger(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
|
| 287 |
-
raise RecoverableAutomationError("连续多次登录失败,系统
|
| 288 |
|
| 289 |
def _goto_select_course(self, driver: WebDriver, wait: WebDriverWait) -> bool:
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
|
|
|
|
|
|
| 293 |
if "非选课" in body_text or "未到选课时间" in body_text:
|
| 294 |
self.logger("INFO", body_text.strip() or "当前不是选课时段。")
|
| 295 |
return False
|
|
@@ -319,12 +327,17 @@ class CourseBot:
|
|
| 319 |
)
|
| 320 |
)
|
| 321 |
except TimeoutException as exc:
|
| 322 |
-
raise RecoverableAutomationError(
|
|
|
|
|
|
|
| 323 |
finally:
|
| 324 |
driver.switch_to.default_content()
|
| 325 |
|
| 326 |
time.sleep(0.2)
|
| 327 |
-
|
|
|
|
|
|
|
|
|
|
| 328 |
if not found_target:
|
| 329 |
self.logger("INFO", f"本轮未找到目标课程 {course_key}。")
|
| 330 |
return False
|
|
@@ -412,7 +425,7 @@ class CourseBot:
|
|
| 412 |
if not image or not input_box or not button:
|
| 413 |
return False
|
| 414 |
|
| 415 |
-
captcha_text = self.
|
| 416 |
self.logger("INFO", f"提交验证码 OCR 输出: {captcha_text}")
|
| 417 |
input_box.clear()
|
| 418 |
input_box.send_keys(captcha_text)
|
|
@@ -424,12 +437,12 @@ class CourseBot:
|
|
| 424 |
tab_id = CATEGORY_META[category]["tab_id"]
|
| 425 |
tab = self._find(driver, By.ID, tab_id)
|
| 426 |
tab.click()
|
| 427 |
-
webdriver_utils.wait_for_ready(wait)
|
| 428 |
time.sleep(0.2)
|
| 429 |
|
| 430 |
def _read_result_page(self, driver: WebDriver, wait: WebDriverWait) -> list[dict]:
|
| 431 |
try:
|
| 432 |
-
webdriver_utils.wait_for_ready(wait)
|
| 433 |
wait.until(
|
| 434 |
lambda current_driver: current_driver.execute_script(
|
| 435 |
"""
|
|
@@ -440,24 +453,145 @@ class CourseBot:
|
|
| 440 |
)
|
| 441 |
return json.loads(driver.execute_script(self.check_result_js))
|
| 442 |
except Exception as exc:
|
| 443 |
-
raise RecoverableAutomationError(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 444 |
|
| 445 |
def _read_login_error(self, driver: WebDriver) -> str:
|
| 446 |
script = """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 447 |
const nodes = Array.from(document.querySelectorAll('body span, body div'));
|
| 448 |
for (const node of nodes) {
|
| 449 |
const text = (node.innerText || '').trim();
|
| 450 |
-
if (!text) continue;
|
| 451 |
if (/验证码|密码|用户|账号/.test(text)) return text;
|
| 452 |
}
|
| 453 |
return '';
|
| 454 |
"""
|
| 455 |
raw_message = driver.execute_script(script) or ""
|
| 456 |
-
|
| 457 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
|
| 459 |
-
|
| 460 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 461 |
source = image_element.get_attribute("src") or ""
|
| 462 |
if "base64," in source:
|
| 463 |
return base64.b64decode(source.split("base64,", 1)[1])
|
|
@@ -466,7 +600,7 @@ class CourseBot:
|
|
| 466 |
def _find_first_visible(self, driver: WebDriver, selectors: list[tuple[str, str]], label: str, timeout: int = 0):
|
| 467 |
element = self._find_first_visible_optional(driver, selectors, timeout=timeout)
|
| 468 |
if element is None:
|
| 469 |
-
raise RecoverableAutomationError(f"页面元素未找到: {label}")
|
| 470 |
return element
|
| 471 |
|
| 472 |
@staticmethod
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
|
| 3 |
import base64
|
| 4 |
import json
|
|
|
|
| 11 |
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException
|
| 12 |
from selenium.webdriver.common.by import By
|
| 13 |
from selenium.webdriver.remote.webdriver import WebDriver
|
| 14 |
+
from selenium.webdriver.remote.webelement import WebElement
|
| 15 |
from selenium.webdriver.support.wait import WebDriverWait
|
| 16 |
|
| 17 |
import onnx_inference
|
|
|
|
| 253 |
|
| 254 |
def _login(self, driver: WebDriver, wait: WebDriverWait) -> None:
|
| 255 |
for attempt in range(1, self.config.login_retry_limit + 1):
|
| 256 |
+
self._open_page(driver, wait, URL_LOGIN, f"登录页(第 {attempt} 次)", log_on_success=True)
|
|
|
|
| 257 |
|
| 258 |
std_id_box = self._find_first_visible(driver, LOGIN_STUDENT_SELECTORS, "登录学号输入框", timeout=6)
|
| 259 |
password_box = self._find_first_visible(driver, LOGIN_PASSWORD_SELECTORS, "登录密码输入框", timeout=6)
|
|
|
|
| 261 |
login_button = self._find_first_visible(driver, LOGIN_BUTTON_SELECTORS, "登录按钮", timeout=6)
|
| 262 |
captcha_image = self._find_first_visible(driver, LOGIN_CAPTCHA_IMAGE_SELECTORS, "登录验证码图片", timeout=6)
|
| 263 |
|
| 264 |
+
captcha_text = self._solve_captcha_text(captcha_image, scene="登录")
|
| 265 |
+
self.logger("INFO", f"登录尝试 {attempt}/{self.config.login_retry_limit},验证码 OCR 输出: {captcha_text}")
|
| 266 |
|
| 267 |
std_id_box.clear()
|
| 268 |
std_id_box.send_keys(self.user["student_id"])
|
|
|
|
| 271 |
captcha_box.clear()
|
| 272 |
captcha_box.send_keys(captcha_text)
|
| 273 |
login_button.click()
|
|
|
|
| 274 |
|
| 275 |
+
state, error_message = self._wait_for_login_outcome(driver, timeout_seconds=10)
|
| 276 |
+
if state == "success":
|
| 277 |
self.logger("INFO", f"登录成功,耗费 {attempt} 次尝试。")
|
| 278 |
return
|
| 279 |
|
|
|
|
| 280 |
if any(token in error_message for token in ("用户名或密码错误", "密码错误", "账号或密码错误", "用户不存在")):
|
| 281 |
raise FatalCredentialsError("学号或密码错误,任务已停止,请在面板中更新后重新启动。")
|
| 282 |
+
|
| 283 |
if error_message:
|
| 284 |
self.logger("WARNING", f"登录失败,第 {attempt} 次尝试: {error_message}")
|
| 285 |
else:
|
| 286 |
+
self.logger(
|
| 287 |
+
"WARNING",
|
| 288 |
+
f"登录失败,第 {attempt} 次尝试,未读取到明确错误提示。{self._page_snapshot(driver, include_body=True)}",
|
| 289 |
+
)
|
| 290 |
+
|
| 291 |
+
time.sleep(0.6)
|
| 292 |
|
| 293 |
+
raise RecoverableAutomationError("连续多次登录失败,可能是验证码识别失败、页面异常或系统暂时不可用。")
|
| 294 |
|
| 295 |
def _goto_select_course(self, driver: WebDriver, wait: WebDriverWait) -> bool:
|
| 296 |
+
self._open_page(driver, wait, URL_SELECT_COURSE, "选课页")
|
| 297 |
+
if self._is_session_expired(driver):
|
| 298 |
+
raise RecoverableAutomationError("检测到登录会话已失效,准备重新登录。")
|
| 299 |
+
|
| 300 |
+
body_text = self._safe_body_text(driver)
|
| 301 |
if "非选课" in body_text or "未到选课时间" in body_text:
|
| 302 |
self.logger("INFO", body_text.strip() or "当前不是选课时段。")
|
| 303 |
return False
|
|
|
|
| 327 |
)
|
| 328 |
)
|
| 329 |
except TimeoutException as exc:
|
| 330 |
+
raise RecoverableAutomationError(
|
| 331 |
+
f"课程查询超时,页面可能暂时无响应。{self._page_snapshot(driver, include_body=True)}"
|
| 332 |
+
) from exc
|
| 333 |
finally:
|
| 334 |
driver.switch_to.default_content()
|
| 335 |
|
| 336 |
time.sleep(0.2)
|
| 337 |
+
try:
|
| 338 |
+
found_target = driver.execute_script(self.select_course_js, course_key) == "yes"
|
| 339 |
+
except WebDriverException as exc:
|
| 340 |
+
raise RecoverableAutomationError(f"执行课程勾选脚本失败。{self._page_snapshot(driver, include_body=True)}") from exc
|
| 341 |
if not found_target:
|
| 342 |
self.logger("INFO", f"本轮未找到目标课程 {course_key}。")
|
| 343 |
return False
|
|
|
|
| 425 |
if not image or not input_box or not button:
|
| 426 |
return False
|
| 427 |
|
| 428 |
+
captcha_text = self._solve_captcha_text(image, scene="提交")
|
| 429 |
self.logger("INFO", f"提交验证码 OCR 输出: {captcha_text}")
|
| 430 |
input_box.clear()
|
| 431 |
input_box.send_keys(captcha_text)
|
|
|
|
| 437 |
tab_id = CATEGORY_META[category]["tab_id"]
|
| 438 |
tab = self._find(driver, By.ID, tab_id)
|
| 439 |
tab.click()
|
| 440 |
+
webdriver_utils.wait_for_ready(wait, allow_interactive=True)
|
| 441 |
time.sleep(0.2)
|
| 442 |
|
| 443 |
def _read_result_page(self, driver: WebDriver, wait: WebDriverWait) -> list[dict]:
|
| 444 |
try:
|
| 445 |
+
webdriver_utils.wait_for_ready(wait, allow_interactive=True)
|
| 446 |
wait.until(
|
| 447 |
lambda current_driver: current_driver.execute_script(
|
| 448 |
"""
|
|
|
|
| 453 |
)
|
| 454 |
return json.loads(driver.execute_script(self.check_result_js))
|
| 455 |
except Exception as exc:
|
| 456 |
+
raise RecoverableAutomationError(
|
| 457 |
+
f"读取选课结果失败,页面结构可能发生变化。{self._page_snapshot(driver, include_body=True)}"
|
| 458 |
+
) from exc
|
| 459 |
+
|
| 460 |
+
def _open_page(
|
| 461 |
+
self,
|
| 462 |
+
driver: WebDriver,
|
| 463 |
+
wait: WebDriverWait,
|
| 464 |
+
url: str,
|
| 465 |
+
label: str,
|
| 466 |
+
*,
|
| 467 |
+
allow_interactive: bool = True,
|
| 468 |
+
log_on_success: bool = False,
|
| 469 |
+
) -> None:
|
| 470 |
+
timed_out = webdriver_utils.open_with_recovery(driver, url)
|
| 471 |
+
if timed_out:
|
| 472 |
+
self.logger("WARNING", f"{label} 页面加载超时,尝试停止页面并继续执行。")
|
| 473 |
+
try:
|
| 474 |
+
ready_state = webdriver_utils.wait_for_ready(wait, allow_interactive=allow_interactive)
|
| 475 |
+
except TimeoutException as exc:
|
| 476 |
+
raise RecoverableAutomationError(f"{label} 页面加载失败。{self._page_snapshot(driver, include_body=True)}") from exc
|
| 477 |
+
if log_on_success or timed_out:
|
| 478 |
+
self.logger("INFO", f"{label} 页面已打开,readyState={ready_state}。{self._page_snapshot(driver, include_body=timed_out)}")
|
| 479 |
+
|
| 480 |
+
def _wait_for_login_outcome(self, driver: WebDriver, timeout_seconds: int = 10) -> tuple[str, str]:
|
| 481 |
+
deadline = time.monotonic() + max(1, timeout_seconds)
|
| 482 |
+
last_error = ""
|
| 483 |
+
while time.monotonic() < deadline:
|
| 484 |
+
current_url = driver.current_url or ""
|
| 485 |
+
if self._is_login_success_url(current_url):
|
| 486 |
+
return "success", ""
|
| 487 |
+
last_error = self._read_login_error(driver)
|
| 488 |
+
if last_error:
|
| 489 |
+
return "error", last_error
|
| 490 |
+
time.sleep(0.4)
|
| 491 |
+
return "unknown", self._read_login_error(driver)
|
| 492 |
|
| 493 |
def _read_login_error(self, driver: WebDriver) -> str:
|
| 494 |
script = """
|
| 495 |
+
const visible = (node) => {
|
| 496 |
+
if (!node) return false;
|
| 497 |
+
const style = window.getComputedStyle(node);
|
| 498 |
+
const rect = node.getBoundingClientRect();
|
| 499 |
+
return style.display !== 'none' && style.visibility !== 'hidden' && rect.width > 0 && rect.height > 0;
|
| 500 |
+
};
|
| 501 |
+
const selectors = [
|
| 502 |
+
'.el-message',
|
| 503 |
+
'[role="alert"]',
|
| 504 |
+
'.message',
|
| 505 |
+
'.toast',
|
| 506 |
+
'.el-form-item__error',
|
| 507 |
+
'.error'
|
| 508 |
+
];
|
| 509 |
+
for (const selector of selectors) {
|
| 510 |
+
for (const node of Array.from(document.querySelectorAll(selector))) {
|
| 511 |
+
const text = (node.innerText || '').trim();
|
| 512 |
+
if (visible(node) && text) {
|
| 513 |
+
return text;
|
| 514 |
+
}
|
| 515 |
+
}
|
| 516 |
+
}
|
| 517 |
const nodes = Array.from(document.querySelectorAll('body span, body div'));
|
| 518 |
for (const node of nodes) {
|
| 519 |
const text = (node.innerText || '').trim();
|
| 520 |
+
if (!text || !visible(node)) continue;
|
| 521 |
if (/验证码|密码|用户|账号/.test(text)) return text;
|
| 522 |
}
|
| 523 |
return '';
|
| 524 |
"""
|
| 525 |
raw_message = driver.execute_script(script) or ""
|
| 526 |
+
return re.sub(r"\s+", " ", str(raw_message)).strip()
|
| 527 |
+
|
| 528 |
+
def _solve_captcha_text(self, image_element: WebElement, *, scene: str) -> str:
|
| 529 |
+
last_candidate = ""
|
| 530 |
+
for attempt in range(1, 3):
|
| 531 |
+
raw_text = self.captcha_solver.classification(self._extract_image_bytes(image_element))
|
| 532 |
+
normalized = re.sub(r"[^0-9A-Za-z]", "", str(raw_text or "")).strip()
|
| 533 |
+
if len(normalized) >= 4:
|
| 534 |
+
return normalized[:4]
|
| 535 |
+
last_candidate = normalized
|
| 536 |
+
self.logger("WARNING", f"{scene}验证码 OCR 输出异常,第 {attempt} 次结果: {raw_text!r}")
|
| 537 |
+
try:
|
| 538 |
+
image_element.click()
|
| 539 |
+
time.sleep(0.4)
|
| 540 |
+
except Exception:
|
| 541 |
+
pass
|
| 542 |
+
if len(last_candidate) >= 3:
|
| 543 |
+
return last_candidate[:4]
|
| 544 |
+
raise RecoverableAutomationError(f"{scene}验证码 OCR 未能识别出有效内容。")
|
| 545 |
+
|
| 546 |
+
def _is_login_success_url(self, url: str) -> bool:
|
| 547 |
+
if not url:
|
| 548 |
+
return False
|
| 549 |
+
if any(url.startswith(prefix) for prefix in LOGIN_SUCCESS_PREFIXES):
|
| 550 |
+
return True
|
| 551 |
+
return "zhjw.scu.edu.cn" in url and "id.scu.edu.cn" not in url
|
| 552 |
|
| 553 |
+
def _is_session_expired(self, driver: WebDriver) -> bool:
|
| 554 |
+
current_url = driver.current_url or ""
|
| 555 |
+
if "id.scu.edu.cn" in current_url:
|
| 556 |
+
return True
|
| 557 |
+
password_box = self._find_first_visible_optional(driver, LOGIN_PASSWORD_SELECTORS, timeout=1)
|
| 558 |
+
return password_box is not None
|
| 559 |
+
|
| 560 |
+
def _safe_body_text(self, driver: WebDriver) -> str:
|
| 561 |
+
try:
|
| 562 |
+
body = self._find(driver, By.TAG_NAME, "body")
|
| 563 |
+
return body.text or ""
|
| 564 |
+
except RecoverableAutomationError:
|
| 565 |
+
return ""
|
| 566 |
+
|
| 567 |
+
def _page_snapshot(self, driver: WebDriver, *, include_body: bool = False) -> str:
|
| 568 |
+
script = """
|
| 569 |
+
const body = document.body ? (document.body.innerText || '') : '';
|
| 570 |
+
return JSON.stringify({
|
| 571 |
+
url: window.location.href || '',
|
| 572 |
+
title: document.title || '',
|
| 573 |
+
readyState: document.readyState || '',
|
| 574 |
+
body: body.replace(/\\s+/g, ' ').trim().slice(0, 180)
|
| 575 |
+
});
|
| 576 |
+
"""
|
| 577 |
+
try:
|
| 578 |
+
raw = driver.execute_script(script)
|
| 579 |
+
data = json.loads(raw) if isinstance(raw, str) else raw
|
| 580 |
+
except Exception:
|
| 581 |
+
current_url = getattr(driver, "current_url", "") or ""
|
| 582 |
+
return f" url={current_url or '-'}"
|
| 583 |
+
|
| 584 |
+
parts = [
|
| 585 |
+
f"url={data.get('url') or '-'}",
|
| 586 |
+
f"title={data.get('title') or '-'}",
|
| 587 |
+
f"readyState={data.get('readyState') or '-'}",
|
| 588 |
+
]
|
| 589 |
+
body_excerpt = (data.get("body") or "").strip()
|
| 590 |
+
if include_body and body_excerpt:
|
| 591 |
+
parts.append(f"body={body_excerpt}")
|
| 592 |
+
return " " + " | ".join(parts)
|
| 593 |
+
|
| 594 |
+
def _extract_image_bytes(self, image_element: WebElement) -> bytes:
|
| 595 |
source = image_element.get_attribute("src") or ""
|
| 596 |
if "base64," in source:
|
| 597 |
return base64.b64decode(source.split("base64,", 1)[1])
|
|
|
|
| 600 |
def _find_first_visible(self, driver: WebDriver, selectors: list[tuple[str, str]], label: str, timeout: int = 0):
|
| 601 |
element = self._find_first_visible_optional(driver, selectors, timeout=timeout)
|
| 602 |
if element is None:
|
| 603 |
+
raise RecoverableAutomationError(f"页面元素未找到: {label}。{self._page_snapshot(driver, include_body=True)}")
|
| 604 |
return element
|
| 605 |
|
| 606 |
@staticmethod
|
webdriver_utils.py
CHANGED
|
@@ -1,44 +1,73 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
-
|
| 3 |
-
from selenium import webdriver
|
| 4 |
-
from selenium.
|
| 5 |
-
from selenium.webdriver.
|
| 6 |
-
from selenium.webdriver.
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
"(
|
| 12 |
-
)
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
options
|
| 18 |
-
options.
|
| 19 |
-
options.
|
| 20 |
-
options.add_argument("--
|
| 21 |
-
options.add_argument("--disable-
|
| 22 |
-
options.add_argument("--
|
| 23 |
-
options.add_argument("--
|
| 24 |
-
options.add_argument("--
|
| 25 |
-
options.add_argument(
|
| 26 |
-
options.
|
| 27 |
-
options.
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
)
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from selenium import webdriver
|
| 4 |
+
from selenium.common.exceptions import TimeoutException, WebDriverException
|
| 5 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 6 |
+
from selenium.webdriver.remote.webdriver import WebDriver
|
| 7 |
+
from selenium.webdriver.support.wait import WebDriverWait
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
DEFAULT_USER_AGENT = (
|
| 11 |
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
| 12 |
+
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def configure_browser(*, chrome_binary: str, chromedriver_path: str, page_timeout: int = 40) -> WebDriver:
|
| 17 |
+
options = webdriver.ChromeOptions()
|
| 18 |
+
options.binary_location = chrome_binary
|
| 19 |
+
options.page_load_strategy = "eager"
|
| 20 |
+
options.add_argument("--headless=new")
|
| 21 |
+
options.add_argument("--disable-gpu")
|
| 22 |
+
options.add_argument("--no-sandbox")
|
| 23 |
+
options.add_argument("--disable-dev-shm-usage")
|
| 24 |
+
options.add_argument("--disable-blink-features=AutomationControlled")
|
| 25 |
+
options.add_argument("--disable-background-networking")
|
| 26 |
+
options.add_argument("--disable-background-timer-throttling")
|
| 27 |
+
options.add_argument("--disable-backgrounding-occluded-windows")
|
| 28 |
+
options.add_argument("--disable-renderer-backgrounding")
|
| 29 |
+
options.add_argument("--disable-extensions")
|
| 30 |
+
options.add_argument("--disable-default-apps")
|
| 31 |
+
options.add_argument("--no-first-run")
|
| 32 |
+
options.add_argument("--no-default-browser-check")
|
| 33 |
+
options.add_argument("--mute-audio")
|
| 34 |
+
options.add_argument("--window-size=1440,1280")
|
| 35 |
+
options.add_argument("--lang=zh-CN")
|
| 36 |
+
options.add_argument(f"--user-agent={DEFAULT_USER_AGENT}")
|
| 37 |
+
options.add_argument("--remote-debugging-pipe")
|
| 38 |
+
options.add_experimental_option("excludeSwitches", ["enable-automation", "enable-logging"])
|
| 39 |
+
options.add_experimental_option("useAutomationExtension", False)
|
| 40 |
+
|
| 41 |
+
service = ChromeService(executable_path=chromedriver_path)
|
| 42 |
+
driver = webdriver.Chrome(service=service, options=options)
|
| 43 |
+
driver.set_page_load_timeout(page_timeout)
|
| 44 |
+
driver.set_script_timeout(min(page_timeout, 20))
|
| 45 |
+
driver.implicitly_wait(6)
|
| 46 |
+
driver.execute_cdp_cmd(
|
| 47 |
+
"Page.addScriptToEvaluateOnNewDocument",
|
| 48 |
+
{
|
| 49 |
+
"source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined});"
|
| 50 |
+
},
|
| 51 |
+
)
|
| 52 |
+
return driver
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def open_with_recovery(driver: WebDriver, url: str) -> bool:
|
| 56 |
+
try:
|
| 57 |
+
driver.get(url)
|
| 58 |
+
return False
|
| 59 |
+
except TimeoutException:
|
| 60 |
+
try:
|
| 61 |
+
driver.execute_script("window.stop();")
|
| 62 |
+
except WebDriverException:
|
| 63 |
+
pass
|
| 64 |
+
return True
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def wait_for_ready(driver_wait: WebDriverWait, *, allow_interactive: bool = True) -> str:
|
| 68 |
+
acceptable_states = {"complete", "interactive"} if allow_interactive else {"complete"}
|
| 69 |
+
driver_wait.until(
|
| 70 |
+
lambda web_driver: web_driver.execute_script("return document.readyState") in acceptable_states,
|
| 71 |
+
"The target page did not finish loading in time.",
|
| 72 |
+
)
|
| 73 |
+
return str(driver_wait._driver.execute_script("return document.readyState"))
|