|
|
@@ -14,6 +14,9 @@ from typing import Dict, List, Optional, Any, Callable
|
|
|
from curl_cffi import requests, const
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
+# DrissionPage 核心
|
|
|
+from DrissionPage import ChromiumPage, ChromiumOptions
|
|
|
+
|
|
|
from cryptography.hazmat.primitives import serialization, hashes
|
|
|
from cryptography.hazmat.primitives.asymmetric import padding
|
|
|
from cryptography.hazmat.backends import default_backend
|
|
|
@@ -21,7 +24,8 @@ from cryptography.hazmat.backends import default_backend
|
|
|
# 框架依赖
|
|
|
from vs_plg import IVSPlg
|
|
|
from vs_types import VSPlgConfig, VSQueryResult, VSBookResult, DateAvailability, TimeSlot, AvailabilityStatus, NotFoundError, PermissionDeniedError, RateLimiteddError, SessionExpiredOrInvalidError, BizLogicError
|
|
|
-from toolkit.vs_cloud_api import VSCloudApi
|
|
|
+from toolkit.vs_cloud_api import VSCloudApi
|
|
|
+from toolkit.ocr_engine import PyTorchEngine
|
|
|
|
|
|
class BlsPlugin(IVSPlg):
|
|
|
"""
|
|
|
@@ -40,6 +44,19 @@ class BlsPlugin(IVSPlg):
|
|
|
self.book_params: Dict = {}
|
|
|
self.is_healthy: bool = True
|
|
|
|
|
|
+ # 浏览器实例
|
|
|
+ self.page: Optional[ChromiumPage] = None
|
|
|
+
|
|
|
+ # --- [核心修改] 并发隔离与资源管理 ---
|
|
|
+ # 生成唯一实例 ID
|
|
|
+ self.instance_id = uuid.uuid4().hex[:8]
|
|
|
+ self.root_workspace = os.path.abspath(os.path.join("temp_browser_data", f"{self.group_id}_{self.instance_id}"))
|
|
|
+ # 定义子目录:代理插件目录 & 浏览器用户数据目录
|
|
|
+ self.user_data_path = os.path.join(self.root_workspace, "user_data")
|
|
|
+
|
|
|
+ # 字符识别引擎
|
|
|
+ self.ocr_engine = Optional[PyTorchEngine] = None
|
|
|
+
|
|
|
# OCR 服务地址默认值
|
|
|
self.local_service_url: str = ""
|
|
|
self.session_create_time: float = 0
|
|
|
@@ -49,6 +66,12 @@ class BlsPlugin(IVSPlg):
|
|
|
|
|
|
def set_log(self, logger: Callable[[str], None]) -> None:
|
|
|
self.logger = logger
|
|
|
+
|
|
|
+ def _log(self, message):
|
|
|
+ if self.logger:
|
|
|
+ self.logger(f'[TlsPlugin] [{self.group_id}] {message}')
|
|
|
+ else:
|
|
|
+ print(f'[TlsPlugin] [{self.group_id}] {message}')
|
|
|
|
|
|
def set_config(self, config: VSPlgConfig):
|
|
|
self.config = config
|
|
|
@@ -66,11 +89,58 @@ class BlsPlugin(IVSPlg):
|
|
|
current_time = time.time()
|
|
|
elapsed_time = current_time - self.session_create_time
|
|
|
if elapsed_time > self.config.session_max_life * 60:
|
|
|
- self._log(f"Session Life ({int(elapsed_time)}s) out of max life limit ({self.config.session_max_life * 60}s), mark as unhealth session")
|
|
|
+ self._log(f"Session expired.")
|
|
|
return False
|
|
|
return True
|
|
|
|
|
|
def create_session(self):
|
|
|
+ self._log(f"Initializing Session (ID: {self.instance_id})...")
|
|
|
+ co = ChromiumOptions()
|
|
|
+ # -------------------------------------------------------------
|
|
|
+ # [核心修复] 解决 'not enough values to unpack'
|
|
|
+ # -------------------------------------------------------------
|
|
|
+ # 1. 不要用 co.auto_port(),因为它依赖解析 stdout,会被 DBus 报错干扰
|
|
|
+ # 2. 我们手动随机生成一个端口
|
|
|
+ import random
|
|
|
+ import socket
|
|
|
+
|
|
|
+ def get_free_port():
|
|
|
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
|
+ s.bind(('', 0))
|
|
|
+ return s.getsockname()[1]
|
|
|
+
|
|
|
+ debug_port = get_free_port()
|
|
|
+ self._log(f"Assigned Debug Port: {debug_port}")
|
|
|
+
|
|
|
+ # 3. 强制指定端口,DrissionPage 就会直接连接,不再解析日志
|
|
|
+ co.set_local_port(debug_port)
|
|
|
+
|
|
|
+ # --- [关键配置] 设置独立的用户数据目录 ---
|
|
|
+ # 这样每个实例的 Cache, Cookies, LocalStorage 都是完全隔离的
|
|
|
+ # 同时也防止了多进程争抢同一个 Default 文件夹导致的崩溃
|
|
|
+ co.set_user_data_path(self.user_data_path)
|
|
|
+
|
|
|
+ # --- 1. 指定浏览器路径 (适配 Docker) ---
|
|
|
+ chrome_path = os.getenv("CHROME_BIN")
|
|
|
+ if chrome_path and os.path.exists(chrome_path):
|
|
|
+ co.set_paths(browser_path=chrome_path)
|
|
|
+
|
|
|
+ co.headless(False)
|
|
|
+ co.set_argument('--no-sandbox')
|
|
|
+ co.set_argument('--disable-gpu')
|
|
|
+ # Docker 默认 /dev/shm 只有 64MB,Chromium 很容易爆内存崩溃
|
|
|
+ co.set_argument('--disable-dev-shm-usage')
|
|
|
+ co.set_argument('--window-size=1920,1080')
|
|
|
+ co.set_argument('--disable-blink-features=AutomationControlled')
|
|
|
+
|
|
|
+ try:
|
|
|
+ self.page = ChromiumPage(co)
|
|
|
+ except Exception as e:
|
|
|
+ self._log(f"Session Create Error: {e}")
|
|
|
+ self.cleanup()
|
|
|
+ raise e
|
|
|
+
|
|
|
+ self.ocr_engine = PyTorchEngine(self.free_config.get('ocr_model'))
|
|
|
self.session = requests.Session(
|
|
|
proxy=self._get_proxy_url(),
|
|
|
impersonate="chrome124",
|
|
|
@@ -86,12 +156,7 @@ class BlsPlugin(IVSPlg):
|
|
|
# 1.1 获取登录页 & 解析参数
|
|
|
login_url = f"https://{domain}/Global/account/login"
|
|
|
|
|
|
- headers = {
|
|
|
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/131.0.0.0 Safari/537.36',
|
|
|
- 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'
|
|
|
- }
|
|
|
-
|
|
|
- resp = self._perform_request('GET', login_url, headers=headers)
|
|
|
+ resp = self._perform_request('GET', login_url)
|
|
|
if self.config.debug:
|
|
|
self._save_debug_html(resp.text, prefix="Bls_Login_Page")
|
|
|
soup = BeautifulSoup(resp.text, 'html.parser')
|
|
|
@@ -139,7 +204,6 @@ class BlsPlugin(IVSPlg):
|
|
|
# 2.1 签证类型验证
|
|
|
url_vtv = f"https://{domain}/Global/bls/visatypeverification"
|
|
|
headers = {
|
|
|
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/131.0.0.0 Safari/537.36',
|
|
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'
|
|
|
}
|
|
|
resp = self._perform_request('GET', url_vtv, headers=headers)
|
|
|
@@ -228,7 +292,6 @@ class BlsPlugin(IVSPlg):
|
|
|
domain = self.free_config.get("domain")
|
|
|
|
|
|
headers = {
|
|
|
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/131.0.0.0 Safari/537.36',
|
|
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'
|
|
|
}
|
|
|
# 3.1 获取 Manage Page (为了 Token 和 JS 变量)
|
|
|
@@ -345,10 +408,6 @@ class BlsPlugin(IVSPlg):
|
|
|
self._log(f"Book Success. Liveness URL: {res.payment_link}")
|
|
|
return res
|
|
|
|
|
|
- def _log(self, message):
|
|
|
- if self.logger:
|
|
|
- self.logger(f'[BlsPlugin] [{self.group_id}] {message}')
|
|
|
-
|
|
|
def _get_proxy_url(self):
|
|
|
# 构造代理
|
|
|
proxy_url = ""
|
|
|
@@ -392,6 +451,62 @@ class BlsPlugin(IVSPlg):
|
|
|
raise RateLimiteddError()
|
|
|
else:
|
|
|
raise BizLogicError(message=f"HTTP Error {resp.status_code}: {resp.text[:100]}")
|
|
|
+
|
|
|
+ def _extract_captcha_data(self, tmp_file):
|
|
|
+ # 1. 加载文件
|
|
|
+ html_file_path = Path(tmp_file).resolve()
|
|
|
+ self.page.get(f'file://{html_file_path}')
|
|
|
+
|
|
|
+ # 2. 定位主容器 (作为后续查找的基准,减少全局扫描)
|
|
|
+ main_div = self.page.ele('#captcha-main-div', timeout=5)
|
|
|
+ if not main_div:
|
|
|
+ raise BizLogicError(message='Captcha main container not found')
|
|
|
+
|
|
|
+ # --- 3. 提取提示数字 ---
|
|
|
+ # 假设结构是 main -> div -> div[1] (header)
|
|
|
+ # 使用相对 XPath 定位 header 区域
|
|
|
+ header_ele = main_div.ele('xpath:./div/div[1]')
|
|
|
+ caption_text = ""
|
|
|
+
|
|
|
+ if header_ele:
|
|
|
+ # 遍历子元素寻找可见的提示语
|
|
|
+ for child in header_ele.children():
|
|
|
+ # 这里的 is_displayed 检查是否有大小,is_covered 检查是否被遮挡
|
|
|
+ if child.states.is_displayed and not child.states.is_covered:
|
|
|
+ caption_text = child.text
|
|
|
+ if caption_text: # 找到文本就跳出
|
|
|
+ break
|
|
|
+
|
|
|
+ # 安全提取数字
|
|
|
+ number_match = re.search(r'\d+', caption_text)
|
|
|
+ if not number_match:
|
|
|
+ # 如果没找到数字,返回错误或特定的 status
|
|
|
+ raise BizLogicError(message="No number found in caption")
|
|
|
+
|
|
|
+ number = number_match.group()
|
|
|
+
|
|
|
+ # --- 4. 提取图片 ID ---
|
|
|
+ images_ids = []
|
|
|
+
|
|
|
+ # 优化策略:直接查找所有 class 为 captcha-img 的图片元素
|
|
|
+ # 语法: tag:img @@ class:captcha-img
|
|
|
+ all_imgs = main_div.eles('tag:img@@class:captcha-img')
|
|
|
+
|
|
|
+ for img in all_imgs:
|
|
|
+ # 1. 检查可见性 (有尺寸且未被遮挡)
|
|
|
+ if img.states.is_displayed and not img.states.is_covered:
|
|
|
+ # 2. 检查 src 属性
|
|
|
+ src = img.attr('src')
|
|
|
+ if src and src.startswith('data:image'):
|
|
|
+ # 3. 获取父级元素的 ID (根据原逻辑,ID 在 img 的父级容器上)
|
|
|
+ parent_id = img.parent().attr('id')
|
|
|
+ if parent_id:
|
|
|
+ images_ids.append(parent_id)
|
|
|
+ data = {
|
|
|
+ "number": number,
|
|
|
+ "image_ids": images_ids,
|
|
|
+ }
|
|
|
+ return data
|
|
|
|
|
|
def _solve_bls_captcha(self, data='') -> Optional[str]:
|
|
|
"""
|
|
|
@@ -399,29 +514,22 @@ class BlsPlugin(IVSPlg):
|
|
|
"""
|
|
|
domain = self.free_config.get("domain")
|
|
|
url = f"https://{domain}/Global/NewCaptcha/GenerateCaptcha"
|
|
|
- if data: url = f"https://{domain}/Global/CaptchaPublic/GenerateCaptcha?data={data}"
|
|
|
- headers = {
|
|
|
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/131.0.0.0 Safari/537.36',
|
|
|
- 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'
|
|
|
- }
|
|
|
+ if data:
|
|
|
+ url = f"https://{domain}/Global/CaptchaPublic/GenerateCaptcha?data={data}"
|
|
|
resp = self._perform_request("GET", url, headers=headers)
|
|
|
if self.config.debug:
|
|
|
self._save_debug_html(resp.text, prefix="Bls_Captcha_Page")
|
|
|
self._check_resp_is_session_expired_or_invalid('Please select all boxes with number', resp)
|
|
|
|
|
|
+ tmpfile = os.path.join(self.root_workspace, "tmp.html")
|
|
|
+ with open(tmpfile, 'wb') as tfp:
|
|
|
+ tfp.write(resp.text)
|
|
|
+
|
|
|
soup = BeautifulSoup(resp.text, 'html.parser')
|
|
|
- resp = requests.post(
|
|
|
- f'{self.local_service_url}/browser/visable_captchas',
|
|
|
- data=resp.text,
|
|
|
- headers={"Content-Type": "text/plain"},
|
|
|
- timeout=10
|
|
|
- )
|
|
|
- result = resp.json()
|
|
|
- if result.get('status') != 'success':
|
|
|
- raise BizLogicError(message='Broswer task failed')
|
|
|
+ extract_data = self._extract_captcha_data(tmpfile)
|
|
|
|
|
|
- numbers = result['data']['number']
|
|
|
- image_ids = result['data']['image_ids']
|
|
|
+ numbers = extract_data['number']
|
|
|
+ image_ids = extract_data['image_ids']
|
|
|
selected_ids = []
|
|
|
for sid in image_ids:
|
|
|
div = soup.find("div", id=sid)
|
|
|
@@ -429,20 +537,12 @@ class BlsPlugin(IVSPlg):
|
|
|
src = img.get("src")
|
|
|
base64_data = src.split("base64,", 1)[1]
|
|
|
img_bytes = base64.b64decode(base64_data)
|
|
|
- ocr_resp = requests.post(
|
|
|
- f'{self.local_service_url}/predict/bls?model=pytorch',
|
|
|
- data=img_bytes,
|
|
|
- headers={"Content-Type": "application/octet-stream"},
|
|
|
- timeout=5
|
|
|
- )
|
|
|
- if ocr_resp.status_code == 200:
|
|
|
- res_json = ocr_resp.json()
|
|
|
- ocr_res = res_json.get('data', '').replace('$', '')[:3]
|
|
|
- self._log(f'ocr captcha id={sid} result={ocr_res}, target={numbers}')
|
|
|
- if ocr_res == numbers:
|
|
|
- selected_ids.append(sid)
|
|
|
- else:
|
|
|
- raise BizLogicError(message='Captcha server response error')
|
|
|
+
|
|
|
+ ocr_output = self.ocr_engine.inference_bytes(img_bytes)
|
|
|
+ ocr_res = ocr_output.replace('$', '')[:3]
|
|
|
+ self._log(f'ocr captcha id={sid} result={ocr_res}, target={numbers}')
|
|
|
+ if ocr_res == numbers:
|
|
|
+ selected_ids.append(sid)
|
|
|
if not selected_ids:
|
|
|
raise BizLogicError(message='Captcha selected ids is empty')
|
|
|
|
|
|
@@ -488,6 +588,32 @@ class BlsPlugin(IVSPlg):
|
|
|
match = re.search(pattern, html)
|
|
|
if match: return match.group(1)
|
|
|
return ""
|
|
|
+
|
|
|
+ def _find_id_by_label(self, label_text, input_index=1):
|
|
|
+ """
|
|
|
+ 根据 Label 文本查找对应 Input 的 ID 数字
|
|
|
+ :param label_text: Label 包含的文本
|
|
|
+ :param input_index: Input 也是 Label 后的第几个 input (默认第1个)
|
|
|
+ :return: ID (int) or None
|
|
|
+ """
|
|
|
+ # 优化定位:直接查找包含特定文本的 label 标签
|
|
|
+ # syntax: 标签名:label @@ text:文本内容
|
|
|
+ labels = self.page.eles(f'tag:label@@text:{label_text}', timeout=1)
|
|
|
+
|
|
|
+ for label in labels:
|
|
|
+ # 检查元素是否可见 (has_rect)
|
|
|
+ if label.states.has_rect:
|
|
|
+ # 获取 label 后的指定 input 元素
|
|
|
+ target_input = label.after('tag:input', index=input_index)
|
|
|
+
|
|
|
+ if target_input:
|
|
|
+ eid = target_input.attr('id')
|
|
|
+ if eid:
|
|
|
+ # 使用正则提取数字,比 filter 更快且易读
|
|
|
+ match = re.search(r'\d+', eid)
|
|
|
+ if match:
|
|
|
+ return int(match.group())
|
|
|
+ return None
|
|
|
|
|
|
def _construct_visatype_payload(self, html: str, soup: BeautifulSoup) -> Optional[Dict]:
|
|
|
"""
|
|
|
@@ -522,21 +648,30 @@ class BlsPlugin(IVSPlg):
|
|
|
subtype_value = None
|
|
|
cat_value = None
|
|
|
|
|
|
- resp = requests.post(
|
|
|
- f'{self.local_service_url}/browser/visatype_visable',
|
|
|
- data=html,
|
|
|
- headers={"Content-Type": "text/plain"},
|
|
|
- timeout=10
|
|
|
- )
|
|
|
- result = resp.json()
|
|
|
- if result.get('status') != 'success':
|
|
|
- raise BizLogicError(message='Broswer task failed')
|
|
|
-
|
|
|
- jur_id = result['data']['jur_id']
|
|
|
- loc_id = result['data']['loc_id']
|
|
|
- type_id = result['data']['type_id']
|
|
|
- subtype_id = result['data']['subtype_id']
|
|
|
- cat_id = result['data']['cat_id']
|
|
|
+ tmpfile = os.path.join(self.root_workspace, "tmp.html")
|
|
|
+ with open(tmpfile, 'wb') as tfp:
|
|
|
+ tfp.write(resp.text)
|
|
|
+
|
|
|
+ # 3. 配置映射关系: { 结果字段名: (Label文本, Input索引) }
|
|
|
+ # 注意:Location 原代码中 index=2,其余默认为 1
|
|
|
+ field_config = {
|
|
|
+ "cat_id": ("Appointment Category", 1),
|
|
|
+ "jur_id": ("Jurisdiction", 1),
|
|
|
+ "loc_id": ("Location", 2),
|
|
|
+ "type_id": ("Visa Type", 1),
|
|
|
+ "subtype_id": ("Visa Sub Type", 1),
|
|
|
+ }
|
|
|
+
|
|
|
+ # 4. 循环提取
|
|
|
+ data = {}
|
|
|
+ for key, (text, idx) in field_config.items():
|
|
|
+ data[key] = _find_id_by_label(text, idx)
|
|
|
+
|
|
|
+ jur_id = data['jur_id']
|
|
|
+ loc_id = data['loc_id']
|
|
|
+ type_id = data['type_id']
|
|
|
+ subtype_id = data['subtype_id']
|
|
|
+ cat_id = data['cat_id']
|
|
|
|
|
|
jurisdiction_list = get_js_data("jurisdictionData")
|
|
|
location_list = get_js_data("locationData")
|
|
|
@@ -652,7 +787,6 @@ class BlsPlugin(IVSPlg):
|
|
|
"""
|
|
|
domain = self.free_config.get("domain")
|
|
|
headers = {
|
|
|
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/131.0.0.0 Safari/537.36',
|
|
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'
|
|
|
}
|
|
|
# 1. 获取表单页面 (为了提取 JS 变量映射表)
|
|
|
@@ -802,7 +936,6 @@ class BlsPlugin(IVSPlg):
|
|
|
|
|
|
# Headers 需要 Token
|
|
|
headers = {
|
|
|
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/131.0.0.0 Safari/537.36',
|
|
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
|
|
"Referer": f"https://{domain}/{referer}",
|
|
|
"X-Requested-With": "XMLHttpRequest",
|
|
|
@@ -924,4 +1057,40 @@ class BlsPlugin(IVSPlg):
|
|
|
|
|
|
return VSCloudApi.Instance().create_http_session(
|
|
|
sid, cookies_str, "", ua_str, proxy_str, page_url
|
|
|
- )
|
|
|
+ )
|
|
|
+
|
|
|
+ # --- 资源清理核心方法 ---
|
|
|
+ def cleanup(self):
|
|
|
+ """
|
|
|
+ 销毁浏览器并彻底删除临时文件
|
|
|
+ """
|
|
|
+ # 1. 关闭浏览器
|
|
|
+ if self.page:
|
|
|
+ try:
|
|
|
+ self.page.quit() # 这会关闭 Chrome 进程
|
|
|
+ except Exception:
|
|
|
+ pass # 忽略已关闭的错误
|
|
|
+ self.page = None
|
|
|
+
|
|
|
+ # 2. 删除文件
|
|
|
+ # 注意:Chrome 关闭后可能需要几百毫秒释放文件锁,稍微等待
|
|
|
+ if os.path.exists(self.root_workspace):
|
|
|
+ for _ in range(3):
|
|
|
+ try:
|
|
|
+ time.sleep(0.2)
|
|
|
+ shutil.rmtree(self.root_workspace, ignore_errors=True)
|
|
|
+ break
|
|
|
+ except Exception as e:
|
|
|
+ # 如果删除失败(通常是Windows文件占用),重试
|
|
|
+ self._log(f"Cleanup retry: {e}")
|
|
|
+ time.sleep(0.5)
|
|
|
+
|
|
|
+ # 如果依然存在,打印警告(虽然 ignore_errors=True 会掩盖报错,但可以 check exists)
|
|
|
+ if os.path.exists(self.root_workspace):
|
|
|
+ self._log(f"[WARN] Failed to fully remove workspace: {self.root_workspace}")
|
|
|
+
|
|
|
+ def __del__(self):
|
|
|
+ """
|
|
|
+ 析构函数:当对象被垃圾回收时自动调用
|
|
|
+ """
|
|
|
+ self.cleanup()
|