|
|
@@ -24,6 +24,7 @@ from cryptography.hazmat.backends import default_backend
|
|
|
from vs_plg import IVSPlg
|
|
|
from vs_types import VSPlgConfig, VSQueryResult, VSBookResult, DateAvailability, AvailabilityStatus, NotFoundError, PermissionDeniedError, RateLimiteddError, SessionExpiredOrInvalidError, BizLogicError
|
|
|
from toolkit.vs_cloud_api import VSCloudApi
|
|
|
+from toolkit.proxy_tunnel import ProxyTunnel
|
|
|
from utils.cloudflare_bypass_for_scraping import CloudflareBypasser
|
|
|
|
|
|
|
|
|
@@ -81,75 +82,6 @@ def to_yyyymmdd(data_str: str, date_str_format: str, target_format: str="%Y-%m-%
|
|
|
except:
|
|
|
return data_str
|
|
|
|
|
|
-def create_proxy_auth_extension(ip, port, username, password, plugin_path):
|
|
|
- """
|
|
|
- 创建一个 Chrome 插件来自动处理代理认证
|
|
|
- """
|
|
|
- if not os.path.exists(plugin_path):
|
|
|
- os.makedirs(plugin_path)
|
|
|
-
|
|
|
- # 1. manifest.json
|
|
|
- manifest_json = """
|
|
|
- {
|
|
|
- "version": "1.0.0",
|
|
|
- "manifest_version": 2,
|
|
|
- "name": "Chrome Proxy Auth Extension",
|
|
|
- "permissions": [
|
|
|
- "proxy",
|
|
|
- "tabs",
|
|
|
- "unlimitedStorage",
|
|
|
- "storage",
|
|
|
- "<all_urls>",
|
|
|
- "webRequest",
|
|
|
- "webRequestBlocking"
|
|
|
- ],
|
|
|
- "background": {
|
|
|
- "scripts": ["background.js"]
|
|
|
- },
|
|
|
- "minimum_chrome_version": "22.0.0"
|
|
|
- }
|
|
|
- """
|
|
|
-
|
|
|
- # 2. background.js
|
|
|
- background_js = f"""
|
|
|
- var config = {{
|
|
|
- mode: "fixed_servers",
|
|
|
- rules: {{
|
|
|
- singleProxy: {{
|
|
|
- scheme: "http",
|
|
|
- host: "{ip}",
|
|
|
- port: parseInt({port})
|
|
|
- }},
|
|
|
- bypassList: ["localhost"]
|
|
|
- }}
|
|
|
- }};
|
|
|
-
|
|
|
- chrome.proxy.settings.set({{value: config, scope: "regular"}}, function() {{}});
|
|
|
-
|
|
|
- function callbackFn(details) {{
|
|
|
- return {{
|
|
|
- authCredentials: {{
|
|
|
- username: "{username}",
|
|
|
- password: "{password}"
|
|
|
- }}
|
|
|
- }};
|
|
|
- }}
|
|
|
-
|
|
|
- chrome.webRequest.onAuthRequired.addListener(
|
|
|
- callbackFn,
|
|
|
- {{urls: ["<all_urls>"]}},
|
|
|
- ['blocking']
|
|
|
- );
|
|
|
- """
|
|
|
-
|
|
|
- with open(os.path.join(plugin_path, "manifest.json"), "w") as f:
|
|
|
- f.write(manifest_json)
|
|
|
-
|
|
|
- with open(os.path.join(plugin_path, "background.js"), "w") as f:
|
|
|
- f.write(background_js)
|
|
|
-
|
|
|
- return os.path.abspath(plugin_path)
|
|
|
-
|
|
|
# --- 模拟 Requests Response 对象 ---
|
|
|
class BrowserResponse:
|
|
|
def __init__(self, result_dict):
|
|
|
@@ -201,10 +133,11 @@ class VfsPlugin2(IVSPlg):
|
|
|
# 生成唯一实例 ID
|
|
|
self.instance_id = uuid.uuid4().hex[:8]
|
|
|
self.root_workspace = os.path.abspath(os.path.join("temp_browser_data", f"{self.group_id}_{self.instance_id}"))
|
|
|
- # 定义子目录:代理插件目录 & 浏览器用户数据目录
|
|
|
- self.proxy_ext_path = os.path.join(self.root_workspace, "proxy_ext")
|
|
|
self.user_data_path = os.path.join(self.root_workspace, "user_data")
|
|
|
|
|
|
+ # 持有隧道实例
|
|
|
+ self.tunnel = None
|
|
|
+
|
|
|
# 确保根目录存在 (子目录由具体逻辑创建)
|
|
|
if not os.path.exists(self.root_workspace):
|
|
|
os.makedirs(self.root_workspace)
|
|
|
@@ -259,31 +192,65 @@ class VfsPlugin2(IVSPlg):
|
|
|
|
|
|
# 0. 配置浏览器
|
|
|
co = ChromiumOptions()
|
|
|
- co.auto_port()
|
|
|
+ # -------------------------------------------------------------
|
|
|
+ # [核心修复] 解决 'not enough values to unpack'
|
|
|
+ # -------------------------------------------------------------
|
|
|
+ # 1. 不要用 co.auto_port(),因为它依赖解析 stdout,会被 DBus 报错干扰
|
|
|
+ # 2. 我们手动随机生成一个端口
|
|
|
+ import random
|
|
|
+ import socket
|
|
|
+
|
|
|
+ def get_free_port():
|
|
|
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
|
+ s.bind(('', 0))
|
|
|
+ return s.getsockname()[1]
|
|
|
+
|
|
|
+ debug_port = get_free_port()
|
|
|
+ self._log(f"Assigned Debug Port: {debug_port}")
|
|
|
+
|
|
|
+ # 3. 强制指定端口,DrissionPage 就会直接连接,不再解析日志
|
|
|
+ co.set_local_port(debug_port)
|
|
|
|
|
|
# --- [关键配置] 设置独立的用户数据目录 ---
|
|
|
# 这样每个实例的 Cache, Cookies, LocalStorage 都是完全隔离的
|
|
|
# 同时也防止了多进程争抢同一个 Default 文件夹导致的崩溃
|
|
|
co.set_user_data_path(self.user_data_path)
|
|
|
|
|
|
- # 代理配置
|
|
|
+ # --- 1. 指定浏览器路径 (适配 Docker) ---
|
|
|
+ chrome_path = os.getenv("CHROME_BIN")
|
|
|
+ if chrome_path and os.path.exists(chrome_path):
|
|
|
+ co.set_paths(browser_path=chrome_path)
|
|
|
+
|
|
|
+ # --- [核心修改] 代理配置 ---
|
|
|
if self.config.proxy and self.config.proxy.ip:
|
|
|
p = self.config.proxy
|
|
|
+
|
|
|
if p.username and p.password:
|
|
|
- self._log(f"Configuring authenticated proxy: {p.ip}:{p.port}")
|
|
|
- # [关键调用] 生成该实例独享的插件
|
|
|
- plugin_path = create_proxy_auth_extension(
|
|
|
- p.ip, p.port, p.username, p.password,
|
|
|
- self.proxy_ext_path # 传入唯一路径
|
|
|
- )
|
|
|
- co.add_extension(plugin_path)
|
|
|
+ self._log(f"Starting Proxy Tunnel for {p.ip}...")
|
|
|
+
|
|
|
+ # 1. 启动本地隧道
|
|
|
+ self.tunnel = ProxyTunnel(p.ip, p.port, p.username, p.password)
|
|
|
+ local_proxy = self.tunnel.start()
|
|
|
+
|
|
|
+ self._log(f"Tunnel started at {local_proxy}")
|
|
|
+
|
|
|
+ # 2. Chrome 连接本地免密端口
|
|
|
+ # 必须使用 --proxy-server 强制指定,绝对稳健
|
|
|
+ co.set_argument(f'--proxy-server={local_proxy}')
|
|
|
+
|
|
|
else:
|
|
|
- self._log(f"Configuring standard proxy: {p.ip}:{p.port}")
|
|
|
- co.set_proxy(f"{p.scheme}://{p.ip}:{p.port}")
|
|
|
+ # 无密码代理,直接用
|
|
|
+ proxy_str = f"{p.scheme}://{p.ip}:{p.port}"
|
|
|
+ co.set_argument(f'--proxy-server={proxy_str}')
|
|
|
+ else:
|
|
|
+ self._log("[WARN] No proxy configured!")
|
|
|
|
|
|
co.headless(False)
|
|
|
co.set_argument('--no-sandbox')
|
|
|
co.set_argument('--disable-gpu')
|
|
|
+ # Docker 默认 /dev/shm 只有 64MB,Chromium 很容易爆内存崩溃
|
|
|
+ co.set_argument('--disable-dev-shm-usage')
|
|
|
+
|
|
|
co.set_argument('--window-size=1920,1080')
|
|
|
co.set_argument('--disable-blink-features=AutomationControlled')
|
|
|
|
|
|
@@ -454,7 +421,7 @@ class VfsPlugin2(IVSPlg):
|
|
|
|
|
|
return result
|
|
|
|
|
|
- def _perform_request(self, method, url, headers=None, data=None, json_data=None, params=None):
|
|
|
+ def _perform_request(self, method, url, headers=None, data=None, json_data=None, params=None, retry_count=0):
|
|
|
"""
|
|
|
核心方法:在 DrissionPage 浏览器上下文中注入 JS 执行 fetch
|
|
|
"""
|
|
|
@@ -534,7 +501,16 @@ class VfsPlugin2(IVSPlg):
|
|
|
self.is_healthy = False
|
|
|
raise SessionExpiredOrInvalidError(f"401 Unauthorized: {resp.text[:100]}")
|
|
|
elif resp.status_code == 403:
|
|
|
- raise PermissionDeniedError(f"403 Forbidden: {resp.text[:100]}")
|
|
|
+ if "Just a moment" in resp.text or "cloudflare" in resp.text.lower():
|
|
|
+ self._log(f"HTTP 403 (Cloudflare) detected. Re-verifying (Try {retry_count+1}/3)...")
|
|
|
+ if retry_count < 3:
|
|
|
+ new_token = self._refresh_turnstile_token()
|
|
|
+ if new_token:
|
|
|
+ self._log("In-page verification success. Retrying...")
|
|
|
+ if json_data and "captcha_api_key" in json_data:
|
|
|
+ json_data["captcha_api_key"] = new_token
|
|
|
+ return self._perform_request(method, url, headers, data, json_data, params, retry_count+1)
|
|
|
+ raise PermissionDeniedError(f"HTTP 403 Forbidden: {resp.text[:100]}")
|
|
|
elif resp.status_code == 429:
|
|
|
self.is_healthy = False
|
|
|
raise RateLimiteddError(f"429 Rate Limit: {resp.text[:100]}")
|
|
|
@@ -682,7 +658,7 @@ class VfsPlugin2(IVSPlg):
|
|
|
# fetch 不需要显式 content-type application/json,json_data会自动处理
|
|
|
|
|
|
# DrissionPage 不需要手动处理 403 绕盾,因为浏览器本身就在盾后面
|
|
|
- resp = self._perform_request("POST", url, headers=headers, json_data=data)
|
|
|
+ resp = self._perform_request("POST", url, headers=headers, json_data=data, retry_count=2)
|
|
|
|
|
|
if "WaitList" in resp.text:
|
|
|
return "WaitList"
|
|
|
@@ -802,7 +778,7 @@ class VfsPlugin2(IVSPlg):
|
|
|
"countrycode": country,
|
|
|
"languageCode": "en-US",
|
|
|
"captcha_version": "cloudflare-v1",
|
|
|
- "captcha_api_key": new_cf_token, # <--- 使用新 Token
|
|
|
+ "captcha_api_key": new_cf_token,
|
|
|
"otp": otp
|
|
|
}
|
|
|
|
|
|
@@ -1469,12 +1445,18 @@ class VfsPlugin2(IVSPlg):
|
|
|
break
|
|
|
except Exception as e:
|
|
|
# 如果删除失败(通常是Windows文件占用),重试
|
|
|
- if self.logger: self.logger(f"Cleanup retry: {e}")
|
|
|
+ self._log(f"Cleanup retry: {e}")
|
|
|
time.sleep(0.5)
|
|
|
|
|
|
# 如果依然存在,打印警告(虽然 ignore_errors=True 会掩盖报错,但可以 check exists)
|
|
|
- if os.path.exists(self.root_workspace) and self.logger:
|
|
|
- self.logger(f"[WARN] Failed to fully remove workspace: {self.root_workspace}")
|
|
|
+ if os.path.exists(self.root_workspace):
|
|
|
+ self._log(f"[WARN] Failed to fully remove workspace: {self.root_workspace}")
|
|
|
+
|
|
|
+ # 3. [新增] 关闭代理隧道
|
|
|
+ if self.tunnel:
|
|
|
+ try: self.tunnel.stop()
|
|
|
+ except: pass
|
|
|
+ self.tunnel = None
|
|
|
|
|
|
def __del__(self):
|
|
|
"""
|