welin 2 viikkoa sitten
vanhempi
sitoutus
0cad2f8cbe
2 muutettua tiedostoa jossa 283 lisäystä ja 172 poistoa
  1. 264 171
      plugins/tls_plugin2.py
  2. 19 1
      utils/cloudflare_bypass_for_scraping2.py

+ 264 - 171
plugins/tls_plugin2.py

@@ -9,7 +9,7 @@ import queue
 import threading
 from datetime import datetime
 from typing import List, Dict, Optional, Any, Callable
-from urllib.parse import urljoin, urlparse, urlencode
+from urllib.parse import urljoin, urlparse, urlencode, parse_qs
 
 from camoufox import NewBrowser
 from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError, Page, BrowserContext
@@ -227,7 +227,7 @@ class TlsPlugin(IVSPlg):
 
     def _create_session_inner(self):
         """
-        全浏览器会话创建:过盾 -> JS注入登录 -> 原生跳转
+        全浏览器会话创建:过盾 -> JS注入登录 -> 状态机自动路由导航 -> 到达目标页
         必须在同一条 Camoufox/Playwright 工作线程中执行(Playwright 非线程安全)。
         """
         self._log(f"Initializing Session (ID: {self.instance_id})...")
@@ -258,158 +258,194 @@ class TlsPlugin(IVSPlg):
                 window=(1920, 1080),
             )
             self.page = self.browser_ctx.pages[0] if self.browser_ctx.pages else self.browser_ctx.new_page()
+            
+            # --- 初始化访问与过盾 ---
             tls_url = self.free_config.get('tls_url', '')
             self._log(f"Navigating: {tls_url}")
             self.page.goto(tls_url, wait_until="domcontentloaded")
             time.sleep(5)
+            
             cf_bypasser = CloudflareBypasser(self.page, log=True)
             if not cf_bypasser.bypass(max_retry=15):
                 raise BizLogicError("Cloudflare bypass timeout")
             time.sleep(3)
+            cf_bypasser.handle_waiting_room()
             
-            btn_selector = "button:has-text('Login')"
-            if not self._is_selector_visible(btn_selector, timeout=3000):
-                self.page.locator("a[href*='login']").first.click(timeout=5000)
-                time.sleep(3)
-            if not self._is_selector_visible(btn_selector, timeout=10000):
-                raise BizLogicError(message=f"Can't find selector={btn_selector}")
-            time.sleep(random.uniform(0.5, 1))
-
-            # recaptchav2_token = ""
-            # if self.page.ele('.g-recaptcha') or self.page.ele('xpath://iframe[contains(@src, "recaptcha")]'):
-            #     self._log("Solving ReCaptcha...")
-            #     rc_params = {
-            #         "type": "ReCaptchaV2TaskProxyLess",
-            #         "page": self.page.url,
-            #         "siteKey": "6LcDpXcfAAAAAM7wOEsF_38DNsL20tTvPTKxpyn0", 
-            #         "apiToken": self.free_config.get("capsolver_key", "")
-            #     }
-            #     recaptchav2_token = self._solve_recaptcha(rc_params)
-
-            username = self.config.account.username
-            password = self.config.account.password
-
-            self._type_into_first_visible(
-                selectors=[
-                    "input[name='email']",
-                    "input[type='email']",
-                    "input#email",
-                    "input[autocomplete='username']",
-                    "label:has-text('Email') + input",
-                ],
-                text=username,
-                field_name="Email",
-            )
-            
-            time.sleep(random.uniform(0.5, 1.2)) 
-        
-            self._type_into_first_visible(
-                selectors=[
-                    "input[name='password']",
-                    "input[type='password']",
-                    "input#password",
-                    "input[autocomplete='current-password']",
-                    "label:has-text('Password') + input",
-                ],
-                text=password,
-                field_name="Password",
-            )
-            
-            # if recaptchav2_token:
-            #     inject_recaptchav2_token_js = f"""
-            #     var g = document.getElementById('g-recaptcha-response');
-            #     if(g) {{ g.value = "{recaptchav2_token}"; }}
-            #     """
-            #     self._log("Inject ReCaptchaV2 Token via JS...")
-            #     self.page.run_js(inject_recaptchav2_token_js)
-            #     time.sleep(random.uniform(0.5, 1.0))
-            
-            self._log("Submitting Login...")
-            time.sleep(random.uniform(0.3, 0.8))
-            self.page.locator("button:has-text('Login')").first.click(timeout=10000)
-
-            self._log("Waiting for redirect...")
-            self.page.wait_for_function(
-                "() => !window.location.href.includes('login-actions')",
-                timeout=45000,
-            )
-            
-            time.sleep(3)
-            if "login-actions" in self.page.url or "auth" in self.page.url:
-                raise BizLogicError(message="Login Failed! Invalid credentials or Captcha rejected.")
-            
-            self.page.wait_for_load_state("domcontentloaded", timeout=15000)
-            time.sleep(5)
-            
-            # groups = self._parse_travel_groups(self.page.html)
-            # location = self.free_config.get('location')
-            # for g in groups:
-            #     if g['location'] == location:
-            #         self.travel_group = g
-            #         break
-            
-            # if not self.travel_group:
-            #     self._save_screenshot("group_not_found")
-            #     raise NotFoundError(f"Group not found for {location}")
-        
-            # formgroup_id = self.travel_group.get('group_number')
-
-            # btn_selector = f'tag:button@@name=formGroupId@@value={formgroup_id}'
-            # self._log(f"Waiting for visible button to render: {formgroup_id}...")
-            
-            # self.page.wait.eles_loaded(btn_selector, timeout=15)
-            
-            # buttons = self.page.eles(btn_selector)
-            # select_btn = None
-            # for btn in reversed(buttons):
-            #     try:
-            #         w, h = btn.rect.size
-            #         if w > 0 and h > 0:
-            #             select_btn = btn
-            #             break
-            #     except Exception:
-            #         continue
-            # if not select_btn:
-            #     self._save_screenshot("visible_button_not_found")
-            #     raise BizLogicError(f"Can't find any visible Select button for group {formgroup_id}")
-            # time.sleep(random.uniform(0.5, 1.2))
-            # self.mouse.human_click_ele(select_btn)
-            
-            # self._log("Waiting for url redirect...")
-            # self.page.wait.url_change('travel-groups', exclude=True, timeout=45)
-            # time.sleep(2)
-
-            # if "travel-groups" in self.page.url or "auth" in self.page.url:
-            #     raise BizLogicError(message="Redirect to service-level Failed!")
-            
-            # no_applicant_indicators = [
-            #     "Add a new applicant" in self.page.html,
-            #     "You have not yet added an applicant. Please click the button below to add one." in self.page.html,
-            #     "applicants-information" in self.page.url
-            # ]
-            # if any(no_applicant_indicators):
-            #     raise BizLogicError(message=f"No applicant added")
+            # --- 状态机导航循环 ---
+            max_steps = 20
+            session_created = False
+            has_submitted_login = False
             
-            btn_selector = '#book-appointment-btn'
-            self._log(f"Waiting for selector={btn_selector} to render...")
-            if not self._is_selector_visible(btn_selector, timeout=15000):
-                raise BizLogicError(message=f"Waiting for selector={btn_selector} timeout")
-            self.page.locator(btn_selector).first.click(timeout=10000)
-
-            time.sleep(3)
-            # self._log("Waiting for url redirect...")
-            # self.page.wait.url_change('service-level', exclude=True, timeout=45)
-            # time.sleep(2)
-
-            # if "service-level" in self.page.url or "auth" in self.page.url:
-            #     raise BizLogicError(message="Redirect to appointment-booking Failed!")
-
-            btn_selector = "button:has-text('Book your appointment')"
-            if not self._is_selector_visible(btn_selector, timeout=10000):
-                raise BizLogicError(message=f"Waiting for selector={btn_selector} timeout")
+            for step in range(max_steps):
+                current_url = self.page.url
+                self._log(f"--- [Router Step {step+1}] Current URL: {current_url} ---")
+                
+                # 状态 1:到达终极目标页面 (成功退出条件)
+                if "appointment-booking" in current_url or self.page.locator("button:has-text('Book your appointment')").first.count():
+                    btn_selector = "button:has-text('Book your appointment')"
+                    if self._is_selector_visible(btn_selector, timeout=10000):
+                        self.session_create_time = time.time()
+                        self._log("✅ Login & Navigation Success! Reached appointment-booking.")
+                        session_created = True
+                        break
+                
+                # 状态 2:遇到没有申请人的拦截页 (致命错误退出条件)
+                page_content = self.page.content()
+                no_applicant_indicators = [
+                    "Add a new applicant" in page_content,
+                    "You have not yet added an applicant" in page_content,
+                    "applicants-information" in current_url
+                ]
+                if any(no_applicant_indicators):
+                    raise BizLogicError(message="No applicant added. Cannot proceed to booking.")
+                
+                # 状态 3:首页/登录入口页 -> 需要点击进入登录
+                if self.page.locator("a[href*='login']").first.count() and not self.page.locator("label:has-text('Email')").first.count():
+                    self._log("State: Login Portal. Clicking login link...")
+                    try:
+                        self.page.locator("a[href*='login']").first.click(timeout=5000)
+                        time.sleep(3)
+                        continue
+                    except Exception:
+                        pass
                 
-            self.session_create_time = time.time()
-            self._log(f"✅ Login & Navigation Success!")
+                # 状态 4:真正的登录表单页
+                if self.page.locator("label:has-text('Email')").first.count() and not has_submitted_login:
+                    self._log("State: Login Form. Processing credentials and Captcha...")
+                    
+                    recaptchav2_token = ""
+                    if self.page.locator(".g-recaptcha").first.count() or self.page.locator("//iframe[contains(@src, 'recaptcha')]").first.count():
+                        try:
+                            rec_iframe = self.page.locator("//iframe[contains(@src, 'recaptcha')]").first
+                            rec_iframe_src = rec_iframe.get_attribute('src') or ""
+                            rec_parsed = urlparse(rec_iframe_src)
+                            rec_params = parse_qs(rec_parsed.query)
+                            rec_sitekey = rec_params.get("k", [None])[0]
+                            rec_size = rec_params.get("size", [None])[0]
+                            
+                            if 'normal' == rec_size and rec_sitekey:
+                                self._log(f"Solving ReCaptcha sitekey={rec_sitekey}...")
+                                rc_params = {
+                                    "type": "ReCaptchaV2TaskProxyLess",
+                                    "page": current_url,
+                                    "siteKey": rec_sitekey,
+                                    "apiToken": self.free_config.get("capsolver_key", "")
+                                }
+                                recaptchav2_token = self._solve_recaptcha(rc_params)
+                        except Exception as e:
+                            self._log(f"ReCaptcha extraction failed: {e}")
+                    
+                    username = self.config.account.username
+                    password = self.config.account.password
+                    
+                    self._type_into_first_visible(
+                        selectors=[
+                            "input[name='email']",
+                            "input[type='email']",
+                            "input#email",
+                            "input[autocomplete='username']",
+                            "label:has-text('Email') + input",
+                        ],
+                        text=username,
+                        field_name="Email",
+                    )
+                    time.sleep(random.uniform(0.5, 1.2))
+                    
+                    self._type_into_first_visible(
+                        selectors=[
+                            "input[name='password']",
+                            "input[type='password']",
+                            "input#password",
+                            "input[autocomplete='current-password']",
+                            "label:has-text('Password') + input",
+                        ],
+                        text=password,
+                        field_name="Password",
+                    )
+                    
+                    # 注入 Token
+                    if recaptchav2_token:
+                        inject_js = f"var g = document.getElementById('g-recaptcha-response'); if(g) {{ g.value = '{recaptchav2_token}'; }}"
+                        try:
+                            self.page.evaluate(f"() => {{ {inject_js} }}")
+                            self._log("ReCaptcha token injected")
+                        except Exception:
+                            pass
+                        time.sleep(random.uniform(0.5, 1.0))
+                    
+                    self._log("Submitting Login...")
+                    time.sleep(random.uniform(0.3, 0.8))
+                    self.page.locator("button:has-text('Login')").first.click(timeout=10000)
+                    has_submitted_login = True
+                    time.sleep(3)
+                    continue
+                
+                # 状态 5:Travel Groups 页面
+                if "travel-groups" in current_url:
+                    self._log("State: Travel Groups. Selecting targeted group...")
+                    groups = self._parse_travel_groups(self.page.content())
+                    location = self.free_config.get('location')
+                    self.travel_group = next((g for g in groups if location in g['location']), None)
+                    
+                    if not self.travel_group:
+                        self._save_screenshot("group_not_found")
+                        raise NotFoundError(f"Group not found for {location}")
+                    
+                    formgroup_id = self.travel_group.get('group_number')
+                    btn_selector = f'button[name="formGroupId"][value="{formgroup_id}"]'
+                    
+                    select_buttons = self.page.locator(btn_selector)
+                    if select_buttons.count():
+                        # 取最后一个可见的按钮
+                        select_btn = None
+                        for i in range(select_buttons.count() - 1, -1, -1):
+                            btn = select_buttons.nth(i)
+                            try:
+                                if btn.is_visible(timeout=1000):
+                                    select_btn = btn
+                                    break
+                            except Exception:
+                                continue
+                        
+                        if select_btn:
+                            time.sleep(random.uniform(0.5, 1.2))
+                            select_btn.click(timeout=10000)
+                            self._log(f"Clicked select button for group {formgroup_id}")
+                            time.sleep(3)
+                            continue
+                        else:
+                            self._log("[WARN] Select button found but not visible.")
+                    else:
+                        self._log(f"[WARN] Wait timeout for group button {formgroup_id}")
+                
+                # 状态 6:中间过渡页,需点击 "Book Appointment" 继续往下走
+                if self.page.locator('#book-appointment-btn').first.count():
+                    self._log("State: Intermediate Dashboard. Clicking Book Appointment button...")
+                    try:
+                        self.page.locator('#book-appointment-btn').first.click(timeout=10000)
+                        time.sleep(3)
+                        continue
+                    except Exception:
+                        pass
+                
+                # 状态 7:登录失败校验 或 未知加载状态
+                if "login-actions" in current_url and has_submitted_login:
+                    self._log("Waiting on login-actions... (Might be authenticating or invalid credentials)")
+                    time.sleep(2)
+                    try:
+                        if self.page.locator("text='Invalid username or password'").first.count():
+                            raise BizLogicError(message="Login Failed! Invalid credentials or Captcha rejected.")
+                    except Exception:
+                        pass
+                    continue
+                
+                # 兜底:未匹配到明确状态,等待页面渲染或重定向
+                self._log("State: Transitioning or Unknown. Waiting 2 seconds...")
+                time.sleep(2)
+            
+            # 如果循环耗尽还没到达目标
+            if not session_created:
+                raise BizLogicError(f"Failed to reach appointment-booking after {max_steps} navigation steps. Stuck at: {self.page.url}")
 
         except Exception as e:
             self._log(f"Session Create Error: {e}")
@@ -428,7 +464,7 @@ class TlsPlugin(IVSPlg):
         )
         # 结构略变:任意后代 button 带 slot
         yield self.page.locator(
-            "xpath=//div[./p and .//button[contains(@data-testid, 'slot')]]"
+            "xpath=//div[./p and .//button[contains(@data-testid, 'slot                                                                                                           ')]]"
         )
         # 仅要求有 p 与 slot 类按钮
         yield self.page.locator(
@@ -464,7 +500,7 @@ class TlsPlugin(IVSPlg):
 
         for i in range(day_blocks.count()):
             block = day_blocks.nth(i)
-            p_ele = block.locator("p").first
+            p_ele = block.locator("p").first 
             if not p_ele.count():
                 continue
             day_match = re.search(r"\d+", p_ele.inner_text())
@@ -559,7 +595,6 @@ class TlsPlugin(IVSPlg):
         if not is_on_target_month:
             self._log(f"Current is '{current_month_text}', navigating to '{target_month_text}'...")
             for _ in range(12):
-                target_btn_xpath = f'xpath://a[contains(@href, "month={interest_month}")]'
                 target_btn = self.page.locator(f"a[href*='month={interest_month}']").first
                 
                 if target_btn.count():
@@ -580,6 +615,7 @@ class TlsPlugin(IVSPlg):
             except PlaywrightTimeoutError:
                 try:
                     self.page.wait_for_load_state("domcontentloaded", timeout=10000)
+                    self.save_screenshot("query_load_timeout")
                 except PlaywrightTimeoutError:
                     pass
             time.sleep(0.8)
@@ -683,36 +719,93 @@ class TlsPlugin(IVSPlg):
 
         self._log(f"Found {len(all_possible_slots)} valid slots. selected slot: {selected_date} {selected_time.time} {selected_label}")
         
-        js_inject_and_click = f"""
-        try {{
-            const form = document.querySelector('form');
-            if (!form) return 'Form not found';
-
-            function setReactValue(input, value) {{
-                if (!input) return;
-                input.value = value;
-                input.dispatchEvent(new Event('input', {{ bubbles: true }}));
-                input.dispatchEvent(new Event('change', {{ bubbles: true }}));
-            }}
-            setReactValue(form.querySelector('input[name="date"]'), '{selected_date}');
-            setReactValue(form.querySelector('input[name="time"]'), '{selected_time.time}');
-            setReactValue(form.querySelector('input[name="appointmentLabel"]'), '{selected_label}');
-            const submitBtn = form.querySelector('button[type="submit"]');
-            if (submitBtn) {{
+        # 随机选择预订模式 - Mode 1 (鼠标移动 + JS更新 + 点击) 或 Mode 2 (直接 JS 更新 + 点击)
+        book_mode = random.choice([1, 2])
+        self._log(f"Using booking mode: {book_mode}")
+        
+        if book_mode == 1:
+            # Mode 1: 模拟真实用户行为 - 先移动鼠标到随机位置
+            rand_x = random.randint(300, 800)
+            rand_y = random.randint(400, 700)
+            self._log(f"Mode 1: Moving mouse to ({rand_x}, {rand_y}) and clicking")
+            # Playwright 中不直接支持 HumanMouse,但可以通过 hover 和 click 来实现
+            dummy_locator = self.page.locator(f"xpath=//*[@id='dummy_{random.randint(1000, 9999)}']")
+            try:
+                # 如果虚拟定位器存在就点击(通常不会存在),否则只是触发 mousemove 事件
+                dummy_locator.first.click(timeout=500)
+            except Exception:
+                pass
+            
+            js_update_form = f"""
+            try {{
+                const buttons = Array.from(document.querySelectorAll('button[type="submit"]'));
+                const submitBtn = buttons.find(btn => {{
+                    return btn.textContent.trim().toLowerCase().includes('book your appointment');
+                }});
+                if (!submitBtn) return 'Submit button not found';
+                const form = submitBtn.closest('form');
+                if (!form) return 'Correct form not found';
+                function setReactValue(input, value) {{
+                    if (!input) return;
+                    input.value = value;
+                    input.dispatchEvent(new Event('input', {{ bubbles: true }}));
+                    input.dispatchEvent(new Event('change', {{ bubbles: true }}));
+                }}
+                setReactValue(form.querySelector('input[name="date"]'), '{selected_date}');
+                setReactValue(form.querySelector('input[name="time"]'), '{selected_time.time}');
+                setReactValue(form.querySelector('input[name="appointmentLabel"]'), '{selected_label}');
                 submitBtn.removeAttribute('disabled');
                 submitBtn.classList.remove('opacity-50', 'cursor-not-allowed'); 
+                return 'form_updated';
+            }} catch (e) {{
+                return e.toString();
+            }}
+            """
+            update_res = self.page.evaluate(f"() => {{ {js_update_form} }}")
+            self._log(f"Mode 1: Form update triggered: {update_res}")
+            
+            if update_res != 'form_updated':
+                raise BizLogicError(message=f"Failed to update form in Mode 1: {update_res}")
+            
+            # 通过按钮定位器点击
+            submit_btn = self.page.locator("button:has-text('Book your appointment')").first
+            if not submit_btn.count():
+                raise BizLogicError(message="Submit button not found for Mode 1")
+            
+            self._log("Mode 1: Moving mouse to submit button and clicking")
+            time.sleep(random.uniform(0.2, 0.5))
+            submit_btn.click(timeout=10000)
+            inject_res = 'clicked'
+        
+        else:
+            # Mode 2: 直接 JS 注入和点击 (更快但可能被检测)
+            js_inject_and_click = f"""
+            try {{
+                const buttons = Array.from(document.querySelectorAll('button[type="submit"]'));
+                const submitBtn = buttons.find(btn => {{
+                    return btn.textContent.trim().toLowerCase().includes('book your appointment');
+                }});
+                if (!submitBtn) return 'Submit button not found';
+                const form = submitBtn.closest('form');
+                if (!form) return 'Correct form not found';
+                function setReactValue(input, value) {{
+                    if (!input) return;
+                    input.value = value;
+                    input.dispatchEvent(new Event('input', {{ bubbles: true }}));
+                    input.dispatchEvent(new Event('change', {{ bubbles: true }}));
+                }}
+                setReactValue(form.querySelector('input[name="date"]'), '{selected_date}');
+                setReactValue(form.querySelector('input[name="time"]'), '{selected_time.time}');
+                setReactValue(form.querySelector('input[name="appointmentLabel"]'), '{selected_label}');
+                submitBtn.removeAttribute('disabled');
                 submitBtn.click();
                 return 'clicked';
-            }} else {{
-                return 'Submit button not found';
+            }} catch (e) {{
+                return e.toString();
             }}
-        }} catch (e) {{
-            return e.toString();
-        }}
-        """
-        
-        inject_res = self.page.evaluate(f"() => {{ {js_inject_and_click} }}")
-        self._log(f"Form submission triggered: {inject_res}")
+            """
+            inject_res = self.page.evaluate(f"() => {{ {js_inject_and_click} }}")
+            self._log(f"Mode 2: Form submission triggered: {inject_res}")
         
         if inject_res != 'clicked':
             raise BizLogicError(message="Failed to inject form or click the submit button")

+ 19 - 1
utils/cloudflare_bypass_for_scraping2.py

@@ -242,4 +242,22 @@ class CloudflareBypasser:
         if not final_ok:
             self.log_message(f"Bypass failed after retries. final_state: {self._collect_page_state()}")
         return final_ok
-            
+
+    def handle_waiting_room(self, timeout_seconds=6 * 60, poll_seconds=10):
+        wait_start = time.time()
+        while True:
+            try:
+                page = self._normalize_page()
+                html = (page.content() or "").lower()
+            except Exception as e:
+                self.log_message(f"Waiting room check failed: {e}")
+                break
+
+            if "file d'attente" in html or "waiting room" in html:
+                if time.time() - wait_start > timeout_seconds:
+                    self.log_message("Waiting room timeout reached.")
+                    break
+                self.log_message("In Waiting Room... waiting for auto-refresh.")
+                time.sleep(poll_seconds)
+                continue
+            break