|
@@ -34,6 +34,7 @@ class OrderBookerGCO:
|
|
|
self.queue_backoff = ExponentialBackoff(base_delay=1*60.0, max_delay=10*60.0, factor=2.0)
|
|
self.queue_backoff = ExponentialBackoff(base_delay=1*60.0, max_delay=10*60.0, factor=2.0)
|
|
|
self.account_backoff = ExponentialBackoff(base_delay=5*60.0, max_delay=2*60*60.0, factor=2.0)
|
|
self.account_backoff = ExponentialBackoff(base_delay=5*60.0, max_delay=2*60*60.0, factor=2.0)
|
|
|
self.m_last_spawn_time = 0.0
|
|
self.m_last_spawn_time = 0.0
|
|
|
|
|
+ self.heartbeat_ttl = 300
|
|
|
|
|
|
|
|
def _log(self, message):
|
|
def _log(self, message):
|
|
|
if self.m_logger:
|
|
if self.m_logger:
|
|
@@ -58,66 +59,70 @@ class OrderBookerGCO:
|
|
|
|
|
|
|
|
def _get_redis_key(self, routing_key: str) -> str:
|
|
def _get_redis_key(self, routing_key: str) -> str:
|
|
|
return f"vs:signal:{routing_key}"
|
|
return f"vs:signal:{routing_key}"
|
|
|
-
|
|
|
|
|
- def _safe_return_task(self, task_id: int, reason: str = ""):
|
|
|
|
|
- if not task_id:
|
|
|
|
|
- return
|
|
|
|
|
- try:
|
|
|
|
|
- task_data = VSCloudApi.Instance().get_vas_task(task_id)
|
|
|
|
|
- if not task_data:
|
|
|
|
|
- self.redis_client.zrem(self.m_tracker_key, task_id)
|
|
|
|
|
- return
|
|
|
|
|
-
|
|
|
|
|
- current_status = task_data.get('status', '')
|
|
|
|
|
- if current_status in['pending', 'grabbed', 'cancelled', 'success']:
|
|
|
|
|
- self.redis_client.zrem(self.m_tracker_key, task_id)
|
|
|
|
|
- return
|
|
|
|
|
-
|
|
|
|
|
- self._log(f"Returning task={task_id} to queue. Reason: {reason}")
|
|
|
|
|
- VSCloudApi.Instance().return_vas_task_to_queue(task_id)
|
|
|
|
|
-
|
|
|
|
|
- # 归还成功,核销防丢记录
|
|
|
|
|
- self.redis_client.zrem(self.m_tracker_key, task_id)
|
|
|
|
|
-
|
|
|
|
|
- except Exception as ex:
|
|
|
|
|
- self._log(f"Failed to safely return task for task_id {task_id}: {ex}")
|
|
|
|
|
|
|
|
|
|
def _maintain_loop(self):
|
|
def _maintain_loop(self):
|
|
|
self._log("Maintain loop started.")
|
|
self._log("Maintain loop started.")
|
|
|
- rng = random.Random()
|
|
|
|
|
|
|
+ heartbeat_interval = 60
|
|
|
while not self.m_stop_event.is_set():
|
|
while not self.m_stop_event.is_set():
|
|
|
- wait_seconds = rng.randint(180, 300)
|
|
|
|
|
- for _ in range(wait_seconds):
|
|
|
|
|
|
|
+ for _ in range(heartbeat_interval):
|
|
|
if self.m_stop_event.is_set():
|
|
if self.m_stop_event.is_set():
|
|
|
return
|
|
return
|
|
|
time.sleep(1.0)
|
|
time.sleep(1.0)
|
|
|
|
|
|
|
|
with self.m_lock:
|
|
with self.m_lock:
|
|
|
tasks_to_check = list(self.m_tasks)
|
|
tasks_to_check = list(self.m_tasks)
|
|
|
|
|
+
|
|
|
|
|
+ if not tasks_to_check:
|
|
|
|
|
+ continue
|
|
|
|
|
|
|
|
healthy_tasks = []
|
|
healthy_tasks = []
|
|
|
dead_tasks = []
|
|
dead_tasks = []
|
|
|
|
|
+ now = time.time()
|
|
|
|
|
|
|
|
for t in tasks_to_check:
|
|
for t in tasks_to_check:
|
|
|
- try:
|
|
|
|
|
- t.instance.keep_alive()
|
|
|
|
|
- if t.instance.health_check():
|
|
|
|
|
- healthy_tasks.append(t)
|
|
|
|
|
- else:
|
|
|
|
|
|
|
+ if now >= t.next_remote_ping:
|
|
|
|
|
+ try:
|
|
|
|
|
+ t.instance.keep_alive()
|
|
|
|
|
+ if t.instance.health_check():
|
|
|
|
|
+ healthy_tasks.append(t)
|
|
|
|
|
+ next_delay = random.randint(180, 300)
|
|
|
|
|
+ t.next_remote_ping = now + next_delay
|
|
|
|
|
+ self._log(f"🛡️ Task={t.task_ref} keep-alive success. Next ping in {next_delay}s.")
|
|
|
|
|
+ else:
|
|
|
|
|
+ dead_tasks.append(t)
|
|
|
|
|
+ self._log(f"♻️ Instance for task={t.task_ref} unhealthy.")
|
|
|
|
|
+ except Exception as e:
|
|
|
dead_tasks.append(t)
|
|
dead_tasks.append(t)
|
|
|
- self._log(f"♻️ Instance for task={t.task_ref} unhealthy, marking for removal.")
|
|
|
|
|
|
|
+ self._log(f"♻️ Instance for task={t.task_ref} keep-alive failed: {e}.")
|
|
|
|
|
+ else:
|
|
|
|
|
+ healthy_tasks.append(t)
|
|
|
|
|
+
|
|
|
|
|
+ if healthy_tasks:
|
|
|
|
|
+ try:
|
|
|
|
|
+ pipeline = self.redis_client.pipeline()
|
|
|
|
|
+ new_deadline = time.time() + self.heartbeat_ttl
|
|
|
|
|
+ for t in healthy_tasks:
|
|
|
|
|
+ if t.task_ref is not None:
|
|
|
|
|
+ pipeline.zadd(self.m_tracker_key, {str(t.task_ref): new_deadline})
|
|
|
|
|
+ pipeline.execute()
|
|
|
|
|
+ self._log(f"💓 Heartbeat sent. Renewed {len(healthy_tasks)} tasks.")
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ self._log(f"Redis Heartbeat update failed: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ if dead_tasks:
|
|
|
|
|
+ try:
|
|
|
|
|
+ pipeline = self.redis_client.pipeline()
|
|
|
|
|
+ for t in dead_tasks:
|
|
|
|
|
+ if t.task_ref is not None:
|
|
|
|
|
+ pipeline.zadd(self.m_tracker_key, {str(t.task_ref): 0})
|
|
|
|
|
+ pipeline.execute()
|
|
|
|
|
+ self._log(f"🗑️ Handed over {len(dead_tasks)} dead tasks to Sweeper.")
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
- dead_tasks.append(t)
|
|
|
|
|
- self._log(f"♻️ Instance for task={t.task_ref} keep-alive failed: {e}, marking for removal.")
|
|
|
|
|
|
|
+ pass
|
|
|
|
|
|
|
|
with self.m_lock:
|
|
with self.m_lock:
|
|
|
self.m_tasks = [t for t in self.m_tasks if t in healthy_tasks]
|
|
self.m_tasks = [t for t in self.m_tasks if t in healthy_tasks]
|
|
|
|
|
|
|
|
- # 实例死亡,调用安全归还函数
|
|
|
|
|
- for t in dead_tasks:
|
|
|
|
|
- if t.task_ref is not None:
|
|
|
|
|
- self._safe_return_task(t.task_ref, reason="Instance died during maintain_loop")
|
|
|
|
|
-
|
|
|
|
|
def _booking_trigger_loop(self):
|
|
def _booking_trigger_loop(self):
|
|
|
self._log("Trigger loop started.")
|
|
self._log("Trigger loop started.")
|
|
|
while not self.m_stop_event.is_set():
|
|
while not self.m_stop_event.is_set():
|
|
@@ -170,17 +175,16 @@ class OrderBookerGCO:
|
|
|
|
|
|
|
|
try:
|
|
try:
|
|
|
task_data = VSCloudApi.Instance().get_vas_task(task_id)
|
|
task_data = VSCloudApi.Instance().get_vas_task(task_id)
|
|
|
- if not task_data or task_data.get('status') in ['grabbed', 'cancelled']:
|
|
|
|
|
|
|
+ if not task_data or task_data.get('status') in ['grabbed', 'pause', 'completed', 'cancelled']:
|
|
|
self._log(f"Bound Task={task_id} is no longer valid or already processed. Removing instance.")
|
|
self._log(f"Bound Task={task_id} is no longer valid or already processed. Removing instance.")
|
|
|
with self.m_lock:
|
|
with self.m_lock:
|
|
|
if task in self.m_tasks:
|
|
if task in self.m_tasks:
|
|
|
self.m_tasks.remove(task)
|
|
self.m_tasks.remove(task)
|
|
|
|
|
+ self.redis_client.zrem(self.m_tracker_key, task_id)
|
|
|
return
|
|
return
|
|
|
|
|
|
|
|
order_id = task_data.get('order_id')
|
|
order_id = task_data.get('order_id')
|
|
|
user_input = task_data.get('user_inputs', {})
|
|
user_input = task_data.get('user_inputs', {})
|
|
|
- self.redis_client.zadd(self.m_tracker_key, {str(task_id): time.time()})
|
|
|
|
|
-
|
|
|
|
|
book_res = task.instance.book(query_result, user_input)
|
|
book_res = task.instance.book(query_result, user_input)
|
|
|
|
|
|
|
|
if book_res.success:
|
|
if book_res.success:
|
|
@@ -238,13 +242,7 @@ class OrderBookerGCO:
|
|
|
|
|
|
|
|
t_cd = self.task_backoff.calculate(t_fails)
|
|
t_cd = self.task_backoff.calculate(t_fails)
|
|
|
self._log(f"⏳ Task={task_id} (Booking Attempt {t_fails}) suspended for {t_cd:.1f}s.")
|
|
self._log(f"⏳ Task={task_id} (Booking Attempt {t_fails}) suspended for {t_cd:.1f}s.")
|
|
|
-
|
|
|
|
|
- def delayed_return(tid, wait_sec, reason):
|
|
|
|
|
- self.m_stop_event.wait(wait_sec)
|
|
|
|
|
- self._safe_return_task(tid, reason=reason)
|
|
|
|
|
-
|
|
|
|
|
- t = threading.Thread(target=delayed_return, args=(task_id, t_cd, f"Booking failed: rate limited (fails={t_fails})"), daemon=True)
|
|
|
|
|
- t.start()
|
|
|
|
|
|
|
+ self.redis_client.zadd(self.m_tracker_key, {str(task_id): time.time() + t_cd})
|
|
|
|
|
|
|
|
def _creator_loop(self):
|
|
def _creator_loop(self):
|
|
|
self._log("Creator loop started.")
|
|
self._log("Creator loop started.")
|
|
@@ -267,7 +265,6 @@ class OrderBookerGCO:
|
|
|
now = time.time()
|
|
now = time.time()
|
|
|
if now - self.m_last_spawn_time >= spawn_interval:
|
|
if now - self.m_last_spawn_time >= spawn_interval:
|
|
|
self.m_last_spawn_time = now
|
|
self.m_last_spawn_time = now
|
|
|
- self._log(f"Staggered: Spawning booker for [{r_key}]. Next in {spawn_interval}s.")
|
|
|
|
|
self._spawn_worker(r_key)
|
|
self._spawn_worker(r_key)
|
|
|
break
|
|
break
|
|
|
|
|
|
|
@@ -278,6 +275,8 @@ class OrderBookerGCO:
|
|
|
def _job():
|
|
def _job():
|
|
|
success = False
|
|
success = False
|
|
|
task_id = None
|
|
task_id = None
|
|
|
|
|
+ is_rate_limited = False
|
|
|
|
|
+
|
|
|
try:
|
|
try:
|
|
|
queue_name = f"auto.{target_routing_key}"
|
|
queue_name = f"auto.{target_routing_key}"
|
|
|
task_data = VSCloudApi.Instance().get_vas_task_pop(queue_name)
|
|
task_data = VSCloudApi.Instance().get_vas_task_pop(queue_name)
|
|
@@ -286,7 +285,7 @@ class OrderBookerGCO:
|
|
|
|
|
|
|
|
task_id = task_data['id']
|
|
task_id = task_data['id']
|
|
|
|
|
|
|
|
- self.redis_client.zadd(self.m_tracker_key, {str(task_id): time.time()})
|
|
|
|
|
|
|
+ self.redis_client.zadd(self.m_tracker_key, {str(task_id): time.time() + self.heartbeat_ttl})
|
|
|
user_inputs = task_data.get('user_inputs', {})
|
|
user_inputs = task_data.get('user_inputs', {})
|
|
|
|
|
|
|
|
plg_cfg = VSPlgConfig()
|
|
plg_cfg = VSPlgConfig()
|
|
@@ -322,7 +321,8 @@ class OrderBookerGCO:
|
|
|
task_ref=task_id,
|
|
task_ref=task_id,
|
|
|
acceptable_routing_keys=acceptable_keys,
|
|
acceptable_routing_keys=acceptable_keys,
|
|
|
source_queue=target_routing_key,
|
|
source_queue=target_routing_key,
|
|
|
- book_allowed=True
|
|
|
|
|
|
|
+ book_allowed=True,
|
|
|
|
|
+ next_remote_ping=time.time() + random.randint(180, 300)
|
|
|
)
|
|
)
|
|
|
)
|
|
)
|
|
|
queue_fail_key = f"vs:queue:failures:{target_routing_key}"
|
|
queue_fail_key = f"vs:queue:failures:{target_routing_key}"
|
|
@@ -346,6 +346,7 @@ class OrderBookerGCO:
|
|
|
"Rate limited" in err_str
|
|
"Rate limited" in err_str
|
|
|
]
|
|
]
|
|
|
if any(rate_limited_indicators):
|
|
if any(rate_limited_indicators):
|
|
|
|
|
+ is_rate_limited = True
|
|
|
queue_fail_key = f"vs:queue:failures:{target_routing_key}"
|
|
queue_fail_key = f"vs:queue:failures:{target_routing_key}"
|
|
|
queue_cd_key = f"vs:queue:cooldown:{target_routing_key}"
|
|
queue_cd_key = f"vs:queue:cooldown:{target_routing_key}"
|
|
|
q_fails = self.redis_client.incr(queue_fail_key)
|
|
q_fails = self.redis_client.incr(queue_fail_key)
|
|
@@ -364,22 +365,13 @@ class OrderBookerGCO:
|
|
|
|
|
|
|
|
t_cd = self.account_backoff.calculate(t_fails)
|
|
t_cd = self.account_backoff.calculate(t_fails)
|
|
|
self._log(f"⏳ Task={task_id} (Attempt {t_fails}) suspended for {t_cd:.1f}s.")
|
|
self._log(f"⏳ Task={task_id} (Attempt {t_fails}) suspended for {t_cd:.1f}s.")
|
|
|
-
|
|
|
|
|
- def delayed_return(tid, wait_sec, reason):
|
|
|
|
|
- self.m_stop_event.wait(wait_sec)
|
|
|
|
|
- self._safe_return_task(tid, reason=reason)
|
|
|
|
|
-
|
|
|
|
|
- t = threading.Thread(target=delayed_return, args=(task_id, t_cd, f"Spawn failed: rate limited (fails={t_fails})"), daemon=True)
|
|
|
|
|
- t.start()
|
|
|
|
|
-
|
|
|
|
|
- task_id = None
|
|
|
|
|
-
|
|
|
|
|
|
|
+ self.redis_client.zadd(self.m_tracker_key, {str(task_id): time.time() + t_cd})
|
|
|
finally:
|
|
finally:
|
|
|
with self.m_lock:
|
|
with self.m_lock:
|
|
|
self.m_pending_order_by_queue[target_routing_key] = max(0, self.m_pending_order_by_queue[target_routing_key] - 1)
|
|
self.m_pending_order_by_queue[target_routing_key] = max(0, self.m_pending_order_by_queue[target_routing_key] - 1)
|
|
|
|
|
|
|
|
# 创建/登录失败,调用安全归还函数
|
|
# 创建/登录失败,调用安全归还函数
|
|
|
- if not success and task_id is not None:
|
|
|
|
|
- self._safe_return_task(task_id, reason="Instance spawn/login failed")
|
|
|
|
|
-
|
|
|
|
|
|
|
+ if not success and task_id is not None and not is_rate_limited:
|
|
|
|
|
+ self.redis_client.zadd(self.m_tracker_key, {str(task_id): 0})
|
|
|
|
|
+ self._log(f"♻️ Task={task_id} failed normal spawn. Instantly handed over to Sweeper.")
|
|
|
ThreadPool.getInstance().enqueue(_job)
|
|
ThreadPool.getInstance().enqueue(_job)
|