File size: 4,751 Bytes
d6d843f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
"""
Background Scheduler for API Health Checks
Runs periodic health checks with APScheduler
"""
import asyncio
import logging
from datetime import datetime
from apscheduler.schedulers.background import BackgroundScheduler as APScheduler
from apscheduler.triggers.interval import IntervalTrigger
from typing import Optional
logger = logging.getLogger(__name__)
class BackgroundScheduler:
"""Background scheduler for periodic health checks"""
def __init__(self, monitor, database, interval_minutes: int = 5):
"""
Initialize the scheduler
Args:
monitor: APIMonitor instance
database: Database instance
interval_minutes: Interval between health checks
"""
self.monitor = monitor
self.database = database
self.interval_minutes = interval_minutes
self.scheduler = APScheduler()
self.last_run_time: Optional[datetime] = None
self._running = False
def _run_health_check(self):
"""Run health check and save results"""
try:
logger.info("Running scheduled health check...")
self.last_run_time = datetime.now()
# Run async health check
results = asyncio.run(self.monitor.check_all())
# Save to database
self.database.save_health_checks(results)
# Check for incidents (offline Tier 1 providers)
for result in results:
if result.status.value == "offline":
# Check if provider is Tier 1
resources = self.monitor.config.get_all_resources()
resource = next((r for r in resources if r.get('name') == result.provider_name), None)
if resource and resource.get('tier', 3) == 1:
# Create incident for Tier 1 outage
self.database.create_incident(
provider_name=result.provider_name,
category=result.category,
incident_type="service_offline",
description=f"Tier 1 provider offline: {result.error_message}",
severity="high"
)
# Create alert
self.database.create_alert(
provider_name=result.provider_name,
alert_type="tier1_offline",
message=f"Critical: Tier 1 provider {result.provider_name} is offline"
)
logger.info(f"Health check completed. Checked {len(results)} providers.")
# Cleanup old data (older than 7 days)
self.database.cleanup_old_data(days=7)
# Aggregate response times
self.database.aggregate_response_times(period_hours=1)
except Exception as e:
logger.error(f"Error in scheduled health check: {e}")
def start(self):
"""Start the scheduler"""
if not self._running:
try:
# Add job with interval trigger
self.scheduler.add_job(
func=self._run_health_check,
trigger=IntervalTrigger(minutes=self.interval_minutes),
id='health_check_job',
name='API Health Check',
replace_existing=True
)
self.scheduler.start()
self._running = True
logger.info(f"Scheduler started. Running every {self.interval_minutes} minutes.")
# Run initial check
self._run_health_check()
except Exception as e:
logger.error(f"Error starting scheduler: {e}")
def stop(self):
"""Stop the scheduler"""
if self._running:
self.scheduler.shutdown()
self._running = False
logger.info("Scheduler stopped.")
def update_interval(self, interval_minutes: int):
"""Update the check interval"""
self.interval_minutes = interval_minutes
if self._running:
# Reschedule the job
self.scheduler.reschedule_job(
job_id='health_check_job',
trigger=IntervalTrigger(minutes=interval_minutes)
)
logger.info(f"Scheduler interval updated to {interval_minutes} minutes.")
def is_running(self) -> bool:
"""Check if scheduler is running"""
return self._running
def trigger_immediate_check(self):
"""Trigger an immediate health check"""
logger.info("Triggering immediate health check...")
self._run_health_check()
|