mirror of
https://github.com/asimonson1125/asimonson1125.github.io.git
synced 2026-02-24 21:09:49 -06:00
replace status history with db
This commit is contained in:
@@ -19,9 +19,9 @@ Server-side monitoring system that checks the availability of asimonson.com serv
|
|||||||
|
|
||||||
**Features**:
|
**Features**:
|
||||||
- Tracks response times and HTTP status codes
|
- Tracks response times and HTTP status codes
|
||||||
- Stores check history (up to 720 checks = 60 days of data)
|
|
||||||
- Calculates uptime percentages for multiple time periods (24h, 7d, 30d, all-time)
|
- Calculates uptime percentages for multiple time periods (24h, 7d, 30d, all-time)
|
||||||
- Persists data to `static/json/status_history.json`
|
- Persists data to PostgreSQL (`service_checks` table) via `DATABASE_URL` env var
|
||||||
|
- Gracefully degrades when no database is configured (local dev)
|
||||||
- Runs in a background thread
|
- Runs in a background thread
|
||||||
|
|
||||||
#### 2. `app.py` - Flask Integration
|
#### 2. `app.py` - Flask Integration
|
||||||
@@ -57,32 +57,22 @@ Server-side monitoring system that checks the availability of asimonson.com serv
|
|||||||
|
|
||||||
## Data Storage
|
## Data Storage
|
||||||
|
|
||||||
Status history is stored in `src/static/json/status_history.json`:
|
Check history is stored in a PostgreSQL `service_checks` table. The connection is configured via the `DATABASE_URL` environment variable (e.g. `postgresql://user:pass@host:5432/dbname`).
|
||||||
|
|
||||||
```json
|
```sql
|
||||||
{
|
CREATE TABLE service_checks (
|
||||||
"last_check": "2026-02-11T14:30:00",
|
id SERIAL PRIMARY KEY,
|
||||||
"services": {
|
service_id VARCHAR(50) NOT NULL,
|
||||||
"main": {
|
timestamp TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
"name": "asimonson.com",
|
status VARCHAR(20) NOT NULL,
|
||||||
"url": "https://asimonson.com",
|
response_time INTEGER,
|
||||||
"status": "online",
|
status_code INTEGER,
|
||||||
"response_time": 156,
|
error TEXT
|
||||||
"status_code": 200,
|
);
|
||||||
"last_online": "2026-02-11T14:30:00",
|
|
||||||
"checks": [
|
|
||||||
{
|
|
||||||
"timestamp": "2026-02-11T14:30:00",
|
|
||||||
"status": "online",
|
|
||||||
"response_time": 156,
|
|
||||||
"status_code": 200
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
The table and index are created automatically on startup. If `DATABASE_URL` is not set, the monitor runs without persistence (useful for local development).
|
||||||
|
|
||||||
## Status Types
|
## Status Types
|
||||||
|
|
||||||
- **online**: HTTP status 2xx-4xx, service responding
|
- **online**: HTTP status 2xx-4xx, service responding
|
||||||
@@ -142,8 +132,7 @@ SERVICES = [
|
|||||||
## Notes
|
## Notes
|
||||||
|
|
||||||
- First deployment will show limited uptime data until enough checks accumulate
|
- First deployment will show limited uptime data until enough checks accumulate
|
||||||
- Historical data is preserved across server restarts
|
- Historical data is preserved across server restarts (stored in PostgreSQL)
|
||||||
- Maximum 720 checks stored per service (60 days at 2-hour intervals)
|
|
||||||
- Page auto-refreshes every 5 minutes to show latest server data
|
- Page auto-refreshes every 5 minutes to show latest server data
|
||||||
- Manual refresh button available for immediate updates
|
- Manual refresh button available for immediate updates
|
||||||
- All checks performed server-side (no client-side CORS issues)
|
- All checks performed server-side (no client-side CORS issues)
|
||||||
|
|||||||
@@ -7,3 +7,26 @@ services:
|
|||||||
restart: 'no'
|
restart: 'no'
|
||||||
ports:
|
ports:
|
||||||
- 8080:8080
|
- 8080:8080
|
||||||
|
environment:
|
||||||
|
DATABASE_URL: postgresql://portfolio:portfolio@db:5432/portfolio
|
||||||
|
depends_on:
|
||||||
|
db:
|
||||||
|
condition: service_healthy
|
||||||
|
|
||||||
|
db:
|
||||||
|
image: postgres:16-alpine
|
||||||
|
restart: 'no'
|
||||||
|
environment:
|
||||||
|
POSTGRES_USER: portfolio
|
||||||
|
POSTGRES_PASSWORD: portfolio
|
||||||
|
POSTGRES_DB: portfolio
|
||||||
|
volumes:
|
||||||
|
- pgdata:/var/lib/postgresql/data
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "pg_isready -U portfolio"]
|
||||||
|
interval: 5s
|
||||||
|
timeout: 3s
|
||||||
|
retries: 5
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
pgdata:
|
||||||
|
|||||||
294
src/monitor.py
294
src/monitor.py
@@ -2,13 +2,14 @@
|
|||||||
Service monitoring module
|
Service monitoring module
|
||||||
Checks service availability and tracks uptime statistics
|
Checks service availability and tracks uptime statistics
|
||||||
"""
|
"""
|
||||||
|
import os
|
||||||
import requests
|
import requests
|
||||||
import time
|
import time
|
||||||
import json
|
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from threading import Thread, Lock
|
from threading import Thread, Lock
|
||||||
from pathlib import Path
|
|
||||||
|
import psycopg2
|
||||||
|
|
||||||
# Service configuration
|
# Service configuration
|
||||||
SERVICES = [
|
SERVICES = [
|
||||||
@@ -35,52 +36,138 @@ SERVICES = [
|
|||||||
# Check interval: 30 mins
|
# Check interval: 30 mins
|
||||||
CHECK_INTERVAL = 1800
|
CHECK_INTERVAL = 1800
|
||||||
|
|
||||||
# File to store status history
|
DATABASE_URL = os.environ.get('DATABASE_URL')
|
||||||
STATUS_FILE = Path(__file__).parent / 'static' / 'json' / 'status_history.json'
|
|
||||||
|
# Expected columns (besides id) — name: SQL type
|
||||||
|
_EXPECTED_COLUMNS = {
|
||||||
|
'service_id': 'VARCHAR(50) NOT NULL',
|
||||||
|
'timestamp': 'TIMESTAMPTZ NOT NULL DEFAULT NOW()',
|
||||||
|
'status': 'VARCHAR(20) NOT NULL',
|
||||||
|
'response_time': 'INTEGER',
|
||||||
|
'status_code': 'INTEGER',
|
||||||
|
'error': 'TEXT',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class ServiceMonitor:
|
class ServiceMonitor:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.status_data = {}
|
|
||||||
self.lock = Lock()
|
self.lock = Lock()
|
||||||
self.load_history()
|
# Lightweight in-memory cache of latest status per service
|
||||||
|
self._current = {}
|
||||||
def load_history(self):
|
|
||||||
"""Load status history from file"""
|
|
||||||
if STATUS_FILE.exists():
|
|
||||||
try:
|
|
||||||
with open(STATUS_FILE, 'r') as f:
|
|
||||||
self.status_data = json.load(f)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error loading status history: {e}")
|
|
||||||
self.initialize_status_data()
|
|
||||||
else:
|
|
||||||
self.initialize_status_data()
|
|
||||||
|
|
||||||
def initialize_status_data(self):
|
|
||||||
"""Initialize empty status data structure"""
|
|
||||||
self.status_data = {
|
|
||||||
'last_check': None,
|
|
||||||
'services': {}
|
|
||||||
}
|
|
||||||
for service in SERVICES:
|
for service in SERVICES:
|
||||||
self.status_data['services'][service['id']] = {
|
self._current[service['id']] = {
|
||||||
'name': service['name'],
|
'name': service['name'],
|
||||||
'url': service['url'],
|
'url': service['url'],
|
||||||
'status': 'unknown',
|
'status': 'unknown',
|
||||||
'response_time': None,
|
'response_time': None,
|
||||||
'status_code': None,
|
'status_code': None,
|
||||||
'last_online': None,
|
'last_online': None,
|
||||||
'checks': [] # List of check results
|
|
||||||
}
|
}
|
||||||
|
self._last_check = None
|
||||||
|
self._ensure_schema()
|
||||||
|
|
||||||
def save_history(self):
|
# ── database helpers ──────────────────────────────────────────
|
||||||
"""Save status history to file"""
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_conn():
|
||||||
|
"""Return a new psycopg2 connection, or None if DATABASE_URL is unset."""
|
||||||
|
if not DATABASE_URL:
|
||||||
|
return None
|
||||||
|
return psycopg2.connect(DATABASE_URL)
|
||||||
|
|
||||||
|
def _ensure_schema(self):
|
||||||
|
"""Create the service_checks table (and index) if needed, then
|
||||||
|
reconcile columns with _EXPECTED_COLUMNS."""
|
||||||
|
if not DATABASE_URL:
|
||||||
|
print("DATABASE_URL not set — running without persistence")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Retry connection in case DB is still starting (e.g. Docker)
|
||||||
|
conn = None
|
||||||
|
for attempt in range(5):
|
||||||
|
try:
|
||||||
|
conn = psycopg2.connect(DATABASE_URL)
|
||||||
|
break
|
||||||
|
except psycopg2.OperationalError:
|
||||||
|
if attempt < 4:
|
||||||
|
print(f"Database not ready, retrying in 2s (attempt {attempt + 1}/5)...")
|
||||||
|
time.sleep(2)
|
||||||
|
else:
|
||||||
|
print("Could not connect to database — running without persistence")
|
||||||
|
return
|
||||||
try:
|
try:
|
||||||
STATUS_FILE.parent.mkdir(parents=True, exist_ok=True)
|
with conn, conn.cursor() as cur:
|
||||||
with open(STATUS_FILE, 'w') as f:
|
cur.execute("""
|
||||||
json.dump(self.status_data, f, indent=2)
|
CREATE TABLE IF NOT EXISTS service_checks (
|
||||||
except Exception as e:
|
id SERIAL PRIMARY KEY,
|
||||||
print(f"Error saving status history: {e}")
|
service_id VARCHAR(50) NOT NULL,
|
||||||
|
timestamp TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
status VARCHAR(20) NOT NULL,
|
||||||
|
response_time INTEGER,
|
||||||
|
status_code INTEGER,
|
||||||
|
error TEXT
|
||||||
|
);
|
||||||
|
""")
|
||||||
|
cur.execute("""
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_service_checks_service_timestamp
|
||||||
|
ON service_checks (service_id, timestamp DESC);
|
||||||
|
""")
|
||||||
|
|
||||||
|
# Introspect existing columns
|
||||||
|
cur.execute("""
|
||||||
|
SELECT column_name
|
||||||
|
FROM information_schema.columns
|
||||||
|
WHERE table_name = 'service_checks'
|
||||||
|
""")
|
||||||
|
existing = {row[0] for row in cur.fetchall()}
|
||||||
|
|
||||||
|
# Add missing columns
|
||||||
|
for col, col_type in _EXPECTED_COLUMNS.items():
|
||||||
|
if col not in existing:
|
||||||
|
# Strip NOT NULL / DEFAULT for ALTER ADD (can't enforce
|
||||||
|
# NOT NULL on existing rows without a default)
|
||||||
|
bare_type = col_type.split('NOT NULL')[0].split('DEFAULT')[0].strip()
|
||||||
|
cur.execute(
|
||||||
|
f'ALTER TABLE service_checks ADD COLUMN {col} {bare_type}'
|
||||||
|
)
|
||||||
|
print(f"Added column {col} to service_checks")
|
||||||
|
|
||||||
|
# Drop unexpected columns (besides 'id')
|
||||||
|
expected_names = set(_EXPECTED_COLUMNS) | {'id'}
|
||||||
|
for col in existing - expected_names:
|
||||||
|
cur.execute(
|
||||||
|
f'ALTER TABLE service_checks DROP COLUMN {col}'
|
||||||
|
)
|
||||||
|
print(f"Dropped column {col} from service_checks")
|
||||||
|
|
||||||
|
print("Database schema OK")
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
def _insert_check(self, service_id, result):
|
||||||
|
"""Insert a single check result into the database."""
|
||||||
|
conn = self._get_conn()
|
||||||
|
if conn is None:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
with conn, conn.cursor() as cur:
|
||||||
|
cur.execute(
|
||||||
|
"""INSERT INTO service_checks
|
||||||
|
(service_id, timestamp, status, response_time, status_code, error)
|
||||||
|
VALUES (%s, %s, %s, %s, %s, %s)""",
|
||||||
|
(
|
||||||
|
service_id,
|
||||||
|
result['timestamp'],
|
||||||
|
result['status'],
|
||||||
|
result.get('response_time'),
|
||||||
|
result.get('status_code'),
|
||||||
|
result.get('error'),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
# ── service checks ────────────────────────────────────────────
|
||||||
|
|
||||||
def check_service(self, service):
|
def check_service(self, service):
|
||||||
"""Check a single service and return status"""
|
"""Check a single service and return status"""
|
||||||
@@ -136,107 +223,124 @@ class ServiceMonitor:
|
|||||||
results[service['id']] = result
|
results[service['id']] = result
|
||||||
print(f" {service['name']}: {result['status']} ({result['response_time']}ms)")
|
print(f" {service['name']}: {result['status']} ({result['response_time']}ms)")
|
||||||
|
|
||||||
# Only acquire lock when updating the shared data structure
|
# Persist to database (outside lock — DB has its own concurrency)
|
||||||
|
for service_id, result in results.items():
|
||||||
|
self._insert_check(service_id, result)
|
||||||
|
|
||||||
|
# Update lightweight in-memory cache under lock
|
||||||
with self.lock:
|
with self.lock:
|
||||||
for service in SERVICES:
|
for service in SERVICES:
|
||||||
result = results[service['id']]
|
result = results[service['id']]
|
||||||
service_data = self.status_data['services'][service['id']]
|
cached = self._current[service['id']]
|
||||||
|
cached['status'] = result['status']
|
||||||
# Update current status
|
cached['response_time'] = result['response_time']
|
||||||
service_data['status'] = result['status']
|
cached['status_code'] = result['status_code']
|
||||||
service_data['response_time'] = result['response_time']
|
|
||||||
service_data['status_code'] = result['status_code']
|
|
||||||
|
|
||||||
if result['status'] == 'online':
|
if result['status'] == 'online':
|
||||||
service_data['last_online'] = result['timestamp']
|
cached['last_online'] = result['timestamp']
|
||||||
|
self._last_check = datetime.now().isoformat()
|
||||||
|
|
||||||
# Add to check history (keep last 2880 checks = 60 days at 2hr intervals)
|
# ── uptime calculations ───────────────────────────────────────
|
||||||
service_data['checks'].append(result)
|
|
||||||
if len(service_data['checks']) > 2880:
|
|
||||||
service_data['checks'] = service_data['checks'][-2880:]
|
|
||||||
|
|
||||||
self.status_data['last_check'] = datetime.now().isoformat()
|
|
||||||
self.save_history()
|
|
||||||
|
|
||||||
def _calculate_uptime_unlocked(self, service_id, hours=None):
|
def _calculate_uptime_unlocked(self, service_id, hours=None):
|
||||||
"""Calculate uptime percentage for a service (assumes lock is held)"""
|
"""Calculate uptime percentage for a service by querying the DB."""
|
||||||
service_data = self.status_data['services'].get(service_id)
|
conn = self._get_conn()
|
||||||
if not service_data or not service_data['checks']:
|
if conn is None:
|
||||||
return None
|
return None
|
||||||
|
try:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
if hours:
|
||||||
|
cutoff = datetime.now() - timedelta(hours=hours)
|
||||||
|
cur.execute(
|
||||||
|
"""SELECT
|
||||||
|
COUNT(*) FILTER (WHERE status = 'online'),
|
||||||
|
COUNT(*)
|
||||||
|
FROM service_checks
|
||||||
|
WHERE service_id = %s AND timestamp > %s""",
|
||||||
|
(service_id, cutoff),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
cur.execute(
|
||||||
|
"""SELECT
|
||||||
|
COUNT(*) FILTER (WHERE status = 'online'),
|
||||||
|
COUNT(*)
|
||||||
|
FROM service_checks
|
||||||
|
WHERE service_id = %s""",
|
||||||
|
(service_id,),
|
||||||
|
)
|
||||||
|
|
||||||
checks = service_data['checks']
|
online_count, total_count = cur.fetchone()
|
||||||
|
|
||||||
# Filter by time period if specified
|
if total_count == 0:
|
||||||
if hours:
|
return None
|
||||||
cutoff = datetime.now() - timedelta(hours=hours)
|
|
||||||
checks = [
|
|
||||||
c for c in checks
|
|
||||||
if datetime.fromisoformat(c['timestamp']) > cutoff
|
|
||||||
]
|
|
||||||
|
|
||||||
if not checks:
|
# Minimum-data thresholds
|
||||||
return None
|
if hours:
|
||||||
|
expected_checks = (hours * 3600) / CHECK_INTERVAL
|
||||||
|
minimum_checks = max(3, expected_checks * 0.5)
|
||||||
|
if total_count < minimum_checks:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
if total_count < 3:
|
||||||
|
return None
|
||||||
|
|
||||||
# Require minimum data coverage for the time period
|
return round((online_count / total_count) * 100, 2)
|
||||||
# Calculate expected number of checks for this period
|
finally:
|
||||||
expected_checks = (hours * 3600) / CHECK_INTERVAL
|
conn.close()
|
||||||
|
|
||||||
# Require at least 50% of expected checks to show this metric
|
|
||||||
minimum_checks = max(3, expected_checks * 0.5)
|
|
||||||
|
|
||||||
if len(checks) < minimum_checks:
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
# For all-time, require at least 3 checks
|
|
||||||
if len(checks) < 3:
|
|
||||||
return None
|
|
||||||
|
|
||||||
online_count = sum(1 for c in checks if c['status'] == 'online')
|
|
||||||
uptime = (online_count / len(checks)) * 100
|
|
||||||
|
|
||||||
return round(uptime, 2)
|
|
||||||
|
|
||||||
def calculate_uptime(self, service_id, hours=None):
|
def calculate_uptime(self, service_id, hours=None):
|
||||||
"""Calculate uptime percentage for a service"""
|
"""Calculate uptime percentage for a service"""
|
||||||
with self.lock:
|
return self._calculate_uptime_unlocked(service_id, hours)
|
||||||
return self._calculate_uptime_unlocked(service_id, hours)
|
|
||||||
|
|
||||||
def get_status_summary(self):
|
def get_status_summary(self):
|
||||||
"""Get current status summary with uptime statistics"""
|
"""Get current status summary with uptime statistics"""
|
||||||
with self.lock:
|
with self.lock:
|
||||||
summary = {
|
summary = {
|
||||||
'last_check': self.status_data['last_check'],
|
'last_check': self._last_check,
|
||||||
'next_check': None,
|
'next_check': None,
|
||||||
'services': []
|
'services': []
|
||||||
}
|
}
|
||||||
|
|
||||||
# Calculate next check time
|
if self._last_check:
|
||||||
if self.status_data['last_check']:
|
last_check = datetime.fromisoformat(self._last_check)
|
||||||
last_check = datetime.fromisoformat(self.status_data['last_check'])
|
|
||||||
next_check = last_check + timedelta(seconds=CHECK_INTERVAL)
|
next_check = last_check + timedelta(seconds=CHECK_INTERVAL)
|
||||||
summary['next_check'] = next_check.isoformat()
|
summary['next_check'] = next_check.isoformat()
|
||||||
|
|
||||||
for service_id, service_data in self.status_data['services'].items():
|
for service_id, cached in self._current.items():
|
||||||
service_summary = {
|
service_summary = {
|
||||||
'id': service_id,
|
'id': service_id,
|
||||||
'name': service_data['name'],
|
'name': cached['name'],
|
||||||
'url': service_data['url'],
|
'url': cached['url'],
|
||||||
'status': service_data['status'],
|
'status': cached['status'],
|
||||||
'response_time': service_data['response_time'],
|
'response_time': cached['response_time'],
|
||||||
'status_code': service_data['status_code'],
|
'status_code': cached['status_code'],
|
||||||
'last_online': service_data['last_online'],
|
'last_online': cached['last_online'],
|
||||||
'uptime': {
|
'uptime': {
|
||||||
'24h': self._calculate_uptime_unlocked(service_id, 24),
|
'24h': self._calculate_uptime_unlocked(service_id, 24),
|
||||||
'7d': self._calculate_uptime_unlocked(service_id, 24 * 7),
|
'7d': self._calculate_uptime_unlocked(service_id, 24 * 7),
|
||||||
'30d': self._calculate_uptime_unlocked(service_id, 24 * 30),
|
'30d': self._calculate_uptime_unlocked(service_id, 24 * 30),
|
||||||
'all_time': self._calculate_uptime_unlocked(service_id)
|
'all_time': self._calculate_uptime_unlocked(service_id)
|
||||||
},
|
},
|
||||||
'total_checks': len(service_data['checks'])
|
'total_checks': self._get_total_checks(service_id),
|
||||||
}
|
}
|
||||||
summary['services'].append(service_summary)
|
summary['services'].append(service_summary)
|
||||||
|
|
||||||
return summary
|
return summary
|
||||||
|
|
||||||
|
def _get_total_checks(self, service_id):
|
||||||
|
"""Return the total number of checks for a service."""
|
||||||
|
conn = self._get_conn()
|
||||||
|
if conn is None:
|
||||||
|
return 0
|
||||||
|
try:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute(
|
||||||
|
'SELECT COUNT(*) FROM service_checks WHERE service_id = %s',
|
||||||
|
(service_id,),
|
||||||
|
)
|
||||||
|
return cur.fetchone()[0]
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
def start_monitoring(self):
|
def start_monitoring(self):
|
||||||
"""Start background monitoring thread"""
|
"""Start background monitoring thread"""
|
||||||
def monitor_loop():
|
def monitor_loop():
|
||||||
|
|||||||
@@ -20,3 +20,4 @@ six==1.16.0
|
|||||||
urllib3==2.2.2
|
urllib3==2.2.2
|
||||||
Werkzeug==3.0.3
|
Werkzeug==3.0.3
|
||||||
xxhash==3.4.1
|
xxhash==3.4.1
|
||||||
|
psycopg2-binary==2.9.9
|
||||||
|
|||||||
Reference in New Issue
Block a user