replace status history with db

This commit is contained in:
2026-02-12 21:00:47 -06:00
parent 9553e77b2f
commit b59842899b
4 changed files with 239 additions and 122 deletions

View File

@@ -19,9 +19,9 @@ Server-side monitoring system that checks the availability of asimonson.com serv
**Features**: **Features**:
- Tracks response times and HTTP status codes - Tracks response times and HTTP status codes
- Stores check history (up to 720 checks = 60 days of data)
- Calculates uptime percentages for multiple time periods (24h, 7d, 30d, all-time) - Calculates uptime percentages for multiple time periods (24h, 7d, 30d, all-time)
- Persists data to `static/json/status_history.json` - Persists data to PostgreSQL (`service_checks` table) via `DATABASE_URL` env var
- Gracefully degrades when no database is configured (local dev)
- Runs in a background thread - Runs in a background thread
#### 2. `app.py` - Flask Integration #### 2. `app.py` - Flask Integration
@@ -57,32 +57,22 @@ Server-side monitoring system that checks the availability of asimonson.com serv
## Data Storage ## Data Storage
Status history is stored in `src/static/json/status_history.json`: Check history is stored in a PostgreSQL `service_checks` table. The connection is configured via the `DATABASE_URL` environment variable (e.g. `postgresql://user:pass@host:5432/dbname`).
```json ```sql
{ CREATE TABLE service_checks (
"last_check": "2026-02-11T14:30:00", id SERIAL PRIMARY KEY,
"services": { service_id VARCHAR(50) NOT NULL,
"main": { timestamp TIMESTAMPTZ NOT NULL DEFAULT NOW(),
"name": "asimonson.com", status VARCHAR(20) NOT NULL,
"url": "https://asimonson.com", response_time INTEGER,
"status": "online", status_code INTEGER,
"response_time": 156, error TEXT
"status_code": 200, );
"last_online": "2026-02-11T14:30:00",
"checks": [
{
"timestamp": "2026-02-11T14:30:00",
"status": "online",
"response_time": 156,
"status_code": 200
}
]
}
}
}
``` ```
The table and index are created automatically on startup. If `DATABASE_URL` is not set, the monitor runs without persistence (useful for local development).
## Status Types ## Status Types
- **online**: HTTP status 2xx-4xx, service responding - **online**: HTTP status 2xx-4xx, service responding
@@ -142,8 +132,7 @@ SERVICES = [
## Notes ## Notes
- First deployment will show limited uptime data until enough checks accumulate - First deployment will show limited uptime data until enough checks accumulate
- Historical data is preserved across server restarts - Historical data is preserved across server restarts (stored in PostgreSQL)
- Maximum 720 checks stored per service (60 days at 2-hour intervals)
- Page auto-refreshes every 5 minutes to show latest server data - Page auto-refreshes every 5 minutes to show latest server data
- Manual refresh button available for immediate updates - Manual refresh button available for immediate updates
- All checks performed server-side (no client-side CORS issues) - All checks performed server-side (no client-side CORS issues)

View File

@@ -7,3 +7,26 @@ services:
restart: 'no' restart: 'no'
ports: ports:
- 8080:8080 - 8080:8080
environment:
DATABASE_URL: postgresql://portfolio:portfolio@db:5432/portfolio
depends_on:
db:
condition: service_healthy
db:
image: postgres:16-alpine
restart: 'no'
environment:
POSTGRES_USER: portfolio
POSTGRES_PASSWORD: portfolio
POSTGRES_DB: portfolio
volumes:
- pgdata:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U portfolio"]
interval: 5s
timeout: 3s
retries: 5
volumes:
pgdata:

View File

@@ -2,13 +2,14 @@
Service monitoring module Service monitoring module
Checks service availability and tracks uptime statistics Checks service availability and tracks uptime statistics
""" """
import os
import requests import requests
import time import time
import json
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timedelta from datetime import datetime, timedelta
from threading import Thread, Lock from threading import Thread, Lock
from pathlib import Path
import psycopg2
# Service configuration # Service configuration
SERVICES = [ SERVICES = [
@@ -35,52 +36,138 @@ SERVICES = [
# Check interval: 30 mins # Check interval: 30 mins
CHECK_INTERVAL = 1800 CHECK_INTERVAL = 1800
# File to store status history DATABASE_URL = os.environ.get('DATABASE_URL')
STATUS_FILE = Path(__file__).parent / 'static' / 'json' / 'status_history.json'
# Expected columns (besides id) — name: SQL type
_EXPECTED_COLUMNS = {
'service_id': 'VARCHAR(50) NOT NULL',
'timestamp': 'TIMESTAMPTZ NOT NULL DEFAULT NOW()',
'status': 'VARCHAR(20) NOT NULL',
'response_time': 'INTEGER',
'status_code': 'INTEGER',
'error': 'TEXT',
}
class ServiceMonitor: class ServiceMonitor:
def __init__(self): def __init__(self):
self.status_data = {}
self.lock = Lock() self.lock = Lock()
self.load_history() # Lightweight in-memory cache of latest status per service
self._current = {}
def load_history(self):
"""Load status history from file"""
if STATUS_FILE.exists():
try:
with open(STATUS_FILE, 'r') as f:
self.status_data = json.load(f)
except Exception as e:
print(f"Error loading status history: {e}")
self.initialize_status_data()
else:
self.initialize_status_data()
def initialize_status_data(self):
"""Initialize empty status data structure"""
self.status_data = {
'last_check': None,
'services': {}
}
for service in SERVICES: for service in SERVICES:
self.status_data['services'][service['id']] = { self._current[service['id']] = {
'name': service['name'], 'name': service['name'],
'url': service['url'], 'url': service['url'],
'status': 'unknown', 'status': 'unknown',
'response_time': None, 'response_time': None,
'status_code': None, 'status_code': None,
'last_online': None, 'last_online': None,
'checks': [] # List of check results
} }
self._last_check = None
self._ensure_schema()
def save_history(self): # ── database helpers ──────────────────────────────────────────
"""Save status history to file"""
@staticmethod
def _get_conn():
"""Return a new psycopg2 connection, or None if DATABASE_URL is unset."""
if not DATABASE_URL:
return None
return psycopg2.connect(DATABASE_URL)
def _ensure_schema(self):
"""Create the service_checks table (and index) if needed, then
reconcile columns with _EXPECTED_COLUMNS."""
if not DATABASE_URL:
print("DATABASE_URL not set — running without persistence")
return
# Retry connection in case DB is still starting (e.g. Docker)
conn = None
for attempt in range(5):
try: try:
STATUS_FILE.parent.mkdir(parents=True, exist_ok=True) conn = psycopg2.connect(DATABASE_URL)
with open(STATUS_FILE, 'w') as f: break
json.dump(self.status_data, f, indent=2) except psycopg2.OperationalError:
except Exception as e: if attempt < 4:
print(f"Error saving status history: {e}") print(f"Database not ready, retrying in 2s (attempt {attempt + 1}/5)...")
time.sleep(2)
else:
print("Could not connect to database — running without persistence")
return
try:
with conn, conn.cursor() as cur:
cur.execute("""
CREATE TABLE IF NOT EXISTS service_checks (
id SERIAL PRIMARY KEY,
service_id VARCHAR(50) NOT NULL,
timestamp TIMESTAMPTZ NOT NULL DEFAULT NOW(),
status VARCHAR(20) NOT NULL,
response_time INTEGER,
status_code INTEGER,
error TEXT
);
""")
cur.execute("""
CREATE INDEX IF NOT EXISTS idx_service_checks_service_timestamp
ON service_checks (service_id, timestamp DESC);
""")
# Introspect existing columns
cur.execute("""
SELECT column_name
FROM information_schema.columns
WHERE table_name = 'service_checks'
""")
existing = {row[0] for row in cur.fetchall()}
# Add missing columns
for col, col_type in _EXPECTED_COLUMNS.items():
if col not in existing:
# Strip NOT NULL / DEFAULT for ALTER ADD (can't enforce
# NOT NULL on existing rows without a default)
bare_type = col_type.split('NOT NULL')[0].split('DEFAULT')[0].strip()
cur.execute(
f'ALTER TABLE service_checks ADD COLUMN {col} {bare_type}'
)
print(f"Added column {col} to service_checks")
# Drop unexpected columns (besides 'id')
expected_names = set(_EXPECTED_COLUMNS) | {'id'}
for col in existing - expected_names:
cur.execute(
f'ALTER TABLE service_checks DROP COLUMN {col}'
)
print(f"Dropped column {col} from service_checks")
print("Database schema OK")
finally:
conn.close()
def _insert_check(self, service_id, result):
"""Insert a single check result into the database."""
conn = self._get_conn()
if conn is None:
return
try:
with conn, conn.cursor() as cur:
cur.execute(
"""INSERT INTO service_checks
(service_id, timestamp, status, response_time, status_code, error)
VALUES (%s, %s, %s, %s, %s, %s)""",
(
service_id,
result['timestamp'],
result['status'],
result.get('response_time'),
result.get('status_code'),
result.get('error'),
),
)
finally:
conn.close()
# ── service checks ────────────────────────────────────────────
def check_service(self, service): def check_service(self, service):
"""Check a single service and return status""" """Check a single service and return status"""
@@ -136,107 +223,124 @@ class ServiceMonitor:
results[service['id']] = result results[service['id']] = result
print(f" {service['name']}: {result['status']} ({result['response_time']}ms)") print(f" {service['name']}: {result['status']} ({result['response_time']}ms)")
# Only acquire lock when updating the shared data structure # Persist to database (outside lock — DB has its own concurrency)
for service_id, result in results.items():
self._insert_check(service_id, result)
# Update lightweight in-memory cache under lock
with self.lock: with self.lock:
for service in SERVICES: for service in SERVICES:
result = results[service['id']] result = results[service['id']]
service_data = self.status_data['services'][service['id']] cached = self._current[service['id']]
cached['status'] = result['status']
# Update current status cached['response_time'] = result['response_time']
service_data['status'] = result['status'] cached['status_code'] = result['status_code']
service_data['response_time'] = result['response_time']
service_data['status_code'] = result['status_code']
if result['status'] == 'online': if result['status'] == 'online':
service_data['last_online'] = result['timestamp'] cached['last_online'] = result['timestamp']
self._last_check = datetime.now().isoformat()
# Add to check history (keep last 2880 checks = 60 days at 2hr intervals) # ── uptime calculations ───────────────────────────────────────
service_data['checks'].append(result)
if len(service_data['checks']) > 2880:
service_data['checks'] = service_data['checks'][-2880:]
self.status_data['last_check'] = datetime.now().isoformat()
self.save_history()
def _calculate_uptime_unlocked(self, service_id, hours=None): def _calculate_uptime_unlocked(self, service_id, hours=None):
"""Calculate uptime percentage for a service (assumes lock is held)""" """Calculate uptime percentage for a service by querying the DB."""
service_data = self.status_data['services'].get(service_id) conn = self._get_conn()
if not service_data or not service_data['checks']: if conn is None:
return None return None
try:
checks = service_data['checks'] with conn.cursor() as cur:
# Filter by time period if specified
if hours: if hours:
cutoff = datetime.now() - timedelta(hours=hours) cutoff = datetime.now() - timedelta(hours=hours)
checks = [ cur.execute(
c for c in checks """SELECT
if datetime.fromisoformat(c['timestamp']) > cutoff COUNT(*) FILTER (WHERE status = 'online'),
] COUNT(*)
FROM service_checks
WHERE service_id = %s AND timestamp > %s""",
(service_id, cutoff),
)
else:
cur.execute(
"""SELECT
COUNT(*) FILTER (WHERE status = 'online'),
COUNT(*)
FROM service_checks
WHERE service_id = %s""",
(service_id,),
)
if not checks: online_count, total_count = cur.fetchone()
if total_count == 0:
return None return None
# Require minimum data coverage for the time period # Minimum-data thresholds
# Calculate expected number of checks for this period if hours:
expected_checks = (hours * 3600) / CHECK_INTERVAL expected_checks = (hours * 3600) / CHECK_INTERVAL
# Require at least 50% of expected checks to show this metric
minimum_checks = max(3, expected_checks * 0.5) minimum_checks = max(3, expected_checks * 0.5)
if total_count < minimum_checks:
if len(checks) < minimum_checks:
return None return None
else: else:
# For all-time, require at least 3 checks if total_count < 3:
if len(checks) < 3:
return None return None
online_count = sum(1 for c in checks if c['status'] == 'online') return round((online_count / total_count) * 100, 2)
uptime = (online_count / len(checks)) * 100 finally:
conn.close()
return round(uptime, 2)
def calculate_uptime(self, service_id, hours=None): def calculate_uptime(self, service_id, hours=None):
"""Calculate uptime percentage for a service""" """Calculate uptime percentage for a service"""
with self.lock:
return self._calculate_uptime_unlocked(service_id, hours) return self._calculate_uptime_unlocked(service_id, hours)
def get_status_summary(self): def get_status_summary(self):
"""Get current status summary with uptime statistics""" """Get current status summary with uptime statistics"""
with self.lock: with self.lock:
summary = { summary = {
'last_check': self.status_data['last_check'], 'last_check': self._last_check,
'next_check': None, 'next_check': None,
'services': [] 'services': []
} }
# Calculate next check time if self._last_check:
if self.status_data['last_check']: last_check = datetime.fromisoformat(self._last_check)
last_check = datetime.fromisoformat(self.status_data['last_check'])
next_check = last_check + timedelta(seconds=CHECK_INTERVAL) next_check = last_check + timedelta(seconds=CHECK_INTERVAL)
summary['next_check'] = next_check.isoformat() summary['next_check'] = next_check.isoformat()
for service_id, service_data in self.status_data['services'].items(): for service_id, cached in self._current.items():
service_summary = { service_summary = {
'id': service_id, 'id': service_id,
'name': service_data['name'], 'name': cached['name'],
'url': service_data['url'], 'url': cached['url'],
'status': service_data['status'], 'status': cached['status'],
'response_time': service_data['response_time'], 'response_time': cached['response_time'],
'status_code': service_data['status_code'], 'status_code': cached['status_code'],
'last_online': service_data['last_online'], 'last_online': cached['last_online'],
'uptime': { 'uptime': {
'24h': self._calculate_uptime_unlocked(service_id, 24), '24h': self._calculate_uptime_unlocked(service_id, 24),
'7d': self._calculate_uptime_unlocked(service_id, 24 * 7), '7d': self._calculate_uptime_unlocked(service_id, 24 * 7),
'30d': self._calculate_uptime_unlocked(service_id, 24 * 30), '30d': self._calculate_uptime_unlocked(service_id, 24 * 30),
'all_time': self._calculate_uptime_unlocked(service_id) 'all_time': self._calculate_uptime_unlocked(service_id)
}, },
'total_checks': len(service_data['checks']) 'total_checks': self._get_total_checks(service_id),
} }
summary['services'].append(service_summary) summary['services'].append(service_summary)
return summary return summary
def _get_total_checks(self, service_id):
"""Return the total number of checks for a service."""
conn = self._get_conn()
if conn is None:
return 0
try:
with conn.cursor() as cur:
cur.execute(
'SELECT COUNT(*) FROM service_checks WHERE service_id = %s',
(service_id,),
)
return cur.fetchone()[0]
finally:
conn.close()
def start_monitoring(self): def start_monitoring(self):
"""Start background monitoring thread""" """Start background monitoring thread"""
def monitor_loop(): def monitor_loop():

View File

@@ -20,3 +20,4 @@ six==1.16.0
urllib3==2.2.2 urllib3==2.2.2
Werkzeug==3.0.3 Werkzeug==3.0.3
xxhash==3.4.1 xxhash==3.4.1
psycopg2-binary==2.9.9