QoL and metric value inverter

This commit is contained in:
2026-03-04 11:10:45 -06:00
parent 8dfe3607b1
commit f57ac7d6dc
30 changed files with 556 additions and 118 deletions

View File

@@ -146,13 +146,16 @@ def run(ctx, domain_name, passes, threshold, dimensions):
click.echo(f" Total combinations: {result.total_generated}")
click.echo(f" Pass 1 — valid: {result.pass1_valid}, "
f"conditional: {result.pass1_conditional}, "
f"blocked: {result.pass1_blocked}")
f"failed: {result.pass1_failed}")
if 2 in pass_list:
click.echo(f" Pass 2 — estimated: {result.pass2_estimated}")
click.echo(f" Pass 2 — estimated: {result.pass2_estimated}, "
f"failed: {result.pass2_failed}")
if 3 in pass_list:
click.echo(f" Pass 3 — above threshold: {result.pass3_above_threshold}")
click.echo(f" Pass 3 — above threshold: {result.pass3_above_threshold}, "
f"failed: {result.pass3_failed}")
if 4 in pass_list:
click.echo(f" Pass 4 — LLM reviewed: {result.pass4_reviewed}")
click.echo(f" Pass 4 — LLM reviewed: {result.pass4_reviewed}, "
f"failed: {result.pass4_failed}")
@main.command()

View File

@@ -193,6 +193,17 @@ class Repository:
self.conn.commit()
return row["id"]
def backfill_lower_is_better(self, domain_name: str, metric_name: str) -> None:
"""Set lower_is_better=1 for an existing domain-metric row that still has the default 0."""
self.conn.execute(
"""UPDATE domain_metric_weights SET lower_is_better = 1
WHERE lower_is_better = 0
AND domain_id = (SELECT id FROM domains WHERE name = ?)
AND metric_id = (SELECT id FROM metrics WHERE name = ?)""",
(domain_name, metric_name),
)
self.conn.commit()
def add_domain(self, domain: Domain) -> Domain:
cur = self.conn.execute(
"INSERT INTO domains (name, description) VALUES (?, ?)",
@@ -204,9 +215,10 @@ class Repository:
mb.metric_id = metric_id
self.conn.execute(
"""INSERT INTO domain_metric_weights
(domain_id, metric_id, weight, norm_min, norm_max)
VALUES (?, ?, ?, ?, ?)""",
(domain.id, metric_id, mb.weight, mb.norm_min, mb.norm_max),
(domain_id, metric_id, weight, norm_min, norm_max, lower_is_better)
VALUES (?, ?, ?, ?, ?, ?)""",
(domain.id, metric_id, mb.weight, mb.norm_min, mb.norm_max,
int(mb.lower_is_better)),
)
self.conn.commit()
return domain
@@ -216,7 +228,8 @@ class Repository:
if not row:
return None
weights = self.conn.execute(
"""SELECT m.name, m.unit, dmw.weight, dmw.norm_min, dmw.norm_max, dmw.metric_id
"""SELECT m.name, m.unit, dmw.weight, dmw.norm_min, dmw.norm_max,
dmw.metric_id, dmw.lower_is_better
FROM domain_metric_weights dmw
JOIN metrics m ON dmw.metric_id = m.id
WHERE dmw.domain_id = ?""",
@@ -231,6 +244,7 @@ class Repository:
metric_name=w["name"], weight=w["weight"],
norm_min=w["norm_min"], norm_max=w["norm_max"],
metric_id=w["metric_id"], unit=w["unit"] or "",
lower_is_better=bool(w["lower_is_better"]),
)
for w in weights
],
@@ -245,7 +259,8 @@ class Repository:
if not row:
return None
weights = self.conn.execute(
"""SELECT m.name, m.unit, dmw.weight, dmw.norm_min, dmw.norm_max, dmw.metric_id
"""SELECT m.name, m.unit, dmw.weight, dmw.norm_min, dmw.norm_max,
dmw.metric_id, dmw.lower_is_better
FROM domain_metric_weights dmw
JOIN metrics m ON dmw.metric_id = m.id
WHERE dmw.domain_id = ?""",
@@ -260,6 +275,7 @@ class Repository:
metric_name=w["name"], weight=w["weight"],
norm_min=w["norm_min"], norm_max=w["norm_max"],
metric_id=w["metric_id"], unit=w["unit"] or "",
lower_is_better=bool(w["lower_is_better"]),
)
for w in weights
],
@@ -277,21 +293,23 @@ class Repository:
mb.metric_id = metric_id
self.conn.execute(
"""INSERT OR REPLACE INTO domain_metric_weights
(domain_id, metric_id, weight, norm_min, norm_max)
VALUES (?, ?, ?, ?, ?)""",
(domain_id, metric_id, mb.weight, mb.norm_min, mb.norm_max),
(domain_id, metric_id, weight, norm_min, norm_max, lower_is_better)
VALUES (?, ?, ?, ?, ?, ?)""",
(domain_id, metric_id, mb.weight, mb.norm_min, mb.norm_max,
int(mb.lower_is_better)),
)
self.conn.commit()
return mb
def update_metric_bound(
self, domain_id: int, metric_id: int, weight: float, norm_min: float, norm_max: float, unit: str
self, domain_id: int, metric_id: int, weight: float, norm_min: float, norm_max: float,
unit: str, lower_is_better: bool = False,
) -> None:
self.conn.execute(
"""UPDATE domain_metric_weights
SET weight = ?, norm_min = ?, norm_max = ?
SET weight = ?, norm_min = ?, norm_max = ?, lower_is_better = ?
WHERE domain_id = ? AND metric_id = ?""",
(weight, norm_min, norm_max, domain_id, metric_id),
(weight, norm_min, norm_max, int(lower_is_better), domain_id, metric_id),
)
if unit:
self.conn.execute(
@@ -330,6 +348,21 @@ class Repository:
self.conn.execute("DELETE FROM combination_scores WHERE domain_id = ?", (domain.id,))
self.conn.execute("DELETE FROM combination_results WHERE domain_id = ?", (domain.id,))
self.conn.execute("DELETE FROM pipeline_runs WHERE domain_id = ?", (domain.id,))
# Delete orphaned combos (no results left in any domain) and all their
# related rows — scores, entity links — so FK constraints don't block.
orphan_sql = """SELECT c.id FROM combinations c
WHERE c.id NOT IN (
SELECT DISTINCT combination_id FROM combination_results
)"""
self.conn.execute(
f"DELETE FROM combination_scores WHERE combination_id IN ({orphan_sql})"
)
self.conn.execute(
f"DELETE FROM combination_entities WHERE combination_id IN ({orphan_sql})"
)
self.conn.execute(
f"DELETE FROM combinations WHERE id IN ({orphan_sql})"
)
self.conn.commit()
return count
@@ -371,12 +404,15 @@ class Repository:
self, combo_id: int, status: str, block_reason: str | None = None
) -> None:
# Don't downgrade from higher pass states — preserves human/LLM review data
if status in ("scored", "llm_reviewed"):
if status in ("scored", "llm_reviewed") or status.endswith("_fail"):
row = self.conn.execute(
"SELECT status FROM combinations WHERE id = ?", (combo_id,)
).fetchone()
if row:
cur = row["status"]
# Fail statuses should not overwrite llm_reviewed or reviewed
if status.endswith("_fail") and cur in ("llm_reviewed", "reviewed"):
return
if status == "scored" and cur in ("llm_reviewed", "reviewed"):
return
if status == "llm_reviewed" and cur == "reviewed":
@@ -550,12 +586,12 @@ class Repository:
).fetchone()
if not row or row["total"] == 0:
return None
blocked = self.conn.execute(
failed = self.conn.execute(
"""SELECT COUNT(*) as cnt
FROM combinations c
JOIN combination_results cr ON cr.combination_id = c.id
JOIN domains d ON cr.domain_id = d.id
WHERE c.status = 'blocked' AND d.name = ?""",
WHERE c.status LIKE '%\\_fail' ESCAPE '\\' AND d.name = ?""",
(domain_name,),
).fetchone()
return {
@@ -564,7 +600,7 @@ class Repository:
"max_score": row["max_score"],
"min_score": row["min_score"],
"last_pass": row["last_pass"],
"blocked": blocked["cnt"] if blocked else 0,
"failed": failed["cnt"] if failed else 0,
}
def get_result(self, combo_id: int, domain_id: int) -> dict | None:

View File

@@ -44,12 +44,13 @@ CREATE TABLE IF NOT EXISTS metrics (
);
CREATE TABLE IF NOT EXISTS domain_metric_weights (
id INTEGER PRIMARY KEY AUTOINCREMENT,
domain_id INTEGER NOT NULL REFERENCES domains(id),
metric_id INTEGER NOT NULL REFERENCES metrics(id),
weight REAL NOT NULL,
norm_min REAL,
norm_max REAL,
id INTEGER PRIMARY KEY AUTOINCREMENT,
domain_id INTEGER NOT NULL REFERENCES domains(id),
metric_id INTEGER NOT NULL REFERENCES metrics(id),
weight REAL NOT NULL,
norm_min REAL,
norm_max REAL,
lower_is_better INTEGER NOT NULL DEFAULT 0,
UNIQUE(domain_id, metric_id)
);
@@ -117,6 +118,23 @@ CREATE INDEX IF NOT EXISTS idx_pipeline_runs_domain ON pipeline_runs(domain_id);
"""
def _migrate(conn: sqlite3.Connection) -> None:
"""Apply incremental migrations for existing databases."""
cols = {r[1] for r in conn.execute("PRAGMA table_info(domain_metric_weights)").fetchall()}
if "lower_is_better" not in cols:
conn.execute(
"ALTER TABLE domain_metric_weights ADD COLUMN lower_is_better INTEGER NOT NULL DEFAULT 0"
)
# Backfill: cost_efficiency is lower-is-better in all domains
conn.execute(
"""UPDATE domain_metric_weights SET lower_is_better = 1
WHERE lower_is_better = 0
AND metric_id IN (SELECT id FROM metrics WHERE name = 'cost_efficiency')"""
)
conn.commit()
def init_db(db_path: str | Path) -> sqlite3.Connection:
"""Create/open the database and ensure all tables exist."""
db_path = Path(db_path)
@@ -125,5 +143,6 @@ def init_db(db_path: str | Path) -> sqlite3.Connection:
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA foreign_keys=ON")
conn.executescript(DDL)
_migrate(conn)
conn.commit()
return conn

View File

@@ -46,7 +46,7 @@ class ConstraintResolver:
self._check_unmet_requirements(all_deps, result)
if result.violations:
result.status = "blocked"
result.status = "p1_fail"
elif result.warnings:
result.status = "conditional"

View File

@@ -21,12 +21,15 @@ class PipelineResult:
total_generated: int = 0
pass1_valid: int = 0
pass1_blocked: int = 0
pass1_failed: int = 0
pass1_conditional: int = 0
pass2_estimated: int = 0
pass2_failed: int = 0
pass3_scored: int = 0
pass3_above_threshold: int = 0
pass3_failed: int = 0
pass4_reviewed: int = 0
pass4_failed: int = 0
pass5_human_reviewed: int = 0
top_results: list[dict] = field(default_factory=list)
@@ -77,7 +80,7 @@ class Pipeline:
run_id,
combos_pass1=result.pass1_valid
+ result.pass1_conditional
+ result.pass1_blocked,
+ result.pass1_failed,
combos_pass2=result.pass2_estimated,
combos_pass3=result.pass3_scored,
combos_pass4=result.pass4_reviewed,
@@ -142,22 +145,22 @@ class Pipeline:
# ── Pass 1: Constraint Resolution ────────────────
if 1 in passes and existing_pass < 1:
cr: ConstraintResult = self.resolver.resolve(combo)
if cr.status == "blocked":
combo.status = "blocked"
if cr.status == "p1_fail":
combo.status = "p1_fail"
combo.block_reason = "; ".join(cr.violations)
self.repo.update_combination_status(
combo.id, "blocked", combo.block_reason
combo.id, "p1_fail", combo.block_reason
)
# Save a result row so blocked combos appear in results
# Save a result row so failed combos appear in results
self.repo.save_result(
combo.id,
domain.id,
composite_score=0.0,
pass_reached=1,
)
result.pass1_blocked += 1
result.pass1_failed += 1
self._update_run_counters(run_id, result, current_pass=1)
continue # blocked — skip remaining passes
continue # p1_fail — skip remaining passes
else:
combo.status = "valid"
self.repo.update_combination_status(combo.id, "valid")
@@ -168,16 +171,16 @@ class Pipeline:
self._update_run_counters(run_id, result, current_pass=1)
elif 1 in passes:
# Already pass1'd — check if it was blocked
if combo.status == "blocked":
result.pass1_blocked += 1
# Already pass1'd — check if it failed
if combo.status.endswith("_fail"):
result.pass1_failed += 1
continue
else:
result.pass1_valid += 1
else:
# Pass 1 not requested; check if blocked from a prior run
if combo.status == "blocked":
result.pass1_blocked += 1
# Pass 1 not requested; check if failed from a prior run
if combo.status.endswith("_fail"):
result.pass1_failed += 1
continue
# ── Pass 2: Physics Estimation ───────────────────
@@ -207,6 +210,21 @@ class Pipeline:
combo.id, domain.id, estimate_dicts
)
# Check for all-zero estimates → p2_fail
if raw_metrics and all(v == 0.0 for v in raw_metrics.values()):
combo.status = "p2_fail"
combo.block_reason = "All metric estimates are zero"
self.repo.update_combination_status(
combo.id, "p2_fail", combo.block_reason
)
self.repo.save_result(
combo.id, domain.id,
composite_score=0.0, pass_reached=2,
)
result.pass2_failed += 1
self._update_run_counters(run_id, result, current_pass=2)
continue
result.pass2_estimated += 1
self._update_run_counters(run_id, result, current_pass=2)
elif 2 in passes:
@@ -249,6 +267,26 @@ class Pipeline:
existing_result["human_notes"] if existing_result else None
)
if sr.composite_score < score_threshold:
self.repo.save_result(
combo.id, domain.id,
sr.composite_score, pass_reached=3,
novelty_flag=novelty_flag,
human_notes=human_notes,
)
combo.status = "p3_fail"
combo.block_reason = (
f"Composite score {sr.composite_score:.4f} "
f"below threshold {score_threshold}"
)
self.repo.update_combination_status(
combo.id, "p3_fail", combo.block_reason
)
result.pass3_failed += 1
result.pass3_scored += 1
self._update_run_counters(run_id, result, current_pass=3)
continue
self.repo.save_result(
combo.id,
domain.id,
@@ -260,8 +298,7 @@ class Pipeline:
self.repo.update_combination_status(combo.id, "scored")
result.pass3_scored += 1
if sr.composite_score >= score_threshold:
result.pass3_above_threshold += 1
result.pass3_above_threshold += 1
self._update_run_counters(run_id, result, current_pass=3)
elif 3 in passes and existing_pass >= 3:
@@ -294,33 +331,49 @@ class Pipeline:
for s in db_scores
if s["normalized_score"] is not None
}
review: str | None = None
review_result: tuple[str, bool] | None = None
try:
review = self.llm.review_plausibility(
review_result = self.llm.review_plausibility(
description, score_dict
)
except LLMRateLimitError as exc:
self._wait_for_rate_limit(run_id, exc.retry_after)
try:
review = self.llm.review_plausibility(
review_result = self.llm.review_plausibility(
description, score_dict
)
except LLMRateLimitError:
pass # still limited; skip, retry next run
if review is not None:
self.repo.save_result(
combo.id,
domain.id,
cur_result["composite_score"],
pass_reached=4,
novelty_flag=cur_result.get("novelty_flag"),
llm_review=review,
human_notes=cur_result.get("human_notes"),
)
self.repo.update_combination_status(
combo.id, "llm_reviewed"
)
result.pass4_reviewed += 1
if review_result is not None:
review_text, plausible = review_result
if not plausible:
self.repo.save_result(
combo.id, domain.id,
cur_result["composite_score"],
pass_reached=4,
novelty_flag=cur_result.get("novelty_flag"),
llm_review=review_text,
human_notes=cur_result.get("human_notes"),
)
combo.status = "p4_fail"
combo.block_reason = "LLM deemed implausible"
self.repo.update_combination_status(
combo.id, "p4_fail", combo.block_reason
)
result.pass4_failed += 1
else:
self.repo.save_result(
combo.id, domain.id,
cur_result["composite_score"],
pass_reached=4,
novelty_flag=cur_result.get("novelty_flag"),
llm_review=review_text,
human_notes=cur_result.get("human_notes"),
)
self.repo.update_combination_status(
combo.id, "llm_reviewed"
)
result.pass4_reviewed += 1
self._update_run_counters(
run_id, result, current_pass=4
)

View File

@@ -69,6 +69,8 @@ class Scorer:
for mb in self.domain.metric_bounds:
raw = raw_metrics.get(mb.metric_name, 0.0)
normed = normalize(raw, mb.norm_min, mb.norm_max)
if mb.lower_is_better:
normed = 1.0 - normed
scores.append(Score(
metric_name=mb.metric_name,
raw_value=raw,

View File

@@ -31,7 +31,7 @@ class LLMProvider(ABC):
@abstractmethod
def review_plausibility(
self, combination_description: str, scores: dict[str, float]
) -> str:
"""Given a combination and its scores, return a natural-language
plausibility and novelty assessment."""
) -> tuple[str, bool]:
"""Given a combination and its scores, return a (text, is_plausible)
tuple: natural-language assessment and whether the concept is plausible."""
...

View File

@@ -33,5 +33,8 @@ Review this concept for:
3. Novelty — does anything similar already exist?
4. Overall plausibility — is this a genuinely interesting innovation or nonsense?
Provide a concise 2-4 sentence assessment.
Provide a concise 2-4 sentence assessment, then on a final line write exactly:
VERDICT: PLAUSIBLE
or
VERDICT: IMPLAUSIBLE
"""

View File

@@ -42,7 +42,7 @@ class GeminiLLMProvider(LLMProvider):
def review_plausibility(
self, combination_description: str, scores: dict[str, float]
) -> str:
) -> tuple[str, bool]:
scores_str = "\n".join(f"- {k}: {v:.3f}" for k, v in scores.items())
prompt = PLAUSIBILITY_REVIEW_PROMPT.format(
description=combination_description,
@@ -56,7 +56,16 @@ class GeminiLLMProvider(LLMProvider):
if "429" in str(exc) or "RESOURCE_EXHAUSTED" in str(exc):
raise LLMRateLimitError(str(exc), self._parse_retry_after(exc)) from exc
raise
return response.text.strip()
text = response.text.strip()
plausible = self._parse_verdict(text)
return (text, plausible)
def _parse_verdict(self, text: str) -> bool:
"""Extract VERDICT: PLAUSIBLE/IMPLAUSIBLE from response; default to True."""
m = re.search(r"VERDICT:\s*(PLAUSIBLE|IMPLAUSIBLE)", text, re.IGNORECASE)
if m:
return m.group(1).upper() == "PLAUSIBLE"
return True
def _parse_retry_after(self, exc: Exception) -> int:
"""Extract retry delay from the error message, with a safe default."""

View File

@@ -21,8 +21,8 @@ class MockLLMProvider(LLMProvider):
def review_plausibility(
self, combination_description: str, scores: dict[str, float]
) -> str:
) -> tuple[str, bool]:
avg = sum(scores.values()) / max(len(scores), 1)
if avg > 0.5:
return "This concept appears plausible and worth further investigation."
return "This concept has significant feasibility challenges."
return ("This concept appears plausible and worth further investigation.", True)
return ("This concept has significant feasibility challenges.", False)

View File

@@ -39,7 +39,7 @@ class Combination:
"""A generated combination of entities (one per dimension)."""
entities: list[Entity] = field(default_factory=list)
status: str = "pending" # pending → valid/blocked → scored → reviewed
status: str = "pending" # pending → valid/p1_fail/p2_fail/p3_fail/p4_fail → scored → reviewed
block_reason: str | None = None
hash: str | None = None
id: int | None = None

View File

@@ -11,9 +11,10 @@ class MetricBound:
metric_name: str
weight: float # 0.01.0
norm_min: float # Below this → score 0
norm_max: float # Above this → score 1
norm_min: float # Below this → score 0 (or 1 if lower_is_better)
norm_max: float # Above this → score 1 (or 0 if lower_is_better)
unit: str = ""
lower_is_better: bool = False # Invert scale (e.g., cost: lower = better)
metric_id: int | None = None

View File

@@ -254,7 +254,7 @@ URBAN_COMMUTING = Domain(
description="Daily travel within a city, 1-50km range",
metric_bounds=[
MetricBound("speed", weight=0.25, norm_min=5, norm_max=120, unit="km/h"),
MetricBound("cost_efficiency", weight=0.25, norm_min=0.01, norm_max=2.0, unit="$/km"),
MetricBound("cost_efficiency", weight=0.25, norm_min=0.01, norm_max=2.0, unit="$/km", lower_is_better=True),
MetricBound("safety", weight=0.25, norm_min=0.0, norm_max=1.0, unit="0-1"),
MetricBound("availability", weight=0.15, norm_min=0.0, norm_max=1.0, unit="0-1"),
MetricBound("range_fuel", weight=0.10, norm_min=5, norm_max=500, unit="km"),
@@ -268,7 +268,7 @@ INTERPLANETARY = Domain(
MetricBound("speed", weight=0.30, norm_min=1000, norm_max=300000, unit="km/s"),
MetricBound("range_fuel", weight=0.30, norm_min=1e6, norm_max=1e10, unit="km"),
MetricBound("safety", weight=0.20, norm_min=0.0, norm_max=1.0, unit="0-1"),
MetricBound("cost_efficiency", weight=0.10, norm_min=1e3, norm_max=1e9, unit="$/km"),
MetricBound("cost_efficiency", weight=0.10, norm_min=1e3, norm_max=1e9, unit="$/km", lower_is_better=True),
MetricBound("range_degradation", weight=0.10, norm_min=100, norm_max=36500, unit="days"),
],
)
@@ -302,8 +302,10 @@ def load_transport_seed(repo) -> dict:
counts["domains"] += 1
except sqlite3.IntegrityError:
pass
# Backfill metric units on existing DBs (ensure_metric is idempotent).
# Backfill metric units and lower_is_better on existing DBs.
for mb in domain.metric_bounds:
repo.ensure_metric(mb.metric_name, unit=mb.unit)
if mb.lower_is_better:
repo.backfill_lower_is_better(domain.name, mb.metric_name)
return counts

View File

@@ -2,6 +2,7 @@
from __future__ import annotations
import math
import os
import secrets
from pathlib import Path
@@ -53,10 +54,54 @@ def close_db(exc: BaseException | None = None) -> None:
repo.conn.close()
_SI_PREFIXES = [
(1e12, "T"),
(1e9, "G"),
(1e6, "M"),
(1e3, "k"),
]
def _si_format(value: object) -> str:
"""Format a number with SI prefixes for readability.
Handles string inputs (like dep.value) by trying float conversion first.
Non-numeric values are returned as-is.
"""
if isinstance(value, str):
try:
num = float(value)
except (ValueError, TypeError):
return value
elif isinstance(value, (int, float)):
num = float(value)
else:
return str(value)
if math.isnan(num) or math.isinf(num):
return str(value)
abs_num = abs(num)
if abs_num < 1000:
# Small numbers: drop trailing zeros, cap at 4 significant figures
if num == int(num) and abs_num < 100:
return str(int(num))
return f"{num:.4g}"
for threshold, prefix in _SI_PREFIXES:
if abs_num >= threshold:
scaled = num / threshold
return f"{scaled:.4g}{prefix}"
return f"{num:.4g}"
def create_app() -> Flask:
app = Flask(__name__)
app.secret_key = _load_or_generate_secret_key()
app.jinja_env.filters["si"] = _si_format
app.teardown_appcontext(close_db)
# Register blueprints

View File

@@ -80,7 +80,11 @@ def metric_add(domain_id: int):
if not metric_name:
flash("Metric name is required.", "error")
else:
mb = MetricBound(metric_name=metric_name, weight=weight, norm_min=norm_min, norm_max=norm_max, unit=unit)
lower_is_better = bool(request.form.get("lower_is_better"))
mb = MetricBound(
metric_name=metric_name, weight=weight, norm_min=norm_min,
norm_max=norm_max, unit=unit, lower_is_better=lower_is_better,
)
repo.add_metric_bound(domain_id, mb)
flash("Metric added.", "success")
domain = repo.get_domain_by_id(domain_id)
@@ -99,7 +103,8 @@ def metric_edit(domain_id: int, metric_id: int):
domain = repo.get_domain_by_id(domain_id)
return render_template("domains/_metrics_table.html", domain=domain)
unit = request.form.get("unit", "").strip()
repo.update_metric_bound(domain_id, metric_id, weight, norm_min, norm_max, unit)
lower_is_better = bool(request.form.get("lower_is_better"))
repo.update_metric_bound(domain_id, metric_id, weight, norm_min, norm_max, unit, lower_is_better)
flash("Metric updated.", "success")
domain = repo.get_domain_by_id(domain_id)
return render_template("domains/_metrics_table.html", domain=domain)

View File

@@ -54,6 +54,9 @@ def result_detail(domain_name: str, combo_id: int):
return redirect(url_for("results.results_domain", domain_name=domain_name))
result = repo.get_result(combo_id, domain.id)
if not result:
flash("No results for this combination in this domain.", "error")
return redirect(url_for("results.results_domain", domain_name=domain_name))
scores = repo.get_combination_scores(combo_id, domain.id)
return render_template(

View File

@@ -54,6 +54,7 @@ h3 { font-size: 1rem; margin-bottom: 0.25rem; }
grid-template-columns: repeat(auto-fill, minmax(320px, 1fr));
gap: 1rem;
}
.card-grid > * { min-width: 0; overflow-x: auto; }
/* ── Tables ──────────────────────────────────────────────── */
table { width: 100%; border-collapse: collapse; font-size: 0.9rem; }
@@ -79,7 +80,10 @@ table.compact th, table.compact td { padding: 0.25rem 0.4rem; font-size: 0.85rem
.badge-range_min, .badge-range_max { background: #fef3c7; color: #92400e; }
.badge-excludes { background: #fee2e2; color: #991b1b; }
.badge-valid { background: #dcfce7; color: #166534; }
.badge-blocked { background: #fee2e2; color: #991b1b; }
.badge-p1_fail { background: #fee2e2; color: #991b1b; }
.badge-p2_fail { background: #fee2e2; color: #991b1b; }
.badge-p3_fail { background: #fee2e2; color: #991b1b; }
.badge-p4_fail { background: #fee2e2; color: #991b1b; }
.badge-scored { background: #dbeafe; color: #1e40af; }
.badge-llm_reviewed { background: #e0f2fe; color: #0369a1; }
.badge-reviewed { background: #f3e8ff; color: #6b21a8; }

View File

@@ -6,6 +6,7 @@
<th>Weight</th>
<th>Norm Min</th>
<th>Norm Max</th>
<th>Direction</th>
<th></th>
</tr>
</thead>
@@ -15,8 +16,9 @@
<td>{{ mb.metric_name }}</td>
<td>{{ mb.unit or '—' }}</td>
<td>{{ mb.weight }}</td>
<td>{{ mb.norm_min }}</td>
<td>{{ mb.norm_max }}</td>
<td>{{ mb.norm_min|si }}</td>
<td>{{ mb.norm_max|si }}</td>
<td>{{ '↓ lower' if mb.lower_is_better else '↑ higher' }}</td>
<td class="actions">
<button class="btn btn-sm"
onclick="this.closest('tr').nextElementSibling.style.display='table-row'; this.closest('tr').style.display='none'">
@@ -39,6 +41,7 @@
<td><input name="weight" type="number" step="any" value="{{ mb.weight }}" required></td>
<td><input name="norm_min" type="number" step="any" value="{{ mb.norm_min }}" required></td>
<td><input name="norm_max" type="number" step="any" value="{{ mb.norm_max }}" required></td>
<td><label><input type="checkbox" name="lower_is_better" value="1" {{ 'checked' if mb.lower_is_better }}> lower is better</label></td>
<td>
<button type="submit" class="btn btn-sm btn-primary">Save</button>
<button type="button" class="btn btn-sm"
@@ -63,6 +66,7 @@
<input name="weight" type="number" step="any" placeholder="weight" value="1.0" required>
<input name="norm_min" type="number" step="any" placeholder="norm min" value="0.0" required>
<input name="norm_max" type="number" step="any" placeholder="norm max" value="1.0" required>
<label><input type="checkbox" name="lower_is_better" value="1"> lower is better</label>
<button type="submit" class="btn btn-primary">Add</button>
</div>
</form>

View File

@@ -16,7 +16,7 @@
<p>{{ d.description }}</p>
<table>
<thead>
<tr><th>Metric</th><th>Unit</th><th>Weight</th><th>Norm Min</th><th>Norm Max</th></tr>
<tr><th>Metric</th><th>Unit</th><th>Weight</th><th>Norm Min</th><th>Norm Max</th><th>Direction</th></tr>
</thead>
<tbody>
{% for mb in d.metric_bounds %}
@@ -24,8 +24,9 @@
<td>{{ mb.metric_name }}</td>
<td>{{ mb.unit }}</td>
<td>{{ mb.weight }}</td>
<td>{{ mb.norm_min }}</td>
<td>{{ mb.norm_max }}</td>
<td>{{ mb.norm_min|si }}</td>
<td>{{ mb.norm_max|si }}</td>
<td>{{ '↓ lower' if mb.lower_is_better else '↑ higher' }}</td>
</tr>
{% endfor %}
</tbody>

View File

@@ -14,7 +14,7 @@
<tr>
<td>{{ dep.category }}</td>
<td>{{ dep.key }}</td>
<td>{{ dep.value }}</td>
<td>{{ dep.value|si }}</td>
<td>{{ dep.unit or '—' }}</td>
<td><span class="badge badge-{{ dep.constraint_type }}">{{ dep.constraint_type }}</span></td>
<td class="actions">

View File

@@ -34,7 +34,7 @@
<td>1 — Constraints</td>
<td>{{ run.combos_pass1 or 0 }} checked
{%- if (run.combos_pass2 or 0) > 0 and (run.combos_pass1 or 0) > (run.combos_pass2 or 0) %},
<span class="badge badge-blocked">{{ (run.combos_pass1 or 0) - (run.combos_pass2 or 0) }} blocked</span>
<span class="badge badge-p1_fail">{{ (run.combos_pass1 or 0) - (run.combos_pass2 or 0) }} failed</span>
{%- endif -%}
</td>
</tr>

View File

@@ -97,7 +97,7 @@
<th>Status</th>
<th>Total</th>
<th>P1 Checked</th>
<th>P1 Blocked</th>
<th>P1 Failed</th>
<th>P2 Estimated</th>
<th>P3 Scored</th>
<th>P4 Reviewed</th>
@@ -113,7 +113,7 @@
<td><span class="badge badge-{{ run.status }}">{{ run.status }}</span></td>
<td>{{ run.total_combos or '—' }}</td>
<td>{{ run.combos_pass1 or '—' }}</td>
<td>{% if blocked %}<span class="badge badge-blocked">{{ blocked }}</span>{% else %}—{% endif %}</td>
<td>{% if blocked %}<span class="badge badge-p1_fail">{{ blocked }}</span>{% else %}—{% endif %}</td>
<td>{{ run.combos_pass2 or '—' }}</td>
<td>{{ run.combos_pass3 or '—' }}</td>
<td>{{ run.combos_pass4 or '—' }}</td>
@@ -133,7 +133,7 @@
<h3>{{ d.name }} <span class="subtitle">{{ d.description }}</span></h3>
<dl class="summary-dl">
<dt>Results</dt><dd>{{ s.total_results }} scored combinations</dd>
<dt>Blocked</dt><dd>{{ s.blocked }} combinations</dd>
<dt>Failed</dt><dd>{{ s.failed }} combinations</dd>
<dt>Score range</dt><dd class="score-cell">{{ "%.4f"|format(s.min_score) }} — {{ "%.4f"|format(s.max_score) }}</dd>
<dt>Avg score</dt><dd class="score-cell">{{ "%.4f"|format(s.avg_score) }}</dd>
<dt>Last pass</dt><dd>{{ s.last_pass }}</dd>

View File

@@ -43,7 +43,7 @@
{% for dep in e.dependencies %}
<tr>
<td>{{ dep.key }}</td>
<td>{{ dep.value }}{{ ' ' + dep.unit if dep.unit else '' }}</td>
<td>{{ dep.value|si }}{{ ' ' + dep.unit if dep.unit else '' }}</td>
<td><span class="badge badge-{{ dep.constraint_type }}">{{ dep.constraint_type }}</span></td>
</tr>
{% endfor %}
@@ -77,10 +77,10 @@
<tr>
<td>{{ s.metric_name }}</td>
{% set unit = s.metric_unit or '' %}
<td class="score-cell">{{ "%.2f"|format(s.raw_value) if s.raw_value is not none else '—' }}{{ ' ' + unit if unit and s.raw_value is not none else '' }}</td>
<td class="score-cell">{{ s.raw_value|si if s.raw_value is not none else '—' }}{{ ' ' + unit if unit and s.raw_value is not none else '' }}</td>
<td>
{%- if mb -%}
{{ "%.2f"|format(mb.norm_min) }} — {{ "%.2f"|format(mb.norm_max) }}{{ ' ' + unit if unit else '' }}
{{ mb.norm_min|si }} — {{ mb.norm_max|si }}{{ ' ' + unit if unit else '' }}
{%- else -%}
{%- endif -%}
@@ -88,22 +88,22 @@
<td>
{%- if mb and s.raw_value is not none -%}
{%- if s.raw_value <= mb.norm_min -%}
<span class="badge badge-blocked">at/below min</span>
<span class="badge badge-{{ 'valid' if mb.lower_is_better else 'p1_fail' }}">at/below min{{ ' (best)' if mb.lower_is_better else '' }}</span>
{%- elif s.raw_value >= mb.norm_max -%}
<span class="badge badge-valid">at/above max</span>
<span class="badge badge-{{ 'p1_fail' if mb.lower_is_better else 'valid' }}">at/above max{{ ' (worst)' if mb.lower_is_better else '' }}</span>
{%- else -%}
{% set pct = ((s.raw_value - mb.norm_min) / (mb.norm_max - mb.norm_min) * 100) | int %}
<div class="metric-bar-container">
<div class="metric-bar" style="width: {{ pct }}%"></div>
</div>
<span class="metric-bar-label">~{{ pct }}%</span>
<span class="metric-bar-label">~{{ pct }}%{{ ' ↓' if mb.lower_is_better else '' }}</span>
{%- endif -%}
{%- else -%}
{%- endif -%}
</td>
<td class="score-cell">{{ "%.4f"|format(s.normalized_score) if s.normalized_score is not none else '—' }}</td>
<td>{{ "%.0f%%"|format(mb.weight * 100) if mb else '—' }}</td>
<td>{{ "%.0f%%"|format(mb.weight * 100) if mb else '—' }}{{ ' ↓' if mb and mb.lower_is_better else '' }}</td>
</tr>
{% endfor %}
</tbody>

View File

@@ -64,7 +64,7 @@
<td>{{ r.combination.entities|map(attribute='name')|join(' + ') }}</td>
<td><span class="badge badge-{{ r.combination.status }}">{{ r.combination.status }}</span></td>
<td class="block-reason-cell">
{%- if r.combination.status == 'blocked' and r.combination.block_reason -%}
{%- if r.combination.status.endswith('_fail') and r.combination.block_reason -%}
{{ r.combination.block_reason }}
{%- elif r.novelty_flag -%}
{{ r.novelty_flag }}