QoL and metric value inverter

This commit is contained in:
2026-03-04 11:10:45 -06:00
parent 8dfe3607b1
commit f57ac7d6dc
30 changed files with 556 additions and 118 deletions

View File

@@ -46,7 +46,7 @@ class ConstraintResolver:
self._check_unmet_requirements(all_deps, result)
if result.violations:
result.status = "blocked"
result.status = "p1_fail"
elif result.warnings:
result.status = "conditional"

View File

@@ -21,12 +21,15 @@ class PipelineResult:
total_generated: int = 0
pass1_valid: int = 0
pass1_blocked: int = 0
pass1_failed: int = 0
pass1_conditional: int = 0
pass2_estimated: int = 0
pass2_failed: int = 0
pass3_scored: int = 0
pass3_above_threshold: int = 0
pass3_failed: int = 0
pass4_reviewed: int = 0
pass4_failed: int = 0
pass5_human_reviewed: int = 0
top_results: list[dict] = field(default_factory=list)
@@ -77,7 +80,7 @@ class Pipeline:
run_id,
combos_pass1=result.pass1_valid
+ result.pass1_conditional
+ result.pass1_blocked,
+ result.pass1_failed,
combos_pass2=result.pass2_estimated,
combos_pass3=result.pass3_scored,
combos_pass4=result.pass4_reviewed,
@@ -142,22 +145,22 @@ class Pipeline:
# ── Pass 1: Constraint Resolution ────────────────
if 1 in passes and existing_pass < 1:
cr: ConstraintResult = self.resolver.resolve(combo)
if cr.status == "blocked":
combo.status = "blocked"
if cr.status == "p1_fail":
combo.status = "p1_fail"
combo.block_reason = "; ".join(cr.violations)
self.repo.update_combination_status(
combo.id, "blocked", combo.block_reason
combo.id, "p1_fail", combo.block_reason
)
# Save a result row so blocked combos appear in results
# Save a result row so failed combos appear in results
self.repo.save_result(
combo.id,
domain.id,
composite_score=0.0,
pass_reached=1,
)
result.pass1_blocked += 1
result.pass1_failed += 1
self._update_run_counters(run_id, result, current_pass=1)
continue # blocked — skip remaining passes
continue # p1_fail — skip remaining passes
else:
combo.status = "valid"
self.repo.update_combination_status(combo.id, "valid")
@@ -168,16 +171,16 @@ class Pipeline:
self._update_run_counters(run_id, result, current_pass=1)
elif 1 in passes:
# Already pass1'd — check if it was blocked
if combo.status == "blocked":
result.pass1_blocked += 1
# Already pass1'd — check if it failed
if combo.status.endswith("_fail"):
result.pass1_failed += 1
continue
else:
result.pass1_valid += 1
else:
# Pass 1 not requested; check if blocked from a prior run
if combo.status == "blocked":
result.pass1_blocked += 1
# Pass 1 not requested; check if failed from a prior run
if combo.status.endswith("_fail"):
result.pass1_failed += 1
continue
# ── Pass 2: Physics Estimation ───────────────────
@@ -207,6 +210,21 @@ class Pipeline:
combo.id, domain.id, estimate_dicts
)
# Check for all-zero estimates → p2_fail
if raw_metrics and all(v == 0.0 for v in raw_metrics.values()):
combo.status = "p2_fail"
combo.block_reason = "All metric estimates are zero"
self.repo.update_combination_status(
combo.id, "p2_fail", combo.block_reason
)
self.repo.save_result(
combo.id, domain.id,
composite_score=0.0, pass_reached=2,
)
result.pass2_failed += 1
self._update_run_counters(run_id, result, current_pass=2)
continue
result.pass2_estimated += 1
self._update_run_counters(run_id, result, current_pass=2)
elif 2 in passes:
@@ -249,6 +267,26 @@ class Pipeline:
existing_result["human_notes"] if existing_result else None
)
if sr.composite_score < score_threshold:
self.repo.save_result(
combo.id, domain.id,
sr.composite_score, pass_reached=3,
novelty_flag=novelty_flag,
human_notes=human_notes,
)
combo.status = "p3_fail"
combo.block_reason = (
f"Composite score {sr.composite_score:.4f} "
f"below threshold {score_threshold}"
)
self.repo.update_combination_status(
combo.id, "p3_fail", combo.block_reason
)
result.pass3_failed += 1
result.pass3_scored += 1
self._update_run_counters(run_id, result, current_pass=3)
continue
self.repo.save_result(
combo.id,
domain.id,
@@ -260,8 +298,7 @@ class Pipeline:
self.repo.update_combination_status(combo.id, "scored")
result.pass3_scored += 1
if sr.composite_score >= score_threshold:
result.pass3_above_threshold += 1
result.pass3_above_threshold += 1
self._update_run_counters(run_id, result, current_pass=3)
elif 3 in passes and existing_pass >= 3:
@@ -294,33 +331,49 @@ class Pipeline:
for s in db_scores
if s["normalized_score"] is not None
}
review: str | None = None
review_result: tuple[str, bool] | None = None
try:
review = self.llm.review_plausibility(
review_result = self.llm.review_plausibility(
description, score_dict
)
except LLMRateLimitError as exc:
self._wait_for_rate_limit(run_id, exc.retry_after)
try:
review = self.llm.review_plausibility(
review_result = self.llm.review_plausibility(
description, score_dict
)
except LLMRateLimitError:
pass # still limited; skip, retry next run
if review is not None:
self.repo.save_result(
combo.id,
domain.id,
cur_result["composite_score"],
pass_reached=4,
novelty_flag=cur_result.get("novelty_flag"),
llm_review=review,
human_notes=cur_result.get("human_notes"),
)
self.repo.update_combination_status(
combo.id, "llm_reviewed"
)
result.pass4_reviewed += 1
if review_result is not None:
review_text, plausible = review_result
if not plausible:
self.repo.save_result(
combo.id, domain.id,
cur_result["composite_score"],
pass_reached=4,
novelty_flag=cur_result.get("novelty_flag"),
llm_review=review_text,
human_notes=cur_result.get("human_notes"),
)
combo.status = "p4_fail"
combo.block_reason = "LLM deemed implausible"
self.repo.update_combination_status(
combo.id, "p4_fail", combo.block_reason
)
result.pass4_failed += 1
else:
self.repo.save_result(
combo.id, domain.id,
cur_result["composite_score"],
pass_reached=4,
novelty_flag=cur_result.get("novelty_flag"),
llm_review=review_text,
human_notes=cur_result.get("human_notes"),
)
self.repo.update_combination_status(
combo.id, "llm_reviewed"
)
result.pass4_reviewed += 1
self._update_run_counters(
run_id, result, current_pass=4
)

View File

@@ -69,6 +69,8 @@ class Scorer:
for mb in self.domain.metric_bounds:
raw = raw_metrics.get(mb.metric_name, 0.0)
normed = normalize(raw, mb.norm_min, mb.norm_max)
if mb.lower_is_better:
normed = 1.0 - normed
scores.append(Score(
metric_name=mb.metric_name,
raw_value=raw,