QoL and metric value inverter

2026-03-04 11:10:45 -06:00
parent 8dfe3607b1
commit f57ac7d6dc
30 changed files with 556 additions and 118 deletions
--- a/tests/test_pipeline_async.py
+++ b/tests/test_pipeline_async.py
@@ -239,18 +239,18 @@ def test_blocked_combos_have_results(seeded_repo):
        score_threshold=0.01, passes=[1, 2, 3],
    )

-    assert result.pass1_blocked > 0
+    assert result.pass1_failed > 0

    # All combos (blocked + scored) should have result rows
    all_results = repo.get_all_results(domain.name)
    total_with_results = len(all_results)
    # blocked combos get pass_reached=1 results, non-blocked get pass_reached=3
-    assert total_with_results == result.pass1_blocked + result.pass3_scored
+    assert total_with_results == result.pass1_failed + result.pass3_scored

-    # Blocked combos should have pass_reached=1 and composite_score=0.0
-    blocked_results = [r for r in all_results if r["combination"].status == "blocked"]
-    assert len(blocked_results) == result.pass1_blocked
-    for br in blocked_results:
+    # Failed combos should have pass_reached=1 and composite_score=0.0
+    failed_results = [r for r in all_results if r["combination"].status == "p1_fail"]
+    assert len(failed_results) == result.pass1_failed
+    for br in failed_results:
        assert br["pass_reached"] == 1
        assert br["composite_score"] == 0.0

@@ -291,12 +291,96 @@ def test_save_combination_loads_existing_status(seeded_repo):
    saved = repo.save_combination(combo)
    assert saved.status == "pending"

-    # Mark it blocked in DB
-    repo.update_combination_status(saved.id, "blocked", "test reason")
+    # Mark it p1_fail in DB
+    repo.update_combination_status(saved.id, "p1_fail", "test reason")

-    # Re-saving should pick up the blocked status
+    # Re-saving should pick up the p1_fail status
    combo2 = Combination(entities=entities)
    reloaded = repo.save_combination(combo2)
    assert reloaded.id == saved.id
-    assert reloaded.status == "blocked"
+    assert reloaded.status == "p1_fail"
    assert reloaded.block_reason == "test reason"
+
+
+def test_p3_fail_below_threshold(seeded_repo):
+    """Combos scoring below threshold should get p3_fail status."""
+    repo = seeded_repo
+    domain = repo.get_domain("urban_commuting")
+
+    resolver = ConstraintResolver()
+    scorer = Scorer(domain)
+    pipeline = Pipeline(repo, resolver, scorer)
+
+    # Use a very high threshold so most combos fail pass 3
+    result = pipeline.run(
+        domain, ["platform", "power_source"],
+        score_threshold=0.99, passes=[1, 2, 3],
+    )
+
+    assert result.pass3_failed > 0
+    # above_threshold should be much smaller than scored
+    assert result.pass3_above_threshold <= result.pass3_scored
+
+    # p3_fail combos should exist in DB
+    p3_fail_combos = repo.list_combinations(status="p3_fail")
+    assert len(p3_fail_combos) == result.pass3_failed
+    for c in p3_fail_combos:
+        assert c.block_reason is not None
+        assert "below threshold" in c.block_reason
+
+
+def test_p4_fail_implausible(seeded_repo):
+    """Combos deemed implausible by LLM should get p4_fail status."""
+    from physcom.llm.providers.mock import MockLLMProvider
+
+    repo = seeded_repo
+    domain = repo.get_domain("urban_commuting")
+
+    resolver = ConstraintResolver()
+    scorer = Scorer(domain)
+    # Low estimates → normalized scores avg <= 0.5 → MockLLMProvider returns (text, False)
+    # Use threshold=0.0 so no combo gets p3_fail and all reach pass 4
+    mock_llm = MockLLMProvider(default_estimates={
+        "speed": 0.1, "cost_efficiency": 0.1, "safety": 0.1,
+        "availability": 0.1, "range_fuel": 0.1,
+    })
+    pipeline = Pipeline(repo, resolver, scorer, llm=mock_llm)
+
+    result = pipeline.run(
+        domain, ["platform", "power_source"],
+        score_threshold=0.0, passes=[1, 2, 3, 4],
+    )
+
+    # With low normalized scores (avg <= 0.5), reviewed combos should be p4_fail
+    assert result.pass4_failed > 0
+    assert result.pass4_reviewed == 0
+
+    p4_fail_combos = repo.list_combinations(status="p4_fail")
+    assert len(p4_fail_combos) == result.pass4_failed
+    for c in p4_fail_combos:
+        assert c.block_reason == "LLM deemed implausible"
+
+
+def test_p4_pass_plausible(seeded_repo):
+    """Combos deemed plausible by LLM should get llm_reviewed status."""
+    from physcom.llm.providers.mock import MockLLMProvider
+
+    repo = seeded_repo
+    domain = repo.get_domain("urban_commuting")
+
+    resolver = ConstraintResolver()
+    scorer = Scorer(domain)
+    # High estimates → avg > 0.5 → MockLLMProvider returns (text, True)
+    mock_llm = MockLLMProvider(default_estimates={
+        "speed": 50.0, "cost_efficiency": 0.5, "safety": 0.6,
+        "availability": 0.7, "range_fuel": 200.0,
+    })
+    pipeline = Pipeline(repo, resolver, scorer, llm=mock_llm)
+
+    result = pipeline.run(
+        domain, ["platform", "power_source"],
+        score_threshold=0.01, passes=[1, 2, 3, 4],
+    )
+
+    assert result.pass4_reviewed > 0
+    assert result.pass4_failed == 0