QoL and metric value inverter
This commit is contained in:
@@ -148,7 +148,7 @@ def urban_domain():
|
||||
description="Daily city travel",
|
||||
metric_bounds=[
|
||||
MetricBound("speed", weight=0.25, norm_min=5, norm_max=120),
|
||||
MetricBound("cost_efficiency", weight=0.25, norm_min=0.01, norm_max=2.0),
|
||||
MetricBound("cost_efficiency", weight=0.25, norm_min=0.01, norm_max=2.0, lower_is_better=True),
|
||||
MetricBound("safety", weight=0.25, norm_min=0.0, norm_max=1.0),
|
||||
MetricBound("availability", weight=0.15, norm_min=0.0, norm_max=1.0),
|
||||
MetricBound("range_fuel", weight=0.10, norm_min=5, norm_max=500),
|
||||
|
||||
@@ -10,7 +10,7 @@ def test_compatible_ground_combo(bicycle, human_pedalling):
|
||||
resolver = ConstraintResolver()
|
||||
combo = Combination(entities=[bicycle, human_pedalling])
|
||||
result = resolver.resolve(combo)
|
||||
assert result.status != "blocked", f"Unexpected block: {result.violations}"
|
||||
assert result.status != "p1_fail", f"Unexpected block: {result.violations}"
|
||||
|
||||
|
||||
def test_solar_sail_blocks_with_walking(walking, solar_sail):
|
||||
@@ -18,7 +18,7 @@ def test_solar_sail_blocks_with_walking(walking, solar_sail):
|
||||
resolver = ConstraintResolver()
|
||||
combo = Combination(entities=[walking, solar_sail])
|
||||
result = resolver.resolve(combo)
|
||||
assert result.status == "blocked"
|
||||
assert result.status == "p1_fail"
|
||||
assert any("mutually exclusive" in v for v in result.violations)
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ def test_nuclear_reactor_blocks_with_bicycle(bicycle, nuclear_reactor):
|
||||
resolver = ConstraintResolver()
|
||||
combo = Combination(entities=[bicycle, nuclear_reactor])
|
||||
result = resolver.resolve(combo)
|
||||
assert result.status == "blocked"
|
||||
assert result.status == "p1_fail"
|
||||
assert any("mass" in v.lower() for v in result.violations)
|
||||
|
||||
|
||||
@@ -58,7 +58,7 @@ def test_force_scale_mismatch_blocks():
|
||||
resolver = ConstraintResolver()
|
||||
combo = Combination(entities=[platform, power])
|
||||
result = resolver.resolve(combo)
|
||||
assert result.status == "blocked"
|
||||
assert result.status == "p1_fail"
|
||||
assert any("force deficit" in v for v in result.violations)
|
||||
|
||||
|
||||
@@ -80,7 +80,7 @@ def test_force_under_powered_warning():
|
||||
combo = Combination(entities=[platform, power])
|
||||
result = resolver.resolve(combo)
|
||||
# Under-powered but within 100x → warning, not block
|
||||
assert result.status != "blocked"
|
||||
assert result.status != "p1_fail"
|
||||
assert any("under-powered" in w for w in result.warnings)
|
||||
|
||||
|
||||
@@ -97,7 +97,7 @@ def test_requires_vs_excludes():
|
||||
resolver = ConstraintResolver()
|
||||
combo = Combination(entities=[a, b])
|
||||
result = resolver.resolve(combo)
|
||||
assert result.status == "blocked"
|
||||
assert result.status == "p1_fail"
|
||||
assert any("excludes" in v for v in result.violations)
|
||||
|
||||
|
||||
@@ -106,7 +106,7 @@ def test_ice_engine_blocks_with_spaceship(spaceship, ice_engine):
|
||||
resolver = ConstraintResolver()
|
||||
combo = Combination(entities=[spaceship, ice_engine])
|
||||
result = resolver.resolve(combo)
|
||||
assert result.status == "blocked"
|
||||
assert result.status == "p1_fail"
|
||||
assert any("atmosphere" in v for v in result.violations)
|
||||
|
||||
|
||||
@@ -138,7 +138,7 @@ def test_energy_density_deficit_blocks():
|
||||
resolver = ConstraintResolver()
|
||||
combo = Combination(entities=[platform, power])
|
||||
result = resolver.resolve(combo)
|
||||
assert result.status == "blocked"
|
||||
assert result.status == "p1_fail"
|
||||
assert any("energy density deficit" in v for v in result.violations)
|
||||
|
||||
|
||||
@@ -159,7 +159,7 @@ def test_energy_density_under_density_warning():
|
||||
resolver = ConstraintResolver()
|
||||
combo = Combination(entities=[platform, power])
|
||||
result = resolver.resolve(combo)
|
||||
assert result.status != "blocked"
|
||||
assert result.status != "p1_fail"
|
||||
assert any("under-density" in w for w in result.warnings)
|
||||
|
||||
|
||||
|
||||
@@ -16,8 +16,8 @@ def test_pass1_filters_impossible_combos(seeded_repo):
|
||||
result = pipeline.run(domain, ["platform", "power_source"], passes=[1])
|
||||
|
||||
assert result.total_generated == 81
|
||||
assert result.pass1_blocked > 0
|
||||
assert result.pass1_valid + result.pass1_conditional + result.pass1_blocked == 81
|
||||
assert result.pass1_failed > 0
|
||||
assert result.pass1_valid + result.pass1_conditional + result.pass1_failed == 81
|
||||
|
||||
|
||||
def test_pass123_produces_scored_results(seeded_repo):
|
||||
|
||||
@@ -239,18 +239,18 @@ def test_blocked_combos_have_results(seeded_repo):
|
||||
score_threshold=0.01, passes=[1, 2, 3],
|
||||
)
|
||||
|
||||
assert result.pass1_blocked > 0
|
||||
assert result.pass1_failed > 0
|
||||
|
||||
# All combos (blocked + scored) should have result rows
|
||||
all_results = repo.get_all_results(domain.name)
|
||||
total_with_results = len(all_results)
|
||||
# blocked combos get pass_reached=1 results, non-blocked get pass_reached=3
|
||||
assert total_with_results == result.pass1_blocked + result.pass3_scored
|
||||
assert total_with_results == result.pass1_failed + result.pass3_scored
|
||||
|
||||
# Blocked combos should have pass_reached=1 and composite_score=0.0
|
||||
blocked_results = [r for r in all_results if r["combination"].status == "blocked"]
|
||||
assert len(blocked_results) == result.pass1_blocked
|
||||
for br in blocked_results:
|
||||
# Failed combos should have pass_reached=1 and composite_score=0.0
|
||||
failed_results = [r for r in all_results if r["combination"].status == "p1_fail"]
|
||||
assert len(failed_results) == result.pass1_failed
|
||||
for br in failed_results:
|
||||
assert br["pass_reached"] == 1
|
||||
assert br["composite_score"] == 0.0
|
||||
|
||||
@@ -291,12 +291,96 @@ def test_save_combination_loads_existing_status(seeded_repo):
|
||||
saved = repo.save_combination(combo)
|
||||
assert saved.status == "pending"
|
||||
|
||||
# Mark it blocked in DB
|
||||
repo.update_combination_status(saved.id, "blocked", "test reason")
|
||||
# Mark it p1_fail in DB
|
||||
repo.update_combination_status(saved.id, "p1_fail", "test reason")
|
||||
|
||||
# Re-saving should pick up the blocked status
|
||||
# Re-saving should pick up the p1_fail status
|
||||
combo2 = Combination(entities=entities)
|
||||
reloaded = repo.save_combination(combo2)
|
||||
assert reloaded.id == saved.id
|
||||
assert reloaded.status == "blocked"
|
||||
assert reloaded.status == "p1_fail"
|
||||
assert reloaded.block_reason == "test reason"
|
||||
|
||||
|
||||
def test_p3_fail_below_threshold(seeded_repo):
|
||||
"""Combos scoring below threshold should get p3_fail status."""
|
||||
repo = seeded_repo
|
||||
domain = repo.get_domain("urban_commuting")
|
||||
|
||||
resolver = ConstraintResolver()
|
||||
scorer = Scorer(domain)
|
||||
pipeline = Pipeline(repo, resolver, scorer)
|
||||
|
||||
# Use a very high threshold so most combos fail pass 3
|
||||
result = pipeline.run(
|
||||
domain, ["platform", "power_source"],
|
||||
score_threshold=0.99, passes=[1, 2, 3],
|
||||
)
|
||||
|
||||
assert result.pass3_failed > 0
|
||||
# above_threshold should be much smaller than scored
|
||||
assert result.pass3_above_threshold <= result.pass3_scored
|
||||
|
||||
# p3_fail combos should exist in DB
|
||||
p3_fail_combos = repo.list_combinations(status="p3_fail")
|
||||
assert len(p3_fail_combos) == result.pass3_failed
|
||||
for c in p3_fail_combos:
|
||||
assert c.block_reason is not None
|
||||
assert "below threshold" in c.block_reason
|
||||
|
||||
|
||||
def test_p4_fail_implausible(seeded_repo):
|
||||
"""Combos deemed implausible by LLM should get p4_fail status."""
|
||||
from physcom.llm.providers.mock import MockLLMProvider
|
||||
|
||||
repo = seeded_repo
|
||||
domain = repo.get_domain("urban_commuting")
|
||||
|
||||
resolver = ConstraintResolver()
|
||||
scorer = Scorer(domain)
|
||||
# Low estimates → normalized scores avg <= 0.5 → MockLLMProvider returns (text, False)
|
||||
# Use threshold=0.0 so no combo gets p3_fail and all reach pass 4
|
||||
mock_llm = MockLLMProvider(default_estimates={
|
||||
"speed": 0.1, "cost_efficiency": 0.1, "safety": 0.1,
|
||||
"availability": 0.1, "range_fuel": 0.1,
|
||||
})
|
||||
pipeline = Pipeline(repo, resolver, scorer, llm=mock_llm)
|
||||
|
||||
result = pipeline.run(
|
||||
domain, ["platform", "power_source"],
|
||||
score_threshold=0.0, passes=[1, 2, 3, 4],
|
||||
)
|
||||
|
||||
# With low normalized scores (avg <= 0.5), reviewed combos should be p4_fail
|
||||
assert result.pass4_failed > 0
|
||||
assert result.pass4_reviewed == 0
|
||||
|
||||
p4_fail_combos = repo.list_combinations(status="p4_fail")
|
||||
assert len(p4_fail_combos) == result.pass4_failed
|
||||
for c in p4_fail_combos:
|
||||
assert c.block_reason == "LLM deemed implausible"
|
||||
|
||||
|
||||
def test_p4_pass_plausible(seeded_repo):
|
||||
"""Combos deemed plausible by LLM should get llm_reviewed status."""
|
||||
from physcom.llm.providers.mock import MockLLMProvider
|
||||
|
||||
repo = seeded_repo
|
||||
domain = repo.get_domain("urban_commuting")
|
||||
|
||||
resolver = ConstraintResolver()
|
||||
scorer = Scorer(domain)
|
||||
# High estimates → avg > 0.5 → MockLLMProvider returns (text, True)
|
||||
mock_llm = MockLLMProvider(default_estimates={
|
||||
"speed": 50.0, "cost_efficiency": 0.5, "safety": 0.6,
|
||||
"availability": 0.7, "range_fuel": 200.0,
|
||||
})
|
||||
pipeline = Pipeline(repo, resolver, scorer, llm=mock_llm)
|
||||
|
||||
result = pipeline.run(
|
||||
domain, ["platform", "power_source"],
|
||||
score_threshold=0.01, passes=[1, 2, 3, 4],
|
||||
)
|
||||
|
||||
assert result.pass4_reviewed > 0
|
||||
assert result.pass4_failed == 0
|
||||
|
||||
@@ -81,10 +81,34 @@ class TestScorer:
|
||||
assert len(result.scores) == 5
|
||||
|
||||
def test_scorer_zero_metric_kills_score(self, urban_domain):
|
||||
"""A zero on a higher-is-better metric should drive composite to 0."""
|
||||
scorer = Scorer(urban_domain)
|
||||
combo = Combination(entities=[])
|
||||
combo.id = 1
|
||||
raw = {"speed": 60.0, "cost_efficiency": 0.0, "safety": 0.7,
|
||||
raw = {"speed": 60.0, "cost_efficiency": 0.5, "safety": 0.0,
|
||||
"availability": 0.8, "range_fuel": 400}
|
||||
result = scorer.score_combination(combo, raw)
|
||||
assert result.composite_score == 0.0
|
||||
|
||||
def test_lower_is_better_inverts_score(self, urban_domain):
|
||||
"""cost_efficiency is lower_is_better: low raw value should score high."""
|
||||
scorer = Scorer(urban_domain)
|
||||
combo = Combination(entities=[])
|
||||
combo.id = 1
|
||||
# cost_efficiency: norm_min=0.01, norm_max=2.0, lower_is_better=True
|
||||
# A low cost (0.02) should get a HIGH normalized score (near 1.0)
|
||||
# A high cost (1.9) should get a LOW normalized score (near 0.0)
|
||||
raw_cheap = {"speed": 60.0, "cost_efficiency": 0.02, "safety": 0.7,
|
||||
"availability": 0.8, "range_fuel": 400}
|
||||
raw_expensive = {"speed": 60.0, "cost_efficiency": 1.9, "safety": 0.7,
|
||||
"availability": 0.8, "range_fuel": 400}
|
||||
result_cheap = scorer.score_combination(combo, raw_cheap)
|
||||
result_expensive = scorer.score_combination(combo, raw_expensive)
|
||||
|
||||
# Find the cost_efficiency score in each
|
||||
cost_cheap = next(s for s in result_cheap.scores if s.metric_name == "cost_efficiency")
|
||||
cost_expensive = next(s for s in result_expensive.scores if s.metric_name == "cost_efficiency")
|
||||
|
||||
assert cost_cheap.normalized_score > cost_expensive.normalized_score
|
||||
assert cost_cheap.normalized_score > 0.9 # near the best
|
||||
assert cost_expensive.normalized_score < 0.1 # near the worst
|
||||
|
||||
Reference in New Issue
Block a user