diff --git a/STATUS.md b/STATUS.md new file mode 100644 index 0000000..ece929f --- /dev/null +++ b/STATUS.md @@ -0,0 +1,140 @@ +# PhysCom — Project Status + +## Architecture Overview + +``` +src/ +├── physcom/ Core engine (CLI + SQLite + 5-pass pipeline) +│ ├── cli.py Click CLI: init, seed, entity, domain, run, results, review, export +│ ├── db/ +│ │ ├── schema.py DDL — 9 tables, 4 indexes +│ │ └── repository.py Data-access layer (24 methods) +│ ├── engine/ +│ │ ├── combinator.py Cartesian product generator +│ │ ├── constraint_resolver.py Pass 1 — requires/provides/excludes/range matching +│ │ ├── scorer.py Pass 3 — weighted geometric mean normalizer +│ │ └── pipeline.py Orchestrator for passes 1–5 +│ ├── llm/ +│ │ ├── base.py Abstract LLMProvider interface (2 methods) +│ │ ├── prompts.py Prompt templates for passes 2 and 4 +│ │ └── providers/ +│ │ └── mock.py Deterministic stub for tests +│ ├── models/ Dataclasses: Entity, Dependency, Domain, MetricBound, Combination, Score +│ └── seed/ +│ └── transport_example.py 9 platforms + 9 power sources + 2 domains +│ +├── physcom_web/ Flask web UI +│ ├── app.py App factory, per-request DB connection +│ ├── routes/ +│ │ ├── entities.py Entity + dependency CRUD (9 routes, HTMX) +│ │ ├── domains.py Domain listing (1 route) +│ │ ├── pipeline.py Run form + execution (2 routes) +│ │ └── results.py Browse, detail, human review (4 routes) +│ ├── templates/ Jinja2 + HTMX — 11 templates +│ └── static/style.css +│ +tests/ 37 passing tests +Dockerfile Single-stage Python 3.13-slim +docker-compose.yml web + cli services, shared volume +``` + +## What Works + +| Area | Status | Notes | +|------|--------|-------| +| Database schema | Done | 9 tables, WAL mode, foreign keys | +| Entity/dependency CRUD | Done | CLI + web UI | +| Domain + metric weights | Done | CLI seed + web read-only | +| Pass 1 — constraint resolution | Done | requires/provides/excludes/range logic | +| Pass 2 — physics estimation | Done | Stub heuristic (force/mass-based); LLM path exists but no real provider | +| Pass 3 — scoring + ranking | Done | Weighted geometric mean with min/max normalization | +| Pass 4 — LLM plausibility review | Wired | Pipeline calls `self.llm.review_plausibility()` when `llm` is not None; only MockLLMProvider exists | +| Pass 5 — human review | Done | CLI interactive + web HTMX form | +| Web UI | Done | Entity CRUD, domain view, pipeline run, results browse + review | +| Docker | Done | Compose with web + cli services, named volume | +| Tests | 37/37 passing | Repository, combinator, constraints, scorer, pipeline | + +## What's Missing + +### LLM provider — no real implementation yet + +The `LLMProvider` abstract class defines two methods: + +```python +class LLMProvider(ABC): + def estimate_physics(self, combination_description: str, metrics: list[str]) -> dict[str, float]: ... + def review_plausibility(self, combination_description: str, scores: dict[str, float]) -> str: ... +``` + +**Pass 2** (`estimate_physics`) — given a combination description like *"platform: Bicycle + power_source: Hydrogen Combustion Engine"*, return estimated metric values (speed, cost_efficiency, safety, etc.) as floats. + +**Pass 4** (`review_plausibility`) — given a combination description and its normalized scores, return a 2–4 sentence plausibility assessment. + +Prompt templates already exist in `src/physcom/llm/prompts.py`. The pipeline already checks `if self.llm:` and skips gracefully when None. + +### To enable LLM reviews, you need to: + +1. **Create a real provider** at `src/physcom/llm/providers/.py` that subclasses `LLMProvider`. For example, an Anthropic provider: + + ```python + # src/physcom/llm/providers/anthropic.py + import json + from anthropic import Anthropic + from physcom.llm.base import LLMProvider + from physcom.llm.prompts import PHYSICS_ESTIMATION_PROMPT, PLAUSIBILITY_REVIEW_PROMPT + + class AnthropicProvider(LLMProvider): + def __init__(self, model: str = "claude-sonnet-4-20250514"): + self.client = Anthropic() # reads ANTHROPIC_API_KEY from env + self.model = model + + def estimate_physics(self, description: str, metrics: list[str]) -> dict[str, float]: + prompt = PHYSICS_ESTIMATION_PROMPT.format( + description=description, + metrics=", ".join(metrics), + ) + resp = self.client.messages.create( + model=self.model, + max_tokens=256, + messages=[{"role": "user", "content": prompt}], + ) + return json.loads(resp.content[0].text) + + def review_plausibility(self, description: str, scores: dict[str, float]) -> str: + prompt = PLAUSIBILITY_REVIEW_PROMPT.format( + description=description, + scores=json.dumps(scores, indent=2), + ) + resp = self.client.messages.create( + model=self.model, + max_tokens=512, + messages=[{"role": "user", "content": prompt}], + ) + return resp.content[0].text + ``` + +2. **Add the dependency** to `pyproject.toml`: + ```toml + [project.optional-dependencies] + llm = ["anthropic>=0.40"] + ``` + +3. **Wire it into the CLI** — in `cli.py`'s `run` command, instantiate the provider when an `--llm` flag is passed and include pass 4 in the pass list. + +4. **Wire it into the web UI** — in `routes/pipeline.py`, same logic: read a config flag or env var (`PHYSCOM_LLM_PROVIDER`), instantiate the provider, pass it to `Pipeline(...)`. + +5. **Set the API key** — `ANTHROPIC_API_KEY` env var (or equivalent for your chosen provider). In Docker, add it to `docker-compose.yml`: + ```yaml + environment: + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} + ``` + +The same pattern works for OpenAI, Databricks, or any other provider — just subclass `LLMProvider` and implement the two methods. + +### Other future work + +- **Domain creation via web UI** — currently seed-only +- **Database upgrade** — SQLite → Postgres (docker-compose has a commented placeholder) +- **Async pipeline runs** — currently synchronous; fine for 81 combos, may need background tasks at scale +- **Export from web UI** — currently CLI-only (`physcom export`) +- **Authentication** — no auth on the web UI diff --git a/src/physcom/cli.py b/src/physcom/cli.py index 4efa3db..ce868c2 100644 --- a/src/physcom/cli.py +++ b/src/physcom/cli.py @@ -146,13 +146,16 @@ def run(ctx, domain_name, passes, threshold, dimensions): click.echo(f" Total combinations: {result.total_generated}") click.echo(f" Pass 1 — valid: {result.pass1_valid}, " f"conditional: {result.pass1_conditional}, " - f"blocked: {result.pass1_blocked}") + f"failed: {result.pass1_failed}") if 2 in pass_list: - click.echo(f" Pass 2 — estimated: {result.pass2_estimated}") + click.echo(f" Pass 2 — estimated: {result.pass2_estimated}, " + f"failed: {result.pass2_failed}") if 3 in pass_list: - click.echo(f" Pass 3 — above threshold: {result.pass3_above_threshold}") + click.echo(f" Pass 3 — above threshold: {result.pass3_above_threshold}, " + f"failed: {result.pass3_failed}") if 4 in pass_list: - click.echo(f" Pass 4 — LLM reviewed: {result.pass4_reviewed}") + click.echo(f" Pass 4 — LLM reviewed: {result.pass4_reviewed}, " + f"failed: {result.pass4_failed}") @main.command() diff --git a/src/physcom/db/repository.py b/src/physcom/db/repository.py index f23a243..1a33af0 100644 --- a/src/physcom/db/repository.py +++ b/src/physcom/db/repository.py @@ -193,6 +193,17 @@ class Repository: self.conn.commit() return row["id"] + def backfill_lower_is_better(self, domain_name: str, metric_name: str) -> None: + """Set lower_is_better=1 for an existing domain-metric row that still has the default 0.""" + self.conn.execute( + """UPDATE domain_metric_weights SET lower_is_better = 1 + WHERE lower_is_better = 0 + AND domain_id = (SELECT id FROM domains WHERE name = ?) + AND metric_id = (SELECT id FROM metrics WHERE name = ?)""", + (domain_name, metric_name), + ) + self.conn.commit() + def add_domain(self, domain: Domain) -> Domain: cur = self.conn.execute( "INSERT INTO domains (name, description) VALUES (?, ?)", @@ -204,9 +215,10 @@ class Repository: mb.metric_id = metric_id self.conn.execute( """INSERT INTO domain_metric_weights - (domain_id, metric_id, weight, norm_min, norm_max) - VALUES (?, ?, ?, ?, ?)""", - (domain.id, metric_id, mb.weight, mb.norm_min, mb.norm_max), + (domain_id, metric_id, weight, norm_min, norm_max, lower_is_better) + VALUES (?, ?, ?, ?, ?, ?)""", + (domain.id, metric_id, mb.weight, mb.norm_min, mb.norm_max, + int(mb.lower_is_better)), ) self.conn.commit() return domain @@ -216,7 +228,8 @@ class Repository: if not row: return None weights = self.conn.execute( - """SELECT m.name, m.unit, dmw.weight, dmw.norm_min, dmw.norm_max, dmw.metric_id + """SELECT m.name, m.unit, dmw.weight, dmw.norm_min, dmw.norm_max, + dmw.metric_id, dmw.lower_is_better FROM domain_metric_weights dmw JOIN metrics m ON dmw.metric_id = m.id WHERE dmw.domain_id = ?""", @@ -231,6 +244,7 @@ class Repository: metric_name=w["name"], weight=w["weight"], norm_min=w["norm_min"], norm_max=w["norm_max"], metric_id=w["metric_id"], unit=w["unit"] or "", + lower_is_better=bool(w["lower_is_better"]), ) for w in weights ], @@ -245,7 +259,8 @@ class Repository: if not row: return None weights = self.conn.execute( - """SELECT m.name, m.unit, dmw.weight, dmw.norm_min, dmw.norm_max, dmw.metric_id + """SELECT m.name, m.unit, dmw.weight, dmw.norm_min, dmw.norm_max, + dmw.metric_id, dmw.lower_is_better FROM domain_metric_weights dmw JOIN metrics m ON dmw.metric_id = m.id WHERE dmw.domain_id = ?""", @@ -260,6 +275,7 @@ class Repository: metric_name=w["name"], weight=w["weight"], norm_min=w["norm_min"], norm_max=w["norm_max"], metric_id=w["metric_id"], unit=w["unit"] or "", + lower_is_better=bool(w["lower_is_better"]), ) for w in weights ], @@ -277,21 +293,23 @@ class Repository: mb.metric_id = metric_id self.conn.execute( """INSERT OR REPLACE INTO domain_metric_weights - (domain_id, metric_id, weight, norm_min, norm_max) - VALUES (?, ?, ?, ?, ?)""", - (domain_id, metric_id, mb.weight, mb.norm_min, mb.norm_max), + (domain_id, metric_id, weight, norm_min, norm_max, lower_is_better) + VALUES (?, ?, ?, ?, ?, ?)""", + (domain_id, metric_id, mb.weight, mb.norm_min, mb.norm_max, + int(mb.lower_is_better)), ) self.conn.commit() return mb def update_metric_bound( - self, domain_id: int, metric_id: int, weight: float, norm_min: float, norm_max: float, unit: str + self, domain_id: int, metric_id: int, weight: float, norm_min: float, norm_max: float, + unit: str, lower_is_better: bool = False, ) -> None: self.conn.execute( """UPDATE domain_metric_weights - SET weight = ?, norm_min = ?, norm_max = ? + SET weight = ?, norm_min = ?, norm_max = ?, lower_is_better = ? WHERE domain_id = ? AND metric_id = ?""", - (weight, norm_min, norm_max, domain_id, metric_id), + (weight, norm_min, norm_max, int(lower_is_better), domain_id, metric_id), ) if unit: self.conn.execute( @@ -330,6 +348,21 @@ class Repository: self.conn.execute("DELETE FROM combination_scores WHERE domain_id = ?", (domain.id,)) self.conn.execute("DELETE FROM combination_results WHERE domain_id = ?", (domain.id,)) self.conn.execute("DELETE FROM pipeline_runs WHERE domain_id = ?", (domain.id,)) + # Delete orphaned combos (no results left in any domain) and all their + # related rows — scores, entity links — so FK constraints don't block. + orphan_sql = """SELECT c.id FROM combinations c + WHERE c.id NOT IN ( + SELECT DISTINCT combination_id FROM combination_results + )""" + self.conn.execute( + f"DELETE FROM combination_scores WHERE combination_id IN ({orphan_sql})" + ) + self.conn.execute( + f"DELETE FROM combination_entities WHERE combination_id IN ({orphan_sql})" + ) + self.conn.execute( + f"DELETE FROM combinations WHERE id IN ({orphan_sql})" + ) self.conn.commit() return count @@ -371,12 +404,15 @@ class Repository: self, combo_id: int, status: str, block_reason: str | None = None ) -> None: # Don't downgrade from higher pass states — preserves human/LLM review data - if status in ("scored", "llm_reviewed"): + if status in ("scored", "llm_reviewed") or status.endswith("_fail"): row = self.conn.execute( "SELECT status FROM combinations WHERE id = ?", (combo_id,) ).fetchone() if row: cur = row["status"] + # Fail statuses should not overwrite llm_reviewed or reviewed + if status.endswith("_fail") and cur in ("llm_reviewed", "reviewed"): + return if status == "scored" and cur in ("llm_reviewed", "reviewed"): return if status == "llm_reviewed" and cur == "reviewed": @@ -550,12 +586,12 @@ class Repository: ).fetchone() if not row or row["total"] == 0: return None - blocked = self.conn.execute( + failed = self.conn.execute( """SELECT COUNT(*) as cnt FROM combinations c JOIN combination_results cr ON cr.combination_id = c.id JOIN domains d ON cr.domain_id = d.id - WHERE c.status = 'blocked' AND d.name = ?""", + WHERE c.status LIKE '%\\_fail' ESCAPE '\\' AND d.name = ?""", (domain_name,), ).fetchone() return { @@ -564,7 +600,7 @@ class Repository: "max_score": row["max_score"], "min_score": row["min_score"], "last_pass": row["last_pass"], - "blocked": blocked["cnt"] if blocked else 0, + "failed": failed["cnt"] if failed else 0, } def get_result(self, combo_id: int, domain_id: int) -> dict | None: diff --git a/src/physcom/db/schema.py b/src/physcom/db/schema.py index 5e637bb..9bc5eee 100644 --- a/src/physcom/db/schema.py +++ b/src/physcom/db/schema.py @@ -44,12 +44,13 @@ CREATE TABLE IF NOT EXISTS metrics ( ); CREATE TABLE IF NOT EXISTS domain_metric_weights ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - domain_id INTEGER NOT NULL REFERENCES domains(id), - metric_id INTEGER NOT NULL REFERENCES metrics(id), - weight REAL NOT NULL, - norm_min REAL, - norm_max REAL, + id INTEGER PRIMARY KEY AUTOINCREMENT, + domain_id INTEGER NOT NULL REFERENCES domains(id), + metric_id INTEGER NOT NULL REFERENCES metrics(id), + weight REAL NOT NULL, + norm_min REAL, + norm_max REAL, + lower_is_better INTEGER NOT NULL DEFAULT 0, UNIQUE(domain_id, metric_id) ); @@ -117,6 +118,23 @@ CREATE INDEX IF NOT EXISTS idx_pipeline_runs_domain ON pipeline_runs(domain_id); """ +def _migrate(conn: sqlite3.Connection) -> None: + """Apply incremental migrations for existing databases.""" + cols = {r[1] for r in conn.execute("PRAGMA table_info(domain_metric_weights)").fetchall()} + if "lower_is_better" not in cols: + conn.execute( + "ALTER TABLE domain_metric_weights ADD COLUMN lower_is_better INTEGER NOT NULL DEFAULT 0" + ) + + # Backfill: cost_efficiency is lower-is-better in all domains + conn.execute( + """UPDATE domain_metric_weights SET lower_is_better = 1 + WHERE lower_is_better = 0 + AND metric_id IN (SELECT id FROM metrics WHERE name = 'cost_efficiency')""" + ) + conn.commit() + + def init_db(db_path: str | Path) -> sqlite3.Connection: """Create/open the database and ensure all tables exist.""" db_path = Path(db_path) @@ -125,5 +143,6 @@ def init_db(db_path: str | Path) -> sqlite3.Connection: conn.execute("PRAGMA journal_mode=WAL") conn.execute("PRAGMA foreign_keys=ON") conn.executescript(DDL) + _migrate(conn) conn.commit() return conn diff --git a/src/physcom/engine/constraint_resolver.py b/src/physcom/engine/constraint_resolver.py index 77fa929..6eaef07 100644 --- a/src/physcom/engine/constraint_resolver.py +++ b/src/physcom/engine/constraint_resolver.py @@ -46,7 +46,7 @@ class ConstraintResolver: self._check_unmet_requirements(all_deps, result) if result.violations: - result.status = "blocked" + result.status = "p1_fail" elif result.warnings: result.status = "conditional" diff --git a/src/physcom/engine/pipeline.py b/src/physcom/engine/pipeline.py index 3a4b431..296a73e 100644 --- a/src/physcom/engine/pipeline.py +++ b/src/physcom/engine/pipeline.py @@ -21,12 +21,15 @@ class PipelineResult: total_generated: int = 0 pass1_valid: int = 0 - pass1_blocked: int = 0 + pass1_failed: int = 0 pass1_conditional: int = 0 pass2_estimated: int = 0 + pass2_failed: int = 0 pass3_scored: int = 0 pass3_above_threshold: int = 0 + pass3_failed: int = 0 pass4_reviewed: int = 0 + pass4_failed: int = 0 pass5_human_reviewed: int = 0 top_results: list[dict] = field(default_factory=list) @@ -77,7 +80,7 @@ class Pipeline: run_id, combos_pass1=result.pass1_valid + result.pass1_conditional - + result.pass1_blocked, + + result.pass1_failed, combos_pass2=result.pass2_estimated, combos_pass3=result.pass3_scored, combos_pass4=result.pass4_reviewed, @@ -142,22 +145,22 @@ class Pipeline: # ── Pass 1: Constraint Resolution ──────────────── if 1 in passes and existing_pass < 1: cr: ConstraintResult = self.resolver.resolve(combo) - if cr.status == "blocked": - combo.status = "blocked" + if cr.status == "p1_fail": + combo.status = "p1_fail" combo.block_reason = "; ".join(cr.violations) self.repo.update_combination_status( - combo.id, "blocked", combo.block_reason + combo.id, "p1_fail", combo.block_reason ) - # Save a result row so blocked combos appear in results + # Save a result row so failed combos appear in results self.repo.save_result( combo.id, domain.id, composite_score=0.0, pass_reached=1, ) - result.pass1_blocked += 1 + result.pass1_failed += 1 self._update_run_counters(run_id, result, current_pass=1) - continue # blocked — skip remaining passes + continue # p1_fail — skip remaining passes else: combo.status = "valid" self.repo.update_combination_status(combo.id, "valid") @@ -168,16 +171,16 @@ class Pipeline: self._update_run_counters(run_id, result, current_pass=1) elif 1 in passes: - # Already pass1'd — check if it was blocked - if combo.status == "blocked": - result.pass1_blocked += 1 + # Already pass1'd — check if it failed + if combo.status.endswith("_fail"): + result.pass1_failed += 1 continue else: result.pass1_valid += 1 else: - # Pass 1 not requested; check if blocked from a prior run - if combo.status == "blocked": - result.pass1_blocked += 1 + # Pass 1 not requested; check if failed from a prior run + if combo.status.endswith("_fail"): + result.pass1_failed += 1 continue # ── Pass 2: Physics Estimation ─────────────────── @@ -207,6 +210,21 @@ class Pipeline: combo.id, domain.id, estimate_dicts ) + # Check for all-zero estimates → p2_fail + if raw_metrics and all(v == 0.0 for v in raw_metrics.values()): + combo.status = "p2_fail" + combo.block_reason = "All metric estimates are zero" + self.repo.update_combination_status( + combo.id, "p2_fail", combo.block_reason + ) + self.repo.save_result( + combo.id, domain.id, + composite_score=0.0, pass_reached=2, + ) + result.pass2_failed += 1 + self._update_run_counters(run_id, result, current_pass=2) + continue + result.pass2_estimated += 1 self._update_run_counters(run_id, result, current_pass=2) elif 2 in passes: @@ -249,6 +267,26 @@ class Pipeline: existing_result["human_notes"] if existing_result else None ) + if sr.composite_score < score_threshold: + self.repo.save_result( + combo.id, domain.id, + sr.composite_score, pass_reached=3, + novelty_flag=novelty_flag, + human_notes=human_notes, + ) + combo.status = "p3_fail" + combo.block_reason = ( + f"Composite score {sr.composite_score:.4f} " + f"below threshold {score_threshold}" + ) + self.repo.update_combination_status( + combo.id, "p3_fail", combo.block_reason + ) + result.pass3_failed += 1 + result.pass3_scored += 1 + self._update_run_counters(run_id, result, current_pass=3) + continue + self.repo.save_result( combo.id, domain.id, @@ -260,8 +298,7 @@ class Pipeline: self.repo.update_combination_status(combo.id, "scored") result.pass3_scored += 1 - if sr.composite_score >= score_threshold: - result.pass3_above_threshold += 1 + result.pass3_above_threshold += 1 self._update_run_counters(run_id, result, current_pass=3) elif 3 in passes and existing_pass >= 3: @@ -294,33 +331,49 @@ class Pipeline: for s in db_scores if s["normalized_score"] is not None } - review: str | None = None + review_result: tuple[str, bool] | None = None try: - review = self.llm.review_plausibility( + review_result = self.llm.review_plausibility( description, score_dict ) except LLMRateLimitError as exc: self._wait_for_rate_limit(run_id, exc.retry_after) try: - review = self.llm.review_plausibility( + review_result = self.llm.review_plausibility( description, score_dict ) except LLMRateLimitError: pass # still limited; skip, retry next run - if review is not None: - self.repo.save_result( - combo.id, - domain.id, - cur_result["composite_score"], - pass_reached=4, - novelty_flag=cur_result.get("novelty_flag"), - llm_review=review, - human_notes=cur_result.get("human_notes"), - ) - self.repo.update_combination_status( - combo.id, "llm_reviewed" - ) - result.pass4_reviewed += 1 + if review_result is not None: + review_text, plausible = review_result + if not plausible: + self.repo.save_result( + combo.id, domain.id, + cur_result["composite_score"], + pass_reached=4, + novelty_flag=cur_result.get("novelty_flag"), + llm_review=review_text, + human_notes=cur_result.get("human_notes"), + ) + combo.status = "p4_fail" + combo.block_reason = "LLM deemed implausible" + self.repo.update_combination_status( + combo.id, "p4_fail", combo.block_reason + ) + result.pass4_failed += 1 + else: + self.repo.save_result( + combo.id, domain.id, + cur_result["composite_score"], + pass_reached=4, + novelty_flag=cur_result.get("novelty_flag"), + llm_review=review_text, + human_notes=cur_result.get("human_notes"), + ) + self.repo.update_combination_status( + combo.id, "llm_reviewed" + ) + result.pass4_reviewed += 1 self._update_run_counters( run_id, result, current_pass=4 ) diff --git a/src/physcom/engine/scorer.py b/src/physcom/engine/scorer.py index d9449ad..6727dce 100644 --- a/src/physcom/engine/scorer.py +++ b/src/physcom/engine/scorer.py @@ -69,6 +69,8 @@ class Scorer: for mb in self.domain.metric_bounds: raw = raw_metrics.get(mb.metric_name, 0.0) normed = normalize(raw, mb.norm_min, mb.norm_max) + if mb.lower_is_better: + normed = 1.0 - normed scores.append(Score( metric_name=mb.metric_name, raw_value=raw, diff --git a/src/physcom/llm/base.py b/src/physcom/llm/base.py index 2762134..ee072de 100644 --- a/src/physcom/llm/base.py +++ b/src/physcom/llm/base.py @@ -31,7 +31,7 @@ class LLMProvider(ABC): @abstractmethod def review_plausibility( self, combination_description: str, scores: dict[str, float] - ) -> str: - """Given a combination and its scores, return a natural-language - plausibility and novelty assessment.""" + ) -> tuple[str, bool]: + """Given a combination and its scores, return a (text, is_plausible) + tuple: natural-language assessment and whether the concept is plausible.""" ... diff --git a/src/physcom/llm/prompts.py b/src/physcom/llm/prompts.py index 68e679f..b7e03de 100644 --- a/src/physcom/llm/prompts.py +++ b/src/physcom/llm/prompts.py @@ -33,5 +33,8 @@ Review this concept for: 3. Novelty — does anything similar already exist? 4. Overall plausibility — is this a genuinely interesting innovation or nonsense? -Provide a concise 2-4 sentence assessment. +Provide a concise 2-4 sentence assessment, then on a final line write exactly: +VERDICT: PLAUSIBLE +or +VERDICT: IMPLAUSIBLE """ diff --git a/src/physcom/llm/providers/gemini.py b/src/physcom/llm/providers/gemini.py index 2b46575..ac1f9d5 100644 --- a/src/physcom/llm/providers/gemini.py +++ b/src/physcom/llm/providers/gemini.py @@ -42,7 +42,7 @@ class GeminiLLMProvider(LLMProvider): def review_plausibility( self, combination_description: str, scores: dict[str, float] - ) -> str: + ) -> tuple[str, bool]: scores_str = "\n".join(f"- {k}: {v:.3f}" for k, v in scores.items()) prompt = PLAUSIBILITY_REVIEW_PROMPT.format( description=combination_description, @@ -56,7 +56,16 @@ class GeminiLLMProvider(LLMProvider): if "429" in str(exc) or "RESOURCE_EXHAUSTED" in str(exc): raise LLMRateLimitError(str(exc), self._parse_retry_after(exc)) from exc raise - return response.text.strip() + text = response.text.strip() + plausible = self._parse_verdict(text) + return (text, plausible) + + def _parse_verdict(self, text: str) -> bool: + """Extract VERDICT: PLAUSIBLE/IMPLAUSIBLE from response; default to True.""" + m = re.search(r"VERDICT:\s*(PLAUSIBLE|IMPLAUSIBLE)", text, re.IGNORECASE) + if m: + return m.group(1).upper() == "PLAUSIBLE" + return True def _parse_retry_after(self, exc: Exception) -> int: """Extract retry delay from the error message, with a safe default.""" diff --git a/src/physcom/llm/providers/mock.py b/src/physcom/llm/providers/mock.py index 50a2f26..bb3d8b0 100644 --- a/src/physcom/llm/providers/mock.py +++ b/src/physcom/llm/providers/mock.py @@ -21,8 +21,8 @@ class MockLLMProvider(LLMProvider): def review_plausibility( self, combination_description: str, scores: dict[str, float] - ) -> str: + ) -> tuple[str, bool]: avg = sum(scores.values()) / max(len(scores), 1) if avg > 0.5: - return "This concept appears plausible and worth further investigation." - return "This concept has significant feasibility challenges." + return ("This concept appears plausible and worth further investigation.", True) + return ("This concept has significant feasibility challenges.", False) diff --git a/src/physcom/models/combination.py b/src/physcom/models/combination.py index dd04e15..7785507 100644 --- a/src/physcom/models/combination.py +++ b/src/physcom/models/combination.py @@ -39,7 +39,7 @@ class Combination: """A generated combination of entities (one per dimension).""" entities: list[Entity] = field(default_factory=list) - status: str = "pending" # pending → valid/blocked → scored → reviewed + status: str = "pending" # pending → valid/p1_fail/p2_fail/p3_fail/p4_fail → scored → reviewed block_reason: str | None = None hash: str | None = None id: int | None = None diff --git a/src/physcom/models/domain.py b/src/physcom/models/domain.py index 347fc79..d02670f 100644 --- a/src/physcom/models/domain.py +++ b/src/physcom/models/domain.py @@ -11,9 +11,10 @@ class MetricBound: metric_name: str weight: float # 0.0–1.0 - norm_min: float # Below this → score 0 - norm_max: float # Above this → score 1 + norm_min: float # Below this → score 0 (or 1 if lower_is_better) + norm_max: float # Above this → score 1 (or 0 if lower_is_better) unit: str = "" + lower_is_better: bool = False # Invert scale (e.g., cost: lower = better) metric_id: int | None = None diff --git a/src/physcom/seed/transport_example.py b/src/physcom/seed/transport_example.py index aea3ccf..339c29f 100644 --- a/src/physcom/seed/transport_example.py +++ b/src/physcom/seed/transport_example.py @@ -254,7 +254,7 @@ URBAN_COMMUTING = Domain( description="Daily travel within a city, 1-50km range", metric_bounds=[ MetricBound("speed", weight=0.25, norm_min=5, norm_max=120, unit="km/h"), - MetricBound("cost_efficiency", weight=0.25, norm_min=0.01, norm_max=2.0, unit="$/km"), + MetricBound("cost_efficiency", weight=0.25, norm_min=0.01, norm_max=2.0, unit="$/km", lower_is_better=True), MetricBound("safety", weight=0.25, norm_min=0.0, norm_max=1.0, unit="0-1"), MetricBound("availability", weight=0.15, norm_min=0.0, norm_max=1.0, unit="0-1"), MetricBound("range_fuel", weight=0.10, norm_min=5, norm_max=500, unit="km"), @@ -268,7 +268,7 @@ INTERPLANETARY = Domain( MetricBound("speed", weight=0.30, norm_min=1000, norm_max=300000, unit="km/s"), MetricBound("range_fuel", weight=0.30, norm_min=1e6, norm_max=1e10, unit="km"), MetricBound("safety", weight=0.20, norm_min=0.0, norm_max=1.0, unit="0-1"), - MetricBound("cost_efficiency", weight=0.10, norm_min=1e3, norm_max=1e9, unit="$/km"), + MetricBound("cost_efficiency", weight=0.10, norm_min=1e3, norm_max=1e9, unit="$/km", lower_is_better=True), MetricBound("range_degradation", weight=0.10, norm_min=100, norm_max=36500, unit="days"), ], ) @@ -302,8 +302,10 @@ def load_transport_seed(repo) -> dict: counts["domains"] += 1 except sqlite3.IntegrityError: pass - # Backfill metric units on existing DBs (ensure_metric is idempotent). + # Backfill metric units and lower_is_better on existing DBs. for mb in domain.metric_bounds: repo.ensure_metric(mb.metric_name, unit=mb.unit) + if mb.lower_is_better: + repo.backfill_lower_is_better(domain.name, mb.metric_name) return counts diff --git a/src/physcom_web/app.py b/src/physcom_web/app.py index b56fce3..5bb717e 100644 --- a/src/physcom_web/app.py +++ b/src/physcom_web/app.py @@ -2,6 +2,7 @@ from __future__ import annotations +import math import os import secrets from pathlib import Path @@ -53,10 +54,54 @@ def close_db(exc: BaseException | None = None) -> None: repo.conn.close() +_SI_PREFIXES = [ + (1e12, "T"), + (1e9, "G"), + (1e6, "M"), + (1e3, "k"), +] + + +def _si_format(value: object) -> str: + """Format a number with SI prefixes for readability. + + Handles string inputs (like dep.value) by trying float conversion first. + Non-numeric values are returned as-is. + """ + if isinstance(value, str): + try: + num = float(value) + except (ValueError, TypeError): + return value + elif isinstance(value, (int, float)): + num = float(value) + else: + return str(value) + + if math.isnan(num) or math.isinf(num): + return str(value) + + abs_num = abs(num) + if abs_num < 1000: + # Small numbers: drop trailing zeros, cap at 4 significant figures + if num == int(num) and abs_num < 100: + return str(int(num)) + return f"{num:.4g}" + + for threshold, prefix in _SI_PREFIXES: + if abs_num >= threshold: + scaled = num / threshold + return f"{scaled:.4g}{prefix}" + + return f"{num:.4g}" + + def create_app() -> Flask: app = Flask(__name__) app.secret_key = _load_or_generate_secret_key() + app.jinja_env.filters["si"] = _si_format + app.teardown_appcontext(close_db) # Register blueprints diff --git a/src/physcom_web/routes/domains.py b/src/physcom_web/routes/domains.py index 6f8b326..e7bb5e5 100644 --- a/src/physcom_web/routes/domains.py +++ b/src/physcom_web/routes/domains.py @@ -80,7 +80,11 @@ def metric_add(domain_id: int): if not metric_name: flash("Metric name is required.", "error") else: - mb = MetricBound(metric_name=metric_name, weight=weight, norm_min=norm_min, norm_max=norm_max, unit=unit) + lower_is_better = bool(request.form.get("lower_is_better")) + mb = MetricBound( + metric_name=metric_name, weight=weight, norm_min=norm_min, + norm_max=norm_max, unit=unit, lower_is_better=lower_is_better, + ) repo.add_metric_bound(domain_id, mb) flash("Metric added.", "success") domain = repo.get_domain_by_id(domain_id) @@ -99,7 +103,8 @@ def metric_edit(domain_id: int, metric_id: int): domain = repo.get_domain_by_id(domain_id) return render_template("domains/_metrics_table.html", domain=domain) unit = request.form.get("unit", "").strip() - repo.update_metric_bound(domain_id, metric_id, weight, norm_min, norm_max, unit) + lower_is_better = bool(request.form.get("lower_is_better")) + repo.update_metric_bound(domain_id, metric_id, weight, norm_min, norm_max, unit, lower_is_better) flash("Metric updated.", "success") domain = repo.get_domain_by_id(domain_id) return render_template("domains/_metrics_table.html", domain=domain) diff --git a/src/physcom_web/routes/results.py b/src/physcom_web/routes/results.py index a8e141f..675e9c7 100644 --- a/src/physcom_web/routes/results.py +++ b/src/physcom_web/routes/results.py @@ -54,6 +54,9 @@ def result_detail(domain_name: str, combo_id: int): return redirect(url_for("results.results_domain", domain_name=domain_name)) result = repo.get_result(combo_id, domain.id) + if not result: + flash("No results for this combination in this domain.", "error") + return redirect(url_for("results.results_domain", domain_name=domain_name)) scores = repo.get_combination_scores(combo_id, domain.id) return render_template( diff --git a/src/physcom_web/static/style.css b/src/physcom_web/static/style.css index 9aa71be..2a1a7ce 100644 --- a/src/physcom_web/static/style.css +++ b/src/physcom_web/static/style.css @@ -54,6 +54,7 @@ h3 { font-size: 1rem; margin-bottom: 0.25rem; } grid-template-columns: repeat(auto-fill, minmax(320px, 1fr)); gap: 1rem; } +.card-grid > * { min-width: 0; overflow-x: auto; } /* ── Tables ──────────────────────────────────────────────── */ table { width: 100%; border-collapse: collapse; font-size: 0.9rem; } @@ -79,7 +80,10 @@ table.compact th, table.compact td { padding: 0.25rem 0.4rem; font-size: 0.85rem .badge-range_min, .badge-range_max { background: #fef3c7; color: #92400e; } .badge-excludes { background: #fee2e2; color: #991b1b; } .badge-valid { background: #dcfce7; color: #166534; } -.badge-blocked { background: #fee2e2; color: #991b1b; } +.badge-p1_fail { background: #fee2e2; color: #991b1b; } +.badge-p2_fail { background: #fee2e2; color: #991b1b; } +.badge-p3_fail { background: #fee2e2; color: #991b1b; } +.badge-p4_fail { background: #fee2e2; color: #991b1b; } .badge-scored { background: #dbeafe; color: #1e40af; } .badge-llm_reviewed { background: #e0f2fe; color: #0369a1; } .badge-reviewed { background: #f3e8ff; color: #6b21a8; } diff --git a/src/physcom_web/templates/domains/_metrics_table.html b/src/physcom_web/templates/domains/_metrics_table.html index f6f33cb..6d830b2 100644 --- a/src/physcom_web/templates/domains/_metrics_table.html +++ b/src/physcom_web/templates/domains/_metrics_table.html @@ -6,6 +6,7 @@ Weight Norm Min Norm Max + Direction @@ -15,8 +16,9 @@ {{ mb.metric_name }} {{ mb.unit or '—' }} {{ mb.weight }} - {{ mb.norm_min }} - {{ mb.norm_max }} + {{ mb.norm_min|si }} + {{ mb.norm_max|si }} + {{ '↓ lower' if mb.lower_is_better else '↑ higher' }} diff --git a/src/physcom_web/templates/domains/list.html b/src/physcom_web/templates/domains/list.html index ceba19a..49b01bc 100644 --- a/src/physcom_web/templates/domains/list.html +++ b/src/physcom_web/templates/domains/list.html @@ -16,7 +16,7 @@

{{ d.description }}

- + {% for mb in d.metric_bounds %} @@ -24,8 +24,9 @@ - - + + + {% endfor %} diff --git a/src/physcom_web/templates/entities/_dep_table.html b/src/physcom_web/templates/entities/_dep_table.html index 4bb3560..2e07544 100644 --- a/src/physcom_web/templates/entities/_dep_table.html +++ b/src/physcom_web/templates/entities/_dep_table.html @@ -14,7 +14,7 @@ - + diff --git a/src/physcom_web/templates/pipeline/run.html b/src/physcom_web/templates/pipeline/run.html index 4544876..d2e02fc 100644 --- a/src/physcom_web/templates/pipeline/run.html +++ b/src/physcom_web/templates/pipeline/run.html @@ -97,7 +97,7 @@ - + @@ -113,7 +113,7 @@ - + @@ -133,7 +133,7 @@

{{ d.name }} {{ d.description }}

Results
{{ s.total_results }} scored combinations
-
Blocked
{{ s.blocked }} combinations
+
Failed
{{ s.failed }} combinations
Score range
{{ "%.4f"|format(s.min_score) }} — {{ "%.4f"|format(s.max_score) }}
Avg score
{{ "%.4f"|format(s.avg_score) }}
Last pass
{{ s.last_pass }}
diff --git a/src/physcom_web/templates/results/detail.html b/src/physcom_web/templates/results/detail.html index ac39b89..7c7f2de 100644 --- a/src/physcom_web/templates/results/detail.html +++ b/src/physcom_web/templates/results/detail.html @@ -43,7 +43,7 @@ {% for dep in e.dependencies %}
- + {% endfor %} @@ -77,10 +77,10 @@ {% set unit = s.metric_unit or '' %} - + - + {% endfor %} diff --git a/src/physcom_web/templates/results/list.html b/src/physcom_web/templates/results/list.html index aacf4c2..e5c7a87 100644 --- a/src/physcom_web/templates/results/list.html +++ b/src/physcom_web/templates/results/list.html @@ -64,7 +64,7 @@
MetricUnitWeightNorm MinNorm Max
MetricUnitWeightNorm MinNorm MaxDirection
{{ mb.metric_name }} {{ mb.unit }} {{ mb.weight }}{{ mb.norm_min }}{{ mb.norm_max }}{{ mb.norm_min|si }}{{ mb.norm_max|si }}{{ '↓ lower' if mb.lower_is_better else '↑ higher' }}
{{ dep.category }} {{ dep.key }}{{ dep.value }}{{ dep.value|si }} {{ dep.unit or '—' }} {{ dep.constraint_type }} diff --git a/src/physcom_web/templates/pipeline/_run_status.html b/src/physcom_web/templates/pipeline/_run_status.html index b7d8d28..f988069 100644 --- a/src/physcom_web/templates/pipeline/_run_status.html +++ b/src/physcom_web/templates/pipeline/_run_status.html @@ -34,7 +34,7 @@ 1 — Constraints {{ run.combos_pass1 or 0 }} checked {%- if (run.combos_pass2 or 0) > 0 and (run.combos_pass1 or 0) > (run.combos_pass2 or 0) %}, - {{ (run.combos_pass1 or 0) - (run.combos_pass2 or 0) }} blocked + {{ (run.combos_pass1 or 0) - (run.combos_pass2 or 0) }} failed {%- endif -%}
Status Total P1 CheckedP1 BlockedP1 Failed P2 Estimated P3 Scored P4 Reviewed{{ run.status }} {{ run.total_combos or '—' }} {{ run.combos_pass1 or '—' }}{% if blocked %}{{ blocked }}{% else %}—{% endif %}{% if blocked %}{{ blocked }}{% else %}—{% endif %} {{ run.combos_pass2 or '—' }} {{ run.combos_pass3 or '—' }} {{ run.combos_pass4 or '—' }}
{{ dep.key }}{{ dep.value }}{{ ' ' + dep.unit if dep.unit else '' }}{{ dep.value|si }}{{ ' ' + dep.unit if dep.unit else '' }} {{ dep.constraint_type }}
{{ s.metric_name }}{{ "%.2f"|format(s.raw_value) if s.raw_value is not none else '—' }}{{ ' ' + unit if unit and s.raw_value is not none else '' }}{{ s.raw_value|si if s.raw_value is not none else '—' }}{{ ' ' + unit if unit and s.raw_value is not none else '' }} {%- if mb -%} - {{ "%.2f"|format(mb.norm_min) }} — {{ "%.2f"|format(mb.norm_max) }}{{ ' ' + unit if unit else '' }} + {{ mb.norm_min|si }} — {{ mb.norm_max|si }}{{ ' ' + unit if unit else '' }} {%- else -%} — {%- endif -%} @@ -88,22 +88,22 @@ {%- if mb and s.raw_value is not none -%} {%- if s.raw_value <= mb.norm_min -%} - at/below min + at/below min{{ ' (best)' if mb.lower_is_better else '' }} {%- elif s.raw_value >= mb.norm_max -%} - at/above max + at/above max{{ ' (worst)' if mb.lower_is_better else '' }} {%- else -%} {% set pct = ((s.raw_value - mb.norm_min) / (mb.norm_max - mb.norm_min) * 100) | int %}
- ~{{ pct }}% + ~{{ pct }}%{{ ' ↓' if mb.lower_is_better else '' }} {%- endif -%} {%- else -%} — {%- endif -%}
{{ "%.4f"|format(s.normalized_score) if s.normalized_score is not none else '—' }}{{ "%.0f%%"|format(mb.weight * 100) if mb else '—' }}{{ "%.0f%%"|format(mb.weight * 100) if mb else '—' }}{{ ' ↓' if mb and mb.lower_is_better else '' }}
{{ r.combination.entities|map(attribute='name')|join(' + ') }} {{ r.combination.status }} - {%- if r.combination.status == 'blocked' and r.combination.block_reason -%} + {%- if r.combination.status.endswith('_fail') and r.combination.block_reason -%} {{ r.combination.block_reason }} {%- elif r.novelty_flag -%} {{ r.novelty_flag }} diff --git a/tests/conftest.py b/tests/conftest.py index 2a1b2a4..6ad09da 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -148,7 +148,7 @@ def urban_domain(): description="Daily city travel", metric_bounds=[ MetricBound("speed", weight=0.25, norm_min=5, norm_max=120), - MetricBound("cost_efficiency", weight=0.25, norm_min=0.01, norm_max=2.0), + MetricBound("cost_efficiency", weight=0.25, norm_min=0.01, norm_max=2.0, lower_is_better=True), MetricBound("safety", weight=0.25, norm_min=0.0, norm_max=1.0), MetricBound("availability", weight=0.15, norm_min=0.0, norm_max=1.0), MetricBound("range_fuel", weight=0.10, norm_min=5, norm_max=500), diff --git a/tests/test_constraint_resolver.py b/tests/test_constraint_resolver.py index 0daca05..f3d0cfa 100644 --- a/tests/test_constraint_resolver.py +++ b/tests/test_constraint_resolver.py @@ -10,7 +10,7 @@ def test_compatible_ground_combo(bicycle, human_pedalling): resolver = ConstraintResolver() combo = Combination(entities=[bicycle, human_pedalling]) result = resolver.resolve(combo) - assert result.status != "blocked", f"Unexpected block: {result.violations}" + assert result.status != "p1_fail", f"Unexpected block: {result.violations}" def test_solar_sail_blocks_with_walking(walking, solar_sail): @@ -18,7 +18,7 @@ def test_solar_sail_blocks_with_walking(walking, solar_sail): resolver = ConstraintResolver() combo = Combination(entities=[walking, solar_sail]) result = resolver.resolve(combo) - assert result.status == "blocked" + assert result.status == "p1_fail" assert any("mutually exclusive" in v for v in result.violations) @@ -37,7 +37,7 @@ def test_nuclear_reactor_blocks_with_bicycle(bicycle, nuclear_reactor): resolver = ConstraintResolver() combo = Combination(entities=[bicycle, nuclear_reactor]) result = resolver.resolve(combo) - assert result.status == "blocked" + assert result.status == "p1_fail" assert any("mass" in v.lower() for v in result.violations) @@ -58,7 +58,7 @@ def test_force_scale_mismatch_blocks(): resolver = ConstraintResolver() combo = Combination(entities=[platform, power]) result = resolver.resolve(combo) - assert result.status == "blocked" + assert result.status == "p1_fail" assert any("force deficit" in v for v in result.violations) @@ -80,7 +80,7 @@ def test_force_under_powered_warning(): combo = Combination(entities=[platform, power]) result = resolver.resolve(combo) # Under-powered but within 100x → warning, not block - assert result.status != "blocked" + assert result.status != "p1_fail" assert any("under-powered" in w for w in result.warnings) @@ -97,7 +97,7 @@ def test_requires_vs_excludes(): resolver = ConstraintResolver() combo = Combination(entities=[a, b]) result = resolver.resolve(combo) - assert result.status == "blocked" + assert result.status == "p1_fail" assert any("excludes" in v for v in result.violations) @@ -106,7 +106,7 @@ def test_ice_engine_blocks_with_spaceship(spaceship, ice_engine): resolver = ConstraintResolver() combo = Combination(entities=[spaceship, ice_engine]) result = resolver.resolve(combo) - assert result.status == "blocked" + assert result.status == "p1_fail" assert any("atmosphere" in v for v in result.violations) @@ -138,7 +138,7 @@ def test_energy_density_deficit_blocks(): resolver = ConstraintResolver() combo = Combination(entities=[platform, power]) result = resolver.resolve(combo) - assert result.status == "blocked" + assert result.status == "p1_fail" assert any("energy density deficit" in v for v in result.violations) @@ -159,7 +159,7 @@ def test_energy_density_under_density_warning(): resolver = ConstraintResolver() combo = Combination(entities=[platform, power]) result = resolver.resolve(combo) - assert result.status != "blocked" + assert result.status != "p1_fail" assert any("under-density" in w for w in result.warnings) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index d785821..50cd22d 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -16,8 +16,8 @@ def test_pass1_filters_impossible_combos(seeded_repo): result = pipeline.run(domain, ["platform", "power_source"], passes=[1]) assert result.total_generated == 81 - assert result.pass1_blocked > 0 - assert result.pass1_valid + result.pass1_conditional + result.pass1_blocked == 81 + assert result.pass1_failed > 0 + assert result.pass1_valid + result.pass1_conditional + result.pass1_failed == 81 def test_pass123_produces_scored_results(seeded_repo): diff --git a/tests/test_pipeline_async.py b/tests/test_pipeline_async.py index 4e0616e..cd58144 100644 --- a/tests/test_pipeline_async.py +++ b/tests/test_pipeline_async.py @@ -239,18 +239,18 @@ def test_blocked_combos_have_results(seeded_repo): score_threshold=0.01, passes=[1, 2, 3], ) - assert result.pass1_blocked > 0 + assert result.pass1_failed > 0 # All combos (blocked + scored) should have result rows all_results = repo.get_all_results(domain.name) total_with_results = len(all_results) # blocked combos get pass_reached=1 results, non-blocked get pass_reached=3 - assert total_with_results == result.pass1_blocked + result.pass3_scored + assert total_with_results == result.pass1_failed + result.pass3_scored - # Blocked combos should have pass_reached=1 and composite_score=0.0 - blocked_results = [r for r in all_results if r["combination"].status == "blocked"] - assert len(blocked_results) == result.pass1_blocked - for br in blocked_results: + # Failed combos should have pass_reached=1 and composite_score=0.0 + failed_results = [r for r in all_results if r["combination"].status == "p1_fail"] + assert len(failed_results) == result.pass1_failed + for br in failed_results: assert br["pass_reached"] == 1 assert br["composite_score"] == 0.0 @@ -291,12 +291,96 @@ def test_save_combination_loads_existing_status(seeded_repo): saved = repo.save_combination(combo) assert saved.status == "pending" - # Mark it blocked in DB - repo.update_combination_status(saved.id, "blocked", "test reason") + # Mark it p1_fail in DB + repo.update_combination_status(saved.id, "p1_fail", "test reason") - # Re-saving should pick up the blocked status + # Re-saving should pick up the p1_fail status combo2 = Combination(entities=entities) reloaded = repo.save_combination(combo2) assert reloaded.id == saved.id - assert reloaded.status == "blocked" + assert reloaded.status == "p1_fail" assert reloaded.block_reason == "test reason" + + +def test_p3_fail_below_threshold(seeded_repo): + """Combos scoring below threshold should get p3_fail status.""" + repo = seeded_repo + domain = repo.get_domain("urban_commuting") + + resolver = ConstraintResolver() + scorer = Scorer(domain) + pipeline = Pipeline(repo, resolver, scorer) + + # Use a very high threshold so most combos fail pass 3 + result = pipeline.run( + domain, ["platform", "power_source"], + score_threshold=0.99, passes=[1, 2, 3], + ) + + assert result.pass3_failed > 0 + # above_threshold should be much smaller than scored + assert result.pass3_above_threshold <= result.pass3_scored + + # p3_fail combos should exist in DB + p3_fail_combos = repo.list_combinations(status="p3_fail") + assert len(p3_fail_combos) == result.pass3_failed + for c in p3_fail_combos: + assert c.block_reason is not None + assert "below threshold" in c.block_reason + + +def test_p4_fail_implausible(seeded_repo): + """Combos deemed implausible by LLM should get p4_fail status.""" + from physcom.llm.providers.mock import MockLLMProvider + + repo = seeded_repo + domain = repo.get_domain("urban_commuting") + + resolver = ConstraintResolver() + scorer = Scorer(domain) + # Low estimates → normalized scores avg <= 0.5 → MockLLMProvider returns (text, False) + # Use threshold=0.0 so no combo gets p3_fail and all reach pass 4 + mock_llm = MockLLMProvider(default_estimates={ + "speed": 0.1, "cost_efficiency": 0.1, "safety": 0.1, + "availability": 0.1, "range_fuel": 0.1, + }) + pipeline = Pipeline(repo, resolver, scorer, llm=mock_llm) + + result = pipeline.run( + domain, ["platform", "power_source"], + score_threshold=0.0, passes=[1, 2, 3, 4], + ) + + # With low normalized scores (avg <= 0.5), reviewed combos should be p4_fail + assert result.pass4_failed > 0 + assert result.pass4_reviewed == 0 + + p4_fail_combos = repo.list_combinations(status="p4_fail") + assert len(p4_fail_combos) == result.pass4_failed + for c in p4_fail_combos: + assert c.block_reason == "LLM deemed implausible" + + +def test_p4_pass_plausible(seeded_repo): + """Combos deemed plausible by LLM should get llm_reviewed status.""" + from physcom.llm.providers.mock import MockLLMProvider + + repo = seeded_repo + domain = repo.get_domain("urban_commuting") + + resolver = ConstraintResolver() + scorer = Scorer(domain) + # High estimates → avg > 0.5 → MockLLMProvider returns (text, True) + mock_llm = MockLLMProvider(default_estimates={ + "speed": 50.0, "cost_efficiency": 0.5, "safety": 0.6, + "availability": 0.7, "range_fuel": 200.0, + }) + pipeline = Pipeline(repo, resolver, scorer, llm=mock_llm) + + result = pipeline.run( + domain, ["platform", "power_source"], + score_threshold=0.01, passes=[1, 2, 3, 4], + ) + + assert result.pass4_reviewed > 0 + assert result.pass4_failed == 0 diff --git a/tests/test_scorer.py b/tests/test_scorer.py index 0565f10..c728a5e 100644 --- a/tests/test_scorer.py +++ b/tests/test_scorer.py @@ -81,10 +81,34 @@ class TestScorer: assert len(result.scores) == 5 def test_scorer_zero_metric_kills_score(self, urban_domain): + """A zero on a higher-is-better metric should drive composite to 0.""" scorer = Scorer(urban_domain) combo = Combination(entities=[]) combo.id = 1 - raw = {"speed": 60.0, "cost_efficiency": 0.0, "safety": 0.7, + raw = {"speed": 60.0, "cost_efficiency": 0.5, "safety": 0.0, "availability": 0.8, "range_fuel": 400} result = scorer.score_combination(combo, raw) assert result.composite_score == 0.0 + + def test_lower_is_better_inverts_score(self, urban_domain): + """cost_efficiency is lower_is_better: low raw value should score high.""" + scorer = Scorer(urban_domain) + combo = Combination(entities=[]) + combo.id = 1 + # cost_efficiency: norm_min=0.01, norm_max=2.0, lower_is_better=True + # A low cost (0.02) should get a HIGH normalized score (near 1.0) + # A high cost (1.9) should get a LOW normalized score (near 0.0) + raw_cheap = {"speed": 60.0, "cost_efficiency": 0.02, "safety": 0.7, + "availability": 0.8, "range_fuel": 400} + raw_expensive = {"speed": 60.0, "cost_efficiency": 1.9, "safety": 0.7, + "availability": 0.8, "range_fuel": 400} + result_cheap = scorer.score_combination(combo, raw_cheap) + result_expensive = scorer.score_combination(combo, raw_expensive) + + # Find the cost_efficiency score in each + cost_cheap = next(s for s in result_cheap.scores if s.metric_name == "cost_efficiency") + cost_expensive = next(s for s in result_expensive.scores if s.metric_name == "cost_efficiency") + + assert cost_cheap.normalized_score > cost_expensive.normalized_score + assert cost_cheap.normalized_score > 0.9 # near the best + assert cost_expensive.normalized_score < 0.1 # near the worst