diff --git a/STATUS.md b/STATUS.md
new file mode 100644
index 0000000..ece929f
--- /dev/null
+++ b/STATUS.md
@@ -0,0 +1,140 @@
+# PhysCom — Project Status
+
+## Architecture Overview
+
+```
+src/
+├── physcom/              Core engine (CLI + SQLite + 5-pass pipeline)
+│   ├── cli.py            Click CLI: init, seed, entity, domain, run, results, review, export
+│   ├── db/
+│   │   ├── schema.py     DDL — 9 tables, 4 indexes
+│   │   └── repository.py Data-access layer (24 methods)
+│   ├── engine/
+│   │   ├── combinator.py Cartesian product generator
+│   │   ├── constraint_resolver.py  Pass 1 — requires/provides/excludes/range matching
+│   │   ├── scorer.py     Pass 3 — weighted geometric mean normalizer
+│   │   └── pipeline.py   Orchestrator for passes 1–5
+│   ├── llm/
+│   │   ├── base.py       Abstract LLMProvider interface (2 methods)
+│   │   ├── prompts.py    Prompt templates for passes 2 and 4
+│   │   └── providers/
+│   │       └── mock.py   Deterministic stub for tests
+│   ├── models/           Dataclasses: Entity, Dependency, Domain, MetricBound, Combination, Score
+│   └── seed/
+│       └── transport_example.py  9 platforms + 9 power sources + 2 domains
+│
+├── physcom_web/          Flask web UI
+│   ├── app.py            App factory, per-request DB connection
+│   ├── routes/
+│   │   ├── entities.py   Entity + dependency CRUD (9 routes, HTMX)
+│   │   ├── domains.py    Domain listing (1 route)
+│   │   ├── pipeline.py   Run form + execution (2 routes)
+│   │   └── results.py    Browse, detail, human review (4 routes)
+│   ├── templates/        Jinja2 + HTMX — 11 templates
+│   └── static/style.css
+│
+tests/                    37 passing tests
+Dockerfile                Single-stage Python 3.13-slim
+docker-compose.yml        web + cli services, shared volume
+```
+
+## What Works
+
+| Area | Status | Notes |
+|------|--------|-------|
+| Database schema | Done | 9 tables, WAL mode, foreign keys |
+| Entity/dependency CRUD | Done | CLI + web UI |
+| Domain + metric weights | Done | CLI seed + web read-only |
+| Pass 1 — constraint resolution | Done | requires/provides/excludes/range logic |
+| Pass 2 — physics estimation | Done | Stub heuristic (force/mass-based); LLM path exists but no real provider |
+| Pass 3 — scoring + ranking | Done | Weighted geometric mean with min/max normalization |
+| Pass 4 — LLM plausibility review | Wired | Pipeline calls `self.llm.review_plausibility()` when `llm` is not None; only MockLLMProvider exists |
+| Pass 5 — human review | Done | CLI interactive + web HTMX form |
+| Web UI | Done | Entity CRUD, domain view, pipeline run, results browse + review |
+| Docker | Done | Compose with web + cli services, named volume |
+| Tests | 37/37 passing | Repository, combinator, constraints, scorer, pipeline |
+
+## What's Missing
+
+### LLM provider — no real implementation yet
+
+The `LLMProvider` abstract class defines two methods:
+
+```python
+class LLMProvider(ABC):
+    def estimate_physics(self, combination_description: str, metrics: list[str]) -> dict[str, float]: ...
+    def review_plausibility(self, combination_description: str, scores: dict[str, float]) -> str: ...
+```
+
+**Pass 2** (`estimate_physics`) — given a combination description like *"platform: Bicycle + power_source: Hydrogen Combustion Engine"*, return estimated metric values (speed, cost_efficiency, safety, etc.) as floats.
+
+**Pass 4** (`review_plausibility`) — given a combination description and its normalized scores, return a 2–4 sentence plausibility assessment.
+
+Prompt templates already exist in `src/physcom/llm/prompts.py`. The pipeline already checks `if self.llm:` and skips gracefully when None.
+
+### To enable LLM reviews, you need to:
+
+1. **Create a real provider** at `src/physcom/llm/providers/<name>.py` that subclasses `LLMProvider`. For example, an Anthropic provider:
+
+   ```python
+   # src/physcom/llm/providers/anthropic.py
+   import json
+   from anthropic import Anthropic
+   from physcom.llm.base import LLMProvider
+   from physcom.llm.prompts import PHYSICS_ESTIMATION_PROMPT, PLAUSIBILITY_REVIEW_PROMPT
+
+   class AnthropicProvider(LLMProvider):
+       def __init__(self, model: str = "claude-sonnet-4-20250514"):
+           self.client = Anthropic()  # reads ANTHROPIC_API_KEY from env
+           self.model = model
+
+       def estimate_physics(self, description: str, metrics: list[str]) -> dict[str, float]:
+           prompt = PHYSICS_ESTIMATION_PROMPT.format(
+               description=description,
+               metrics=", ".join(metrics),
+           )
+           resp = self.client.messages.create(
+               model=self.model,
+               max_tokens=256,
+               messages=[{"role": "user", "content": prompt}],
+           )
+           return json.loads(resp.content[0].text)
+
+       def review_plausibility(self, description: str, scores: dict[str, float]) -> str:
+           prompt = PLAUSIBILITY_REVIEW_PROMPT.format(
+               description=description,
+               scores=json.dumps(scores, indent=2),
+           )
+           resp = self.client.messages.create(
+               model=self.model,
+               max_tokens=512,
+               messages=[{"role": "user", "content": prompt}],
+           )
+           return resp.content[0].text
+   ```
+
+2. **Add the dependency** to `pyproject.toml`:
+   ```toml
+   [project.optional-dependencies]
+   llm = ["anthropic>=0.40"]
+   ```
+
+3. **Wire it into the CLI** — in `cli.py`'s `run` command, instantiate the provider when an `--llm` flag is passed and include pass 4 in the pass list.
+
+4. **Wire it into the web UI** — in `routes/pipeline.py`, same logic: read a config flag or env var (`PHYSCOM_LLM_PROVIDER`), instantiate the provider, pass it to `Pipeline(...)`.
+
+5. **Set the API key** — `ANTHROPIC_API_KEY` env var (or equivalent for your chosen provider). In Docker, add it to `docker-compose.yml`:
+   ```yaml
+   environment:
+     - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
+   ```
+
+The same pattern works for OpenAI, Databricks, or any other provider — just subclass `LLMProvider` and implement the two methods.
+
+### Other future work
+
+- **Domain creation via web UI** — currently seed-only
+- **Database upgrade** — SQLite → Postgres (docker-compose has a commented placeholder)
+- **Async pipeline runs** — currently synchronous; fine for 81 combos, may need background tasks at scale
+- **Export from web UI** — currently CLI-only (`physcom export`)
+- **Authentication** — no auth on the web UI
diff --git a/src/physcom/cli.py b/src/physcom/cli.py
index 4efa3db..ce868c2 100644
--- a/src/physcom/cli.py
+++ b/src/physcom/cli.py
@@ -146,13 +146,16 @@ def run(ctx, domain_name, passes, threshold, dimensions):
     click.echo(f"  Total combinations: {result.total_generated}")
     click.echo(f"  Pass 1 — valid: {result.pass1_valid}, "
                f"conditional: {result.pass1_conditional}, "
-               f"blocked: {result.pass1_blocked}")
+               f"failed: {result.pass1_failed}")
     if 2 in pass_list:
-        click.echo(f"  Pass 2 — estimated: {result.pass2_estimated}")
+        click.echo(f"  Pass 2 — estimated: {result.pass2_estimated}, "
+                   f"failed: {result.pass2_failed}")
     if 3 in pass_list:
-        click.echo(f"  Pass 3 — above threshold: {result.pass3_above_threshold}")
+        click.echo(f"  Pass 3 — above threshold: {result.pass3_above_threshold}, "
+                   f"failed: {result.pass3_failed}")
     if 4 in pass_list:
-        click.echo(f"  Pass 4 — LLM reviewed: {result.pass4_reviewed}")
+        click.echo(f"  Pass 4 — LLM reviewed: {result.pass4_reviewed}, "
+                   f"failed: {result.pass4_failed}")
 
 
 @main.command()
diff --git a/src/physcom/db/repository.py b/src/physcom/db/repository.py
index f23a243..1a33af0 100644
--- a/src/physcom/db/repository.py
+++ b/src/physcom/db/repository.py
@@ -193,6 +193,17 @@ class Repository:
         self.conn.commit()
         return row["id"]
 
+    def backfill_lower_is_better(self, domain_name: str, metric_name: str) -> None:
+        """Set lower_is_better=1 for an existing domain-metric row that still has the default 0."""
+        self.conn.execute(
+            """UPDATE domain_metric_weights SET lower_is_better = 1
+               WHERE lower_is_better = 0
+                 AND domain_id = (SELECT id FROM domains WHERE name = ?)
+                 AND metric_id = (SELECT id FROM metrics WHERE name = ?)""",
+            (domain_name, metric_name),
+        )
+        self.conn.commit()
+
     def add_domain(self, domain: Domain) -> Domain:
         cur = self.conn.execute(
             "INSERT INTO domains (name, description) VALUES (?, ?)",
@@ -204,9 +215,10 @@ class Repository:
             mb.metric_id = metric_id
             self.conn.execute(
                 """INSERT INTO domain_metric_weights
-                   (domain_id, metric_id, weight, norm_min, norm_max)
-                   VALUES (?, ?, ?, ?, ?)""",
-                (domain.id, metric_id, mb.weight, mb.norm_min, mb.norm_max),
+                   (domain_id, metric_id, weight, norm_min, norm_max, lower_is_better)
+                   VALUES (?, ?, ?, ?, ?, ?)""",
+                (domain.id, metric_id, mb.weight, mb.norm_min, mb.norm_max,
+                 int(mb.lower_is_better)),
             )
         self.conn.commit()
         return domain
@@ -216,7 +228,8 @@ class Repository:
         if not row:
             return None
         weights = self.conn.execute(
-            """SELECT m.name, m.unit, dmw.weight, dmw.norm_min, dmw.norm_max, dmw.metric_id
+            """SELECT m.name, m.unit, dmw.weight, dmw.norm_min, dmw.norm_max,
+                      dmw.metric_id, dmw.lower_is_better
                FROM domain_metric_weights dmw
                JOIN metrics m ON dmw.metric_id = m.id
                WHERE dmw.domain_id = ?""",
@@ -231,6 +244,7 @@ class Repository:
                     metric_name=w["name"], weight=w["weight"],
                     norm_min=w["norm_min"], norm_max=w["norm_max"],
                     metric_id=w["metric_id"], unit=w["unit"] or "",
+                    lower_is_better=bool(w["lower_is_better"]),
                 )
                 for w in weights
             ],
@@ -245,7 +259,8 @@ class Repository:
         if not row:
             return None
         weights = self.conn.execute(
-            """SELECT m.name, m.unit, dmw.weight, dmw.norm_min, dmw.norm_max, dmw.metric_id
+            """SELECT m.name, m.unit, dmw.weight, dmw.norm_min, dmw.norm_max,
+                      dmw.metric_id, dmw.lower_is_better
                FROM domain_metric_weights dmw
                JOIN metrics m ON dmw.metric_id = m.id
                WHERE dmw.domain_id = ?""",
@@ -260,6 +275,7 @@ class Repository:
                     metric_name=w["name"], weight=w["weight"],
                     norm_min=w["norm_min"], norm_max=w["norm_max"],
                     metric_id=w["metric_id"], unit=w["unit"] or "",
+                    lower_is_better=bool(w["lower_is_better"]),
                 )
                 for w in weights
             ],
@@ -277,21 +293,23 @@ class Repository:
         mb.metric_id = metric_id
         self.conn.execute(
             """INSERT OR REPLACE INTO domain_metric_weights
-               (domain_id, metric_id, weight, norm_min, norm_max)
-               VALUES (?, ?, ?, ?, ?)""",
-            (domain_id, metric_id, mb.weight, mb.norm_min, mb.norm_max),
+               (domain_id, metric_id, weight, norm_min, norm_max, lower_is_better)
+               VALUES (?, ?, ?, ?, ?, ?)""",
+            (domain_id, metric_id, mb.weight, mb.norm_min, mb.norm_max,
+             int(mb.lower_is_better)),
         )
         self.conn.commit()
         return mb
 
     def update_metric_bound(
-        self, domain_id: int, metric_id: int, weight: float, norm_min: float, norm_max: float, unit: str
+        self, domain_id: int, metric_id: int, weight: float, norm_min: float, norm_max: float,
+        unit: str, lower_is_better: bool = False,
     ) -> None:
         self.conn.execute(
             """UPDATE domain_metric_weights
-               SET weight = ?, norm_min = ?, norm_max = ?
+               SET weight = ?, norm_min = ?, norm_max = ?, lower_is_better = ?
                WHERE domain_id = ? AND metric_id = ?""",
-            (weight, norm_min, norm_max, domain_id, metric_id),
+            (weight, norm_min, norm_max, int(lower_is_better), domain_id, metric_id),
         )
         if unit:
             self.conn.execute(
@@ -330,6 +348,21 @@ class Repository:
         self.conn.execute("DELETE FROM combination_scores WHERE domain_id = ?", (domain.id,))
         self.conn.execute("DELETE FROM combination_results WHERE domain_id = ?", (domain.id,))
         self.conn.execute("DELETE FROM pipeline_runs WHERE domain_id = ?", (domain.id,))
+        # Delete orphaned combos (no results left in any domain) and all their
+        # related rows — scores, entity links — so FK constraints don't block.
+        orphan_sql = """SELECT c.id FROM combinations c
+                        WHERE c.id NOT IN (
+                            SELECT DISTINCT combination_id FROM combination_results
+                        )"""
+        self.conn.execute(
+            f"DELETE FROM combination_scores WHERE combination_id IN ({orphan_sql})"
+        )
+        self.conn.execute(
+            f"DELETE FROM combination_entities WHERE combination_id IN ({orphan_sql})"
+        )
+        self.conn.execute(
+            f"DELETE FROM combinations WHERE id IN ({orphan_sql})"
+        )
         self.conn.commit()
         return count
 
@@ -371,12 +404,15 @@ class Repository:
         self, combo_id: int, status: str, block_reason: str | None = None
     ) -> None:
         # Don't downgrade from higher pass states — preserves human/LLM review data
-        if status in ("scored", "llm_reviewed"):
+        if status in ("scored", "llm_reviewed") or status.endswith("_fail"):
             row = self.conn.execute(
                 "SELECT status FROM combinations WHERE id = ?", (combo_id,)
             ).fetchone()
             if row:
                 cur = row["status"]
+                # Fail statuses should not overwrite llm_reviewed or reviewed
+                if status.endswith("_fail") and cur in ("llm_reviewed", "reviewed"):
+                    return
                 if status == "scored" and cur in ("llm_reviewed", "reviewed"):
                     return
                 if status == "llm_reviewed" and cur == "reviewed":
@@ -550,12 +586,12 @@ class Repository:
         ).fetchone()
         if not row or row["total"] == 0:
             return None
-        blocked = self.conn.execute(
+        failed = self.conn.execute(
             """SELECT COUNT(*) as cnt
                FROM combinations c
                JOIN combination_results cr ON cr.combination_id = c.id
                JOIN domains d ON cr.domain_id = d.id
-               WHERE c.status = 'blocked' AND d.name = ?""",
+               WHERE c.status LIKE '%\\_fail' ESCAPE '\\' AND d.name = ?""",
             (domain_name,),
         ).fetchone()
         return {
@@ -564,7 +600,7 @@ class Repository:
             "max_score": row["max_score"],
             "min_score": row["min_score"],
             "last_pass": row["last_pass"],
-            "blocked": blocked["cnt"] if blocked else 0,
+            "failed": failed["cnt"] if failed else 0,
         }
 
     def get_result(self, combo_id: int, domain_id: int) -> dict | None:
diff --git a/src/physcom/db/schema.py b/src/physcom/db/schema.py
index 5e637bb..9bc5eee 100644
--- a/src/physcom/db/schema.py
+++ b/src/physcom/db/schema.py
@@ -44,12 +44,13 @@ CREATE TABLE IF NOT EXISTS metrics (
 );
 
 CREATE TABLE IF NOT EXISTS domain_metric_weights (
-    id        INTEGER PRIMARY KEY AUTOINCREMENT,
-    domain_id INTEGER NOT NULL REFERENCES domains(id),
-    metric_id INTEGER NOT NULL REFERENCES metrics(id),
-    weight    REAL NOT NULL,
-    norm_min  REAL,
-    norm_max  REAL,
+    id              INTEGER PRIMARY KEY AUTOINCREMENT,
+    domain_id       INTEGER NOT NULL REFERENCES domains(id),
+    metric_id       INTEGER NOT NULL REFERENCES metrics(id),
+    weight          REAL NOT NULL,
+    norm_min        REAL,
+    norm_max        REAL,
+    lower_is_better INTEGER NOT NULL DEFAULT 0,
     UNIQUE(domain_id, metric_id)
 );
 
@@ -117,6 +118,23 @@ CREATE INDEX IF NOT EXISTS idx_pipeline_runs_domain ON pipeline_runs(domain_id);
 """
 
 
+def _migrate(conn: sqlite3.Connection) -> None:
+    """Apply incremental migrations for existing databases."""
+    cols = {r[1] for r in conn.execute("PRAGMA table_info(domain_metric_weights)").fetchall()}
+    if "lower_is_better" not in cols:
+        conn.execute(
+            "ALTER TABLE domain_metric_weights ADD COLUMN lower_is_better INTEGER NOT NULL DEFAULT 0"
+        )
+
+    # Backfill: cost_efficiency is lower-is-better in all domains
+    conn.execute(
+        """UPDATE domain_metric_weights SET lower_is_better = 1
+           WHERE lower_is_better = 0
+             AND metric_id IN (SELECT id FROM metrics WHERE name = 'cost_efficiency')"""
+    )
+    conn.commit()
+
+
 def init_db(db_path: str | Path) -> sqlite3.Connection:
     """Create/open the database and ensure all tables exist."""
     db_path = Path(db_path)
@@ -125,5 +143,6 @@ def init_db(db_path: str | Path) -> sqlite3.Connection:
     conn.execute("PRAGMA journal_mode=WAL")
     conn.execute("PRAGMA foreign_keys=ON")
     conn.executescript(DDL)
+    _migrate(conn)
     conn.commit()
     return conn
diff --git a/src/physcom/engine/constraint_resolver.py b/src/physcom/engine/constraint_resolver.py
index 77fa929..6eaef07 100644
--- a/src/physcom/engine/constraint_resolver.py
+++ b/src/physcom/engine/constraint_resolver.py
@@ -46,7 +46,7 @@ class ConstraintResolver:
         self._check_unmet_requirements(all_deps, result)
 
         if result.violations:
-            result.status = "blocked"
+            result.status = "p1_fail"
         elif result.warnings:
             result.status = "conditional"
 
diff --git a/src/physcom/engine/pipeline.py b/src/physcom/engine/pipeline.py
index 3a4b431..296a73e 100644
--- a/src/physcom/engine/pipeline.py
+++ b/src/physcom/engine/pipeline.py
@@ -21,12 +21,15 @@ class PipelineResult:
 
     total_generated: int = 0
     pass1_valid: int = 0
-    pass1_blocked: int = 0
+    pass1_failed: int = 0
     pass1_conditional: int = 0
     pass2_estimated: int = 0
+    pass2_failed: int = 0
     pass3_scored: int = 0
     pass3_above_threshold: int = 0
+    pass3_failed: int = 0
     pass4_reviewed: int = 0
+    pass4_failed: int = 0
     pass5_human_reviewed: int = 0
     top_results: list[dict] = field(default_factory=list)
 
@@ -77,7 +80,7 @@ class Pipeline:
             run_id,
             combos_pass1=result.pass1_valid
             + result.pass1_conditional
-            + result.pass1_blocked,
+            + result.pass1_failed,
             combos_pass2=result.pass2_estimated,
             combos_pass3=result.pass3_scored,
             combos_pass4=result.pass4_reviewed,
@@ -142,22 +145,22 @@ class Pipeline:
                 # ── Pass 1: Constraint Resolution ────────────────
                 if 1 in passes and existing_pass < 1:
                     cr: ConstraintResult = self.resolver.resolve(combo)
-                    if cr.status == "blocked":
-                        combo.status = "blocked"
+                    if cr.status == "p1_fail":
+                        combo.status = "p1_fail"
                         combo.block_reason = "; ".join(cr.violations)
                         self.repo.update_combination_status(
-                            combo.id, "blocked", combo.block_reason
+                            combo.id, "p1_fail", combo.block_reason
                         )
-                        # Save a result row so blocked combos appear in results
+                        # Save a result row so failed combos appear in results
                         self.repo.save_result(
                             combo.id,
                             domain.id,
                             composite_score=0.0,
                             pass_reached=1,
                         )
-                        result.pass1_blocked += 1
+                        result.pass1_failed += 1
                         self._update_run_counters(run_id, result, current_pass=1)
-                        continue  # blocked — skip remaining passes
+                        continue  # p1_fail — skip remaining passes
                     else:
                         combo.status = "valid"
                         self.repo.update_combination_status(combo.id, "valid")
@@ -168,16 +171,16 @@ class Pipeline:
 
                     self._update_run_counters(run_id, result, current_pass=1)
                 elif 1 in passes:
-                    # Already pass1'd — check if it was blocked
-                    if combo.status == "blocked":
-                        result.pass1_blocked += 1
+                    # Already pass1'd — check if it failed
+                    if combo.status.endswith("_fail"):
+                        result.pass1_failed += 1
                         continue
                     else:
                         result.pass1_valid += 1
                 else:
-                    # Pass 1 not requested; check if blocked from a prior run
-                    if combo.status == "blocked":
-                        result.pass1_blocked += 1
+                    # Pass 1 not requested; check if failed from a prior run
+                    if combo.status.endswith("_fail"):
+                        result.pass1_failed += 1
                         continue
 
                 # ── Pass 2: Physics Estimation ───────────────────
@@ -207,6 +210,21 @@ class Pipeline:
                             combo.id, domain.id, estimate_dicts
                         )
 
+                    # Check for all-zero estimates → p2_fail
+                    if raw_metrics and all(v == 0.0 for v in raw_metrics.values()):
+                        combo.status = "p2_fail"
+                        combo.block_reason = "All metric estimates are zero"
+                        self.repo.update_combination_status(
+                            combo.id, "p2_fail", combo.block_reason
+                        )
+                        self.repo.save_result(
+                            combo.id, domain.id,
+                            composite_score=0.0, pass_reached=2,
+                        )
+                        result.pass2_failed += 1
+                        self._update_run_counters(run_id, result, current_pass=2)
+                        continue
+
                     result.pass2_estimated += 1
                     self._update_run_counters(run_id, result, current_pass=2)
                 elif 2 in passes:
@@ -249,6 +267,26 @@ class Pipeline:
                         existing_result["human_notes"] if existing_result else None
                     )
 
+                    if sr.composite_score < score_threshold:
+                        self.repo.save_result(
+                            combo.id, domain.id,
+                            sr.composite_score, pass_reached=3,
+                            novelty_flag=novelty_flag,
+                            human_notes=human_notes,
+                        )
+                        combo.status = "p3_fail"
+                        combo.block_reason = (
+                            f"Composite score {sr.composite_score:.4f} "
+                            f"below threshold {score_threshold}"
+                        )
+                        self.repo.update_combination_status(
+                            combo.id, "p3_fail", combo.block_reason
+                        )
+                        result.pass3_failed += 1
+                        result.pass3_scored += 1
+                        self._update_run_counters(run_id, result, current_pass=3)
+                        continue
+
                     self.repo.save_result(
                         combo.id,
                         domain.id,
@@ -260,8 +298,7 @@ class Pipeline:
                     self.repo.update_combination_status(combo.id, "scored")
 
                     result.pass3_scored += 1
-                    if sr.composite_score >= score_threshold:
-                        result.pass3_above_threshold += 1
+                    result.pass3_above_threshold += 1
 
                     self._update_run_counters(run_id, result, current_pass=3)
                 elif 3 in passes and existing_pass >= 3:
@@ -294,33 +331,49 @@ class Pipeline:
                                 for s in db_scores
                                 if s["normalized_score"] is not None
                             }
-                            review: str | None = None
+                            review_result: tuple[str, bool] | None = None
                             try:
-                                review = self.llm.review_plausibility(
+                                review_result = self.llm.review_plausibility(
                                     description, score_dict
                                 )
                             except LLMRateLimitError as exc:
                                 self._wait_for_rate_limit(run_id, exc.retry_after)
                                 try:
-                                    review = self.llm.review_plausibility(
+                                    review_result = self.llm.review_plausibility(
                                         description, score_dict
                                     )
                                 except LLMRateLimitError:
                                     pass  # still limited; skip, retry next run
-                            if review is not None:
-                                self.repo.save_result(
-                                    combo.id,
-                                    domain.id,
-                                    cur_result["composite_score"],
-                                    pass_reached=4,
-                                    novelty_flag=cur_result.get("novelty_flag"),
-                                    llm_review=review,
-                                    human_notes=cur_result.get("human_notes"),
-                                )
-                                self.repo.update_combination_status(
-                                    combo.id, "llm_reviewed"
-                                )
-                                result.pass4_reviewed += 1
+                            if review_result is not None:
+                                review_text, plausible = review_result
+                                if not plausible:
+                                    self.repo.save_result(
+                                        combo.id, domain.id,
+                                        cur_result["composite_score"],
+                                        pass_reached=4,
+                                        novelty_flag=cur_result.get("novelty_flag"),
+                                        llm_review=review_text,
+                                        human_notes=cur_result.get("human_notes"),
+                                    )
+                                    combo.status = "p4_fail"
+                                    combo.block_reason = "LLM deemed implausible"
+                                    self.repo.update_combination_status(
+                                        combo.id, "p4_fail", combo.block_reason
+                                    )
+                                    result.pass4_failed += 1
+                                else:
+                                    self.repo.save_result(
+                                        combo.id, domain.id,
+                                        cur_result["composite_score"],
+                                        pass_reached=4,
+                                        novelty_flag=cur_result.get("novelty_flag"),
+                                        llm_review=review_text,
+                                        human_notes=cur_result.get("human_notes"),
+                                    )
+                                    self.repo.update_combination_status(
+                                        combo.id, "llm_reviewed"
+                                    )
+                                    result.pass4_reviewed += 1
                                 self._update_run_counters(
                                     run_id, result, current_pass=4
                                 )
diff --git a/src/physcom/engine/scorer.py b/src/physcom/engine/scorer.py
index d9449ad..6727dce 100644
--- a/src/physcom/engine/scorer.py
+++ b/src/physcom/engine/scorer.py
@@ -69,6 +69,8 @@ class Scorer:
         for mb in self.domain.metric_bounds:
             raw = raw_metrics.get(mb.metric_name, 0.0)
             normed = normalize(raw, mb.norm_min, mb.norm_max)
+            if mb.lower_is_better:
+                normed = 1.0 - normed
             scores.append(Score(
                 metric_name=mb.metric_name,
                 raw_value=raw,
diff --git a/src/physcom/llm/base.py b/src/physcom/llm/base.py
index 2762134..ee072de 100644
--- a/src/physcom/llm/base.py
+++ b/src/physcom/llm/base.py
@@ -31,7 +31,7 @@ class LLMProvider(ABC):
     @abstractmethod
     def review_plausibility(
         self, combination_description: str, scores: dict[str, float]
-    ) -> str:
-        """Given a combination and its scores, return a natural-language
-        plausibility and novelty assessment."""
+    ) -> tuple[str, bool]:
+        """Given a combination and its scores, return a (text, is_plausible)
+        tuple: natural-language assessment and whether the concept is plausible."""
         ...
diff --git a/src/physcom/llm/prompts.py b/src/physcom/llm/prompts.py
index 68e679f..b7e03de 100644
--- a/src/physcom/llm/prompts.py
+++ b/src/physcom/llm/prompts.py
@@ -33,5 +33,8 @@ Review this concept for:
 3. Novelty — does anything similar already exist?
 4. Overall plausibility — is this a genuinely interesting innovation or nonsense?
 
-Provide a concise 2-4 sentence assessment.
+Provide a concise 2-4 sentence assessment, then on a final line write exactly:
+VERDICT: PLAUSIBLE
+or
+VERDICT: IMPLAUSIBLE
 """
diff --git a/src/physcom/llm/providers/gemini.py b/src/physcom/llm/providers/gemini.py
index 2b46575..ac1f9d5 100644
--- a/src/physcom/llm/providers/gemini.py
+++ b/src/physcom/llm/providers/gemini.py
@@ -42,7 +42,7 @@ class GeminiLLMProvider(LLMProvider):
 
     def review_plausibility(
         self, combination_description: str, scores: dict[str, float]
-    ) -> str:
+    ) -> tuple[str, bool]:
         scores_str = "\n".join(f"- {k}: {v:.3f}" for k, v in scores.items())
         prompt = PLAUSIBILITY_REVIEW_PROMPT.format(
             description=combination_description,
@@ -56,7 +56,16 @@ class GeminiLLMProvider(LLMProvider):
             if "429" in str(exc) or "RESOURCE_EXHAUSTED" in str(exc):
                 raise LLMRateLimitError(str(exc), self._parse_retry_after(exc)) from exc
             raise
-        return response.text.strip()
+        text = response.text.strip()
+        plausible = self._parse_verdict(text)
+        return (text, plausible)
+
+    def _parse_verdict(self, text: str) -> bool:
+        """Extract VERDICT: PLAUSIBLE/IMPLAUSIBLE from response; default to True."""
+        m = re.search(r"VERDICT:\s*(PLAUSIBLE|IMPLAUSIBLE)", text, re.IGNORECASE)
+        if m:
+            return m.group(1).upper() == "PLAUSIBLE"
+        return True
 
     def _parse_retry_after(self, exc: Exception) -> int:
         """Extract retry delay from the error message, with a safe default."""
diff --git a/src/physcom/llm/providers/mock.py b/src/physcom/llm/providers/mock.py
index 50a2f26..bb3d8b0 100644
--- a/src/physcom/llm/providers/mock.py
+++ b/src/physcom/llm/providers/mock.py
@@ -21,8 +21,8 @@ class MockLLMProvider(LLMProvider):
 
     def review_plausibility(
         self, combination_description: str, scores: dict[str, float]
-    ) -> str:
+    ) -> tuple[str, bool]:
         avg = sum(scores.values()) / max(len(scores), 1)
         if avg > 0.5:
-            return "This concept appears plausible and worth further investigation."
-        return "This concept has significant feasibility challenges."
+            return ("This concept appears plausible and worth further investigation.", True)
+        return ("This concept has significant feasibility challenges.", False)
diff --git a/src/physcom/models/combination.py b/src/physcom/models/combination.py
index dd04e15..7785507 100644
--- a/src/physcom/models/combination.py
+++ b/src/physcom/models/combination.py
@@ -39,7 +39,7 @@ class Combination:
     """A generated combination of entities (one per dimension)."""
 
     entities: list[Entity] = field(default_factory=list)
-    status: str = "pending"  # pending → valid/blocked → scored → reviewed
+    status: str = "pending"  # pending → valid/p1_fail/p2_fail/p3_fail/p4_fail → scored → reviewed
     block_reason: str | None = None
     hash: str | None = None
     id: int | None = None
diff --git a/src/physcom/models/domain.py b/src/physcom/models/domain.py
index 347fc79..d02670f 100644
--- a/src/physcom/models/domain.py
+++ b/src/physcom/models/domain.py
@@ -11,9 +11,10 @@ class MetricBound:
 
     metric_name: str
     weight: float  # 0.0–1.0
-    norm_min: float  # Below this → score 0
-    norm_max: float  # Above this → score 1
+    norm_min: float  # Below this → score 0 (or 1 if lower_is_better)
+    norm_max: float  # Above this → score 1 (or 0 if lower_is_better)
     unit: str = ""
+    lower_is_better: bool = False  # Invert scale (e.g., cost: lower = better)
     metric_id: int | None = None
 
 
diff --git a/src/physcom/seed/transport_example.py b/src/physcom/seed/transport_example.py
index aea3ccf..339c29f 100644
--- a/src/physcom/seed/transport_example.py
+++ b/src/physcom/seed/transport_example.py
@@ -254,7 +254,7 @@ URBAN_COMMUTING = Domain(
     description="Daily travel within a city, 1-50km range",
     metric_bounds=[
         MetricBound("speed", weight=0.25, norm_min=5, norm_max=120, unit="km/h"),
-        MetricBound("cost_efficiency", weight=0.25, norm_min=0.01, norm_max=2.0, unit="$/km"),
+        MetricBound("cost_efficiency", weight=0.25, norm_min=0.01, norm_max=2.0, unit="$/km", lower_is_better=True),
         MetricBound("safety", weight=0.25, norm_min=0.0, norm_max=1.0, unit="0-1"),
         MetricBound("availability", weight=0.15, norm_min=0.0, norm_max=1.0, unit="0-1"),
         MetricBound("range_fuel", weight=0.10, norm_min=5, norm_max=500, unit="km"),
@@ -268,7 +268,7 @@ INTERPLANETARY = Domain(
         MetricBound("speed", weight=0.30, norm_min=1000, norm_max=300000, unit="km/s"),
         MetricBound("range_fuel", weight=0.30, norm_min=1e6, norm_max=1e10, unit="km"),
         MetricBound("safety", weight=0.20, norm_min=0.0, norm_max=1.0, unit="0-1"),
-        MetricBound("cost_efficiency", weight=0.10, norm_min=1e3, norm_max=1e9, unit="$/km"),
+        MetricBound("cost_efficiency", weight=0.10, norm_min=1e3, norm_max=1e9, unit="$/km", lower_is_better=True),
         MetricBound("range_degradation", weight=0.10, norm_min=100, norm_max=36500, unit="days"),
     ],
 )
@@ -302,8 +302,10 @@ def load_transport_seed(repo) -> dict:
             counts["domains"] += 1
         except sqlite3.IntegrityError:
             pass
-        # Backfill metric units on existing DBs (ensure_metric is idempotent).
+        # Backfill metric units and lower_is_better on existing DBs.
         for mb in domain.metric_bounds:
             repo.ensure_metric(mb.metric_name, unit=mb.unit)
+            if mb.lower_is_better:
+                repo.backfill_lower_is_better(domain.name, mb.metric_name)
 
     return counts
diff --git a/src/physcom_web/app.py b/src/physcom_web/app.py
index b56fce3..5bb717e 100644
--- a/src/physcom_web/app.py
+++ b/src/physcom_web/app.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import math
 import os
 import secrets
 from pathlib import Path
@@ -53,10 +54,54 @@ def close_db(exc: BaseException | None = None) -> None:
         repo.conn.close()
 
 
+_SI_PREFIXES = [
+    (1e12, "T"),
+    (1e9, "G"),
+    (1e6, "M"),
+    (1e3, "k"),
+]
+
+
+def _si_format(value: object) -> str:
+    """Format a number with SI prefixes for readability.
+
+    Handles string inputs (like dep.value) by trying float conversion first.
+    Non-numeric values are returned as-is.
+    """
+    if isinstance(value, str):
+        try:
+            num = float(value)
+        except (ValueError, TypeError):
+            return value
+    elif isinstance(value, (int, float)):
+        num = float(value)
+    else:
+        return str(value)
+
+    if math.isnan(num) or math.isinf(num):
+        return str(value)
+
+    abs_num = abs(num)
+    if abs_num < 1000:
+        # Small numbers: drop trailing zeros, cap at 4 significant figures
+        if num == int(num) and abs_num < 100:
+            return str(int(num))
+        return f"{num:.4g}"
+
+    for threshold, prefix in _SI_PREFIXES:
+        if abs_num >= threshold:
+            scaled = num / threshold
+            return f"{scaled:.4g}{prefix}"
+
+    return f"{num:.4g}"
+
+
 def create_app() -> Flask:
     app = Flask(__name__)
     app.secret_key = _load_or_generate_secret_key()
 
+    app.jinja_env.filters["si"] = _si_format
+
     app.teardown_appcontext(close_db)
 
     # Register blueprints
diff --git a/src/physcom_web/routes/domains.py b/src/physcom_web/routes/domains.py
index 6f8b326..e7bb5e5 100644
--- a/src/physcom_web/routes/domains.py
+++ b/src/physcom_web/routes/domains.py
@@ -80,7 +80,11 @@ def metric_add(domain_id: int):
     if not metric_name:
         flash("Metric name is required.", "error")
     else:
-        mb = MetricBound(metric_name=metric_name, weight=weight, norm_min=norm_min, norm_max=norm_max, unit=unit)
+        lower_is_better = bool(request.form.get("lower_is_better"))
+        mb = MetricBound(
+            metric_name=metric_name, weight=weight, norm_min=norm_min,
+            norm_max=norm_max, unit=unit, lower_is_better=lower_is_better,
+        )
         repo.add_metric_bound(domain_id, mb)
         flash("Metric added.", "success")
     domain = repo.get_domain_by_id(domain_id)
@@ -99,7 +103,8 @@ def metric_edit(domain_id: int, metric_id: int):
         domain = repo.get_domain_by_id(domain_id)
         return render_template("domains/_metrics_table.html", domain=domain)
     unit = request.form.get("unit", "").strip()
-    repo.update_metric_bound(domain_id, metric_id, weight, norm_min, norm_max, unit)
+    lower_is_better = bool(request.form.get("lower_is_better"))
+    repo.update_metric_bound(domain_id, metric_id, weight, norm_min, norm_max, unit, lower_is_better)
     flash("Metric updated.", "success")
     domain = repo.get_domain_by_id(domain_id)
     return render_template("domains/_metrics_table.html", domain=domain)
diff --git a/src/physcom_web/routes/results.py b/src/physcom_web/routes/results.py
index a8e141f..675e9c7 100644
--- a/src/physcom_web/routes/results.py
+++ b/src/physcom_web/routes/results.py
@@ -54,6 +54,9 @@ def result_detail(domain_name: str, combo_id: int):
         return redirect(url_for("results.results_domain", domain_name=domain_name))
 
     result = repo.get_result(combo_id, domain.id)
+    if not result:
+        flash("No results for this combination in this domain.", "error")
+        return redirect(url_for("results.results_domain", domain_name=domain_name))
     scores = repo.get_combination_scores(combo_id, domain.id)
 
     return render_template(
diff --git a/src/physcom_web/static/style.css b/src/physcom_web/static/style.css
index 9aa71be..2a1a7ce 100644
--- a/src/physcom_web/static/style.css
+++ b/src/physcom_web/static/style.css
@@ -54,6 +54,7 @@ h3 { font-size: 1rem; margin-bottom: 0.25rem; }
   grid-template-columns: repeat(auto-fill, minmax(320px, 1fr));
   gap: 1rem;
 }
+.card-grid > * { min-width: 0; overflow-x: auto; }
 
 /* ── Tables ──────────────────────────────────────────────── */
 table { width: 100%; border-collapse: collapse; font-size: 0.9rem; }
@@ -79,7 +80,10 @@ table.compact th, table.compact td { padding: 0.25rem 0.4rem; font-size: 0.85rem
 .badge-range_min, .badge-range_max { background: #fef3c7; color: #92400e; }
 .badge-excludes { background: #fee2e2; color: #991b1b; }
 .badge-valid { background: #dcfce7; color: #166534; }
-.badge-blocked { background: #fee2e2; color: #991b1b; }
+.badge-p1_fail { background: #fee2e2; color: #991b1b; }
+.badge-p2_fail { background: #fee2e2; color: #991b1b; }
+.badge-p3_fail { background: #fee2e2; color: #991b1b; }
+.badge-p4_fail { background: #fee2e2; color: #991b1b; }
 .badge-scored { background: #dbeafe; color: #1e40af; }
 .badge-llm_reviewed { background: #e0f2fe; color: #0369a1; }
 .badge-reviewed { background: #f3e8ff; color: #6b21a8; }
diff --git a/src/physcom_web/templates/domains/_metrics_table.html b/src/physcom_web/templates/domains/_metrics_table.html
index f6f33cb..6d830b2 100644
--- a/src/physcom_web/templates/domains/_metrics_table.html
+++ b/src/physcom_web/templates/domains/_metrics_table.html
@@ -6,6 +6,7 @@
       <th>Weight</th>
       <th>Norm Min</th>
       <th>Norm Max</th>
+      <th>Direction</th>
       <th></th>
     </tr>
   </thead>
@@ -15,8 +16,9 @@
       <td>{{ mb.metric_name }}</td>
       <td>{{ mb.unit or '—' }}</td>
       <td>{{ mb.weight }}</td>
-      <td>{{ mb.norm_min }}</td>
-      <td>{{ mb.norm_max }}</td>
+      <td>{{ mb.norm_min|si }}</td>
+      <td>{{ mb.norm_max|si }}</td>
+      <td>{{ '↓ lower' if mb.lower_is_better else '↑ higher' }}</td>
       <td class="actions">
         <button class="btn btn-sm"
                 onclick="this.closest('tr').nextElementSibling.style.display='table-row'; this.closest('tr').style.display='none'">
@@ -39,6 +41,7 @@
         <td><input name="weight" type="number" step="any" value="{{ mb.weight }}" required></td>
         <td><input name="norm_min" type="number" step="any" value="{{ mb.norm_min }}" required></td>
         <td><input name="norm_max" type="number" step="any" value="{{ mb.norm_max }}" required></td>
+        <td><label><input type="checkbox" name="lower_is_better" value="1" {{ 'checked' if mb.lower_is_better }}> lower is better</label></td>
         <td>
           <button type="submit" class="btn btn-sm btn-primary">Save</button>
           <button type="button" class="btn btn-sm"
@@ -63,6 +66,7 @@
     <input name="weight" type="number" step="any" placeholder="weight" value="1.0" required>
     <input name="norm_min" type="number" step="any" placeholder="norm min" value="0.0" required>
     <input name="norm_max" type="number" step="any" placeholder="norm max" value="1.0" required>
+    <label><input type="checkbox" name="lower_is_better" value="1"> lower is better</label>
     <button type="submit" class="btn btn-primary">Add</button>
   </div>
 </form>
diff --git a/src/physcom_web/templates/domains/list.html b/src/physcom_web/templates/domains/list.html
index ceba19a..49b01bc 100644
--- a/src/physcom_web/templates/domains/list.html
+++ b/src/physcom_web/templates/domains/list.html
@@ -16,7 +16,7 @@
     <p>{{ d.description }}</p>
     <table>
       <thead>
-        <tr><th>Metric</th><th>Unit</th><th>Weight</th><th>Norm Min</th><th>Norm Max</th></tr>
+        <tr><th>Metric</th><th>Unit</th><th>Weight</th><th>Norm Min</th><th>Norm Max</th><th>Direction</th></tr>
       </thead>
       <tbody>
         {% for mb in d.metric_bounds %}
@@ -24,8 +24,9 @@
           <td>{{ mb.metric_name }}</td>
           <td>{{ mb.unit }}</td>
           <td>{{ mb.weight }}</td>
-          <td>{{ mb.norm_min }}</td>
-          <td>{{ mb.norm_max }}</td>
+          <td>{{ mb.norm_min|si }}</td>
+          <td>{{ mb.norm_max|si }}</td>
+          <td>{{ '↓ lower' if mb.lower_is_better else '↑ higher' }}</td>
         </tr>
         {% endfor %}
       </tbody>
diff --git a/src/physcom_web/templates/entities/_dep_table.html b/src/physcom_web/templates/entities/_dep_table.html
index 4bb3560..2e07544 100644
--- a/src/physcom_web/templates/entities/_dep_table.html
+++ b/src/physcom_web/templates/entities/_dep_table.html
@@ -14,7 +14,7 @@
     <tr>
       <td>{{ dep.category }}</td>
       <td>{{ dep.key }}</td>
-      <td>{{ dep.value }}</td>
+      <td>{{ dep.value|si }}</td>
       <td>{{ dep.unit or '—' }}</td>
       <td><span class="badge badge-{{ dep.constraint_type }}">{{ dep.constraint_type }}</span></td>
       <td class="actions">
diff --git a/src/physcom_web/templates/pipeline/_run_status.html b/src/physcom_web/templates/pipeline/_run_status.html
index b7d8d28..f988069 100644
--- a/src/physcom_web/templates/pipeline/_run_status.html
+++ b/src/physcom_web/templates/pipeline/_run_status.html
@@ -34,7 +34,7 @@
         <td>1 — Constraints</td>
         <td>{{ run.combos_pass1 or 0 }} checked
           {%- if (run.combos_pass2 or 0) > 0 and (run.combos_pass1 or 0) > (run.combos_pass2 or 0) %},
-            <span class="badge badge-blocked">{{ (run.combos_pass1 or 0) - (run.combos_pass2 or 0) }} blocked</span>
+            <span class="badge badge-p1_fail">{{ (run.combos_pass1 or 0) - (run.combos_pass2 or 0) }} failed</span>
           {%- endif -%}
         </td>
       </tr>
diff --git a/src/physcom_web/templates/pipeline/run.html b/src/physcom_web/templates/pipeline/run.html
index 4544876..d2e02fc 100644
--- a/src/physcom_web/templates/pipeline/run.html
+++ b/src/physcom_web/templates/pipeline/run.html
@@ -97,7 +97,7 @@
       <th>Status</th>
       <th>Total</th>
       <th>P1 Checked</th>
-      <th>P1 Blocked</th>
+      <th>P1 Failed</th>
       <th>P2 Estimated</th>
       <th>P3 Scored</th>
       <th>P4 Reviewed</th>
@@ -113,7 +113,7 @@
       <td><span class="badge badge-{{ run.status }}">{{ run.status }}</span></td>
       <td>{{ run.total_combos or '—' }}</td>
       <td>{{ run.combos_pass1 or '—' }}</td>
-      <td>{% if blocked %}<span class="badge badge-blocked">{{ blocked }}</span>{% else %}—{% endif %}</td>
+      <td>{% if blocked %}<span class="badge badge-p1_fail">{{ blocked }}</span>{% else %}—{% endif %}</td>
       <td>{{ run.combos_pass2 or '—' }}</td>
       <td>{{ run.combos_pass3 or '—' }}</td>
       <td>{{ run.combos_pass4 or '—' }}</td>
@@ -133,7 +133,7 @@
   <h3>{{ d.name }} <span class="subtitle">{{ d.description }}</span></h3>
   <dl class="summary-dl">
     <dt>Results</dt><dd>{{ s.total_results }} scored combinations</dd>
-    <dt>Blocked</dt><dd>{{ s.blocked }} combinations</dd>
+    <dt>Failed</dt><dd>{{ s.failed }} combinations</dd>
     <dt>Score range</dt><dd class="score-cell">{{ "%.4f"|format(s.min_score) }} — {{ "%.4f"|format(s.max_score) }}</dd>
     <dt>Avg score</dt><dd class="score-cell">{{ "%.4f"|format(s.avg_score) }}</dd>
     <dt>Last pass</dt><dd>{{ s.last_pass }}</dd>
diff --git a/src/physcom_web/templates/results/detail.html b/src/physcom_web/templates/results/detail.html
index ac39b89..7c7f2de 100644
--- a/src/physcom_web/templates/results/detail.html
+++ b/src/physcom_web/templates/results/detail.html
@@ -43,7 +43,7 @@
         {% for dep in e.dependencies %}
         <tr>
           <td>{{ dep.key }}</td>
-          <td>{{ dep.value }}{{ ' ' + dep.unit if dep.unit else '' }}</td>
+          <td>{{ dep.value|si }}{{ ' ' + dep.unit if dep.unit else '' }}</td>
           <td><span class="badge badge-{{ dep.constraint_type }}">{{ dep.constraint_type }}</span></td>
         </tr>
         {% endfor %}
@@ -77,10 +77,10 @@
       <tr>
         <td>{{ s.metric_name }}</td>
         {% set unit = s.metric_unit or '' %}
-        <td class="score-cell">{{ "%.2f"|format(s.raw_value) if s.raw_value is not none else '—' }}{{ ' ' + unit if unit and s.raw_value is not none else '' }}</td>
+        <td class="score-cell">{{ s.raw_value|si if s.raw_value is not none else '—' }}{{ ' ' + unit if unit and s.raw_value is not none else '' }}</td>
         <td>
           {%- if mb -%}
-            {{ "%.2f"|format(mb.norm_min) }} — {{ "%.2f"|format(mb.norm_max) }}{{ ' ' + unit if unit else '' }}
+            {{ mb.norm_min|si }} — {{ mb.norm_max|si }}{{ ' ' + unit if unit else '' }}
           {%- else -%}
             —
           {%- endif -%}
@@ -88,22 +88,22 @@
         <td>
           {%- if mb and s.raw_value is not none -%}
             {%- if s.raw_value <= mb.norm_min -%}
-              <span class="badge badge-blocked">at/below min</span>
+              <span class="badge badge-{{ 'valid' if mb.lower_is_better else 'p1_fail' }}">at/below min{{ ' (best)' if mb.lower_is_better else '' }}</span>
             {%- elif s.raw_value >= mb.norm_max -%}
-              <span class="badge badge-valid">at/above max</span>
+              <span class="badge badge-{{ 'p1_fail' if mb.lower_is_better else 'valid' }}">at/above max{{ ' (worst)' if mb.lower_is_better else '' }}</span>
             {%- else -%}
               {% set pct = ((s.raw_value - mb.norm_min) / (mb.norm_max - mb.norm_min) * 100) | int %}
               <div class="metric-bar-container">
                 <div class="metric-bar" style="width: {{ pct }}%"></div>
               </div>
-              <span class="metric-bar-label">~{{ pct }}%</span>
+              <span class="metric-bar-label">~{{ pct }}%{{ ' ↓' if mb.lower_is_better else '' }}</span>
             {%- endif -%}
           {%- else -%}
             —
           {%- endif -%}
         </td>
         <td class="score-cell">{{ "%.4f"|format(s.normalized_score) if s.normalized_score is not none else '—' }}</td>
-        <td>{{ "%.0f%%"|format(mb.weight * 100) if mb else '—' }}</td>
+        <td>{{ "%.0f%%"|format(mb.weight * 100) if mb else '—' }}{{ ' ↓' if mb and mb.lower_is_better else '' }}</td>
       </tr>
       {% endfor %}
     </tbody>
diff --git a/src/physcom_web/templates/results/list.html b/src/physcom_web/templates/results/list.html
index aacf4c2..e5c7a87 100644
--- a/src/physcom_web/templates/results/list.html
+++ b/src/physcom_web/templates/results/list.html
@@ -64,7 +64,7 @@
         <td>{{ r.combination.entities|map(attribute='name')|join(' + ') }}</td>
         <td><span class="badge badge-{{ r.combination.status }}">{{ r.combination.status }}</span></td>
         <td class="block-reason-cell">
-          {%- if r.combination.status == 'blocked' and r.combination.block_reason -%}
+          {%- if r.combination.status.endswith('_fail') and r.combination.block_reason -%}
             {{ r.combination.block_reason }}
           {%- elif r.novelty_flag -%}
             {{ r.novelty_flag }}
diff --git a/tests/conftest.py b/tests/conftest.py
index 2a1b2a4..6ad09da 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -148,7 +148,7 @@ def urban_domain():
         description="Daily city travel",
         metric_bounds=[
             MetricBound("speed", weight=0.25, norm_min=5, norm_max=120),
-            MetricBound("cost_efficiency", weight=0.25, norm_min=0.01, norm_max=2.0),
+            MetricBound("cost_efficiency", weight=0.25, norm_min=0.01, norm_max=2.0, lower_is_better=True),
             MetricBound("safety", weight=0.25, norm_min=0.0, norm_max=1.0),
             MetricBound("availability", weight=0.15, norm_min=0.0, norm_max=1.0),
             MetricBound("range_fuel", weight=0.10, norm_min=5, norm_max=500),
diff --git a/tests/test_constraint_resolver.py b/tests/test_constraint_resolver.py
index 0daca05..f3d0cfa 100644
--- a/tests/test_constraint_resolver.py
+++ b/tests/test_constraint_resolver.py
@@ -10,7 +10,7 @@ def test_compatible_ground_combo(bicycle, human_pedalling):
     resolver = ConstraintResolver()
     combo = Combination(entities=[bicycle, human_pedalling])
     result = resolver.resolve(combo)
-    assert result.status != "blocked", f"Unexpected block: {result.violations}"
+    assert result.status != "p1_fail", f"Unexpected block: {result.violations}"
 
 
 def test_solar_sail_blocks_with_walking(walking, solar_sail):
@@ -18,7 +18,7 @@ def test_solar_sail_blocks_with_walking(walking, solar_sail):
     resolver = ConstraintResolver()
     combo = Combination(entities=[walking, solar_sail])
     result = resolver.resolve(combo)
-    assert result.status == "blocked"
+    assert result.status == "p1_fail"
     assert any("mutually exclusive" in v for v in result.violations)
 
 
@@ -37,7 +37,7 @@ def test_nuclear_reactor_blocks_with_bicycle(bicycle, nuclear_reactor):
     resolver = ConstraintResolver()
     combo = Combination(entities=[bicycle, nuclear_reactor])
     result = resolver.resolve(combo)
-    assert result.status == "blocked"
+    assert result.status == "p1_fail"
     assert any("mass" in v.lower() for v in result.violations)
 
 
@@ -58,7 +58,7 @@ def test_force_scale_mismatch_blocks():
     resolver = ConstraintResolver()
     combo = Combination(entities=[platform, power])
     result = resolver.resolve(combo)
-    assert result.status == "blocked"
+    assert result.status == "p1_fail"
     assert any("force deficit" in v for v in result.violations)
 
 
@@ -80,7 +80,7 @@ def test_force_under_powered_warning():
     combo = Combination(entities=[platform, power])
     result = resolver.resolve(combo)
     # Under-powered but within 100x → warning, not block
-    assert result.status != "blocked"
+    assert result.status != "p1_fail"
     assert any("under-powered" in w for w in result.warnings)
 
 
@@ -97,7 +97,7 @@ def test_requires_vs_excludes():
     resolver = ConstraintResolver()
     combo = Combination(entities=[a, b])
     result = resolver.resolve(combo)
-    assert result.status == "blocked"
+    assert result.status == "p1_fail"
     assert any("excludes" in v for v in result.violations)
 
 
@@ -106,7 +106,7 @@ def test_ice_engine_blocks_with_spaceship(spaceship, ice_engine):
     resolver = ConstraintResolver()
     combo = Combination(entities=[spaceship, ice_engine])
     result = resolver.resolve(combo)
-    assert result.status == "blocked"
+    assert result.status == "p1_fail"
     assert any("atmosphere" in v for v in result.violations)
 
 
@@ -138,7 +138,7 @@ def test_energy_density_deficit_blocks():
     resolver = ConstraintResolver()
     combo = Combination(entities=[platform, power])
     result = resolver.resolve(combo)
-    assert result.status == "blocked"
+    assert result.status == "p1_fail"
     assert any("energy density deficit" in v for v in result.violations)
 
 
@@ -159,7 +159,7 @@ def test_energy_density_under_density_warning():
     resolver = ConstraintResolver()
     combo = Combination(entities=[platform, power])
     result = resolver.resolve(combo)
-    assert result.status != "blocked"
+    assert result.status != "p1_fail"
     assert any("under-density" in w for w in result.warnings)
 
 
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index d785821..50cd22d 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -16,8 +16,8 @@ def test_pass1_filters_impossible_combos(seeded_repo):
     result = pipeline.run(domain, ["platform", "power_source"], passes=[1])
 
     assert result.total_generated == 81
-    assert result.pass1_blocked > 0
-    assert result.pass1_valid + result.pass1_conditional + result.pass1_blocked == 81
+    assert result.pass1_failed > 0
+    assert result.pass1_valid + result.pass1_conditional + result.pass1_failed == 81
 
 
 def test_pass123_produces_scored_results(seeded_repo):
diff --git a/tests/test_pipeline_async.py b/tests/test_pipeline_async.py
index 4e0616e..cd58144 100644
--- a/tests/test_pipeline_async.py
+++ b/tests/test_pipeline_async.py
@@ -239,18 +239,18 @@ def test_blocked_combos_have_results(seeded_repo):
         score_threshold=0.01, passes=[1, 2, 3],
     )
 
-    assert result.pass1_blocked > 0
+    assert result.pass1_failed > 0
 
     # All combos (blocked + scored) should have result rows
     all_results = repo.get_all_results(domain.name)
     total_with_results = len(all_results)
     # blocked combos get pass_reached=1 results, non-blocked get pass_reached=3
-    assert total_with_results == result.pass1_blocked + result.pass3_scored
+    assert total_with_results == result.pass1_failed + result.pass3_scored
 
-    # Blocked combos should have pass_reached=1 and composite_score=0.0
-    blocked_results = [r for r in all_results if r["combination"].status == "blocked"]
-    assert len(blocked_results) == result.pass1_blocked
-    for br in blocked_results:
+    # Failed combos should have pass_reached=1 and composite_score=0.0
+    failed_results = [r for r in all_results if r["combination"].status == "p1_fail"]
+    assert len(failed_results) == result.pass1_failed
+    for br in failed_results:
         assert br["pass_reached"] == 1
         assert br["composite_score"] == 0.0
 
@@ -291,12 +291,96 @@ def test_save_combination_loads_existing_status(seeded_repo):
     saved = repo.save_combination(combo)
     assert saved.status == "pending"
 
-    # Mark it blocked in DB
-    repo.update_combination_status(saved.id, "blocked", "test reason")
+    # Mark it p1_fail in DB
+    repo.update_combination_status(saved.id, "p1_fail", "test reason")
 
-    # Re-saving should pick up the blocked status
+    # Re-saving should pick up the p1_fail status
     combo2 = Combination(entities=entities)
     reloaded = repo.save_combination(combo2)
     assert reloaded.id == saved.id
-    assert reloaded.status == "blocked"
+    assert reloaded.status == "p1_fail"
     assert reloaded.block_reason == "test reason"
+
+
+def test_p3_fail_below_threshold(seeded_repo):
+    """Combos scoring below threshold should get p3_fail status."""
+    repo = seeded_repo
+    domain = repo.get_domain("urban_commuting")
+
+    resolver = ConstraintResolver()
+    scorer = Scorer(domain)
+    pipeline = Pipeline(repo, resolver, scorer)
+
+    # Use a very high threshold so most combos fail pass 3
+    result = pipeline.run(
+        domain, ["platform", "power_source"],
+        score_threshold=0.99, passes=[1, 2, 3],
+    )
+
+    assert result.pass3_failed > 0
+    # above_threshold should be much smaller than scored
+    assert result.pass3_above_threshold <= result.pass3_scored
+
+    # p3_fail combos should exist in DB
+    p3_fail_combos = repo.list_combinations(status="p3_fail")
+    assert len(p3_fail_combos) == result.pass3_failed
+    for c in p3_fail_combos:
+        assert c.block_reason is not None
+        assert "below threshold" in c.block_reason
+
+
+def test_p4_fail_implausible(seeded_repo):
+    """Combos deemed implausible by LLM should get p4_fail status."""
+    from physcom.llm.providers.mock import MockLLMProvider
+
+    repo = seeded_repo
+    domain = repo.get_domain("urban_commuting")
+
+    resolver = ConstraintResolver()
+    scorer = Scorer(domain)
+    # Low estimates → normalized scores avg <= 0.5 → MockLLMProvider returns (text, False)
+    # Use threshold=0.0 so no combo gets p3_fail and all reach pass 4
+    mock_llm = MockLLMProvider(default_estimates={
+        "speed": 0.1, "cost_efficiency": 0.1, "safety": 0.1,
+        "availability": 0.1, "range_fuel": 0.1,
+    })
+    pipeline = Pipeline(repo, resolver, scorer, llm=mock_llm)
+
+    result = pipeline.run(
+        domain, ["platform", "power_source"],
+        score_threshold=0.0, passes=[1, 2, 3, 4],
+    )
+
+    # With low normalized scores (avg <= 0.5), reviewed combos should be p4_fail
+    assert result.pass4_failed > 0
+    assert result.pass4_reviewed == 0
+
+    p4_fail_combos = repo.list_combinations(status="p4_fail")
+    assert len(p4_fail_combos) == result.pass4_failed
+    for c in p4_fail_combos:
+        assert c.block_reason == "LLM deemed implausible"
+
+
+def test_p4_pass_plausible(seeded_repo):
+    """Combos deemed plausible by LLM should get llm_reviewed status."""
+    from physcom.llm.providers.mock import MockLLMProvider
+
+    repo = seeded_repo
+    domain = repo.get_domain("urban_commuting")
+
+    resolver = ConstraintResolver()
+    scorer = Scorer(domain)
+    # High estimates → avg > 0.5 → MockLLMProvider returns (text, True)
+    mock_llm = MockLLMProvider(default_estimates={
+        "speed": 50.0, "cost_efficiency": 0.5, "safety": 0.6,
+        "availability": 0.7, "range_fuel": 200.0,
+    })
+    pipeline = Pipeline(repo, resolver, scorer, llm=mock_llm)
+
+    result = pipeline.run(
+        domain, ["platform", "power_source"],
+        score_threshold=0.01, passes=[1, 2, 3, 4],
+    )
+
+    assert result.pass4_reviewed > 0
+    assert result.pass4_failed == 0
diff --git a/tests/test_scorer.py b/tests/test_scorer.py
index 0565f10..c728a5e 100644
--- a/tests/test_scorer.py
+++ b/tests/test_scorer.py
@@ -81,10 +81,34 @@ class TestScorer:
         assert len(result.scores) == 5
 
     def test_scorer_zero_metric_kills_score(self, urban_domain):
+        """A zero on a higher-is-better metric should drive composite to 0."""
         scorer = Scorer(urban_domain)
         combo = Combination(entities=[])
         combo.id = 1
-        raw = {"speed": 60.0, "cost_efficiency": 0.0, "safety": 0.7,
+        raw = {"speed": 60.0, "cost_efficiency": 0.5, "safety": 0.0,
                "availability": 0.8, "range_fuel": 400}
         result = scorer.score_combination(combo, raw)
         assert result.composite_score == 0.0
+
+    def test_lower_is_better_inverts_score(self, urban_domain):
+        """cost_efficiency is lower_is_better: low raw value should score high."""
+        scorer = Scorer(urban_domain)
+        combo = Combination(entities=[])
+        combo.id = 1
+        # cost_efficiency: norm_min=0.01, norm_max=2.0, lower_is_better=True
+        # A low cost (0.02) should get a HIGH normalized score (near 1.0)
+        # A high cost (1.9) should get a LOW normalized score (near 0.0)
+        raw_cheap = {"speed": 60.0, "cost_efficiency": 0.02, "safety": 0.7,
+                     "availability": 0.8, "range_fuel": 400}
+        raw_expensive = {"speed": 60.0, "cost_efficiency": 1.9, "safety": 0.7,
+                         "availability": 0.8, "range_fuel": 400}
+        result_cheap = scorer.score_combination(combo, raw_cheap)
+        result_expensive = scorer.score_combination(combo, raw_expensive)
+
+        # Find the cost_efficiency score in each
+        cost_cheap = next(s for s in result_cheap.scores if s.metric_name == "cost_efficiency")
+        cost_expensive = next(s for s in result_expensive.scores if s.metric_name == "cost_efficiency")
+
+        assert cost_cheap.normalized_score > cost_expensive.normalized_score
+        assert cost_cheap.normalized_score > 0.9  # near the best
+        assert cost_expensive.normalized_score < 0.1  # near the worst

Metric	Unit	Weight	Norm Min	Norm Max
Metric	Unit	Weight	Norm Min	Norm Max	Direction
{{ mb.metric_name }}	{{ mb.unit }}	{{ mb.weight }}	{{ mb.norm_min }}	{{ mb.norm_max }}	{{ mb.norm_min\|si }}	{{ mb.norm_max\|si }}	{{ '↓ lower' if mb.lower_is_better else '↑ higher' }}
{{ dep.category }}	{{ dep.key }}	{{ dep.value }}	{{ dep.value\|si }}	{{ dep.unit or '—' }}	{{ dep.constraint_type }}	diff --git a/src/physcom_web/templates/pipeline/_run_status.html b/src/physcom_web/templates/pipeline/_run_status.html index b7d8d28..f988069 100644 --- a/src/physcom_web/templates/pipeline/_run_status.html +++ b/src/physcom_web/templates/pipeline/_run_status.html @@ -34,7 +34,7 @@	1 — Constraints	{{ run.combos_pass1 or 0 }} checked {%- if (run.combos_pass2 or 0) > 0 and (run.combos_pass1 or 0) > (run.combos_pass2 or 0) %}, - {{ (run.combos_pass1 or 0) - (run.combos_pass2 or 0) }} blocked + {{ (run.combos_pass1 or 0) - (run.combos_pass2 or 0) }} failed {%- endif -%}
Status	Total	P1 Checked	P1 Blocked	P1 Failed	P2 Estimated	P3 Scored	P4 Reviewed	{{ run.status }}	{{ run.total_combos or '—' }}	{{ run.combos_pass1 or '—' }}	{% if blocked %}{{ blocked }}{% else %}—{% endif %}	{% if blocked %}{{ blocked }}{% else %}—{% endif %}	{{ run.combos_pass2 or '—' }}	{{ run.combos_pass3 or '—' }}	{{ run.combos_pass4 or '—' }}
{{ dep.key }}	{{ dep.value }}{{ ' ' + dep.unit if dep.unit else '' }}	{{ dep.value\|si }}{{ ' ' + dep.unit if dep.unit else '' }}	{{ dep.constraint_type }}
{{ s.metric_name }}	{{ "%.2f"\|format(s.raw_value) if s.raw_value is not none else '—' }}{{ ' ' + unit if unit and s.raw_value is not none else '' }}	{{ s.raw_value\|si if s.raw_value is not none else '—' }}{{ ' ' + unit if unit and s.raw_value is not none else '' }}	{%- if mb -%} - {{ "%.2f"\|format(mb.norm_min) }} — {{ "%.2f"\|format(mb.norm_max) }}{{ ' ' + unit if unit else '' }} + {{ mb.norm_min\|si }} — {{ mb.norm_max\|si }}{{ ' ' + unit if unit else '' }} {%- else -%} — {%- endif -%} @@ -88,22 +88,22 @@	{%- if mb and s.raw_value is not none -%} {%- if s.raw_value <= mb.norm_min -%} - at/below min + at/below min{{ ' (best)' if mb.lower_is_better else '' }} {%- elif s.raw_value >= mb.norm_max -%} - at/above max + at/above max{{ ' (worst)' if mb.lower_is_better else '' }} {%- else -%} {% set pct = ((s.raw_value - mb.norm_min) / (mb.norm_max - mb.norm_min) * 100) \| int %} - ~{{ pct }}% + ~{{ pct }}%{{ ' ↓' if mb.lower_is_better else '' }} {%- endif -%} {%- else -%} — {%- endif -%}	{{ "%.4f"\|format(s.normalized_score) if s.normalized_score is not none else '—' }}	{{ "%.0f%%"\|format(mb.weight * 100) if mb else '—' }}	{{ "%.0f%%"\|format(mb.weight * 100) if mb else '—' }}{{ ' ↓' if mb and mb.lower_is_better else '' }}
{{ r.combination.entities\|map(attribute='name')\|join(' + ') }}	{{ r.combination.status }}	- {%- if r.combination.status == 'blocked' and r.combination.block_reason -%} + {%- if r.combination.status.endswith('_fail') and r.combination.block_reason -%} {{ r.combination.block_reason }} {%- elif r.novelty_flag -%} {{ r.novelty_flag }} diff --git a/tests/conftest.py b/tests/conftest.py index 2a1b2a4..6ad09da 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -148,7 +148,7 @@ def urban_domain(): description="Daily city travel", metric_bounds=[ MetricBound("speed", weight=0.25, norm_min=5, norm_max=120), - MetricBound("cost_efficiency", weight=0.25, norm_min=0.01, norm_max=2.0), + MetricBound("cost_efficiency", weight=0.25, norm_min=0.01, norm_max=2.0, lower_is_better=True), MetricBound("safety", weight=0.25, norm_min=0.0, norm_max=1.0), MetricBound("availability", weight=0.15, norm_min=0.0, norm_max=1.0), MetricBound("range_fuel", weight=0.10, norm_min=5, norm_max=500), diff --git a/tests/test_constraint_resolver.py b/tests/test_constraint_resolver.py index 0daca05..f3d0cfa 100644 --- a/tests/test_constraint_resolver.py +++ b/tests/test_constraint_resolver.py @@ -10,7 +10,7 @@ def test_compatible_ground_combo(bicycle, human_pedalling): resolver = ConstraintResolver() combo = Combination(entities=[bicycle, human_pedalling]) result = resolver.resolve(combo) - assert result.status != "blocked", f"Unexpected block: {result.violations}" + assert result.status != "p1_fail", f"Unexpected block: {result.violations}" def test_solar_sail_blocks_with_walking(walking, solar_sail): @@ -18,7 +18,7 @@ def test_solar_sail_blocks_with_walking(walking, solar_sail): resolver = ConstraintResolver() combo = Combination(entities=[walking, solar_sail]) result = resolver.resolve(combo) - assert result.status == "blocked" + assert result.status == "p1_fail" assert any("mutually exclusive" in v for v in result.violations) @@ -37,7 +37,7 @@ def test_nuclear_reactor_blocks_with_bicycle(bicycle, nuclear_reactor): resolver = ConstraintResolver() combo = Combination(entities=[bicycle, nuclear_reactor]) result = resolver.resolve(combo) - assert result.status == "blocked" + assert result.status == "p1_fail" assert any("mass" in v.lower() for v in result.violations) @@ -58,7 +58,7 @@ def test_force_scale_mismatch_blocks(): resolver = ConstraintResolver() combo = Combination(entities=[platform, power]) result = resolver.resolve(combo) - assert result.status == "blocked" + assert result.status == "p1_fail" assert any("force deficit" in v for v in result.violations) @@ -80,7 +80,7 @@ def test_force_under_powered_warning(): combo = Combination(entities=[platform, power]) result = resolver.resolve(combo) # Under-powered but within 100x → warning, not block - assert result.status != "blocked" + assert result.status != "p1_fail" assert any("under-powered" in w for w in result.warnings) @@ -97,7 +97,7 @@ def test_requires_vs_excludes(): resolver = ConstraintResolver() combo = Combination(entities=[a, b]) result = resolver.resolve(combo) - assert result.status == "blocked" + assert result.status == "p1_fail" assert any("excludes" in v for v in result.violations) @@ -106,7 +106,7 @@ def test_ice_engine_blocks_with_spaceship(spaceship, ice_engine): resolver = ConstraintResolver() combo = Combination(entities=[spaceship, ice_engine]) result = resolver.resolve(combo) - assert result.status == "blocked" + assert result.status == "p1_fail" assert any("atmosphere" in v for v in result.violations) @@ -138,7 +138,7 @@ def test_energy_density_deficit_blocks(): resolver = ConstraintResolver() combo = Combination(entities=[platform, power]) result = resolver.resolve(combo) - assert result.status == "blocked" + assert result.status == "p1_fail" assert any("energy density deficit" in v for v in result.violations) @@ -159,7 +159,7 @@ def test_energy_density_under_density_warning(): resolver = ConstraintResolver() combo = Combination(entities=[platform, power]) result = resolver.resolve(combo) - assert result.status != "blocked" + assert result.status != "p1_fail" assert any("under-density" in w for w in result.warnings) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index d785821..50cd22d 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -16,8 +16,8 @@ def test_pass1_filters_impossible_combos(seeded_repo): result = pipeline.run(domain, ["platform", "power_source"], passes=[1]) assert result.total_generated == 81 - assert result.pass1_blocked > 0 - assert result.pass1_valid + result.pass1_conditional + result.pass1_blocked == 81 + assert result.pass1_failed > 0 + assert result.pass1_valid + result.pass1_conditional + result.pass1_failed == 81 def test_pass123_produces_scored_results(seeded_repo): diff --git a/tests/test_pipeline_async.py b/tests/test_pipeline_async.py index 4e0616e..cd58144 100644 --- a/tests/test_pipeline_async.py +++ b/tests/test_pipeline_async.py @@ -239,18 +239,18 @@ def test_blocked_combos_have_results(seeded_repo): score_threshold=0.01, passes=[1, 2, 3], ) - assert result.pass1_blocked > 0 + assert result.pass1_failed > 0 # All combos (blocked + scored) should have result rows all_results = repo.get_all_results(domain.name) total_with_results = len(all_results) # blocked combos get pass_reached=1 results, non-blocked get pass_reached=3 - assert total_with_results == result.pass1_blocked + result.pass3_scored + assert total_with_results == result.pass1_failed + result.pass3_scored - # Blocked combos should have pass_reached=1 and composite_score=0.0 - blocked_results = [r for r in all_results if r["combination"].status == "blocked"] - assert len(blocked_results) == result.pass1_blocked - for br in blocked_results: + # Failed combos should have pass_reached=1 and composite_score=0.0 + failed_results = [r for r in all_results if r["combination"].status == "p1_fail"] + assert len(failed_results) == result.pass1_failed + for br in failed_results: assert br["pass_reached"] == 1 assert br["composite_score"] == 0.0 @@ -291,12 +291,96 @@ def test_save_combination_loads_existing_status(seeded_repo): saved = repo.save_combination(combo) assert saved.status == "pending" - # Mark it blocked in DB - repo.update_combination_status(saved.id, "blocked", "test reason") + # Mark it p1_fail in DB + repo.update_combination_status(saved.id, "p1_fail", "test reason") - # Re-saving should pick up the blocked status + # Re-saving should pick up the p1_fail status combo2 = Combination(entities=entities) reloaded = repo.save_combination(combo2) assert reloaded.id == saved.id - assert reloaded.status == "blocked" + assert reloaded.status == "p1_fail" assert reloaded.block_reason == "test reason" + + +def test_p3_fail_below_threshold(seeded_repo): + """Combos scoring below threshold should get p3_fail status.""" + repo = seeded_repo + domain = repo.get_domain("urban_commuting") + + resolver = ConstraintResolver() + scorer = Scorer(domain) + pipeline = Pipeline(repo, resolver, scorer) + + # Use a very high threshold so most combos fail pass 3 + result = pipeline.run( + domain, ["platform", "power_source"], + score_threshold=0.99, passes=[1, 2, 3], + ) + + assert result.pass3_failed > 0 + # above_threshold should be much smaller than scored + assert result.pass3_above_threshold <= result.pass3_scored + + # p3_fail combos should exist in DB + p3_fail_combos = repo.list_combinations(status="p3_fail") + assert len(p3_fail_combos) == result.pass3_failed + for c in p3_fail_combos: + assert c.block_reason is not None + assert "below threshold" in c.block_reason + + +def test_p4_fail_implausible(seeded_repo): + """Combos deemed implausible by LLM should get p4_fail status.""" + from physcom.llm.providers.mock import MockLLMProvider + + repo = seeded_repo + domain = repo.get_domain("urban_commuting") + + resolver = ConstraintResolver() + scorer = Scorer(domain) + # Low estimates → normalized scores avg <= 0.5 → MockLLMProvider returns (text, False) + # Use threshold=0.0 so no combo gets p3_fail and all reach pass 4 + mock_llm = MockLLMProvider(default_estimates={ + "speed": 0.1, "cost_efficiency": 0.1, "safety": 0.1, + "availability": 0.1, "range_fuel": 0.1, + }) + pipeline = Pipeline(repo, resolver, scorer, llm=mock_llm) + + result = pipeline.run( + domain, ["platform", "power_source"], + score_threshold=0.0, passes=[1, 2, 3, 4], + ) + + # With low normalized scores (avg <= 0.5), reviewed combos should be p4_fail + assert result.pass4_failed > 0 + assert result.pass4_reviewed == 0 + + p4_fail_combos = repo.list_combinations(status="p4_fail") + assert len(p4_fail_combos) == result.pass4_failed + for c in p4_fail_combos: + assert c.block_reason == "LLM deemed implausible" + + +def test_p4_pass_plausible(seeded_repo): + """Combos deemed plausible by LLM should get llm_reviewed status.""" + from physcom.llm.providers.mock import MockLLMProvider + + repo = seeded_repo + domain = repo.get_domain("urban_commuting") + + resolver = ConstraintResolver() + scorer = Scorer(domain) + # High estimates → avg > 0.5 → MockLLMProvider returns (text, True) + mock_llm = MockLLMProvider(default_estimates={ + "speed": 50.0, "cost_efficiency": 0.5, "safety": 0.6, + "availability": 0.7, "range_fuel": 200.0, + }) + pipeline = Pipeline(repo, resolver, scorer, llm=mock_llm) + + result = pipeline.run( + domain, ["platform", "power_source"], + score_threshold=0.01, passes=[1, 2, 3, 4], + ) + + assert result.pass4_reviewed > 0 + assert result.pass4_failed == 0 diff --git a/tests/test_scorer.py b/tests/test_scorer.py index 0565f10..c728a5e 100644 --- a/tests/test_scorer.py +++ b/tests/test_scorer.py @@ -81,10 +81,34 @@ class TestScorer: assert len(result.scores) == 5 def test_scorer_zero_metric_kills_score(self, urban_domain): + """A zero on a higher-is-better metric should drive composite to 0.""" scorer = Scorer(urban_domain) combo = Combination(entities=[]) combo.id = 1 - raw = {"speed": 60.0, "cost_efficiency": 0.0, "safety": 0.7, + raw = {"speed": 60.0, "cost_efficiency": 0.5, "safety": 0.0, "availability": 0.8, "range_fuel": 400} result = scorer.score_combination(combo, raw) assert result.composite_score == 0.0 + + def test_lower_is_better_inverts_score(self, urban_domain): + """cost_efficiency is lower_is_better: low raw value should score high.""" + scorer = Scorer(urban_domain) + combo = Combination(entities=[]) + combo.id = 1 + # cost_efficiency: norm_min=0.01, norm_max=2.0, lower_is_better=True + # A low cost (0.02) should get a HIGH normalized score (near 1.0) + # A high cost (1.9) should get a LOW normalized score (near 0.0) + raw_cheap = {"speed": 60.0, "cost_efficiency": 0.02, "safety": 0.7, + "availability": 0.8, "range_fuel": 400} + raw_expensive = {"speed": 60.0, "cost_efficiency": 1.9, "safety": 0.7, + "availability": 0.8, "range_fuel": 400} + result_cheap = scorer.score_combination(combo, raw_cheap) + result_expensive = scorer.score_combination(combo, raw_expensive) + + # Find the cost_efficiency score in each + cost_cheap = next(s for s in result_cheap.scores if s.metric_name == "cost_efficiency") + cost_expensive = next(s for s in result_expensive.scores if s.metric_name == "cost_efficiency") + + assert cost_cheap.normalized_score > cost_expensive.normalized_score + assert cost_cheap.normalized_score > 0.9 # near the best + assert cost_expensive.normalized_score < 0.1 # near the worst