responder) =>
+ new(new TestHttpHandler(responder)) { BaseAddress = new Uri("http://report-api.test/") };
+
+ public static HttpResponseMessage Json(string body, HttpStatusCode status = HttpStatusCode.OK) =>
+ new(status)
+ {
+ Content = new StringContent(body, System.Text.Encoding.UTF8, "application/json"),
+ };
+}
diff --git a/services/FileService/tests/FileService.Tests/fixtures/full-payload.json b/services/FileService/tests/FileService.Tests/fixtures/full-payload.json
new file mode 100644
index 00000000..9ac6728f
--- /dev/null
+++ b/services/FileService/tests/FileService.Tests/fixtures/full-payload.json
@@ -0,0 +1,128 @@
+{
+ "site_name": "example.com",
+ "report_title": "Technical SEO Audit Report",
+ "report_generated_at": "2025-06-01T12:00:00Z",
+ "overall_score": 72,
+ "summary": {
+ "total_urls": 200,
+ "indexable": 180,
+ "total_issues": 45,
+ "critical_issues": 2
+ },
+ "status_counts": {
+ "200": 180,
+ "301": 10,
+ "404": 5
+ },
+ "executive_summary": {
+ "summary": "The site has several high-priority SEO issues.",
+ "source": "deterministic",
+ "priorities": ["Fix broken links", "Improve meta descriptions"]
+ },
+ "report_meta": {
+ "data_sources": ["Crawl", "Google Search Console", "Google Analytics 4"],
+ "google_fetched_at": "2025-06-01T11:00:00Z",
+ "crawl_scope": {
+ "pages_crawled": 150,
+ "max_pages_configured": 500,
+ "render_mode": "auto"
+ }
+ },
+ "categories": [
+ {
+ "name": "technical_seo",
+ "score": 68,
+ "issues": [
+ {
+ "priority": "critical",
+ "message": "Server error on https://example.com/",
+ "url": "https://example.com/",
+ "recommendation": "Fix 500 response.",
+ "gsc_clicks": 120,
+ "gsc_impressions": 4500
+ }
+ ]
+ }
+ ],
+ "lighthouse_summary": {
+ "url": "https://example.com/",
+ "performance": 85,
+ "accessibility": 92,
+ "best_practices": 88,
+ "seo": 90
+ },
+ "lighthouse_human_summary": "Core Web Vitals are generally good.",
+ "lighthouse_diagnostics": [
+ { "title": "Largest Contentful Paint", "description": "2.1s" }
+ ],
+ "search_performance": {
+ "queries": [
+ { "query": "example brand", "clicks": "50", "impressions": "1200" }
+ ],
+ "pages": [
+ { "page": "https://example.com/", "clicks": "200", "impressions": "8000" }
+ ]
+ },
+ "ga4": {
+ "channels": [
+ { "channel": "Organic Search", "sessions": "1200" }
+ ],
+ "devices": [
+ { "device": "mobile", "sessions": "800" }
+ ]
+ },
+ "security_findings": [
+ {
+ "severity": "high",
+ "finding_type": "missing_header",
+ "url": "https://example.com/",
+ "message": "Missing Content-Security-Policy header"
+ }
+ ],
+ "content_analytics": {
+ "word_count_stats": { "mean": 450, "median": 380 },
+ "thin_content_count": 12,
+ "top_keywords_site": [
+ { "word": "services", "count": "45" }
+ ]
+ },
+ "indexation_coverage": {
+ "indexable": 180,
+ "non_indexable": 15,
+ "blocked": 5
+ },
+ "links": [
+ { "url": "https://example.com/", "status": "200", "title": "Home", "inlinks": 1, "outlinks": 2 },
+ {
+ "url": "https://example.com/products",
+ "status": "200",
+ "title": "Products",
+ "custom_extract": "SKU-1",
+ "custom_fields": "{\"price\":\"9.99\",\"sku\":\"SKU-1\"}"
+ }
+ ],
+ "link_edges": [
+ {
+ "from_url": "https://example.com/",
+ "to_url": "https://example.com/about",
+ "anchor_text": "About",
+ "rel": "",
+ "is_nofollow": false,
+ "is_sponsored": false,
+ "is_ugc": false,
+ "link_type": "internal",
+ "position": 1
+ }
+ ],
+ "redirects": [
+ {
+ "url": "https://example.com/old",
+ "message": "301 redirect chain",
+ "priority": "medium",
+ "recommendation": "Update internal links"
+ }
+ ],
+ "top_pages": [
+ { "url": "https://example.com/" }
+ ]
+}
diff --git a/services/FileService/tests/FileService.Tests/fixtures/minimal-payload.json b/services/FileService/tests/FileService.Tests/fixtures/minimal-payload.json
new file mode 100644
index 00000000..36afa384
--- /dev/null
+++ b/services/FileService/tests/FileService.Tests/fixtures/minimal-payload.json
@@ -0,0 +1,73 @@
+{
+ "site_name": "example.com",
+ "report_title": "Technical SEO Audit Report",
+ "report_generated_at": "2025-06-01T12:00:00Z",
+ "overall_score": 72,
+ "executive_summary": {
+ "summary": "The site has several high-priority SEO issues.",
+ "source": "deterministic",
+ "priorities": ["Fix broken links", "Improve meta descriptions"],
+ "top_issues": [
+ {
+ "category": "Technical SEO",
+ "priority": "high",
+ "message": "Missing title tag",
+ "url": "https://example.com/page",
+ "recommendation": "Add a unique title."
+ }
+ ]
+ },
+ "report_meta": {
+ "data_sources": ["Crawl", "Google Search Console"],
+ "crawl_scope": {
+ "pages_crawled": 150,
+ "max_pages_configured": 500
+ }
+ },
+ "categories": [
+ {
+ "name": "technical_seo",
+ "score": 68,
+ "issues": [
+ {
+ "priority": "critical",
+ "message": "Server error on homepage",
+ "url": "https://example.com/",
+ "recommendation": "Fix 500 response."
+ },
+ {
+ "priority": "high",
+ "message": "Missing meta description",
+ "url": "https://example.com/about",
+ "recommendation": "Add meta description."
+ }
+ ]
+ },
+ {
+ "name": "content",
+ "score": 80,
+ "issues": [
+ {
+ "priority": "medium",
+ "message": "Thin content",
+ "url": "https://example.com/blog/post",
+ "recommendation": "Expand content."
+ }
+ ]
+ }
+ ],
+ "lighthouse_summary": {
+ "url": "https://example.com/",
+ "performance": 85,
+ "accessibility": 92,
+ "best_practices": 88,
+ "seo": 90
+ },
+ "top_pages": [
+ { "url": "https://example.com/" },
+ { "url": "https://example.com/about" }
+ ],
+ "links": [
+ { "url": "https://example.com/", "status": "200", "title": "Home" }
+ ]
+}
diff --git a/src/website_profiling/api/__init__.py b/src/website_profiling/api/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/website_profiling/api/deps.py b/src/website_profiling/api/deps.py
new file mode 100644
index 00000000..8675c831
--- /dev/null
+++ b/src/website_profiling/api/deps.py
@@ -0,0 +1,19 @@
+"""Shared FastAPI dependencies."""
+from __future__ import annotations
+
+from typing import Iterator
+
+from psycopg import Connection
+
+from website_profiling.db.pool import db_session
+
+
+def get_db() -> Iterator[Connection]:
+ """Yield a synchronous psycopg connection from the pool.
+
+ Declare route handlers as plain ``def`` (not ``async def``) so FastAPI
+ runs them in a thread pool automatically — this matches the existing
+ synchronous codebase and requires no pool migration.
+ """
+ with db_session() as conn:
+ yield conn
diff --git a/src/website_profiling/api/main.py b/src/website_profiling/api/main.py
new file mode 100644
index 00000000..e26b2d4d
--- /dev/null
+++ b/src/website_profiling/api/main.py
@@ -0,0 +1,117 @@
+"""FastAPI application entry point."""
+from __future__ import annotations
+
+import os
+from contextlib import asynccontextmanager
+from typing import AsyncIterator
+
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+
+from .routers import (
+ alerts,
+ chat,
+ compare,
+ config,
+ content,
+ crawl,
+ dashboards,
+ filters,
+ health,
+ integrations,
+ issues,
+ keywords,
+ logs,
+ mcp_tools,
+ ollama,
+ page_coach,
+ page_markdown,
+ pipeline,
+ portfolio,
+ properties,
+ report,
+ report_audit_tool,
+ report_export,
+ report_portfolio,
+ schedule,
+)
+
+
+@asynccontextmanager
+async def _lifespan(app: FastAPI) -> AsyncIterator[None]:
+ yield
+ # Close the psycopg connection pool on shutdown.
+ try:
+ from website_profiling.db.pool import close_db_pool
+
+ close_db_pool()
+ except Exception as exc:
+ import logging
+ logging.getLogger(__name__).warning("Error closing DB pool on shutdown: %s", exc)
+
+
+app = FastAPI(
+ title="Website Profiling API",
+ version="1.0.0",
+ lifespan=_lifespan,
+)
+
+# CORS — only added when FASTAPI_ALLOWED_ORIGINS is set (local Swagger in dev).
+_origins_raw = os.getenv("FASTAPI_ALLOWED_ORIGINS", "").strip()
+if _origins_raw:
+ _origins = [o.strip() for o in _origins_raw.split(",") if o.strip()]
+ if "*" in _origins:
+ raise RuntimeError(
+ "FASTAPI_ALLOWED_ORIGINS cannot contain '*' when allow_credentials=True. "
+ "List explicit origins (e.g. http://localhost:3000) instead."
+ )
+ app.add_middleware(
+ CORSMiddleware,
+ allow_origins=_origins,
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"],
+ )
+
+# ── Core routes ───────────────────────────────────────────────────────────────
+app.include_router(health.router, prefix="/api")
+app.include_router(report.router, prefix="/api")
+
+# ── Batch B: Pipeline jobs ────────────────────────────────────────────────────
+app.include_router(pipeline.router, prefix="/api")
+
+# ── Batch C: Chat (SSE + sessions) ───────────────────────────────────────────
+app.include_router(chat.router, prefix="/api")
+
+# ── Batch D: Crawl ───────────────────────────────────────────────────────────
+app.include_router(crawl.router, prefix="/api")
+
+# ── Batch E: Config (pipeline, LLM, secrets, app-settings) ───────────────────
+app.include_router(config.router, prefix="/api")
+
+# ── Batch F: Properties ──────────────────────────────────────────────────────
+app.include_router(properties.router, prefix="/api")
+
+# ── Batch G: Dashboards + Filters ────────────────────────────────────────────
+app.include_router(dashboards.router, prefix="/api")
+app.include_router(filters.router, prefix="/api")
+
+# ── Batch H: Google + Bing integrations ──────────────────────────────────────
+app.include_router(integrations.router, prefix="/api")
+
+# ── Batch I: Issues, keywords, content, page markdown, long-tail ─────────────
+app.include_router(issues.router, prefix="/api")
+app.include_router(keywords.router, prefix="/api")
+app.include_router(content.router, prefix="/api")
+app.include_router(page_markdown.router, prefix="/api")
+app.include_router(ollama.router, prefix="/api")
+app.include_router(mcp_tools.router, prefix="/api")
+app.include_router(portfolio.router, prefix="/api")
+app.include_router(alerts.router, prefix="/api")
+app.include_router(schedule.router, prefix="/api")
+app.include_router(logs.router, prefix="/api")
+app.include_router(compare.router, prefix="/api")
+app.include_router(page_coach.router, prefix="/api")
+app.include_router(report_audit_tool.router, prefix="/api")
+app.include_router(report_export.router, prefix="/api")
+app.include_router(report_portfolio.router, prefix="/api")
diff --git a/src/website_profiling/api/routers/__init__.py b/src/website_profiling/api/routers/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/website_profiling/api/routers/alerts.py b/src/website_profiling/api/routers/alerts.py
new file mode 100644
index 00000000..ae95963f
--- /dev/null
+++ b/src/website_profiling/api/routers/alerts.py
@@ -0,0 +1,32 @@
+"""Property alert checks — /api/alerts/*."""
+from __future__ import annotations
+
+from typing import Annotated, Any
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+from psycopg import Connection
+
+from ..deps import get_db
+
+router = APIRouter(tags=["alerts"])
+
+DbDep = Annotated[Connection, Depends(get_db)]
+
+
+@router.post("/alerts/check")
+def alerts_check(
+ conn: DbDep,
+ propertyId: int = Query(...),
+) -> dict[str, Any]:
+ if not propertyId:
+ raise HTTPException(status_code=400, detail="propertyId required")
+ try:
+ from website_profiling.tools.alerts_runner import run_alerts_for_property
+
+ return run_alerts_for_property(conn, propertyId)
+ except ImportError:
+ import logging
+ logging.getLogger(__name__).debug("alerts_runner module not available in this build")
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=str(exc))
+ return {"ok": True, "checked": 0}
diff --git a/src/website_profiling/api/routers/chat.py b/src/website_profiling/api/routers/chat.py
new file mode 100644
index 00000000..099e5515
--- /dev/null
+++ b/src/website_profiling/api/routers/chat.py
@@ -0,0 +1,243 @@
+"""Chat routers — /api/chat, /api/chat/sessions/*, /api/chat/artifacts/*."""
+from __future__ import annotations
+
+import json
+import queue
+import re
+import threading
+from typing import Annotated, Any, Generator, Optional
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+from fastapi.responses import StreamingResponse
+from psycopg import Connection
+
+from ..deps import get_db
+from ..schemas.chat import ArtifactUpdateBody, ChatRequest, ChatSessionCreate
+
+router = APIRouter(prefix="/chat", tags=["chat"])
+
+DbDep = Annotated[Connection, Depends(get_db)]
+
+_FIRST_SENTENCE_RE = re.compile(r"^(.{8,80}[.!?])", re.DOTALL)
+
+
+def _fmt_session(s: dict[str, Any]) -> dict[str, Any]:
+ return {
+ "id": s["id"],
+ "propertyId": s["property_id"],
+ "title": s["title"],
+ "createdAt": s["created_at"],
+ "updatedAt": s["updated_at"],
+ }
+
+
+def _messages_for_agent_context(
+ rows: list[dict[str, Any]], max_turns: int = 20
+) -> list[dict[str, str]]:
+ """Port of messagesForAgentContext from chatDb.ts."""
+ relevant = [m for m in rows if m.get("role") in ("user", "assistant")]
+ sliced = relevant[-(max_turns * 2):]
+ return [{"role": m["role"], "content": str(m.get("content") or "")} for m in sliced]
+
+
+def _derive_title(text: str) -> str | None:
+ text = text.strip()
+ if not text:
+ return None
+ m = _FIRST_SENTENCE_RE.match(text)
+ raw = m.group(1).strip() if m else text[:60].strip()
+ return raw[:80] if raw else None
+
+
+# ── POST /api/chat (SSE streaming) ────────────────────────────────────────────
+
+@router.post("/")
+def chat_turn(body: ChatRequest, conn: DbDep) -> StreamingResponse:
+ from website_profiling.db.chat_store import (
+ append_message,
+ get_messages,
+ get_session,
+ update_session_title,
+ )
+ from website_profiling.llm.agent import run_agent_turn
+ from website_profiling.tools.audit_tools import AuditToolContext
+
+ # Validate session
+ session = get_session(conn, body.sessionId)
+ if not session or session["property_id"] != body.propertyId:
+ raise HTTPException(status_code=404, detail="session not found")
+
+ # Persist user message
+ append_message(conn, body.sessionId, "user", body.message)
+
+ # Build agent context
+ history = get_messages(conn, body.sessionId)
+ agent_messages = _messages_for_agent_context(history)
+ context = AuditToolContext(
+ property_id=body.propertyId,
+ report_id=body.reportId,
+ )
+
+ q: queue.Queue[dict[str, Any] | None] = queue.Queue()
+ assistant_parts: list[str] = []
+ tool_events: list[dict[str, Any]] = []
+ result_holder: list[dict[str, Any]] = []
+
+ def on_event(event: dict[str, Any]) -> None:
+ if event.get("type") == "token":
+ assistant_parts.append(str(event.get("text") or ""))
+ elif event.get("type") == "tool_end":
+ tool_events.append(event)
+ q.put(event)
+
+ def run_agent() -> None:
+ try:
+ result = run_agent_turn(agent_messages, context, on_event=on_event)
+ result_holder.append(result)
+ except Exception as exc:
+ q.put({"type": "error", "message": str(exc)})
+ finally:
+ q.put(None) # sentinel
+
+ thread = threading.Thread(target=run_agent, daemon=True)
+ thread.start()
+
+ def generate() -> Generator[str, None, None]:
+ while True:
+ item = q.get()
+ if item is None:
+ break
+ event_type = str(item.get("type") or "message")
+ yield f"event: {event_type}\ndata: {json.dumps(item)}\n\n"
+
+ thread.join(timeout=5)
+
+ # Persist assistant response. The injected `conn` dependency is released
+ # by FastAPI as soon as StreamingResponse is returned (before this
+ # generator resumes), so we open a fresh connection for persistence.
+ assistant_text = "".join(assistant_parts).strip()
+ if assistant_text:
+ try:
+ from website_profiling.db.pool import db_session
+ with db_session() as fresh_conn:
+ append_message(fresh_conn, body.sessionId, "assistant", assistant_text)
+ if session.get("title") in ("New chat", "", None):
+ derived = _derive_title(body.message) or _derive_title(assistant_text)
+ if derived:
+ update_session_title(fresh_conn, body.sessionId, derived)
+ except Exception as exc:
+ import logging
+ logging.getLogger(__name__).warning(
+ "Failed to persist chat assistant message for session %s: %s",
+ body.sessionId,
+ exc,
+ )
+
+ return StreamingResponse(generate(), media_type="text/event-stream")
+
+
+# ── Session CRUD ──────────────────────────────────────────────────────────────
+
+@router.get("/sessions")
+def list_sessions(
+ conn: DbDep,
+ propertyId: int = Query(...),
+) -> dict[str, Any]:
+ from website_profiling.db.chat_store import list_sessions as _list
+
+ if not propertyId:
+ raise HTTPException(status_code=400, detail="propertyId required")
+ sessions = _list(conn, propertyId)
+ return {"sessions": [_fmt_session(s) for s in sessions]}
+
+
+@router.post("/sessions")
+def create_session(body: ChatSessionCreate, conn: DbDep) -> dict[str, Any]:
+ from website_profiling.db.chat_store import create_session as _create
+
+ if not body.propertyId:
+ raise HTTPException(status_code=400, detail="propertyId required")
+ session_id = _create(conn, body.propertyId, body.title)
+ return {"id": session_id, "propertyId": body.propertyId, "title": body.title}
+
+
+@router.get("/sessions/{session_id}")
+def get_session_route(session_id: int, conn: DbDep) -> dict[str, Any]:
+ from website_profiling.db.chat_store import get_session
+
+ session = get_session(conn, session_id)
+ if not session:
+ raise HTTPException(status_code=404, detail="session not found")
+ return {"session": _fmt_session(session)}
+
+
+@router.delete("/sessions/{session_id}")
+def delete_session_route(
+ session_id: int,
+ conn: DbDep,
+ propertyId: int = Query(...),
+) -> dict[str, Any]:
+ from website_profiling.db.chat_store import delete_session, get_session
+
+ session = get_session(conn, session_id)
+ if not session or session["property_id"] != propertyId:
+ raise HTTPException(status_code=404, detail="session not found")
+ deleted = delete_session(conn, session_id)
+ if not deleted:
+ raise HTTPException(status_code=404, detail="session not found")
+ return {"ok": True}
+
+
+@router.get("/sessions/{session_id}/messages")
+def get_session_messages(
+ session_id: int,
+ conn: DbDep,
+ propertyId: int = Query(...),
+) -> dict[str, Any]:
+ from website_profiling.db.chat_store import get_messages, get_session
+
+ session = get_session(conn, session_id)
+ if not session or session["property_id"] != propertyId:
+ raise HTTPException(status_code=404, detail="session not found")
+ messages = get_messages(conn, session_id)
+ return {"messages": messages}
+
+
+# ── Artifacts ────────────────────────────────────────────────────────────────
+
+@router.get("/artifacts/{artifact_id}")
+def get_artifact(artifact_id: str) -> Any:
+ import base64
+ import re as _re
+
+ if not _re.match(r"^[a-f0-9\-]{36}$", artifact_id):
+ raise HTTPException(status_code=400, detail="Invalid artifact id")
+
+ try:
+ from website_profiling.tools.export_artifacts import read_artifact_bytes
+
+ result = read_artifact_bytes(artifact_id)
+ except ImportError:
+ raise HTTPException(status_code=500, detail="Artifact module unavailable")
+
+ if not result:
+ raise HTTPException(status_code=404, detail="Artifact not found")
+
+ meta, data = result
+ filename = meta.get("filename") or "export.bin"
+ mime_type = meta.get("mime_type") or "application/octet-stream"
+ ascii_name = re.sub(r'[^\x20-\x7e]', '_', filename)
+ ascii_name = re.sub(r'["\\/]', '_', ascii_name) or "export.bin"
+
+ from fastapi import Response
+
+ return Response(
+ content=data,
+ media_type=mime_type,
+ headers={
+ "Content-Disposition": (
+ f'attachment; filename="{ascii_name}"; '
+ f"filename*=UTF-8''{ascii_name}"
+ ),
+ },
+ )
diff --git a/src/website_profiling/api/routers/compare.py b/src/website_profiling/api/routers/compare.py
new file mode 100644
index 00000000..14264d9f
--- /dev/null
+++ b/src/website_profiling/api/routers/compare.py
@@ -0,0 +1,61 @@
+"""Report comparison export — /api/compare/*."""
+from __future__ import annotations
+
+from typing import Annotated, Optional
+
+from fastapi import APIRouter, Depends, HTTPException
+from fastapi.responses import Response
+from psycopg import Connection
+from pydantic import BaseModel
+
+from ..deps import get_db
+
+router = APIRouter(tags=["compare"])
+
+DbDep = Annotated[Connection, Depends(get_db)]
+
+
+class CompareExportBody(BaseModel):
+ reportIdA: Optional[int] = None
+ reportIdB: Optional[int] = None
+
+
+def _csv_escape(val: str) -> str:
+ if any(c in val for c in ('",\n')):
+ return f'"{val.replace(chr(34), chr(34) + chr(34))}"'
+ return val
+
+
+@router.post("/compare/export")
+def compare_export(body: CompareExportBody, conn: DbDep) -> Response:
+ if not body.reportIdA or not body.reportIdB:
+ raise HTTPException(status_code=400, detail="reportIdA and reportIdB required")
+
+ from website_profiling.db.report_store import read_report_payload
+
+ payload_a = read_report_payload(conn, body.reportIdA)
+ payload_b = read_report_payload(conn, body.reportIdB)
+ if not payload_a or not payload_b:
+ raise HTTPException(status_code=404, detail="One or both reports not found")
+
+ lines = ["Category,Issue Title,Priority,Change\n"]
+ cats_a = {c.get("id") or c.get("name"): c for c in (payload_a.get("categories") or [])}
+ cats_b = {c.get("id") or c.get("name"): c for c in (payload_b.get("categories") or [])}
+ for key in set(list(cats_a.keys()) + list(cats_b.keys())):
+ cat_a = cats_a.get(key) or {}
+ cat_b = cats_b.get(key) or {}
+ issues_a = {i.get("title"): i for i in (cat_a.get("issues") or [])}
+ issues_b = {i.get("title"): i for i in (cat_b.get("issues") or [])}
+ for title in set(list(issues_a.keys()) + list(issues_b.keys())):
+ in_a = title in issues_a
+ in_b = title in issues_b
+ change = "removed" if in_a and not in_b else "added" if not in_a and in_b else "unchanged"
+ priority = (issues_b.get(title) or issues_a.get(title) or {}).get("priority", "")
+ lines.append(f"{_csv_escape(str(key))},{_csv_escape(str(title))},{priority},{change}\n")
+
+ csv_content = "".join(lines)
+ return Response(
+ content=csv_content,
+ media_type="text/csv",
+ headers={"Content-Disposition": "attachment; filename=compare_export.csv"},
+ )
diff --git a/src/website_profiling/api/routers/config.py b/src/website_profiling/api/routers/config.py
new file mode 100644
index 00000000..70c08f5a
--- /dev/null
+++ b/src/website_profiling/api/routers/config.py
@@ -0,0 +1,260 @@
+"""Config routes: pipeline-config, llm-config, secrets, app-settings."""
+from __future__ import annotations
+
+from typing import Annotated, Any, Optional
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+from psycopg import Connection
+from pydantic import BaseModel
+
+from ..deps import get_db
+
+router = APIRouter(tags=["config"])
+
+_MASK = "*"
+
+
+# ---------------------------------------------------------------------------
+# helpers
+# ---------------------------------------------------------------------------
+
+
+def _mask_secrets(data: dict[str, Any]) -> dict[str, Any]:
+ """Return a copy of *data* with secret-ish values replaced by ``'*'``."""
+ masked: dict[str, Any] = {}
+ for k, v in data.items():
+ val_str = str(v) if v is not None else ""
+ if val_str and (_is_secret_key(k)):
+ masked[k] = _MASK
+ else:
+ masked[k] = v
+ return masked
+
+
+def _is_secret_key(key: str) -> bool:
+ key_lower = key.lower()
+ return (
+ key_lower.endswith("_secret")
+ or key_lower.endswith("_api_key")
+ or key_lower.endswith("_key")
+ or "api_key" in key_lower
+ or "secret" in key_lower
+ or "password" in key_lower
+ or "token" in key_lower
+ )
+
+
+def _read_llm_config_full(conn: Connection) -> list[dict[str, Any]]:
+ from website_profiling.db.config_store import read_llm_config_full
+ return read_llm_config_full(conn)
+
+
+def _read_app_setting(conn: Connection, key: str) -> Optional[str]:
+ from website_profiling.db.config_store import read_app_setting
+ return read_app_setting(conn, key)
+
+
+def _write_app_setting(conn: Connection, key: str, value: str) -> None:
+ from website_profiling.db.config_store import write_app_setting
+ write_app_setting(conn, key, value)
+
+
+# ---------------------------------------------------------------------------
+# pipeline-config
+# ---------------------------------------------------------------------------
+
+
+@router.get("/pipeline-config")
+def get_pipeline_config(conn: Annotated[Connection, Depends(get_db)]) -> dict[str, Any]:
+ from website_profiling.db.config_store import read_pipeline_config
+
+ state, unknown_keys = read_pipeline_config(conn)
+ return {"state": state, "unknownKeys": unknown_keys, "source": "db"}
+
+
+class PipelineConfigBody(BaseModel):
+ state: dict[str, Any]
+ unknownKeys: Optional[list[dict[str, str]]] = None
+
+
+@router.put("/pipeline-config")
+def put_pipeline_config(
+ body: PipelineConfigBody,
+ conn: Annotated[Connection, Depends(get_db)],
+) -> dict[str, Any]:
+ from website_profiling.db.config_store import write_pipeline_config
+
+ coerced: dict[str, str] = {str(k): str(v) for k, v in body.state.items()}
+ unknown_keys: list[dict[str, str]] = body.unknownKeys or []
+ write_pipeline_config(conn, coerced, unknown_keys)
+ return {"ok": True, "source": "db"}
+
+
+# ---------------------------------------------------------------------------
+# llm-config
+# ---------------------------------------------------------------------------
+
+
+@router.get("/llm-config")
+def get_llm_config(conn: Annotated[Connection, Depends(get_db)]) -> dict[str, Any]:
+ rows = _read_llm_config_full(conn)
+ state: dict[str, Any] = {}
+ for row in rows:
+ k = str(row["key"])
+ v = str(row["value"])
+ is_secret = bool(row.get("is_secret"))
+ state[k] = _MASK if (is_secret and v) else v
+ return {"state": state, "source": "db"}
+
+
+class LlmConfigBody(BaseModel):
+ state: dict[str, Any]
+
+
+@router.put("/llm-config")
+def put_llm_config(
+ body: LlmConfigBody,
+ conn: Annotated[Connection, Depends(get_db)],
+) -> dict[str, Any]:
+ from website_profiling.db.config_store import write_llm_config
+
+ # Preserve existing secret values when client sends "*" (masked sentinel)
+ existing_rows = _read_llm_config_full(conn)
+ existing: dict[str, str] = {str(r["key"]): str(r["value"]) for r in existing_rows}
+ existing_secrets: set[str] = {str(r["key"]) for r in existing_rows if r.get("is_secret")}
+
+ entries: dict[str, str] = {}
+ secret_keys: set[str] = set()
+
+ for k, v in body.state.items():
+ val = str(v) if v is not None else ""
+ is_masked_sentinel = val.strip() in (_MASK, "••••") or (
+ val.strip().startswith("*") and len(val.strip()) <= 4
+ )
+ if is_masked_sentinel and k in existing:
+ # Keep original value
+ entries[k] = existing[k]
+ else:
+ entries[k] = val
+
+ if k in existing_secrets or _is_secret_key(k):
+ secret_keys.add(k)
+
+ write_llm_config(conn, entries, secret_keys)
+ return {"ok": True}
+
+
+# ---------------------------------------------------------------------------
+# secrets
+# ---------------------------------------------------------------------------
+
+
+@router.get("/secrets")
+def get_secrets(conn: Annotated[Connection, Depends(get_db)]) -> dict[str, Any]:
+ from website_profiling.db.google_app_store import read_google_app_settings
+
+ llm_rows = _read_llm_config_full(conn)
+ state: dict[str, Any] = {}
+ for row in llm_rows:
+ k = str(row["key"])
+ v = str(row["value"])
+ is_secret = bool(row.get("is_secret")) or _is_secret_key(k)
+ if is_secret and v:
+ state[k] = _MASK
+ state[f"{k}_masked"] = True
+ elif v:
+ state[k] = v
+
+ google = read_google_app_settings(conn)
+ for field in ("client_id", "client_secret", "developer_token", "login_customer_id"):
+ raw = str(google.get(field) or "")
+ if raw:
+ state[f"google_{field}"] = _MASK if _is_secret_key(field) else raw
+ if _is_secret_key(field):
+ state[f"google_{field}_masked"] = True
+ state["google_has_service_account"] = bool(google.get("service_account_json"))
+
+ return {"state": state, "source": "db"}
+
+
+class SecretsBody(BaseModel):
+ state: dict[str, Any]
+
+
+@router.put("/secrets")
+def put_secrets(
+ body: SecretsBody,
+ conn: Annotated[Connection, Depends(get_db)],
+) -> dict[str, Any]:
+ from website_profiling.db.config_store import read_llm_config, write_llm_config
+ from website_profiling.db.google_app_store import read_google_app_settings, save_google_app_settings
+
+ existing_llm = read_llm_config(conn)
+ existing_rows = _read_llm_config_full(conn)
+ existing_secrets_set: set[str] = {str(r["key"]) for r in existing_rows if r.get("is_secret")}
+
+ llm_updates: dict[str, str] = dict(existing_llm)
+ llm_secret_keys: set[str] = set(existing_secrets_set)
+ google_patch: dict[str, Any] = {}
+
+ for k, v in body.state.items():
+ if k.endswith("_masked") or k == "google_has_service_account":
+ continue
+
+ val = str(v) if v is not None else ""
+ is_masked_sentinel = val.strip() in (_MASK, "••••") or (
+ val.strip().startswith("*") and len(val.strip()) <= 4
+ )
+
+ if k.startswith("google_"):
+ field = k[len("google_"):]
+ if field in ("client_id", "client_secret", "developer_token", "login_customer_id"):
+ if not is_masked_sentinel:
+ google_patch[field] = val
+ else:
+ if is_masked_sentinel:
+ # Preserve existing
+ pass
+ else:
+ llm_updates[k] = val
+ if _is_secret_key(k):
+ llm_secret_keys.add(k)
+
+ write_llm_config(conn, llm_updates, llm_secret_keys)
+
+ if google_patch:
+ save_google_app_settings(conn, google_patch)
+
+ return {"ok": True}
+
+
+# ---------------------------------------------------------------------------
+# app-settings
+# ---------------------------------------------------------------------------
+
+
+@router.get("/app-settings")
+def get_app_setting(
+ conn: Annotated[Connection, Depends(get_db)],
+ key: str = Query(..., description="Settings key to retrieve"),
+) -> dict[str, Any]:
+ if not key or not key.strip():
+ raise HTTPException(status_code=400, detail="Missing key query parameter")
+ value = _read_app_setting(conn, key.strip())
+ return {"key": key.strip(), "value": value}
+
+
+class AppSettingBody(BaseModel):
+ key: str
+ value: str
+
+
+@router.put("/app-settings")
+def put_app_setting(
+ body: AppSettingBody,
+ conn: Annotated[Connection, Depends(get_db)],
+) -> dict[str, Any]:
+ if not body.key or not body.key.strip():
+ raise HTTPException(status_code=400, detail="key must not be empty")
+ _write_app_setting(conn, body.key.strip(), body.value)
+ return {"ok": True}
diff --git a/src/website_profiling/api/routers/content.py b/src/website_profiling/api/routers/content.py
new file mode 100644
index 00000000..aa281200
--- /dev/null
+++ b/src/website_profiling/api/routers/content.py
@@ -0,0 +1,257 @@
+"""Content routers — /api/content/* and /api/backlinks/* and /api/content-drafts/*."""
+from __future__ import annotations
+
+from typing import Annotated, Any
+
+from fastapi import APIRouter, Body, Depends, HTTPException, Query
+from psycopg import Connection
+
+from ..deps import get_db
+from website_profiling.db import content_draft_store
+from website_profiling.integrations.google.gsc_links_store import list_backlinks_velocity
+
+router = APIRouter(tags=["content"])
+
+DbDep = Annotated[Connection, Depends(get_db)]
+
+_VALID_WIZARD_STEPS = {"intents", "content_types", "tones", "titles", "outline", "draft", "research"}
+
+
+# ── GET /api/backlinks/velocity ──────────────────────────────────────────────
+
+@router.get("/backlinks/velocity")
+def backlinks_velocity(
+ conn: DbDep,
+ propertyId: int = Query(...),
+) -> dict[str, Any]:
+ if not propertyId:
+ raise HTTPException(status_code=400, detail="propertyId required")
+ return {"snapshots": list_backlinks_velocity(conn, propertyId)}
+
+
+# ── POST /api/backlinks/competitor-import ────────────────────────────────────
+
+@router.post("/backlinks/competitor-import")
+def backlinks_competitor_import(
+ body: dict[str, Any] = Body(default={}),
+) -> dict[str, Any]:
+ competitor = str(body.get("competitor") or "").strip()
+ csv_text = str(body.get("csvText") or "")
+ our_domains = body.get("ourDomains") or []
+
+ if not competitor or not csv_text.strip():
+ raise HTTPException(status_code=400, detail="competitor and csvText required")
+
+ try:
+ from website_profiling.integrations.google.competitor_links import ( # type: ignore[import]
+ parse_referring_domains_from_csv,
+ build_competitor_domain_gap,
+ )
+
+ refs = parse_referring_domains_from_csv(csv_text)
+ gap = build_competitor_domain_gap(set(our_domains), competitor, refs)
+ return {"gap": gap}
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=f"Competitor backlink import failed: {exc}")
+
+
+# ── POST /api/backlinks/third-party-import ───────────────────────────────────
+
+@router.post("/backlinks/third-party-import")
+def backlinks_third_party_import(
+ conn: DbDep,
+ body: dict[str, Any] = Body(default={}),
+) -> dict[str, Any]:
+ property_id = int(body.get("propertyId") or 0)
+ provider = str(body.get("provider") or "moz").strip().lower()
+ csv_text = str(body.get("csvText") or "")
+ our_domains = body.get("ourDomains") or []
+
+ if not property_id or not csv_text.strip():
+ raise HTTPException(status_code=400, detail="propertyId and csvText required")
+ if provider not in ("moz", "majestic"):
+ raise HTTPException(status_code=400, detail="provider must be moz or majestic")
+
+ try:
+ from website_profiling.integrations.links.third_party_csv import ( # type: ignore[import]
+ build_third_party_overlay,
+ )
+ from website_profiling.integrations.google.gsc_links_store import ( # type: ignore[import]
+ import_third_party_links_overlay,
+ )
+
+ overlay = build_third_party_overlay(provider, csv_text, our_domains)
+ result = import_third_party_links_overlay(conn, property_id, overlay)
+ return result # type: ignore[return-value]
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=f"Third-party backlink import failed: {exc}")
+
+
+# ── POST /api/content/analyze ─────────────────────────────────────────────────
+
+@router.post("/content/analyze")
+def content_analyze(
+ body: dict[str, Any] = Body(default={}),
+) -> dict[str, Any]:
+ keyword = str(body.get("keyword") or "").strip()
+ if not keyword:
+ raise HTTPException(status_code=400, detail="keyword required")
+
+ property_id_raw = body.get("propertyId")
+ property_id = int(property_id_raw) if property_id_raw else None
+
+ try:
+ from website_profiling.content_studio.ai_suggest import analyze_content_draft # type: ignore[import]
+
+ analysis = analyze_content_draft(
+ property_id,
+ keyword,
+ body.get("bodyHtml") or "",
+ body.get("titleTag") or "",
+ body.get("metaDescription") or "",
+ body.get("landingUrl") or None,
+ use_ai=bool(body.get("useAi")),
+ refresh=bool(body.get("refresh")),
+ title=body.get("title") or "",
+ )
+ return {"analysis": analysis}
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=f"Content analyze failed: {exc}")
+
+
+# ── POST /api/content/score ───────────────────────────────────────────────────
+
+@router.post("/content/score")
+def content_score(
+ body: dict[str, Any] = Body(default={}),
+) -> dict[str, Any]:
+ keyword = str(body.get("keyword") or "").strip()
+ if not keyword:
+ raise HTTPException(status_code=400, detail="keyword required")
+
+ property_id_raw = body.get("propertyId")
+ property_id = int(property_id_raw) if property_id_raw else None
+
+ try:
+ from website_profiling.content_studio.score import score_content_draft # type: ignore[import]
+
+ score = score_content_draft(
+ property_id,
+ keyword,
+ body.get("bodyHtml") or "",
+ body.get("titleTag") or "",
+ body.get("metaDescription") or "",
+ body.get("landingUrl") or None,
+ )
+ return {"score": score}
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=f"Content score failed: {exc}")
+
+
+# ── POST /api/content/wizard ──────────────────────────────────────────────────
+
+@router.post("/content/wizard")
+def content_wizard(
+ body: dict[str, Any] = Body(default={}),
+) -> dict[str, Any]:
+ step = str(body.get("step") or "").strip()
+ if step not in _VALID_WIZARD_STEPS:
+ raise HTTPException(status_code=400, detail="Invalid wizard step")
+
+ payload = {
+ "keyword": str(body.get("keyword") or "").strip(),
+ "locale": str(body.get("locale") or "en-US"),
+ "intent": str(body.get("intent") or ""),
+ "contentType": str(body.get("contentType") or ""),
+ "tone": str(body.get("tone") or ""),
+ "title": str(body.get("title") or ""),
+ "outline": body.get("outline") if isinstance(body.get("outline"), list) else [],
+ }
+
+ try:
+ from website_profiling.content_studio.wizard import run_wizard_step # type: ignore[import]
+
+ result = run_wizard_step(step, payload)
+ if isinstance(result, dict) and result.get("ok") is False:
+ raise HTTPException(status_code=400, detail=result.get("error") or "Wizard step failed")
+ return {"result": result}
+ except HTTPException:
+ raise
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=f"Wizard step failed: {exc}")
+
+
+# ── GET /api/content-drafts ───────────────────────────────────────────────────
+
+@router.get("/content-drafts")
+def list_content_drafts_route(
+ conn: DbDep,
+ propertyId: int = Query(...),
+) -> dict[str, Any]:
+ if not propertyId:
+ raise HTTPException(status_code=400, detail="propertyId required")
+ return {"drafts": content_draft_store.list_content_drafts(conn, propertyId)}
+
+
+# ── POST /api/content-drafts ──────────────────────────────────────────────────
+
+@router.post("/content-drafts")
+def create_content_draft_route(
+ conn: DbDep,
+ body: dict[str, Any] = Body(default={}),
+) -> dict[str, Any]:
+ property_id = int(body.get("propertyId") or 0)
+ if not property_id:
+ raise HTTPException(status_code=400, detail="propertyId required")
+
+ draft_id = content_draft_store.create_content_draft(
+ conn,
+ property_id,
+ title=str(body.get("title") or "Untitled draft"),
+ target_keyword=str(body.get("target_keyword") or ""),
+ landing_url=str(body.get("landing_url") or "").strip() or None,
+ status=str(body.get("status") or "draft"),
+ body_html=str(body.get("body_html") or ""),
+ title_tag=str(body.get("title_tag") or ""),
+ meta_description=str(body.get("meta_description") or ""),
+ )
+ return {"id": draft_id, "propertyId": property_id}
+
+
+# ── GET /api/content-drafts/{id} ─────────────────────────────────────────────
+
+@router.get("/content-drafts/{draft_id}")
+def get_content_draft_route(conn: DbDep, draft_id: int) -> dict[str, Any]:
+ if not draft_id:
+ raise HTTPException(status_code=400, detail="invalid draft id")
+ draft = content_draft_store.get_content_draft(conn, draft_id)
+ if not draft:
+ raise HTTPException(status_code=404, detail="draft not found")
+ return {"draft": draft}
+
+
+# ── PATCH /api/content-drafts/{id} ───────────────────────────────────────────
+
+@router.patch("/content-drafts/{draft_id}")
+def update_content_draft_route(
+ conn: DbDep,
+ draft_id: int,
+ body: dict[str, Any] = Body(default={}),
+) -> dict[str, Any]:
+ if not draft_id:
+ raise HTTPException(status_code=400, detail="invalid draft id")
+ draft = content_draft_store.update_content_draft(conn, draft_id, body)
+ if not draft:
+ raise HTTPException(status_code=404, detail="draft not found")
+ return {"draft": draft}
+
+
+# ── DELETE /api/content-drafts/{id} ──────────────────────────────────────────
+
+@router.delete("/content-drafts/{draft_id}")
+def delete_content_draft_route(conn: DbDep, draft_id: int) -> dict[str, Any]:
+ if not draft_id:
+ raise HTTPException(status_code=400, detail="invalid draft id")
+ if not content_draft_store.delete_content_draft(conn, draft_id):
+ raise HTTPException(status_code=404, detail="draft not found")
+ return {"ok": True}
diff --git a/src/website_profiling/api/routers/crawl.py b/src/website_profiling/api/routers/crawl.py
new file mode 100644
index 00000000..3638c8c5
--- /dev/null
+++ b/src/website_profiling/api/routers/crawl.py
@@ -0,0 +1,40 @@
+"""Crawl routes: /api/crawl/*"""
+from __future__ import annotations
+
+from typing import Annotated, Any, Optional
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+from psycopg import Connection
+
+from ..deps import get_db
+
+router = APIRouter(tags=["crawl"])
+
+
+@router.get("/crawl/browser-status")
+def browser_status_check() -> dict[str, Any]:
+ """Return whether Playwright + Chromium are available."""
+ from website_profiling.crawl.fetchers import ensure_browser_deps
+
+ return ensure_browser_deps()
+
+
+@router.get("/crawl/page-html")
+def get_page_html(
+ conn: Annotated[Connection, Depends(get_db)],
+ url: str = Query(..., description="Page URL to retrieve stored HTML for"),
+ crawlRunId: Optional[int] = Query(None, description="Crawl run ID"),
+) -> dict[str, Any]:
+ """Return stored HTML and metadata for a URL within a crawl run."""
+ from website_profiling.db.html_store import read_page_html
+
+ if not crawlRunId:
+ raise HTTPException(status_code=400, detail="crawlRunId is required")
+
+ result = read_page_html(conn, crawlRunId, url)
+ if result is None:
+ raise HTTPException(
+ status_code=404,
+ detail=f"No stored HTML found for url={url!r} in crawlRunId={crawlRunId}",
+ )
+ return result
diff --git a/src/website_profiling/api/routers/dashboards.py b/src/website_profiling/api/routers/dashboards.py
new file mode 100644
index 00000000..095767b5
--- /dev/null
+++ b/src/website_profiling/api/routers/dashboards.py
@@ -0,0 +1,147 @@
+"""Dashboards router — /api/dashboards/*"""
+from __future__ import annotations
+
+from typing import Annotated, Any, Optional
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+from psycopg import Connection
+
+from ..deps import get_db
+from website_profiling.db import dashboard_store
+
+router = APIRouter(tags=["dashboards"])
+
+DbDep = Annotated[Connection, Depends(get_db)]
+
+
+class DashboardCreateBody(BaseModel):
+ propertyId: int
+ name: Optional[str] = None
+ layoutJson: Optional[Any] = None
+
+
+class DashboardUpdateBody(BaseModel):
+ propertyId: int
+ name: Optional[str] = None
+ layoutJson: Optional[Any] = None
+ isDefault: Optional[bool] = None
+
+
+@router.get("/dashboards")
+def list_dashboards(
+ conn: DbDep,
+ propertyId: int = Query(..., description="Property ID"),
+) -> dict[str, Any]:
+ return {"dashboards": dashboard_store.list_dashboards(conn, propertyId)}
+
+
+@router.post("/dashboards", status_code=201)
+def create_dashboard(body: DashboardCreateBody, conn: DbDep) -> dict[str, Any]:
+ name = (body.name or "Untitled dashboard").strip() or "Untitled dashboard"
+ layout = body.layoutJson if body.layoutJson is not None else {}
+ dashboard = dashboard_store.create_dashboard(conn, body.propertyId, name, layout)
+ return {"dashboard": dashboard}
+
+
+@router.get("/dashboards/{dashboard_id}")
+def get_dashboard(
+ dashboard_id: int,
+ conn: DbDep,
+ propertyId: int = Query(..., description="Property ID"),
+) -> dict[str, Any]:
+ dashboard = dashboard_store.get_dashboard(conn, dashboard_id, propertyId)
+ if not dashboard:
+ raise HTTPException(status_code=404, detail="Not found")
+ return {"dashboard": dashboard}
+
+
+@router.put("/dashboards/{dashboard_id}")
+def update_dashboard(dashboard_id: int, body: DashboardUpdateBody, conn: DbDep) -> dict[str, Any]:
+ dashboard = dashboard_store.update_dashboard(
+ conn,
+ dashboard_id,
+ body.propertyId,
+ name=body.name.strip() if body.name is not None else None,
+ layout_json=body.layoutJson,
+ is_default=body.isDefault,
+ )
+ if not dashboard:
+ raise HTTPException(status_code=404, detail="Not found")
+ return {"dashboard": dashboard}
+
+
+@router.delete("/dashboards/{dashboard_id}")
+def delete_dashboard(
+ dashboard_id: int,
+ conn: DbDep,
+ propertyId: int = Query(..., description="Property ID"),
+) -> dict[str, Any]:
+ if not dashboard_store.delete_dashboard(conn, dashboard_id, propertyId):
+ raise HTTPException(status_code=404, detail="Not found")
+ return {"ok": True}
+
+
+class DashboardAiGenerateBody(BaseModel):
+ mode: str
+ prompt: str
+ catalog: list[dict[str, Any]]
+ viz_types: dict[str, str]
+ dashscript_help: str
+ toolName: Optional[str] = None
+ propertyId: Optional[int] = None
+ reportId: Optional[int] = None
+ current: Optional[Any] = None
+ sample: Optional[dict[str, Any]] = None
+
+
+def _truncate_tool_sample(data: dict[str, Any]) -> dict[str, Any]:
+ out: dict[str, Any] = {}
+ for key, val in data.items():
+ out[key] = val[:2] if isinstance(val, list) else val
+ return out
+
+
+@router.post("/dashboards/ai-generate")
+def dashboards_ai_generate(body: DashboardAiGenerateBody, conn: DbDep) -> JSONResponse:
+ """Generate DashScript, a widget, or a full dashboard via LLM."""
+ mode = str(body.mode or "widget").strip().lower()
+ if mode not in {"script", "widget", "dashboard"}:
+ raise HTTPException(status_code=400, detail="mode must be script, widget, or dashboard")
+ prompt = str(body.prompt or "").strip()
+ if not prompt:
+ raise HTTPException(status_code=400, detail="prompt required")
+
+ payload: dict[str, Any] = {
+ "mode": mode,
+ "prompt": prompt,
+ "catalog": body.catalog,
+ "viz_types": body.viz_types,
+ "dashscript_help": body.dashscript_help,
+ "current": body.current,
+ }
+
+ if body.sample is not None:
+ payload["sample"] = body.sample
+ elif body.toolName and body.propertyId and mode in ("script", "widget"):
+ try:
+ from website_profiling.tools.audit_tools import AuditToolContext
+ from website_profiling.tools.audit_tools.registry import dispatch_tool
+
+ ctx = AuditToolContext(property_id=body.propertyId, report_id=body.reportId)
+ tool_result = dispatch_tool(body.toolName, {}, context=ctx, conn=conn)
+ if isinstance(tool_result, dict) and "error" not in tool_result:
+ payload["sample"] = _truncate_tool_sample(tool_result)
+ except Exception:
+ pass
+
+ from website_profiling.db.config_store import read_llm_config
+ from website_profiling.llm.dashboard_ai import generate_dashboard_ai
+
+ cfg = read_llm_config(conn)
+ result = generate_dashboard_ai(payload, cfg=cfg or None)
+ if result.get("ok") is False:
+ status = 503 if result.get("missing") else 500
+ return JSONResponse(content=result, status_code=status)
+ return JSONResponse(content=result)
diff --git a/src/website_profiling/api/routers/filters.py b/src/website_profiling/api/routers/filters.py
new file mode 100644
index 00000000..b2fb19d7
--- /dev/null
+++ b/src/website_profiling/api/routers/filters.py
@@ -0,0 +1,55 @@
+"""Saved filters router — /api/filters"""
+from __future__ import annotations
+
+from typing import Annotated, Any, Optional
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+from pydantic import BaseModel
+from psycopg import Connection
+
+from ..deps import get_db
+from website_profiling.db import saved_filter_store
+
+router = APIRouter(tags=["filters"])
+
+DbDep = Annotated[Connection, Depends(get_db)]
+
+
+class FilterUpsertBody(BaseModel):
+ propertyId: int
+ name: str
+ filterJson: Optional[Any] = None
+
+
+class FilterDeleteBody(BaseModel):
+ propertyId: int
+ name: str
+
+
+@router.get("/filters")
+def list_filters(
+ conn: DbDep,
+ propertyId: int = Query(..., description="Property ID"),
+) -> dict[str, Any]:
+ return {"filters": saved_filter_store.list_saved_filters(conn, propertyId)}
+
+
+@router.post("/filters")
+def upsert_filter(body: FilterUpsertBody, conn: DbDep) -> dict[str, Any]:
+ name = (body.name or "").strip()
+ if not body.propertyId or not name:
+ raise HTTPException(status_code=400, detail="propertyId and name required")
+ filter_json = body.filterJson if isinstance(body.filterJson, dict) else {}
+ saved_filter_store.upsert_saved_filter(conn, body.propertyId, name, filter_json)
+ return {"ok": True}
+
+
+@router.delete("/filters")
+def delete_filter(body: FilterDeleteBody, conn: DbDep) -> dict[str, Any]:
+ name = (body.name or "").strip()
+ if not body.propertyId or not name:
+ raise HTTPException(status_code=400, detail="propertyId and name required")
+ deleted = saved_filter_store.delete_saved_filter(conn, body.propertyId, name)
+ if not deleted:
+ raise HTTPException(status_code=404, detail="filter not found")
+ return {"ok": True}
diff --git a/src/website_profiling/api/routers/health.py b/src/website_profiling/api/routers/health.py
new file mode 100644
index 00000000..fe00d023
--- /dev/null
+++ b/src/website_profiling/api/routers/health.py
@@ -0,0 +1,17 @@
+"""GET /api/health — liveness + DB check."""
+from __future__ import annotations
+
+from typing import Annotated
+
+from fastapi import APIRouter, Depends
+from psycopg import Connection
+
+from ..deps import get_db
+
+router = APIRouter(tags=["health"])
+
+
+@router.get("/health")
+def health_check(conn: Annotated[Connection, Depends(get_db)]) -> dict:
+ conn.execute("SELECT 1")
+ return {"ok": True, "database": "up"}
diff --git a/src/website_profiling/api/routers/integrations.py b/src/website_profiling/api/routers/integrations.py
new file mode 100644
index 00000000..c8a6fe1d
--- /dev/null
+++ b/src/website_profiling/api/routers/integrations.py
@@ -0,0 +1,549 @@
+"""Integrations routers — /api/integrations/google/* and /api/integrations/bing/*."""
+from __future__ import annotations
+
+import json
+import sys
+from typing import Annotated, Any, Optional
+
+from fastapi import APIRouter, Body, Depends, HTTPException, Query
+from psycopg import Connection
+
+from ..deps import get_db
+
+router = APIRouter(prefix="/integrations", tags=["integrations"])
+
+DbDep = Annotated[Connection, Depends(get_db)]
+
+# ── Helpers ────────────────────────────────────────────────────────────────────
+
+def _google_public_status(conn: Connection) -> dict[str, Any]:
+ """Build a public status dict from google_app_settings."""
+ from website_profiling.db.google_app_store import read_google_app_settings
+
+ cfg = read_google_app_settings(conn)
+ has_client_id = bool(cfg.get("client_id"))
+ has_client_secret = bool(cfg.get("client_secret"))
+ has_service_account = bool(cfg.get("service_account_json"))
+ sa = cfg.get("service_account_json") or {}
+ return {
+ "hasClientId": has_client_id,
+ "hasClientSecret": has_client_secret,
+ "hasOAuthApp": has_client_id and has_client_secret,
+ "hasServiceAccount": has_service_account,
+ "serviceAccountEmail": sa.get("client_email") if has_service_account else None,
+ "dateRangeDays": cfg.get("default_date_range_days", 28),
+ "hasDeveloperToken": bool(cfg.get("developer_token")),
+ "hasLoginCustomerId": bool(cfg.get("login_customer_id")),
+ }
+
+
+# ── GET /api/integrations/google/credentials ──────────────────────────────────
+
+@router.get("/google/credentials")
+def get_google_credentials(conn: DbDep) -> dict[str, Any]:
+ """Full app-level Google OAuth settings (server-side / local admin only)."""
+ from website_profiling.db.google_app_store import read_google_app_settings
+
+ cfg = read_google_app_settings(conn)
+ sa = cfg.get("service_account_json")
+ return {
+ "clientId": str(cfg.get("client_id") or "").strip(),
+ "clientSecret": str(cfg.get("client_secret") or "").strip(),
+ "serviceAccount": sa if isinstance(sa, dict) else None,
+ "dateRangeDays": int(cfg.get("default_date_range_days") or 28),
+ "developerToken": str(cfg.get("developer_token") or "").strip(),
+ "loginCustomerId": str(cfg.get("login_customer_id") or "").strip(),
+ }
+
+
+# ── GET /api/integrations/google/status ───────────────────────────────────────
+
+@router.get("/google/status")
+def google_status(conn: DbDep) -> dict[str, Any]:
+ from website_profiling.integrations.google.store import read_last_google_fetched_at
+
+ status = _google_public_status(conn)
+ status["lastFetchedAt"] = read_last_google_fetched_at(conn)
+ return status
+
+
+# ── POST /api/integrations/google/credentials ─────────────────────────────────
+
+@router.post("/google/credentials")
+def save_google_credentials(
+ conn: DbDep,
+ body: dict[str, Any] = Body(default={}),
+) -> dict[str, Any]:
+ _PROPERTY_ONLY_MSG = (
+ "Per-site settings (GSC, GA4, refresh token) must be saved via property "
+ "Integrations when a Site URL is set."
+ )
+ if any(k in body for k in ("refreshToken", "gscSiteUrl", "ga4PropertyId")):
+ raise HTTPException(status_code=400, detail=_PROPERTY_ONLY_MSG)
+
+ from website_profiling.db.google_app_store import save_google_app_settings
+
+ patch: dict[str, Any] = {}
+ if isinstance(body.get("clientId"), str) and body["clientId"].strip():
+ patch["client_id"] = body["clientId"].strip()
+ if isinstance(body.get("clientSecret"), str) and body["clientSecret"].strip():
+ patch["client_secret"] = body["clientSecret"].strip()
+ if isinstance(body.get("dateRangeDays"), (int, float)) and body["dateRangeDays"] > 0:
+ patch["default_date_range_days"] = int(body["dateRangeDays"])
+ if isinstance(body.get("developerToken"), str) and body["developerToken"].strip():
+ patch["developer_token"] = body["developerToken"].strip()
+ if isinstance(body.get("loginCustomerId"), str) and body["loginCustomerId"].strip():
+ patch["login_customer_id"] = body["loginCustomerId"].strip().replace("-", "")
+
+ if not patch:
+ raise HTTPException(status_code=400, detail="No valid fields provided")
+
+ save_google_app_settings(conn, patch)
+ return {"ok": True, "status": _google_public_status(conn)}
+
+
+# ── POST /api/integrations/google/credentials/upload ──────────────────────────
+
+@router.post("/google/credentials/upload")
+def upload_google_credentials(
+ conn: DbDep,
+ body: dict[str, Any] = Body(default={}),
+) -> dict[str, Any]:
+ from website_profiling.db.google_app_store import save_google_app_settings
+
+ raw = body.get("fileContent")
+ if not raw or not isinstance(raw, str):
+ raise HTTPException(status_code=400, detail="fileContent is required")
+
+ try:
+ parsed = json.loads(raw)
+ except Exception:
+ raise HTTPException(status_code=400, detail="This doesn't look like a valid JSON file.")
+
+ if (
+ not isinstance(parsed, dict)
+ or parsed.get("type") != "service_account"
+ or not isinstance(parsed.get("client_email"), str)
+ or not isinstance(parsed.get("private_key"), str)
+ ):
+ raise HTTPException(
+ status_code=400,
+ detail=(
+ "This doesn't look like a Google service account key file. "
+ "Make sure you downloaded the JSON key from Google Cloud Console > "
+ "IAM & Admin > Service Accounts."
+ ),
+ )
+
+ save_google_app_settings(conn, {"service_account_json": parsed})
+ return {"ok": True, "status": _google_public_status(conn)}
+
+
+# ── POST /api/integrations/google/disconnect ──────────────────────────────────
+
+@router.post("/google/disconnect")
+def google_disconnect(conn: DbDep) -> dict[str, Any]:
+ """Global disconnect is deprecated — use per-property disconnect."""
+ return {
+ "ok": False,
+ "error": (
+ "Disconnect Google per site: set Site URL, open Integrations, "
+ "and use Disconnect on that property."
+ ),
+ "status": _google_public_status(conn),
+ }
+
+
+# ── GET /api/integrations/google/properties ───────────────────────────────────
+
+@router.get("/google/properties")
+def google_properties_deprecated(
+ property_id: Optional[int] = Query(None, alias="propertyId"),
+) -> dict[str, Any]:
+ """Deprecated — use /api/properties/{id}/google/properties."""
+ if not property_id:
+ raise HTTPException(
+ status_code=400,
+ detail="propertyId query parameter is required. Use /api/properties/{id}/google/properties instead.",
+ )
+ raise HTTPException(
+ status_code=301,
+ detail=f"Use /api/properties/{property_id}/google/properties",
+ )
+
+
+# ── POST /api/integrations/google/test ────────────────────────────────────────
+
+@router.post("/google/test")
+def google_test() -> dict[str, Any]:
+ """Run `python -m src google --test` and return stdout log."""
+ import subprocess
+ import sys
+
+ try:
+ result = subprocess.run(
+ [sys.executable, "-m", "src", "google", "--test"],
+ capture_output=True,
+ text=True,
+ timeout=30,
+ )
+ log = (result.stdout + result.stderr)[-28_000:]
+ return {"ok": result.returncode == 0, "log": log, "exitCode": result.returncode}
+ except subprocess.TimeoutExpired:
+ return {"ok": False, "log": "", "error": "Test timed out after 30s"}
+ except Exception as exc:
+ return {"ok": False, "log": "", "error": str(exc)}
+
+
+# ── GET /api/integrations/google/page-data ────────────────────────────────────
+
+@router.get("/google/page-data")
+def google_page_data(
+ conn: DbDep,
+ url: str = Query(...),
+ googleSnapshotId: Optional[int] = Query(None),
+ propertyId: Optional[str] = Query(None),
+ domain: Optional[str] = Query(None),
+) -> dict[str, Any]:
+ from website_profiling.db.property_store import resolve_property_id_for_page
+ from website_profiling.integrations.google.page_lookup import slice_from_google_row
+ from website_profiling.integrations.google.store import read_google_snapshot_row
+
+ if not url:
+ raise HTTPException(status_code=400, detail="url parameter required")
+
+ property_id = resolve_property_id_for_page(conn, url, propertyId, domain)
+
+ _empty = {
+ "source": "snapshot",
+ "snapshotId": None,
+ "gsc": None,
+ "ga4": None,
+ "coverage": {"inCrawl": False, "inGsc": False, "inGa4": False},
+ "siteBenchmarks": {"gsc": None, "ga4": None},
+ "dateRange": {},
+ "fetchedAt": None,
+ }
+
+ if property_id is None:
+ return _empty
+
+ snap = read_google_snapshot_row(
+ conn,
+ property_id,
+ snapshot_id=googleSnapshotId,
+ )
+ if not snap:
+ return _empty
+
+ slice_data = slice_from_google_row(snap["data"], url)
+ return {
+ **slice_data,
+ "snapshotId": snap["id"],
+ "fetchedAt": snap["fetchedAt"] or slice_data.get("fetchedAt"),
+ }
+
+
+# ── GET /api/integrations/google/page-data/history ────────────────────────────
+
+@router.get("/google/page-data/history")
+def google_page_data_history(
+ conn: DbDep,
+ url: str = Query(...),
+ propertyId: Optional[str] = Query(None),
+ domain: Optional[str] = Query(None),
+) -> dict[str, Any]:
+ from website_profiling.db.property_store import resolve_property_id_for_page
+ from website_profiling.integrations.google.page_lookup import (
+ slice_from_google_row,
+ summary_from_slice,
+ )
+ from website_profiling.integrations.google.store import list_google_snapshot_rows
+
+ if not url:
+ raise HTTPException(status_code=400, detail="url parameter required")
+
+ property_id = resolve_property_id_for_page(conn, url, propertyId, domain)
+ if property_id is None:
+ return {"url": url, "history": []}
+
+ history: list[dict[str, Any]] = []
+ for snap in list_google_snapshot_rows(conn, property_id, limit=10):
+ slice_data = slice_from_google_row(snap["data"], url)
+ if not slice_data.get("gsc") and not slice_data.get("ga4"):
+ continue
+ summary = summary_from_slice(slice_data.get("gsc"), slice_data.get("ga4"))
+ history.append({
+ "id": snap["id"],
+ "fetchedAt": snap["fetchedAt"],
+ "type": "snapshot",
+ "gsc": summary.get("gsc"),
+ "ga4": summary.get("ga4"),
+ })
+
+ return {"url": url, "history": history}
+
+
+# ── POST /api/integrations/google/page-live ───────────────────────────────────
+
+@router.post("/google/page-live")
+def google_page_live(
+ body: dict[str, Any] = Body(default={}),
+) -> dict[str, Any]:
+ url = str(body.get("url") or "").strip()
+ if not url:
+ raise HTTPException(status_code=400, detail="url is required")
+
+ try:
+ result = subprocess.run(
+ [sys.executable, "-m", "src", "page-live", "--url", url],
+ capture_output=True,
+ text=True,
+ timeout=45,
+ )
+ combined = result.stdout + result.stderr
+ log = combined[-28_000:]
+ lines = [ln for ln in result.stdout.strip().splitlines() if ln]
+ last = lines[-1] if lines else "{}"
+ try:
+ data = json.loads(last)
+ except Exception:
+ data = {}
+
+ if result.returncode != 0 and not data.get("ok") and not data.get("gsc") and not data.get("ga4"):
+ raise HTTPException(
+ status_code=500,
+ detail=data.get("error") or "Live fetch failed",
+ )
+ import datetime
+ return {"ok": True, "fetchedAt": datetime.datetime.now(datetime.timezone.utc).isoformat().replace("+00:00", "Z"), **data}
+ except subprocess.TimeoutExpired:
+ raise HTTPException(status_code=504, detail="Live fetch timed out after 45s")
+
+
+# ── GET /api/integrations/google/keywords/by-page ─────────────────────────────
+
+@router.get("/google/keywords/by-page")
+def google_keywords_by_page(
+ conn: DbDep,
+ url: str = Query(..., alias="url"),
+ propertyId: Optional[str] = Query(None),
+ domain: Optional[str] = Query(None),
+) -> dict[str, Any]:
+ from website_profiling.db.property_store import resolve_property_id_for_page
+ from website_profiling.integrations.google.keyword_store import read_latest_keyword_data
+
+ page_url = url.strip()
+ if not page_url:
+ raise HTTPException(status_code=400, detail="url parameter is required")
+
+ property_id = resolve_property_id_for_page(conn, page_url, propertyId, domain)
+ if property_id is None:
+ raise HTTPException(status_code=400, detail="propertyId or domain required")
+
+ data = read_latest_keyword_data(conn, property_id) or {}
+ all_rows = data.get("rows") or []
+ normalized_target = page_url.lower().rstrip("/")
+
+ page_keywords = [
+ r for r in all_rows
+ if _matches_url(r.get("gsc_url") or "", normalized_target)
+ ]
+
+ cannib_raw = data.get("cannibalisation") or []
+ cannib = [
+ c for c in cannib_raw
+ if any(
+ (p.get("url") or "").lower().rstrip("/") == normalized_target
+ for p in (c.get("pages") or [])
+ )
+ ]
+
+ return {
+ "url": page_url,
+ "propertyId": property_id,
+ "keyword_count": len(page_keywords),
+ "keywords": page_keywords,
+ "cannibalisation": cannib,
+ "fetched_at": data.get("fetched_at"),
+ }
+
+
+def _matches_url(candidate: str, target: str) -> bool:
+ u = candidate.lower().rstrip("/")
+ return u == target or u in target or target in u
+
+
+# ── GET /api/integrations/google/keywords/history ────────────────────────────
+
+@router.get("/google/keywords/history")
+def google_keywords_history(
+ conn: DbDep,
+ keyword: str = Query(...),
+ propertyId: Optional[str] = Query(None),
+ domain: Optional[str] = Query(None),
+ limit: int = Query(30, ge=1, le=90),
+) -> dict[str, Any]:
+ from website_profiling.db.property_store import resolve_property_id_for_page
+ from website_profiling.integrations.google.keyword_store import read_keyword_history
+
+ keyword = keyword.strip()
+ if not keyword:
+ raise HTTPException(status_code=400, detail="keyword parameter is required")
+
+ property_id = resolve_property_id_for_page(conn, "", propertyId, domain)
+ if property_id is None:
+ raise HTTPException(status_code=400, detail="propertyId or domain required")
+
+ history = read_keyword_history(conn, keyword, limit, property_id=property_id)
+ return {"keyword": keyword, "propertyId": property_id, "history": history}
+
+
+# ── POST /api/integrations/bing/sync ─────────────────────────────────────────
+
+@router.post("/bing/sync")
+def bing_sync(conn: DbDep) -> dict[str, Any]:
+ """Fetch Bing Webmaster backlinks summary using config from DB."""
+ from website_profiling.db.config_store import read_pipeline_config
+
+ try:
+ state, _ = read_pipeline_config(conn)
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=str(exc))
+
+ api_key = str(state.get("bing_webmaster_api_key") or "").strip()
+ site_url = str(state.get("start_url") or "").strip()
+
+ if not api_key or not site_url:
+ raise HTTPException(
+ status_code=400,
+ detail="Set bing_webmaster_api_key and start_url in pipeline settings.",
+ )
+
+ try:
+ from website_profiling.integrations.bing.webmaster import fetch_bing_backlinks_summary
+
+ result = fetch_bing_backlinks_summary(api_key, site_url)
+ return result # type: ignore[return-value]
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=str(exc))
+
+
+# ── GET /api/integrations/google/page-compare ────────────────────────────────
+
+@router.get("/google/page-compare")
+def google_page_compare(
+ conn: DbDep,
+ url: str = Query(...),
+ currentType: str = Query("snapshot"),
+ currentId: int = Query(...),
+ baselineType: str = Query("snapshot"),
+ baselineId: int = Query(...),
+) -> dict[str, Any]:
+ """Compare two page Google data snapshots."""
+ from website_profiling.integrations.google.page_snapshot_store import read_page_snapshot_compare
+
+ current = read_page_snapshot_compare(conn, currentId)
+ baseline = read_page_snapshot_compare(conn, baselineId)
+ if current is None:
+ raise HTTPException(status_code=404, detail="Current snapshot not found")
+ if baseline is None:
+ raise HTTPException(status_code=404, detail="Baseline snapshot not found")
+ return {"url": url, "current": current, "baseline": baseline}
+
+
+# ── GET /api/integrations/google/page-live/history ────────────────────────────
+
+@router.get("/google/page-live/history")
+def google_page_live_history(
+ conn: DbDep,
+ url: str = Query(...),
+ limit: int = Query(15, ge=1, le=50),
+) -> dict[str, Any]:
+ """Return history of page Google snapshots for a URL."""
+ from website_profiling.integrations.google.page_snapshot_store import list_page_snapshot_api_history
+
+ try:
+ history = list_page_snapshot_api_history(conn, url, limit=limit)
+ return {"url": url, "history": history}
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=str(exc))
+
+
+# ── POST /api/integrations/google/keywords/history/batch ─────────────────────
+
+@router.post("/google/keywords/history/batch")
+def google_keywords_history_batch(
+ conn: DbDep,
+ body: dict[str, Any],
+) -> dict[str, Any]:
+ """Batch keyword history: { keywords: str[], limit?: int, propertyId?: int, domain?: str }"""
+ from website_profiling.db.property_store import get_property_id_by_domain
+ from website_profiling.integrations.google.keyword_store import read_keyword_history_batch
+
+ keywords_raw = body.get("keywords") or []
+ if not isinstance(keywords_raw, list):
+ raise HTTPException(status_code=400, detail="keywords must be a list")
+ keywords = [str(k).strip() for k in keywords_raw[:100] if k]
+ limit = max(1, min(int(body.get("limit") or 30), 90))
+ property_id = None
+ if body.get("propertyId"):
+ try:
+ property_id = int(body["propertyId"])
+ except (TypeError, ValueError):
+ pass
+ elif body.get("domain"):
+ property_id = get_property_id_by_domain(conn, str(body["domain"]))
+
+ if property_id is None:
+ raise HTTPException(status_code=400, detail="propertyId or domain required")
+
+ results = read_keyword_history_batch(
+ conn,
+ keywords,
+ property_id=property_id,
+ limit=limit,
+ )
+ return {"keywords": results, "propertyId": property_id}
+
+
+# ── GET/POST /api/integrations/google/keywords/expand ────────────────────────
+
+@router.post("/google/keywords/expand")
+def google_keywords_expand(
+ conn: DbDep,
+ body: dict[str, Any],
+) -> dict[str, Any]:
+ """Expand keyword ideas from Google Keyword Planner or suggest API."""
+ keyword = str(body.get("keyword") or "").strip()
+ if not keyword:
+ raise HTTPException(status_code=400, detail="keyword required")
+ try:
+ from website_profiling.tools.keyword_suggestions import expand_keyword
+ result = expand_keyword(keyword, body.get("propertyId"), conn)
+ return result if isinstance(result, dict) else {"keywords": result}
+ except ImportError:
+ raise HTTPException(status_code=501, detail="Keyword expansion unavailable")
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=str(exc))
+
+
+# ── POST /api/integrations/google/keywords/planner ────────────────────────────
+
+@router.post("/google/keywords/planner")
+def google_keywords_planner(
+ conn: DbDep,
+ body: dict[str, Any],
+) -> dict[str, Any]:
+ """Fetch keyword planner data from Google Ads API."""
+ keywords_raw = body.get("keywords") or []
+ if not isinstance(keywords_raw, list):
+ raise HTTPException(status_code=400, detail="keywords must be a list")
+ try:
+ from website_profiling.integrations.google.keyword_planner import fetch_keyword_ideas
+ result = fetch_keyword_ideas(conn, keywords_raw)
+ return result if isinstance(result, dict) else {"ideas": result}
+ except ImportError:
+ raise HTTPException(status_code=501, detail="Google Keyword Planner unavailable")
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=str(exc))
diff --git a/src/website_profiling/api/routers/issues.py b/src/website_profiling/api/routers/issues.py
new file mode 100644
index 00000000..1f405ef8
--- /dev/null
+++ b/src/website_profiling/api/routers/issues.py
@@ -0,0 +1,148 @@
+"""Issues routers — /api/issues/* and /api/ai/*."""
+from __future__ import annotations
+
+from typing import Annotated, Any
+
+from fastapi import APIRouter, Body, Depends, HTTPException, Query
+from psycopg import Connection
+
+from ..deps import get_db
+from website_profiling.db import issue_status_store
+
+router = APIRouter(tags=["issues"])
+
+DbDep = Annotated[Connection, Depends(get_db)]
+
+
+# ── GET /api/issues/status ────────────────────────────────────────────────────
+
+@router.get("/issues/status")
+def list_issue_status_route(
+ conn: DbDep,
+ propertyId: int = Query(...),
+) -> dict[str, Any]:
+ if not propertyId:
+ raise HTTPException(status_code=400, detail="propertyId required")
+ return {"issues": issue_status_store.list_issue_status(conn, propertyId)}
+
+
+# ── PUT /api/issues/status ────────────────────────────────────────────────────
+
+@router.put("/issues/status")
+def upsert_issue_status_route(
+ conn: DbDep,
+ body: dict[str, Any] = Body(default={}),
+) -> dict[str, Any]:
+ property_id = int(body.get("propertyId") or 0)
+ message = str(body.get("message") or "").strip()
+ status = str(body.get("status") or "")
+
+ if not property_id or not message or not status:
+ raise HTTPException(
+ status_code=400,
+ detail="propertyId, message, and valid status required",
+ )
+
+ report_id = body.get("reportId")
+ try:
+ issue = issue_status_store.upsert_issue_status(
+ conn,
+ property_id=property_id,
+ message=message,
+ status=status,
+ report_id=int(report_id) if report_id is not None else None,
+ url=str(body.get("url") or ""),
+ priority=str(body.get("priority") or "Medium"),
+ category_id=body.get("categoryId") or None,
+ assignee=body.get("assignee") or None,
+ note=body.get("note") or None,
+ )
+ except ValueError as exc:
+ raise HTTPException(status_code=400, detail=str(exc)) from exc
+
+ return {"issue": issue}
+
+
+# ── POST /api/issues/fix-suggestion ──────────────────────────────────────────
+
+@router.post("/issues/fix-suggestion")
+def issues_fix_suggestion(
+ body: dict[str, Any] = Body(default={}),
+) -> Any:
+ message = str(body.get("message") or "").strip()
+ if not message:
+ raise HTTPException(status_code=400, detail="message required")
+
+ payload = {
+ "source": "issue",
+ "message": message,
+ "url": body.get("url"),
+ "priority": body.get("priority"),
+ "category": body.get("category"),
+ "recommendation": body.get("recommendation"),
+ "type": body.get("type"),
+ "refresh": body.get("refresh"),
+ }
+
+ try:
+ from website_profiling.llm.fix_suggestions import generate_fix_suggestion # type: ignore[import]
+
+ return generate_fix_suggestion(payload, refresh=bool(payload.get("refresh")))
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=f"Fix suggestion failed: {exc}")
+
+
+# ── POST /api/issues/action-plan ──────────────────────────────────────────────
+
+@router.post("/issues/action-plan")
+def issues_action_plan(
+ body: dict[str, Any] = Body(default={}),
+) -> Any:
+ domain = str(body.get("domain") or "").strip()
+ if not domain:
+ raise HTTPException(status_code=400, detail="domain required")
+ if not isinstance(body.get("issues"), list) or len(body["issues"]) == 0:
+ raise HTTPException(status_code=400, detail="issues required")
+
+ payload = {
+ "domain": domain,
+ "issues": body["issues"],
+ "refresh": body.get("refresh"),
+ }
+
+ try:
+ from website_profiling.llm.issues_action_plan import generate_issues_action_plan # type: ignore[import]
+
+ return generate_issues_action_plan(payload, refresh=bool(payload.get("refresh")))
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=f"Action plan failed: {exc}")
+
+
+# ── POST /api/ai/fix-suggestion ──────────────────────────────────────────────
+
+@router.post("/ai/fix-suggestion")
+def ai_fix_suggestion(
+ body: dict[str, Any] = Body(default={}),
+) -> Any:
+ message = str(body.get("message") or "").strip()
+ if not message:
+ raise HTTPException(status_code=400, detail="message required")
+
+ payload = {
+ "source": body.get("source") or "issue",
+ "message": message,
+ "url": body.get("url"),
+ "refresh": body.get("refresh"),
+ "context": body.get("context"),
+ "priority": body.get("priority"),
+ "category": body.get("category"),
+ "recommendation": body.get("recommendation"),
+ "type": body.get("type"),
+ }
+
+ try:
+ from website_profiling.llm.fix_suggestions import generate_fix_suggestion # type: ignore[import]
+
+ return generate_fix_suggestion(payload, refresh=bool(payload.get("refresh")))
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=f"Fix suggestion failed: {exc}")
diff --git a/src/website_profiling/api/routers/keywords.py b/src/website_profiling/api/routers/keywords.py
new file mode 100644
index 00000000..4c676cf0
--- /dev/null
+++ b/src/website_profiling/api/routers/keywords.py
@@ -0,0 +1,72 @@
+"""Keywords routers — /api/keywords/*."""
+from __future__ import annotations
+
+from typing import Annotated, Any, Optional
+
+from fastapi import APIRouter, Body, Depends, HTTPException
+
+from ..deps import get_db
+from psycopg import Connection
+
+router = APIRouter(prefix="/keywords", tags=["keywords"])
+
+DbDep = Annotated[Connection, Depends(get_db)]
+
+
+# ── POST /api/keywords/competitor-import ──────────────────────────────────────
+
+@router.post("/competitor-import")
+def keywords_competitor_import(
+ conn: DbDep,
+ body: dict[str, Any] = Body(default={}),
+) -> dict[str, Any]:
+ property_id = int(body.get("propertyId") or 0)
+ competitor = str(body.get("competitor") or "").strip()
+ csv_text = str(body.get("csvText") or "")
+
+ if not property_id or not competitor or not csv_text.strip():
+ raise HTTPException(
+ status_code=400,
+ detail="propertyId, competitor, and csvText required",
+ )
+
+ try:
+ from website_profiling.integrations.keywords.competitor_csv import ( # type: ignore[import]
+ parse_competitor_keyword_csv,
+ )
+ from website_profiling.integrations.keywords.competitor_gap_store import ( # type: ignore[import]
+ merge_competitor_keyword_import,
+ )
+
+ rows = parse_competitor_keyword_csv(csv_text, competitor=competitor)
+ merged = merge_competitor_keyword_import(conn, property_id, competitor, rows)
+ return {
+ "count": len(rows),
+ "rows": rows[:500],
+ "mergedCount": len(merged),
+ "mergedRows": merged[:500],
+ }
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=f"Competitor keyword import failed: {exc}")
+
+
+# ── POST /api/keywords/content-brief ─────────────────────────────────────────
+
+@router.post("/content-brief")
+def keywords_content_brief(
+ body: dict[str, Any] = Body(default={}),
+) -> dict[str, Any]:
+ keyword = str(body.get("keyword") or "").strip()
+ if not keyword:
+ raise HTTPException(status_code=400, detail="keyword required")
+
+ rows = body.get("rows") or []
+ gaps = body.get("gaps") or []
+
+ try:
+ from website_profiling.llm.content_brief import generate_content_brief # type: ignore[import]
+
+ brief = generate_content_brief(keyword, rows, gaps)
+ return {"brief": brief}
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=f"Content brief generation failed: {exc}")
diff --git a/src/website_profiling/api/routers/logs.py b/src/website_profiling/api/routers/logs.py
new file mode 100644
index 00000000..f8e472db
--- /dev/null
+++ b/src/website_profiling/api/routers/logs.py
@@ -0,0 +1,33 @@
+"""Access log upload and analysis — /api/logs/*."""
+from __future__ import annotations
+
+from typing import Annotated, Any
+
+from fastapi import APIRouter, Depends, File, Form, HTTPException, UploadFile
+from psycopg import Connection
+
+from ..deps import get_db
+
+router = APIRouter(tags=["logs"])
+
+DbDep = Annotated[Connection, Depends(get_db)]
+
+
+@router.post("/logs/upload")
+def logs_upload(
+ conn: DbDep,
+ propertyId: int = Form(...),
+ file: UploadFile = File(...),
+) -> dict[str, Any]:
+ if not propertyId:
+ raise HTTPException(status_code=400, detail="propertyId required")
+ content = file.file.read().decode("utf-8", errors="replace")
+ try:
+ from website_profiling.tools.log_analysis import parse_and_store_access_log
+
+ result = parse_and_store_access_log(conn, propertyId, content)
+ return result if isinstance(result, dict) else {"ok": True}
+ except ImportError:
+ raise HTTPException(status_code=501, detail="Log analysis module unavailable")
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=str(exc))
diff --git a/src/website_profiling/api/routers/mcp_tools.py b/src/website_profiling/api/routers/mcp_tools.py
new file mode 100644
index 00000000..45cb6b49
--- /dev/null
+++ b/src/website_profiling/api/routers/mcp_tools.py
@@ -0,0 +1,41 @@
+"""MCP audit tool catalog — /api/mcp-tools."""
+from __future__ import annotations
+
+from typing import Any
+
+from fastapi import APIRouter, HTTPException
+
+router = APIRouter(tags=["mcp-tools"])
+
+
+@router.get("/mcp-tools")
+def mcp_tools() -> dict[str, Any]:
+ try:
+ from website_profiling.tools.audit_tools.registry import (
+ TOOL_DEFINITIONS,
+ get_tool_meta,
+ mcp_tool_names,
+ )
+ from website_profiling.tools.audit_tools.tool_domains import (
+ MCP_DOMAIN_BUNDLES,
+ classify_tool_domain,
+ )
+
+ bundle_sets = {b: mcp_tool_names(b) for b in MCP_DOMAIN_BUNDLES.keys()}
+ tools = []
+ for spec in TOOL_DEFINITIONS:
+ name = spec.get("name", "")
+ if not name:
+ continue
+ meta = get_tool_meta(name) or {}
+ domain = meta.get("domain") or classify_tool_domain(name)
+ in_bundles = [b for b, names in bundle_sets.items() if name in names]
+ tools.append({
+ "name": name,
+ "description": spec.get("description", ""),
+ "domain": domain,
+ "bundles": in_bundles,
+ })
+ return {"tools": tools, "bundles": list(MCP_DOMAIN_BUNDLES.keys())}
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=str(exc))
diff --git a/src/website_profiling/api/routers/ollama.py b/src/website_profiling/api/routers/ollama.py
new file mode 100644
index 00000000..315d64ec
--- /dev/null
+++ b/src/website_profiling/api/routers/ollama.py
@@ -0,0 +1,65 @@
+"""Ollama LLM runtime status — /api/ollama/*."""
+from __future__ import annotations
+
+from typing import Annotated, Any
+
+from fastapi import APIRouter, Depends
+from psycopg import Connection
+
+from ..deps import get_db
+
+router = APIRouter(tags=["ollama"])
+
+DbDep = Annotated[Connection, Depends(get_db)]
+
+DEFAULT_BASE = "http://127.0.0.1:11434"
+
+
+@router.get("/ollama/status")
+def ollama_status(conn: DbDep) -> dict[str, Any]:
+ from website_profiling.db.config_store import read_llm_config
+ from website_profiling.llm.ollama_catalog import (
+ fetch_ollama_models,
+ model_is_configured,
+ models_support_tools,
+ )
+
+ cfg = read_llm_config(conn)
+ base_url = str(cfg.get("llm_base_url") or DEFAULT_BASE).rstrip("/")
+ configured_model = str(cfg.get("llm_model") or "").strip()
+
+ result = fetch_ollama_models(base_url)
+ if not result.get("ok"):
+ return {
+ "ok": False,
+ "baseUrl": result.get("baseUrl", base_url),
+ "configuredModel": configured_model,
+ "error": result.get("error") or "Cannot reach Ollama. Is it running?",
+ "models": [],
+ "cloudCatalogOk": False,
+ "localOk": False,
+ }
+
+ models = result.get("models") or []
+ model_installed = model_is_configured(models, configured_model)
+ configured_entry = next(
+ (m for m in models if str(m.get("name") or "").lower() == configured_model.lower()),
+ None,
+ )
+
+ return {
+ "ok": True,
+ "baseUrl": result.get("baseUrl", base_url),
+ "configuredModel": configured_model,
+ "modelInstalled": model_installed,
+ "supportsTools": (
+ "tools" in (configured_entry.get("capabilities") or [])
+ if configured_entry
+ else models_support_tools(models)
+ ),
+ "cloudCatalogOk": result.get("cloudCatalogOk", False),
+ "localOk": result.get("localOk", False),
+ "catalogSource": "live",
+ "cloudModelCount": sum(1 for m in models if m.get("source") == "cloud"),
+ "models": models,
+ }
diff --git a/src/website_profiling/api/routers/page_coach.py b/src/website_profiling/api/routers/page_coach.py
new file mode 100644
index 00000000..3567175e
--- /dev/null
+++ b/src/website_profiling/api/routers/page_coach.py
@@ -0,0 +1,48 @@
+"""Internal link page coach — /api/links/page-coach."""
+from __future__ import annotations
+
+from typing import Annotated, Any, Optional
+
+from fastapi import APIRouter, Depends, HTTPException
+from psycopg import Connection
+from pydantic import BaseModel
+
+from ..deps import get_db
+
+router = APIRouter(tags=["page-coach"])
+
+DbDep = Annotated[Connection, Depends(get_db)]
+
+
+class PageCoachBody(BaseModel):
+ url: Optional[str] = None
+ refresh: bool = False
+ currentType: Optional[str] = None
+ currentId: Optional[int] = None
+ baselineType: Optional[str] = None
+ baselineId: Optional[int] = None
+ propertyId: Optional[int] = None
+
+
+@router.post("/links/page-coach")
+def page_coach(body: PageCoachBody, conn: DbDep) -> dict[str, Any]:
+ url = (body.url or "").strip()
+ if not url:
+ raise HTTPException(status_code=400, detail="url required")
+ try:
+ from website_profiling.tools.page_coach import run_page_coach
+
+ return run_page_coach(
+ conn,
+ url=url,
+ refresh=body.refresh,
+ current_type=body.currentType,
+ current_id=body.currentId,
+ baseline_type=body.baselineType,
+ baseline_id=body.baselineId,
+ property_id=body.propertyId,
+ )
+ except ImportError:
+ raise HTTPException(status_code=501, detail="Page coach module unavailable")
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=str(exc))
diff --git a/src/website_profiling/api/routers/page_markdown.py b/src/website_profiling/api/routers/page_markdown.py
new file mode 100644
index 00000000..801b6dbe
--- /dev/null
+++ b/src/website_profiling/api/routers/page_markdown.py
@@ -0,0 +1,157 @@
+"""Page markdown routers — /api/page-markdown/*."""
+from __future__ import annotations
+
+from typing import Annotated, Any, Optional
+
+from fastapi import APIRouter, Body, Depends, HTTPException, Query
+from psycopg import Connection
+
+from ..deps import get_db
+from website_profiling.db.markdown_store import (
+ delete_page_markdown_for_run,
+ list_markdown_crawl_runs,
+ list_page_markdown,
+ read_page_markdown,
+)
+
+router = APIRouter(prefix="/page-markdown", tags=["page-markdown"])
+
+DbDep = Annotated[Connection, Depends(get_db)]
+
+
+@router.get("")
+def list_page_markdown_route(
+ conn: DbDep,
+ crawlRunId: int = Query(...),
+ page: int = Query(1, ge=1),
+ limit: int = Query(25, ge=1, le=100),
+ q: Optional[str] = Query(None),
+) -> dict[str, Any]:
+ if not crawlRunId:
+ raise HTTPException(status_code=400, detail="crawlRunId required")
+
+ page = max(1, page)
+ page_size = min(100, max(1, limit))
+ offset = (page - 1) * page_size
+
+ try:
+ result = list_page_markdown(
+ conn,
+ crawlRunId,
+ limit=page_size,
+ offset=offset,
+ query=(q or "").strip(),
+ )
+ items = []
+ for row in result.get("items") or []:
+ extracted = row.get("extracted_at")
+ items.append({
+ "url": row.get("url"),
+ "title": row.get("title"),
+ "word_count": row.get("word_count"),
+ "strategy": row.get("strategy"),
+ "extracted_at": str(extracted) if extracted else None,
+ })
+ total = int(result.get("total") or 0)
+ return {
+ "items": items,
+ "total": total,
+ "page": page,
+ "pageSize": page_size,
+ "totalPages": max(1, -(-total // page_size)),
+ }
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=str(exc))
+
+
+@router.delete("")
+def delete_page_markdown_route(
+ conn: DbDep,
+ body: dict[str, Any] = Body(default={}),
+) -> dict[str, Any]:
+ crawl_run_id = int(body.get("crawlRunId") or 0)
+ if not crawl_run_id:
+ raise HTTPException(status_code=400, detail="crawlRunId required")
+
+ try:
+ deleted = delete_page_markdown_for_run(conn, crawl_run_id)
+ return {"ok": True, "crawlRunId": crawl_run_id, "deletedRows": deleted}
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=str(exc))
+
+
+@router.get("/content")
+def page_markdown_content_route(
+ conn: DbDep,
+ crawlRunId: int = Query(...),
+ url: str = Query(...),
+) -> dict[str, Any]:
+ if not crawlRunId:
+ raise HTTPException(status_code=400, detail="crawlRunId required")
+ if not url:
+ raise HTTPException(status_code=400, detail="url required")
+
+ try:
+ content = read_page_markdown(conn, crawlRunId, url)
+ if not content:
+ raise HTTPException(status_code=404, detail="Not found")
+ extracted = content.get("extracted_at")
+ return {
+ "content": {
+ "url": content.get("url"),
+ "title": content.get("title"),
+ "markdown": content.get("markdown"),
+ "word_count": content.get("word_count"),
+ "strategy": content.get("strategy"),
+ "source_byte_length": content.get("source_byte_length"),
+ "extracted_at": str(extracted) if extracted else None,
+ }
+ }
+ except HTTPException:
+ raise
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=str(exc))
+
+
+@router.post("/extract")
+def page_markdown_extract(
+ conn: DbDep,
+ body: dict[str, Any] = Body(default={}),
+) -> dict[str, Any]:
+ crawl_run_id = int(body.get("crawlRunId") or 0)
+ if not crawl_run_id:
+ raise HTTPException(status_code=400, detail="crawlRunId required")
+
+ strategy = "full_body" if body.get("strategy") == "full_body" else "main_only"
+ overwrite = body.get("overwrite", True)
+ workers = min(16, max(1, int(body.get("workers") or 4)))
+
+ command = f"page-markdown --crawl-run-id {crawl_run_id} --strategy {strategy} --workers {workers}"
+ if not overwrite:
+ command += " --no-overwrite"
+
+ try:
+ from website_profiling.db.pipeline_jobs import enqueue_job
+ import uuid
+
+ job_id = str(uuid.uuid4())
+ ok = enqueue_job(conn, job_id, "page-markdown", command, None, None)
+ if not ok:
+ raise HTTPException(status_code=400, detail="A pipeline job is already running")
+ return {"jobId": job_id, "crawlRunId": crawl_run_id, "strategy": strategy, "overwrite": overwrite}
+ except HTTPException:
+ raise
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=str(exc))
+
+
+@router.get("/runs")
+def page_markdown_runs_route(
+ conn: DbDep,
+ propertyId: Optional[int] = Query(None),
+) -> dict[str, Any]:
+ try:
+ runs = list_markdown_crawl_runs(conn, propertyId)
+ return {"runs": runs}
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=str(exc))
diff --git a/src/website_profiling/api/routers/pipeline.py b/src/website_profiling/api/routers/pipeline.py
new file mode 100644
index 00000000..bd30d04d
--- /dev/null
+++ b/src/website_profiling/api/routers/pipeline.py
@@ -0,0 +1,258 @@
+"""Pipeline job routers — /api/run, /api/jobs."""
+from __future__ import annotations
+
+import re
+import uuid
+from typing import Annotated, Any, Optional
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+from psycopg import Connection
+
+from ..deps import get_db
+from ..schemas.pipeline import (
+ ALLOWED_COMMANDS,
+ CancelResponse,
+ JobResponse,
+ JobsListResponse,
+ PauseResponse,
+ ResumeResponse,
+ RunPostBody,
+ RunResponse,
+ coerce_llm_state,
+ coerce_pipeline_state,
+ validate_pipeline_run,
+)
+
+router = APIRouter(tags=["pipeline"])
+
+DbDep = Annotated[Connection, Depends(get_db)]
+
+_PAUSE_RUN_ID_RE = re.compile(r"CRAWL_RUN_ID=(\d+)")
+
+
+def _get_pipeline_jobs_db(conn: Connection):
+ """Late import to avoid circular deps at startup."""
+ from website_profiling.db.pipeline_jobs import (
+ cancel_job_in_db,
+ check_flags,
+ enqueue_job,
+ get_active_job,
+ get_job,
+ list_jobs,
+ reconcile_stale_jobs,
+ set_cancel_flag,
+ set_pause_flag,
+ )
+ return locals()
+
+
+# ── POST /api/run ─────────────────────────────────────────────────────────────
+
+@router.post("/run", response_model=RunResponse)
+def run_pipeline(body: RunPostBody, conn: DbDep) -> dict[str, Any]:
+ from website_profiling.db.config_store import (
+ read_pipeline_config,
+ read_llm_config,
+ write_llm_config,
+ write_pipeline_config,
+ )
+ from website_profiling.db.pipeline_jobs import enqueue_job, reconcile_stale_jobs
+ from website_profiling.db.property_store import upsert_property_by_domain
+
+ command = body.command or None
+ command_base = command.split()[0] if command else None
+ if command_base is not None and command_base not in {
+ c for c in ALLOWED_COMMANDS if c is not None and c
+ }:
+ raise HTTPException(status_code=400, detail=f"Invalid command: {command_base}")
+
+ # Resolve state — fall back to saved config if not provided
+ raw_state = body.state
+ unknown_keys = [{"key": u.key, "value": u.value} for u in (body.unknownKeys or [])]
+
+ if not raw_state:
+ try:
+ saved_state, saved_unknown = read_pipeline_config(conn)
+ raw_state = saved_state
+ unknown_keys = saved_unknown
+ except Exception as exc:
+ raise HTTPException(
+ status_code=400,
+ detail=f"Missing state and could not load config: {exc}",
+ )
+
+ if not raw_state:
+ raise HTTPException(status_code=400, detail="Missing state object")
+
+ state = coerce_pipeline_state(raw_state)
+
+ # Filter unknown keys
+ safe_unknown = [
+ u for u in unknown_keys
+ if isinstance(u, dict)
+ and not str(u.get("key", "")).startswith("llm_")
+ and not str(u.get("key", "")).startswith("ml_")
+ ]
+
+ # Resolve property ID from start_url
+ start_url = str(state.get("start_url") or "").strip()
+ property_id: int | None = body.propertyId
+ if start_url:
+ from urllib.parse import urlparse
+ hostname = urlparse(start_url).hostname or ""
+ if hostname:
+ try:
+ from website_profiling.db.property_store import (
+ canonical_domain_from_start_url,
+ upsert_property_by_domain,
+ )
+ domain = canonical_domain_from_start_url(start_url)
+ if domain:
+ property_id = upsert_property_by_domain(
+ conn, domain, domain, start_url
+ )
+ except Exception:
+ pass
+ state["active_property_id"] = str(property_id or "")
+
+ # Validate
+ errors = validate_pipeline_run(state, command)
+ if errors:
+ raise HTTPException(status_code=400, detail=" ".join(errors))
+
+ # Save pipeline config
+ str_state = {k: str(v) for k, v in state.items() if v is not None}
+ try:
+ write_pipeline_config(conn, str_state, safe_unknown)
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=f"Failed to save config: {exc}")
+
+ # Save LLM config if provided
+ if body.llmState and isinstance(body.llmState, dict):
+ llm_coerced = coerce_llm_state(body.llmState)
+ str_llm = {k: str(v) for k, v in llm_coerced.items() if not str(k).endswith("_masked")}
+ try:
+ write_llm_config(conn, str_llm)
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=f"Failed to save LLM config: {exc}")
+
+ # Enqueue job
+ job_id = str(uuid.uuid4())
+ try:
+ ok = enqueue_job(conn, job_id, command_base or "full", command, property_id, None)
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=str(exc))
+
+ if not ok:
+ raise HTTPException(status_code=400, detail="An audit job is already running")
+
+ return {"jobId": job_id}
+
+
+# ── GET /api/jobs ─────────────────────────────────────────────────────────────
+
+@router.get("/jobs", response_model=JobsListResponse)
+def list_pipeline_jobs(
+ conn: DbDep,
+ limit: int = Query(20, ge=1, le=100),
+) -> dict[str, Any]:
+ from website_profiling.db.pipeline_jobs import (
+ get_active_job,
+ list_jobs,
+ reconcile_stale_jobs,
+ )
+
+ reconciled = reconcile_stale_jobs(conn)
+ active = get_active_job(conn)
+ jobs = list_jobs(conn, limit)
+ return {"jobs": jobs, "active": active, "reconciled": reconciled}
+
+
+# ── GET /api/jobs/{id} ────────────────────────────────────────────────────────
+
+@router.get("/jobs/{job_id}")
+def get_pipeline_job(job_id: str, conn: DbDep) -> dict[str, Any]:
+ from website_profiling.db.pipeline_jobs import get_job
+
+ job = get_job(conn, job_id)
+ if not job:
+ raise HTTPException(status_code=404, detail="Job not found")
+ return {
+ "status": job["status"],
+ "exitCode": job["exitCode"],
+ "log": job["log"],
+ "error": job.get("error"),
+ "logTruncated": job.get("logTruncated", False),
+ }
+
+
+# ── POST /api/jobs/{id}/cancel ────────────────────────────────────────────────
+
+@router.post("/jobs/{job_id}/cancel", response_model=CancelResponse)
+def cancel_pipeline_job(job_id: str, conn: DbDep) -> dict[str, Any]:
+ from website_profiling.db.pipeline_jobs import cancel_job_in_db, get_job, set_cancel_flag
+
+ job = get_job(conn, job_id)
+ if not job:
+ raise HTTPException(status_code=404, detail="Job not found")
+
+ if job["status"] not in ("pending", "running"):
+ raise HTTPException(status_code=409, detail="Job is not running")
+
+ # Set the cancel flag — the worker will pick it up and kill the subprocess.
+ set_cancel_flag(conn, job_id)
+ return {"ok": True, "status": job["status"]}
+
+
+# ── POST /api/jobs/{id}/pause ─────────────────────────────────────────────────
+
+@router.post("/jobs/{job_id}/pause", response_model=PauseResponse)
+def pause_pipeline_job(job_id: str, conn: DbDep) -> dict[str, Any]:
+ from website_profiling.db.pipeline_jobs import get_job, set_pause_flag
+
+ job = get_job(conn, job_id)
+ if not job:
+ raise HTTPException(status_code=404, detail="Job not found")
+
+ if job["status"] != "running":
+ raise HTTPException(status_code=409, detail="Job is not running")
+
+ set_pause_flag(conn, job_id)
+ return {"ok": True}
+
+
+# ── POST /api/jobs/{id}/resume ────────────────────────────────────────────────
+
+@router.post("/jobs/{job_id}/resume", response_model=ResumeResponse)
+def resume_pipeline_job(job_id: str, conn: DbDep) -> dict[str, Any]:
+ from website_profiling.db.pipeline_jobs import enqueue_job, get_job
+
+ job = get_job(conn, job_id)
+ if not job:
+ raise HTTPException(status_code=404, detail="Job not found")
+
+ if job["status"] != "paused":
+ raise HTTPException(status_code=409, detail="Job is not paused")
+
+ # Extract paused crawl run ID from log
+ log_text = str(job.get("log") or "")
+ m = _PAUSE_RUN_ID_RE.search(log_text)
+ if not m:
+ raise HTTPException(status_code=409, detail="No paused crawl run found for this job")
+
+ paused_run_id = int(m.group(1))
+ resume_command = f"--resume-run-id {paused_run_id}"
+ new_job_id = str(uuid.uuid4())
+
+ ok = enqueue_job(
+ conn,
+ new_job_id,
+ "crawl-resume",
+ resume_command,
+ job.get("propertyId"),
+ None,
+ )
+ if not ok:
+ raise HTTPException(status_code=400, detail="An audit job is already running")
+
+ return {"ok": True, "newJobId": new_job_id}
diff --git a/src/website_profiling/api/routers/portfolio.py b/src/website_profiling/api/routers/portfolio.py
new file mode 100644
index 00000000..ed0c8d76
--- /dev/null
+++ b/src/website_profiling/api/routers/portfolio.py
@@ -0,0 +1,35 @@
+"""Portfolio item deletion — /api/portfolio/*."""
+from __future__ import annotations
+
+from typing import Annotated, Any, Optional
+
+from fastapi import APIRouter, Depends, HTTPException
+from psycopg import Connection
+from pydantic import BaseModel
+
+from ..deps import get_db
+from website_profiling.db import portfolio_store
+
+router = APIRouter(tags=["portfolio"])
+
+DbDep = Annotated[Connection, Depends(get_db)]
+
+
+class DeletePortfolioBody(BaseModel):
+ reportId: Optional[int] = None
+ crawlRunId: Optional[int] = None
+
+
+@router.delete("/portfolio/delete")
+def delete_portfolio_item(body: DeletePortfolioBody, conn: DbDep) -> dict[str, Any]:
+ if body.reportId is None and body.crawlRunId is None:
+ raise HTTPException(status_code=400, detail="reportId or crawlRunId required")
+
+ deleted = portfolio_store.delete_portfolio_item(
+ conn,
+ report_id=body.reportId,
+ crawl_run_id=body.crawlRunId,
+ )
+ if not deleted:
+ raise HTTPException(status_code=404, detail="portfolio item not found")
+ return {"ok": True}
diff --git a/src/website_profiling/api/routers/properties.py b/src/website_profiling/api/routers/properties.py
new file mode 100644
index 00000000..43f68197
--- /dev/null
+++ b/src/website_profiling/api/routers/properties.py
@@ -0,0 +1,324 @@
+"""Properties router — /api/properties/*"""
+from __future__ import annotations
+
+from typing import Annotated, Any, Optional
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+from pydantic import BaseModel
+from psycopg import Connection
+
+from ..deps import get_db
+
+router = APIRouter(tags=["properties"])
+
+DbDep = Annotated[Connection, Depends(get_db)]
+
+
+class PropertyUpsertBody(BaseModel):
+ name: Optional[str] = None
+ canonical_domain: Optional[str] = None
+ site_url: Optional[str] = None
+
+
+class OpsSettingsBody(BaseModel):
+ scheduleCron: Optional[str] = None
+ alertWebhookUrl: Optional[str] = None
+ alertEmail: Optional[str] = None
+
+
+class PresetBody(BaseModel):
+ preset: Optional[str] = None
+
+
+class GoogleCredentialsPatch(BaseModel):
+ refreshToken: Optional[str] = None
+ authMode: Optional[str] = None
+ gscSiteUrl: Optional[str] = None
+ ga4PropertyId: Optional[str] = None
+ dateRangeDays: Optional[int] = None
+ connectedEmail: Optional[str] = None
+
+
+class GoogleCredentialsPostBody(BaseModel):
+ gscSiteUrl: Optional[str] = None
+ ga4PropertyId: Optional[str] = None
+ dateRangeDays: Optional[int] = None
+ refreshToken: Optional[str] = None
+
+
+@router.get("/properties")
+def list_properties(conn: DbDep) -> dict[str, Any]:
+ from website_profiling.db.property_store import list_properties_public
+ return {"properties": list_properties_public(conn)}
+
+
+@router.post("/properties", status_code=201)
+def create_property(body: PropertyUpsertBody, conn: DbDep) -> dict[str, Any]:
+ from website_profiling.db.property_store import upsert_property_by_domain
+
+ name = (body.name or "").strip()
+ domain = (body.canonical_domain or "").strip().lower()
+ if not name or not domain:
+ raise HTTPException(status_code=400, detail="name and canonical_domain required")
+
+ site_url = (body.site_url or "").strip() or None
+ prop_id = upsert_property_by_domain(conn, name, domain, site_url)
+ return {"id": prop_id, "name": name, "canonical_domain": domain}
+
+
+@router.get("/properties/resolve")
+def resolve_property(
+ conn: DbDep,
+ startUrl: str = Query(..., description="Start URL to resolve a property from"),
+) -> dict[str, Any]:
+ from website_profiling.db.property_store import (
+ canonical_domain_from_start_url,
+ get_property_by_domain,
+ resolve_property_id_from_start_url,
+ )
+
+ start_url = startUrl.strip()
+ if not start_url:
+ raise HTTPException(status_code=400, detail="startUrl required")
+
+ prop_id = resolve_property_id_from_start_url(conn, start_url)
+ domain = canonical_domain_from_start_url(start_url)
+ prop = get_property_by_domain(conn, domain) if domain else None
+ return {
+ "id": prop_id,
+ "canonical_domain": domain,
+ "default_crawl_preset": prop.get("default_crawl_preset") if prop else None,
+ }
+
+
+@router.get("/properties/{property_id}")
+def get_property(property_id: int, conn: DbDep) -> dict[str, Any]:
+ from website_profiling.db.property_store import get_property_by_id
+
+ prop = get_property_by_id(conn, property_id)
+ if not prop:
+ raise HTTPException(status_code=404, detail="Property not found")
+ return prop
+
+
+@router.delete("/properties/{property_id}")
+def delete_property_route(property_id: int, conn: DbDep) -> dict[str, Any]:
+ from website_profiling.db.property_store import delete_property
+
+ if not delete_property(conn, property_id):
+ raise HTTPException(status_code=404, detail="Property not found")
+ return {"ok": True}
+
+
+@router.get("/properties/{property_id}/ops")
+def get_property_ops_route(property_id: int, conn: DbDep) -> dict[str, Any]:
+ from website_profiling.db.property_store import get_property_ops
+
+ ops = get_property_ops(conn, property_id)
+ if not ops:
+ raise HTTPException(status_code=404, detail="Property not found")
+ return ops
+
+
+@router.put("/properties/{property_id}/ops")
+def update_property_ops_route(property_id: int, body: OpsSettingsBody, conn: DbDep) -> dict[str, Any]:
+ from website_profiling.db.property_store import get_property_by_id, update_property_ops
+
+ if not get_property_by_id(conn, property_id):
+ raise HTTPException(status_code=404, detail="Property not found")
+
+ update_property_ops(
+ conn,
+ property_id,
+ schedule_cron=body.scheduleCron,
+ alert_webhook_url=body.alertWebhookUrl,
+ alert_email=body.alertEmail,
+ )
+ return {"ok": True}
+
+
+@router.get("/properties/{property_id}/preset")
+def get_property_preset(property_id: int, conn: DbDep) -> dict[str, Any]:
+ from website_profiling.db.property_store import get_property_by_id
+
+ prop = get_property_by_id(conn, property_id)
+ if not prop:
+ raise HTTPException(status_code=404, detail="Property not found")
+ return {"default_crawl_preset": prop.get("default_crawl_preset")}
+
+
+@router.put("/properties/{property_id}/preset")
+def update_property_preset(property_id: int, body: PresetBody, conn: DbDep) -> dict[str, Any]:
+ from website_profiling.db.property_store import get_property_by_id, update_property_crawl_preset
+
+ if not get_property_by_id(conn, property_id):
+ raise HTTPException(status_code=404, detail="Property not found")
+
+ preset = (body.preset or "").strip() or None
+ update_property_crawl_preset(conn, property_id, preset)
+ return {"ok": True, "default_crawl_preset": preset}
+
+
+@router.post("/properties/{property_id}/authorize")
+def authorize_property_crawl_route(property_id: int, conn: DbDep) -> dict[str, Any]:
+ from website_profiling.db.property_store import authorize_property_crawl, get_property_by_id
+
+ if not get_property_by_id(conn, property_id):
+ raise HTTPException(status_code=404, detail="Property not found")
+ authorize_property_crawl(conn, property_id)
+ return {"ok": True}
+
+
+@router.get("/properties/{property_id}/google/status")
+def property_google_status(property_id: int, conn: DbDep) -> dict[str, Any]:
+ from website_profiling.db.property_store import get_property_google_status
+
+ status = get_property_google_status(conn, property_id)
+ if not status:
+ raise HTTPException(status_code=404, detail="Property not found")
+ return status
+
+
+@router.post("/properties/{property_id}/google/test")
+def property_google_test(property_id: int, conn: DbDep) -> dict[str, Any]:
+ from website_profiling.db.property_store import get_property_by_id
+
+ if not get_property_by_id(conn, property_id):
+ raise HTTPException(status_code=404, detail="Property not found")
+ try:
+ from website_profiling.integrations.google.test import test_google_connection
+ result = test_google_connection(conn, property_id)
+ return result if isinstance(result, dict) else {"ok": True, "log": str(result)}
+ except ImportError:
+ raise HTTPException(status_code=501, detail="Google test unavailable")
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=str(exc))
+
+
+@router.get("/properties/{property_id}/google/properties")
+def property_google_properties(property_id: int, conn: DbDep) -> dict[str, Any]:
+ from website_profiling.db.property_store import get_property_by_id
+
+ if not get_property_by_id(conn, property_id):
+ raise HTTPException(status_code=404, detail="Property not found")
+ try:
+ from website_profiling.integrations.google.discover import list_google_properties
+ result = list_google_properties(conn, property_id)
+ return result if isinstance(result, dict) else {"properties": result}
+ except ImportError:
+ raise HTTPException(status_code=501, detail="Google properties discovery unavailable")
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=str(exc))
+
+
+@router.get("/properties/{property_id}/google/links/status")
+def property_google_links_status(property_id: int, conn: DbDep) -> dict[str, Any]:
+ from website_profiling.db.property_store import get_property_by_id
+
+ if not get_property_by_id(conn, property_id):
+ raise HTTPException(status_code=404, detail="Property not found")
+ try:
+ from website_profiling.integrations.google.gsc_links_store import read_gsc_links_status
+ return read_gsc_links_status(conn, property_id)
+ except Exception:
+ return {"hasData": False}
+
+
+@router.post("/properties/{property_id}/google/links/import")
+def property_google_links_import(property_id: int, conn: DbDep) -> dict[str, Any]:
+ from website_profiling.db.property_store import get_property_by_id
+
+ if not get_property_by_id(conn, property_id):
+ raise HTTPException(status_code=404, detail="Property not found")
+ try:
+ from website_profiling.integrations.google.links import import_gsc_links
+ result = import_gsc_links(conn, property_id)
+ return result if isinstance(result, dict) else {"ok": True, "imported": result}
+ except ImportError:
+ raise HTTPException(status_code=501, detail="GSC links import unavailable")
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=str(exc))
+
+
+def _apply_google_credentials_from_patch(
+ conn: Connection,
+ property_id: int,
+ body: GoogleCredentialsPatch,
+) -> None:
+ from website_profiling.db.property_store import apply_property_google_credentials_patch
+
+ fields_set: set[str] = set()
+ if body.gscSiteUrl is not None:
+ fields_set.add("gsc_site_url")
+ if body.ga4PropertyId is not None:
+ fields_set.add("ga4_property_id")
+ if body.dateRangeDays is not None:
+ fields_set.add("date_range_days")
+ if body.authMode is not None:
+ fields_set.add("auth_mode")
+ if body.connectedEmail is not None:
+ fields_set.add("connected_email")
+ if body.refreshToken is not None:
+ fields_set.add("refresh_token")
+
+ try:
+ apply_property_google_credentials_patch(
+ conn,
+ property_id,
+ refresh_token=body.refreshToken,
+ auth_mode=body.authMode,
+ gsc_site_url=body.gscSiteUrl,
+ ga4_property_id=body.ga4PropertyId,
+ date_range_days=body.dateRangeDays,
+ connected_email=body.connectedEmail,
+ fields_set=frozenset(fields_set) if fields_set else None,
+ )
+ except ValueError as exc:
+ raise HTTPException(status_code=400, detail=str(exc)) from exc
+
+
+@router.patch("/properties/{property_id}/google/credentials")
+def patch_property_google_credentials(
+ property_id: int, body: GoogleCredentialsPatch, conn: DbDep
+) -> dict[str, Any]:
+ from website_profiling.db.property_store import get_property_by_id
+
+ if not get_property_by_id(conn, property_id):
+ raise HTTPException(status_code=404, detail="Property not found")
+ _apply_google_credentials_from_patch(conn, property_id, body)
+ return {"ok": True}
+
+
+@router.post("/properties/{property_id}/google/credentials")
+def post_property_google_credentials(
+ property_id: int, body: GoogleCredentialsPostBody, conn: DbDep
+) -> dict[str, Any]:
+ from website_profiling.db.property_store import get_property_by_id, get_property_google_public_status
+
+ if not get_property_by_id(conn, property_id):
+ raise HTTPException(status_code=404, detail="Property not found")
+
+ patch = GoogleCredentialsPatch()
+ fields_set = body.model_fields_set
+ if "gscSiteUrl" in fields_set:
+ patch.gscSiteUrl = body.gscSiteUrl
+ if "ga4PropertyId" in fields_set:
+ patch.ga4PropertyId = body.ga4PropertyId
+ if "dateRangeDays" in fields_set and body.dateRangeDays is not None:
+ patch.dateRangeDays = body.dateRangeDays
+ if isinstance(body.refreshToken, str) and body.refreshToken.strip():
+ patch.refreshToken = body.refreshToken.strip()
+ patch.authMode = "oauth"
+
+ _apply_google_credentials_from_patch(conn, property_id, patch)
+ return {"ok": True, "status": get_property_google_public_status(conn, property_id)}
+
+
+@router.post("/properties/{property_id}/google/disconnect")
+def post_property_google_disconnect(property_id: int, conn: DbDep) -> dict[str, Any]:
+ from website_profiling.db.property_store import disconnect_property_google, get_property_by_id
+
+ if not get_property_by_id(conn, property_id):
+ raise HTTPException(status_code=404, detail="Property not found")
+ disconnect_property_google(conn, property_id)
+ return {"ok": True}
diff --git a/src/website_profiling/api/routers/report.py b/src/website_profiling/api/routers/report.py
new file mode 100644
index 00000000..507c6aaf
--- /dev/null
+++ b/src/website_profiling/api/routers/report.py
@@ -0,0 +1,83 @@
+"""Report data routers — /api/report/*."""
+from __future__ import annotations
+
+from typing import Annotated, Any, Optional
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+from psycopg import Connection
+
+from ..deps import get_db
+from ..services.report_loader import (
+ SECTION_KEYS,
+ get_crawl_preview_payload,
+ get_mobile_desktop_delta,
+ get_report_payload,
+ list_audit_history,
+ list_crawl_runs,
+ list_reports,
+)
+
+router = APIRouter(prefix="/report", tags=["report"])
+
+DbDep = Annotated[Connection, Depends(get_db)]
+
+
+@router.get("/meta")
+def report_meta(conn: DbDep) -> dict[str, Any]:
+ return {
+ "reports": list_reports(conn),
+ "crawlRuns": list_crawl_runs(conn),
+ }
+
+
+@router.get("/payload")
+def report_payload(
+ conn: DbDep,
+ reportId: Optional[int] = Query(None),
+ domain: Optional[str] = Query(None),
+ section: Optional[str] = Query(None),
+) -> dict[str, Any]:
+ if section is not None and section not in SECTION_KEYS:
+ raise HTTPException(status_code=400, detail="Invalid section")
+ payload = get_report_payload(conn, reportId, domain, section)
+ if payload is None:
+ raise HTTPException(status_code=404, detail="Report not found")
+ if section:
+ return {"payload": payload, "section": section}
+ return {"payload": payload}
+
+
+@router.get("/history")
+def report_history(
+ conn: DbDep,
+ propertyId: Optional[int] = Query(None),
+ domain: Optional[str] = Query(None),
+ limit: int = Query(20, ge=1, le=100),
+) -> dict[str, Any]:
+ history = list_audit_history(conn, propertyId, domain, limit)
+ return {"history": history}
+
+
+@router.get("/crawl-payload")
+def crawl_payload(
+ conn: DbDep,
+ crawlRunId: Optional[int] = Query(None),
+) -> dict[str, Any]:
+ if not crawlRunId or crawlRunId <= 0:
+ raise HTTPException(status_code=400, detail="Invalid crawlRunId")
+ try:
+ payload = get_crawl_preview_payload(conn, crawlRunId)
+ except ValueError as e:
+ raise HTTPException(status_code=404, detail=str(e))
+ return {"payload": payload}
+
+
+@router.get("/mobile-delta")
+def mobile_delta(
+ conn: DbDep,
+ id: Optional[int] = Query(None),
+) -> dict[str, Any]:
+ if not id:
+ raise HTTPException(status_code=400, detail="id required")
+ deltas = get_mobile_desktop_delta(conn, id)
+ return {"deltas": deltas}
diff --git a/src/website_profiling/api/routers/report_audit_tool.py b/src/website_profiling/api/routers/report_audit_tool.py
new file mode 100644
index 00000000..53a11ab0
--- /dev/null
+++ b/src/website_profiling/api/routers/report_audit_tool.py
@@ -0,0 +1,40 @@
+"""Audit tool dispatch — POST /api/report/audit-tool."""
+from __future__ import annotations
+
+from typing import Annotated, Any, Optional
+
+from fastapi import APIRouter, Depends, HTTPException
+from psycopg import Connection
+from pydantic import BaseModel
+
+from ..deps import get_db
+
+router = APIRouter(prefix="/report", tags=["report-audit-tool"])
+
+DbDep = Annotated[Connection, Depends(get_db)]
+
+
+class AuditToolBody(BaseModel):
+ toolName: str
+ propertyId: int
+ reportId: Optional[int] = None
+ args: dict[str, Any] = {}
+
+
+@router.post("/audit-tool")
+def run_audit_tool(body: AuditToolBody, conn: DbDep) -> dict[str, Any]:
+ if not body.toolName or not body.propertyId:
+ raise HTTPException(status_code=400, detail="toolName and propertyId required")
+
+ try:
+ from website_profiling.tools.audit_tools import AuditToolContext
+ from website_profiling.tools.audit_tools.registry import dispatch_tool
+
+ context = AuditToolContext(
+ property_id=body.propertyId,
+ report_id=body.reportId,
+ )
+ result = dispatch_tool(body.toolName, body.args, context=context, conn=conn)
+ return {"result": result}
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=str(exc))
diff --git a/src/website_profiling/api/routers/report_export.py b/src/website_profiling/api/routers/report_export.py
new file mode 100644
index 00000000..024548a0
--- /dev/null
+++ b/src/website_profiling/api/routers/report_export.py
@@ -0,0 +1,72 @@
+"""Report export downloads — /api/report/export*."""
+from __future__ import annotations
+
+from typing import Annotated, Optional
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+from fastapi.responses import Response
+from psycopg import Connection
+
+from ..deps import get_db
+
+router = APIRouter(prefix="/report", tags=["report-export"])
+
+DbDep = Annotated[Connection, Depends(get_db)]
+
+EXPORT_FORMATS = {"csv", "json"}
+
+
+@router.get("/export")
+def export_report(
+ conn: DbDep,
+ format: str = Query("csv"),
+ reportId: Optional[int] = Query(None),
+) -> Response:
+ if format not in EXPORT_FORMATS:
+ raise HTTPException(status_code=400, detail=f"Invalid format. Use one of {sorted(EXPORT_FORMATS)}")
+
+ try:
+ if format == "csv":
+ from website_profiling.tools.export_audit import export_audit_csv as _export
+ content = _export(conn, reportId)
+ return Response(
+ content=content if isinstance(content, bytes) else content.encode(),
+ media_type="text/csv",
+ headers={"Content-Disposition": "attachment; filename=report.csv"},
+ )
+ if format == "json":
+ import json
+ from website_profiling.tools.export_audit import export_audit_json as _export
+ content = _export(conn, reportId)
+ body = json.dumps(content) if not isinstance(content, (str, bytes)) else content
+ return Response(
+ content=body if isinstance(body, bytes) else body.encode(),
+ media_type="application/json",
+ headers={"Content-Disposition": "attachment; filename=report.json"},
+ )
+ except ImportError as exc:
+ raise HTTPException(status_code=501, detail=f"Export module unavailable: {exc}")
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=str(exc))
+
+ raise HTTPException(status_code=500, detail="Export failed")
+
+
+@router.get("/export-sitemap")
+def export_sitemap(
+ conn: DbDep,
+ reportId: Optional[int] = Query(None),
+) -> Response:
+ try:
+ from website_profiling.tools.export_sitemap import export_sitemap as _export
+ content = _export(conn, reportId)
+ return Response(
+ content=content if isinstance(content, bytes) else content.encode(),
+ media_type="application/xml",
+ headers={"Content-Disposition": "attachment; filename=sitemap.xml"},
+ )
+ except ImportError:
+ raise HTTPException(status_code=501, detail="Sitemap export unavailable")
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=str(exc))
+
diff --git a/src/website_profiling/api/routers/report_portfolio.py b/src/website_profiling/api/routers/report_portfolio.py
new file mode 100644
index 00000000..28dac5a0
--- /dev/null
+++ b/src/website_profiling/api/routers/report_portfolio.py
@@ -0,0 +1,54 @@
+"""Portfolio report widget — GET /api/report/portfolio."""
+from __future__ import annotations
+
+from typing import Annotated, Any, Optional
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+from psycopg import Connection
+
+from ..deps import get_db
+from ..services.portfolio_loader import get_portfolio_response
+
+router = APIRouter(prefix="/report", tags=["report-portfolio"])
+
+DbDep = Annotated[Connection, Depends(get_db)]
+
+
+@router.get("/portfolio")
+def report_portfolio(
+ conn: DbDep,
+ widget: str = Query("full"),
+ ids: Optional[str] = Query(None),
+ reportId: Optional[int] = Query(None),
+ crawlRunId: Optional[int] = Query(None),
+) -> dict[str, Any]:
+ """Return portfolio data — groups, crawl history, summary, or single card."""
+ valid_widgets = {"full", "groups", "summary", "card"}
+ if widget not in valid_widgets:
+ raise HTTPException(status_code=400, detail="Invalid widget")
+
+ if widget == "card" and reportId is None and crawlRunId is None:
+ raise HTTPException(
+ status_code=400, detail="reportId or crawlRunId required for card widget"
+ )
+
+ id_list: list[int] = []
+ if ids:
+ for s in ids.split(","):
+ try:
+ n = int(s.strip())
+ if n > 0:
+ id_list.append(n)
+ except ValueError:
+ pass
+
+ try:
+ return get_portfolio_response(
+ conn,
+ widget=widget,
+ ids=id_list,
+ report_id=reportId,
+ crawl_run_id=crawlRunId,
+ )
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=str(exc))
diff --git a/src/website_profiling/api/routers/schedule.py b/src/website_profiling/api/routers/schedule.py
new file mode 100644
index 00000000..fef4b7c2
--- /dev/null
+++ b/src/website_profiling/api/routers/schedule.py
@@ -0,0 +1,22 @@
+"""Scheduled crawl checks — /api/schedule/*."""
+from __future__ import annotations
+
+from typing import Any
+
+from fastapi import APIRouter, HTTPException
+
+router = APIRouter(tags=["schedule"])
+
+
+@router.post("/schedule/check")
+def schedule_check() -> dict[str, Any]:
+ try:
+ from website_profiling.tools import schedule_runner
+
+ result = schedule_runner.run()
+ return result if isinstance(result, dict) else {"ok": True}
+ except ImportError:
+ pass
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=str(exc))
+ return {"ok": True}
diff --git a/src/website_profiling/api/schemas/__init__.py b/src/website_profiling/api/schemas/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/website_profiling/api/schemas/chat.py b/src/website_profiling/api/schemas/chat.py
new file mode 100644
index 00000000..ca4b2d8b
--- /dev/null
+++ b/src/website_profiling/api/schemas/chat.py
@@ -0,0 +1,41 @@
+"""Chat request/response Pydantic schemas."""
+from __future__ import annotations
+
+from typing import Any, Optional
+
+from pydantic import BaseModel
+
+
+class ChatRequest(BaseModel):
+ sessionId: int
+ propertyId: int
+ message: str
+ reportId: Optional[int] = None
+
+
+class ChatSessionCreate(BaseModel):
+ propertyId: int
+ title: str = "New chat"
+
+
+class ChatSessionResponse(BaseModel):
+ id: int
+ propertyId: int
+ title: str
+ createdAt: str
+ updatedAt: str
+
+
+class ChatMessageResponse(BaseModel):
+ id: int
+ role: str
+ content: str
+ tool_name: Optional[str] = None
+ tool_args: Optional[dict[str, Any]] = None
+ tool_result: Optional[dict[str, Any]] = None
+ created_at: str
+
+
+class ArtifactUpdateBody(BaseModel):
+ title: Optional[str] = None
+ pinned: Optional[bool] = None
diff --git a/src/website_profiling/api/schemas/pipeline.py b/src/website_profiling/api/schemas/pipeline.py
new file mode 100644
index 00000000..6580def9
--- /dev/null
+++ b/src/website_profiling/api/schemas/pipeline.py
@@ -0,0 +1,155 @@
+"""Pipeline job and config Pydantic schemas."""
+from __future__ import annotations
+
+from typing import Any, Optional
+
+from pydantic import BaseModel, Field
+
+# ── Config field type registry (mirrors pipelineConfigSchema.ts) ─────────────
+
+# bool fields — coerce to Python bool
+_BOOL_KEYS: frozenset[str] = frozenset({
+ "run_crawl", "run_report", "run_keywords", "run_lighthouse", "run_plot",
+ "run_security", "run_enrich", "run_google", "run_page_markdown",
+ "ignore_robots", "allow_external", "store_outlinks", "store_content_excerpt",
+ "store_page_html", "run_content_analysis", "probe_image_inventory",
+ "compare_mobile_desktop", "lighthouse_run_mobile", "enable_ner",
+ "enable_rich_results_validation", "ner_only_top_pages",
+ "enable_hreflang_validation", "enable_crux_summary",
+ "enable_executive_summary", "enable_google_keyword_planner",
+ "enable_competitor_keywords", "export_csv", "export_json", "export_html",
+ "export_pdf", "enable_bing_backlinks",
+})
+
+# tristate fields — 'auto' | 'true' | 'false'
+_TRISTATE_KEYS: frozenset[str] = frozenset({
+ "crawl_render_mode_tristate",
+})
+
+# Keys written internally by the server (not shown in UI)
+INTERNAL_PIPELINE_KEYS: frozenset[str] = frozenset({"active_property_id"})
+
+ALLOWED_COMMANDS: frozenset[str | None] = frozenset({
+ None, "", "crawl", "report", "plot", "lighthouse", "keywords",
+ "keywords --enrich-google", "warnings", "enrich", "google", "page-markdown",
+})
+
+
+def coerce_pipeline_state(raw: dict[str, Any]) -> dict[str, Any]:
+ """Coerce raw state values to correct Python types, mirroring run/route.ts logic."""
+ out: dict[str, Any] = {}
+ for key, val in raw.items():
+ if key.startswith("llm_"):
+ continue
+ if key in _BOOL_KEYS:
+ out[key] = val is True or val == "true"
+ elif key in _TRISTATE_KEYS:
+ s = str(val or "auto").lower()
+ out[key] = "true" if s == "true" else "false" if s == "false" else "auto"
+ else:
+ out[key] = "" if val is None else str(val)
+ return out
+
+
+def coerce_llm_state(raw: dict[str, Any]) -> dict[str, Any]:
+ """Coerce LLM config state, mirroring run/route.ts llm coercion."""
+ # LLM fields that are booleans
+ _LLM_BOOL_KEYS = frozenset({
+ "llm_chat_unlimited_tool_rounds",
+ "llm_reasoning_enabled",
+ })
+ out: dict[str, Any] = {}
+ for key, val in raw.items():
+ if key.endswith("_masked"):
+ continue
+ if key in _LLM_BOOL_KEYS:
+ out[key] = val is True or val == "true"
+ else:
+ out[key] = "" if val is None else str(val)
+ # preserve _masked flags
+ if raw.get(f"{key}_masked") is True:
+ out[f"{key}_masked"] = True
+ return out
+
+
+def validate_pipeline_run(state: dict[str, Any], command: str | None) -> list[str]:
+ """Return validation error messages (empty list = OK)."""
+ errors: list[str] = []
+ start_url = str(state.get("start_url") or "").strip()
+
+ def needs_start_url() -> bool:
+ if command == "crawl":
+ return True
+ if command in ("report", "keywords"):
+ return True
+ if command is None:
+ run_crawl = state.get("run_crawl", True)
+ run_report = state.get("run_report", True)
+ if isinstance(run_crawl, str):
+ run_crawl = run_crawl.lower() == "true"
+ if isinstance(run_report, str):
+ run_report = run_report.lower() == "true"
+ return bool(run_crawl) or bool(run_report)
+ return False
+
+ if needs_start_url() and not start_url:
+ errors.append("Site URL is required. Enter it in Audit settings before continuing.")
+ return errors
+
+
+# ── Request / response models ─────────────────────────────────────────────────
+
+class UnknownKeyEntry(BaseModel):
+ key: str
+ value: str
+
+
+class RunPostBody(BaseModel):
+ command: Optional[str] = None
+ state: Optional[dict[str, Any]] = None
+ unknownKeys: list[UnknownKeyEntry] = Field(default_factory=list)
+ llmState: Optional[dict[str, Any]] = None
+ propertyId: Optional[int] = None
+ python: Optional[str] = None
+ repoRoot: Optional[str] = None
+
+
+class RunResponse(BaseModel):
+ jobId: str
+
+
+class JobResponse(BaseModel):
+ id: str
+ jobType: str
+ status: str
+ exitCode: Optional[int] = None
+ log: str = ""
+ error: Optional[str] = None
+ logTruncated: bool = False
+ propertyId: Optional[int] = None
+ startedAt: Optional[str] = None
+ finishedAt: Optional[str] = None
+ command: Optional[str] = None
+
+
+class JobsListResponse(BaseModel):
+ jobs: list[dict[str, Any]]
+ active: Optional[dict[str, Any]] = None
+ reconciled: int = 0
+
+
+class CancelResponse(BaseModel):
+ ok: bool
+ status: str
+ error: Optional[str] = None
+
+
+class PauseResponse(BaseModel):
+ ok: bool
+ error: Optional[str] = None
+
+
+class ResumeResponse(BaseModel):
+ ok: bool
+ newJobId: Optional[str] = None
+ error: Optional[str] = None
diff --git a/src/website_profiling/api/services/__init__.py b/src/website_profiling/api/services/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/website_profiling/api/services/portfolio_loader.py b/src/website_profiling/api/services/portfolio_loader.py
new file mode 100644
index 00000000..c449c487
--- /dev/null
+++ b/src/website_profiling/api/services/portfolio_loader.py
@@ -0,0 +1,606 @@
+"""Portfolio grouping for /api/report/portfolio — port of web/src/lib/homePortfolio.ts."""
+from __future__ import annotations
+
+import re
+from datetime import datetime
+from typing import Any, Callable, Optional
+from urllib.parse import urlparse
+
+from psycopg import Connection
+
+from website_profiling.db.report_store import read_report_payload
+
+from .report_loader import (
+ list_crawl_run_summaries,
+ list_crawl_runs,
+ list_reports,
+ slice_payload_for_section,
+)
+
+PORTFOLIO_CATEGORY_ORDER = (
+ "technical_seo",
+ "performance",
+ "core_web_vitals",
+ "link_health",
+ "security",
+ "html_accessibility",
+ "mobile",
+ "intelligence",
+)
+
+EMPTY_ISSUE_COUNTS = {"critical": 0, "high": 0, "medium": 0, "low": 0}
+
+DATA_SOURCE_IDS = frozenset({
+ "crawl",
+ "lighthouse",
+ "search_console",
+ "analytics",
+ "backlinks",
+})
+
+UNKNOWN_BRAND = "Unknown property"
+EM_DASH = "—"
+
+
+def _extract_hostname(url: str | None) -> str:
+ if not url:
+ return ""
+ try:
+ host = urlparse(str(url)).hostname
+ return host.lower() if host else ""
+ except Exception:
+ return ""
+
+
+def _slugify_domain(name: str | None) -> str:
+ if not name:
+ return ""
+ s = re.sub(r"[^a-z0-9]+", "-", str(name).strip().lower()).strip("-")
+ return s
+
+
+def _canonical_domain_from_payload(
+ payload: dict[str, Any],
+ start_url_by_run_id: dict[int, str],
+) -> str:
+ run_id = payload.get("crawl_run_id")
+ run_id = int(run_id) if run_id is not None else None
+ run_start = start_url_by_run_id.get(run_id, "") if run_id is not None else ""
+ top_pages = payload.get("top_pages") or []
+ links = payload.get("links") or []
+ fallback = ""
+ if top_pages and isinstance(top_pages[0], dict):
+ fallback = str(top_pages[0].get("url") or "")
+ if not fallback and links and isinstance(links[0], dict):
+ fallback = str(links[0].get("url") or "")
+ start_domain = _extract_hostname(run_start)
+ fallback_domain = _extract_hostname(fallback)
+ return (start_domain or fallback_domain or "").lower()
+
+
+def _crawled_url_count(payload: dict[str, Any]) -> int:
+ scope = (payload.get("report_meta") or {}).get("crawl_scope") or {}
+ pages = scope.get("pages_crawled")
+ if pages is not None:
+ try:
+ n = int(pages)
+ if n > 0:
+ return n
+ except (TypeError, ValueError):
+ pass
+ summary = payload.get("summary") or {}
+ total = summary.get("total_urls")
+ if total is not None:
+ try:
+ n = int(total)
+ if n > 0:
+ return n
+ except (TypeError, ValueError):
+ pass
+ links = payload.get("links") or []
+ return len(links) if links else 0
+
+
+def _score_from_categories(categories: list[dict[str, Any]]) -> int | None:
+ nums = [
+ float(c["score"])
+ for c in categories
+ if isinstance(c.get("score"), (int, float))
+ ]
+ if not nums:
+ return None
+ return round(sum(nums) / len(nums))
+
+
+def _issue_counts_from_payload(payload: dict[str, Any]) -> tuple[dict[str, int], int]:
+ counts = dict(EMPTY_ISSUE_COUNTS)
+ for cat in payload.get("categories") or []:
+ for iss in cat.get("issues") or []:
+ p = str(iss.get("priority") or "Medium")
+ if p == "Critical":
+ counts["critical"] += 1
+ elif p == "High":
+ counts["high"] += 1
+ elif p == "Low":
+ counts["low"] += 1
+ else:
+ counts["medium"] += 1
+ total = sum(counts.values())
+ return counts, total
+
+
+def _category_score(payload: dict[str, Any], cat_id: str) -> int | None:
+ for cat in payload.get("categories") or []:
+ if cat.get("id") == cat_id and isinstance(cat.get("score"), (int, float)):
+ return round(float(cat["score"]))
+ return None
+
+
+def _lh_scores(payload: dict[str, Any]) -> tuple[int | None, int | None]:
+ summary = payload.get("lighthouse_summary")
+ if not isinstance(summary, dict):
+ return None, None
+ mm = summary.get("median_metrics") or {}
+ cs = summary.get("category_scores") or {}
+ perf_raw = mm.get("performance_score") or cs.get("performance")
+ seo_raw = mm.get("seo_score") or cs.get("seo")
+ perf = round(float(perf_raw)) if isinstance(perf_raw, (int, float)) else None
+ seo = round(float(seo_raw)) if isinstance(seo_raw, (int, float)) else None
+ return perf, seo
+
+
+def _category_snapshots(payload: dict[str, Any]) -> list[dict[str, Any]]:
+ cats = payload.get("categories") or []
+ by_id = {str(c.get("id") or ""): c for c in cats}
+ out: list[dict[str, Any]] = []
+
+ def push(cat_id: str) -> None:
+ cat = by_id.get(cat_id)
+ if not cat or not isinstance(cat.get("score"), (int, float)):
+ return
+ out.append({
+ "id": cat_id,
+ "name": str(cat.get("name") or cat_id),
+ "score": round(float(cat["score"])),
+ "issueCount": len(cat.get("issues") or []),
+ })
+
+ for cat_id in PORTFOLIO_CATEGORY_ORDER:
+ push(cat_id)
+ for cat in cats:
+ cat_id = str(cat.get("id") or "")
+ if not cat_id or any(r["id"] == cat_id for r in out):
+ continue
+ if not isinstance(cat.get("score"), (int, float)):
+ continue
+ out.append({
+ "id": cat_id,
+ "name": str(cat.get("name") or cat_id),
+ "score": round(float(cat["score"])),
+ "issueCount": len(cat.get("issues") or []),
+ })
+ return out
+
+
+def _seo_signals(payload: dict[str, Any]) -> dict[str, int] | None:
+ s = payload.get("seo_health")
+ if not isinstance(s, dict):
+ return None
+ return {
+ "missingTitles": int(s.get("missing_title") or 0),
+ "missingMetaDesc": int(s.get("missing_meta_desc") or 0),
+ "thinContent": int(s.get("thin_content") or 0),
+ "h1Issues": int(s.get("h1_zero") or 0) + int(s.get("h1_multi") or 0),
+ }
+
+
+def _median_word_count(payload: dict[str, Any]) -> int | None:
+ median = (payload.get("content_analytics") or {}).get("word_count_stats", {}).get("median")
+ return round(float(median)) if isinstance(median, (int, float)) else None
+
+
+def _median_response_ms(payload: dict[str, Any]) -> int | None:
+ median = (payload.get("response_time_stats") or {}).get("p50")
+ return round(float(median)) if isinstance(median, (int, float)) else None
+
+
+def _data_sources(payload: dict[str, Any]) -> list[str] | None:
+ raw = (payload.get("report_meta") or {}).get("data_sources") or []
+ out = [str(s) for s in raw if str(s) in DATA_SOURCE_IDS]
+ return out or None
+
+
+def _crawl_config_from_payload(
+ payload: dict[str, Any],
+ run_meta: dict[str, Any] | None,
+) -> dict[str, Any] | None:
+ scope = (payload.get("report_meta") or {}).get("crawl_scope")
+ if not scope and not (run_meta or {}).get("render_mode") and not (run_meta or {}).get("discovery_mode"):
+ return None
+ cfg: dict[str, Any] = dict(scope) if isinstance(scope, dict) else {}
+ if run_meta:
+ if run_meta.get("render_mode") and "render_mode" not in cfg:
+ cfg["render_mode"] = run_meta["render_mode"]
+ if run_meta.get("discovery_mode"):
+ cfg["discovery_mode"] = run_meta["discovery_mode"]
+ return cfg or None
+
+
+def _crawl_config_from_summary(row: dict[str, Any]) -> dict[str, Any] | None:
+ if not row.get("render_mode") and not row.get("discovery_mode") and not row.get("url_count"):
+ return None
+ return {
+ "pages_crawled": row.get("url_count"),
+ "render_mode": row.get("render_mode"),
+ "discovery_mode": row.get("discovery_mode"),
+ }
+
+
+def _to_display_datetime(value: str | None) -> str:
+ if not value:
+ return ""
+ try:
+ if isinstance(value, datetime):
+ return value.isoformat()
+ dt = datetime.fromisoformat(str(value).replace("Z", "+00:00"))
+ return dt.isoformat()
+ except Exception:
+ return str(value)
+
+
+def _generated_at_ms(value: str | None) -> float:
+ if not value:
+ return 0.0
+ try:
+ return datetime.fromisoformat(str(value).replace("Z", "+00:00")).timestamp() * 1000
+ except Exception:
+ return 0.0
+
+
+def _title_coverage_pct(with_title: int, url_count: int) -> int:
+ if url_count <= 0:
+ return 0
+ return round((with_title / url_count) * 100)
+
+
+def load_portfolio_maps(conn: Connection) -> dict[str, Any]:
+ crawl_rows = list_crawl_runs(conn)
+ start_url_by_run_id = {int(r["id"]): r["start_url"] for r in crawl_rows}
+ run_created_at_by_run_id = {int(r["id"]): r["created_at"] for r in crawl_rows}
+ run_meta_by_run_id = {
+ int(r["id"]): {
+ "render_mode": r.get("render_mode"),
+ "discovery_mode": r.get("discovery_mode"),
+ }
+ for r in crawl_rows
+ }
+ crawl_summaries = list_crawl_run_summaries(conn)
+ return {
+ "start_url_by_run_id": start_url_by_run_id,
+ "run_created_at_by_run_id": run_created_at_by_run_id,
+ "run_meta_by_run_id": run_meta_by_run_id,
+ "crawl_summaries": crawl_summaries,
+ }
+
+
+def compute_domain_groups(
+ report_list: list[dict[str, Any]],
+ maps: dict[str, Any],
+ get_payload: Callable[[int], dict[str, Any] | None],
+) -> list[dict[str, Any]]:
+ start_url_by_run_id: dict[int, str] = maps["start_url_by_run_id"]
+ run_created_at_by_run_id: dict[str, str] = maps["run_created_at_by_run_id"]
+ run_meta_by_run_id: dict[int, dict[str, Any]] = maps["run_meta_by_run_id"]
+ brand_map: dict[str, dict[str, Any]] = {}
+
+ for r in report_list:
+ report_id = int(r["id"])
+ payload = get_payload(report_id)
+ if not payload:
+ continue
+
+ run_id = payload.get("crawl_run_id")
+ run_id_int = int(run_id) if run_id is not None else None
+ run_start_url = start_url_by_run_id.get(run_id_int, "") if run_id_int is not None else ""
+ top = payload.get("top_pages") or []
+ links = payload.get("links") or []
+ if top and isinstance(top[0], dict):
+ fallback_url = str(top[0].get("url") or "")
+ elif links and isinstance(links[0], dict):
+ fallback_url = str(links[0].get("url") or "")
+ else:
+ fallback_url = ""
+ crawl_url = (run_start_url or fallback_url or "").strip()
+ start_domain = _extract_hostname(run_start_url)
+ fallback_domain = _extract_hostname(crawl_url)
+ domain_name = start_domain or fallback_domain or str(payload.get("site_name") or UNKNOWN_BRAND)
+ brand_key = start_domain or (f"fallback:{fallback_domain}" if fallback_domain else f"report:{report_id}")
+
+ summary = payload.get("summary") or {}
+ status_counts = {
+ "s2xx": int(summary.get("count_2xx") or 0),
+ "s3xx": int(summary.get("count_3xx") or 0),
+ "s4xx": int(summary.get("count_4xx") or 0),
+ "s5xx": int(summary.get("count_5xx") or 0),
+ "other": int(summary.get("count_error") or 0),
+ }
+ url_count = _crawled_url_count(payload)
+ success_pct = round((status_counts["s2xx"] / url_count) * 100) if url_count > 0 else 0
+ health_score = _score_from_categories(payload.get("categories") or []) or 0
+ run_created_at = run_created_at_by_run_id.get(run_id_int, "") if run_id_int is not None else ""
+ last_crawl = _to_display_datetime(
+ run_created_at or payload.get("crawl_run_created_at") or payload.get("report_generated_at") or r.get("generated_at")
+ )
+ last_audit = _to_display_datetime(payload.get("report_generated_at") or r.get("generated_at"))
+ generated_at_ms = _generated_at_ms(r.get("generated_at"))
+ issue_counts, total_issues = _issue_counts_from_payload(payload)
+ perf_score, seo_score = _lh_scores(payload)
+ technical_seo_score = _category_score(payload, "technical_seo")
+ success_rate_raw = summary.get("success_rate")
+ success_rate = (
+ round(float(success_rate_raw))
+ if isinstance(success_rate_raw, (int, float))
+ else (success_pct if url_count > 0 else None)
+ )
+ crawl_duration_s = (
+ round(float(summary["crawl_time_s"]))
+ if isinstance(summary.get("crawl_time_s"), (int, float))
+ else None
+ )
+ run_meta = run_meta_by_run_id.get(run_id_int) if run_id_int is not None else None
+ canonical_host = _canonical_domain_from_payload(payload, start_url_by_run_id) or _slugify_domain(
+ str(payload.get("site_name") or "")
+ )
+ data_sources = _data_sources(payload)
+
+ group = {
+ "domainName": domain_name,
+ "crawlUrl": crawl_url or EM_DASH,
+ "urlCount": url_count,
+ "healthScore": health_score,
+ "statusCounts": status_counts,
+ "lastCrawl": last_crawl,
+ "lastAudit": last_audit,
+ "totalIssues": total_issues,
+ "issueCounts": issue_counts,
+ "successRate": success_rate,
+ "titleCoverage": None,
+ "avgWordCount": None,
+ "thinPages": None,
+ "technicalSeoScore": technical_seo_score,
+ "perfScore": perf_score,
+ "seoScore": seo_score,
+ "crawlDurationS": crawl_duration_s,
+ "categorySnapshots": _category_snapshots(payload),
+ "seoSignals": _seo_signals(payload),
+ "securityFindings": len(payload.get("security_findings") or []),
+ "duplicateClusters": len(payload.get("content_duplicates") or []),
+ "medianWordCount": _median_word_count(payload),
+ "medianResponseMs": _median_response_ms(payload),
+ "reportId": report_id,
+ "crawlRunId": run_id_int,
+ "generatedAtMs": generated_at_ms,
+ "domainParam": canonical_host,
+ "crawlConfig": _crawl_config_from_payload(payload, run_meta),
+ "dataSources": data_sources,
+ }
+
+ existing = brand_map.get(brand_key)
+ if not existing or generated_at_ms > existing["generatedAtMs"]:
+ brand_map[brand_key] = group
+
+ return sorted(brand_map.values(), key=lambda g: g["generatedAtMs"], reverse=True)
+
+
+def compute_crawl_only_groups(
+ crawl_summaries: list[dict[str, Any]],
+ report_groups: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+ covered_domains = {
+ (g.get("domainParam") or _extract_hostname(g.get("crawlUrl")) or g.get("domainName", "")).lower()
+ for g in report_groups
+ if g.get("domainParam") or g.get("crawlUrl") or g.get("domainName")
+ }
+ covered_run_ids = {
+ int(g["crawlRunId"])
+ for g in report_groups
+ if g.get("crawlRunId") is not None
+ }
+
+ brand_map: dict[str, dict[str, Any]] = {}
+ for row in crawl_summaries:
+ crawl_run_id = int(row["crawl_run_id"])
+ if crawl_run_id in covered_run_ids:
+ continue
+ start_url = str(row.get("start_url") or "").strip()
+ domain_name = _extract_hostname(start_url) or UNKNOWN_BRAND
+ domain_key = domain_name.lower()
+ if not domain_key or domain_key in covered_domains:
+ continue
+
+ url_count = int(row.get("url_count") or 0)
+ with_title = int(row.get("with_title") or 0)
+ title_coverage = _title_coverage_pct(with_title, url_count)
+ avg_word_count = round(float(row.get("avg_word_count") or 0))
+ thin_pages = int(row.get("thin_pages") or 0)
+ generated_at_ms = _generated_at_ms(row.get("created_at"))
+
+ existing = brand_map.get(domain_key)
+ if existing and generated_at_ms <= existing["generatedAtMs"]:
+ continue
+
+ brand_map[domain_key] = {
+ "domainName": domain_name,
+ "crawlUrl": start_url or EM_DASH,
+ "urlCount": url_count,
+ "healthScore": title_coverage,
+ "statusCounts": {
+ "s2xx": int(row.get("s2xx") or 0),
+ "s3xx": int(row.get("s3xx") or 0),
+ "s4xx": int(row.get("s4xx") or 0),
+ "s5xx": int(row.get("s5xx") or 0),
+ "other": int(row.get("other") or 0),
+ },
+ "lastCrawl": _to_display_datetime(row.get("created_at")),
+ "lastAudit": "",
+ "totalIssues": 0,
+ "issueCounts": dict(EMPTY_ISSUE_COUNTS),
+ "successRate": None,
+ "titleCoverage": title_coverage,
+ "avgWordCount": avg_word_count,
+ "thinPages": thin_pages,
+ "technicalSeoScore": None,
+ "perfScore": None,
+ "seoScore": None,
+ "crawlDurationS": None,
+ "categorySnapshots": [],
+ "seoSignals": None,
+ "securityFindings": 0,
+ "duplicateClusters": 0,
+ "medianWordCount": avg_word_count or None,
+ "medianResponseMs": None,
+ "reportId": None,
+ "crawlRunId": crawl_run_id,
+ "crawlOnly": True,
+ "generatedAtMs": generated_at_ms,
+ "domainParam": domain_key,
+ "crawlConfig": _crawl_config_from_summary(row),
+ }
+
+ return list(brand_map.values())
+
+
+def merge_portfolio_groups(
+ report_groups: list[dict[str, Any]],
+ crawl_only_groups: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+ return sorted(
+ report_groups + crawl_only_groups,
+ key=lambda g: g["generatedAtMs"],
+ reverse=True,
+ )
+
+
+def build_crawl_history_by_domain(
+ summaries: list[dict[str, Any]],
+) -> dict[str, list[dict[str, Any]]]:
+ by_domain: dict[str, list[dict[str, Any]]] = {}
+ for row in summaries:
+ key = _extract_hostname(row.get("start_url"))
+ if not key:
+ continue
+ pages = int(row.get("url_count") or 0)
+ point = {
+ "pagesDiscovered": pages,
+ "titleCoverage": _title_coverage_pct(int(row.get("with_title") or 0), pages),
+ "avgWordCount": round(float(row.get("avg_word_count") or 0)),
+ "createdAtMs": _generated_at_ms(row.get("created_at")),
+ }
+ by_domain.setdefault(key, []).append(point)
+
+ out: dict[str, list[dict[str, Any]]] = {}
+ for key, points in by_domain.items():
+ out[key] = sorted(points, key=lambda p: p["createdAtMs"])[-8:]
+ return out
+
+
+def compute_portfolio_summary(groups: list[dict[str, Any]]) -> dict[str, Any]:
+ total_brands = len(groups)
+ total_urls = sum(int(g.get("urlCount") or 0) for g in groups)
+ avg_health = (
+ round(sum(int(g.get("healthScore") or 0) for g in groups) / total_brands)
+ if total_brands
+ else None
+ )
+ return {"totalBrands": total_brands, "totalUrls": total_urls, "avgHealth": avg_health}
+
+
+def build_portfolio_card(
+ conn: Connection,
+ report_list: list[dict[str, Any]],
+ maps: dict[str, Any],
+ *,
+ report_id: int | None = None,
+ crawl_run_id: int | None = None,
+) -> dict[str, Any] | None:
+ def get_full_payload(rid: int) -> dict[str, Any] | None:
+ return read_report_payload(conn, rid)
+
+ if report_id is not None:
+ row = next((r for r in report_list if int(r["id"]) == report_id), None)
+ if not row:
+ return None
+ groups = compute_domain_groups([row], maps, get_full_payload)
+ return groups[0] if groups else None
+
+ if crawl_run_id is not None:
+ report_groups = compute_domain_groups(report_list, maps, get_full_payload)
+ from_report = next((g for g in report_groups if g.get("crawlRunId") == crawl_run_id), None)
+ if from_report:
+ return from_report
+ summary = next(
+ (s for s in maps["crawl_summaries"] if int(s["crawl_run_id"]) == crawl_run_id),
+ None,
+ )
+ if not summary:
+ return None
+ crawl_only = compute_crawl_only_groups([summary], report_groups)
+ return crawl_only[0] if crawl_only else None
+
+ return None
+
+
+def build_groups_bundle(
+ conn: Connection,
+ report_list: list[dict[str, Any]],
+ *,
+ lite: bool,
+) -> dict[str, Any]:
+ maps = load_portfolio_maps(conn)
+
+ def get_payload(rid: int) -> dict[str, Any] | None:
+ payload = read_report_payload(conn, rid)
+ if payload is None:
+ return None
+ return slice_payload_for_section(payload, "core") if lite else payload
+
+ report_groups = compute_domain_groups(report_list, maps, get_payload)
+ crawl_only = compute_crawl_only_groups(maps["crawl_summaries"], report_groups)
+ groups = merge_portfolio_groups(report_groups, crawl_only)
+ crawl_history = build_crawl_history_by_domain(maps["crawl_summaries"])
+ return {"groups": groups, "crawlHistoryByDomain": crawl_history}
+
+
+def get_portfolio_response(
+ conn: Connection,
+ *,
+ widget: str,
+ ids: list[int],
+ report_id: int | None = None,
+ crawl_run_id: int | None = None,
+) -> dict[str, Any]:
+ all_reports = list_reports(conn)
+ id_set = set(ids)
+ report_list = [r for r in all_reports if r["id"] in id_set] if ids else all_reports
+
+ if widget == "card":
+ maps = load_portfolio_maps(conn)
+ group = build_portfolio_card(
+ conn,
+ report_list,
+ maps,
+ report_id=report_id,
+ crawl_run_id=crawl_run_id,
+ )
+ return {"group": group}
+
+ lite = widget in ("groups", "summary")
+ bundle = build_groups_bundle(conn, report_list, lite=lite)
+
+ if widget == "summary":
+ return compute_portfolio_summary(bundle["groups"])
+
+ return {
+ "groups": bundle["groups"],
+ "crawlHistoryByDomain": bundle["crawlHistoryByDomain"],
+ }
diff --git a/src/website_profiling/api/services/report_loader.py b/src/website_profiling/api/services/report_loader.py
new file mode 100644
index 00000000..18dcd440
--- /dev/null
+++ b/src/website_profiling/api/services/report_loader.py
@@ -0,0 +1,382 @@
+"""Report data loading service — DB queries for the /api/report/* routes."""
+from __future__ import annotations
+
+from typing import Any, Optional
+
+from psycopg import Connection
+
+from website_profiling.db._common import _parse_row_json, _row_field
+from website_profiling.db.report_store import read_report_payload
+
+# ── Section slicing ─────────────────────────────────────────────────────────
+
+SECTION_FIELDS: dict[str, list[str]] = {
+ "core": [
+ "site_name", "summary", "categories", "top_pages", "recommendations",
+ "seo_health", "social_coverage", "status_counts", "portfolio_benchmark",
+ "executive_summary", "crux_summary", "report_meta", "report_generated_at",
+ "crawl_only_preview", "crawl_run_id", "crawl_run_created_at", "site_level",
+ "ml_errors",
+ ],
+ "links": [
+ "links", "link_edges", "link_rel_summary", "inlink_anchor_matrix",
+ "outbound_link_domains", "outlink_labels", "outlink_counts",
+ ],
+ "traffic": ["google"],
+ "keywords": [
+ "keywords", "keyword_opportunities", "competitor_keyword_gap",
+ "semantic_keyword_clusters",
+ ],
+ "issues": ["issues", "redirects"],
+ "content": [
+ "content_urls", "content_duplicates", "content_analytics",
+ "text_content_analysis", "response_time_stats",
+ ],
+ "lighthouse": [
+ "lighthouse_summary", "lighthouse_by_url", "lighthouse_diagnostics",
+ "lighthouse_human_summary",
+ ],
+ "security": ["security_findings"],
+ "gsc-links": ["gsc_links", "bing_backlinks"],
+ "structure": ["graph_nodes", "graph_edges", "depth_distribution"],
+ "tech": ["tech_stack_summary", "subdomains", "contact_intelligence"],
+ "indexation": [
+ "indexation_coverage", "hreflang_summary", "ner_site_summary",
+ "language_summary", "rich_results_validation", "url_fingerprints",
+ "rich_results_meta",
+ ],
+ "gallery": [
+ "mime_labels", "mime_values", "title_labels", "title_counts",
+ "domain_labels", "domain_values",
+ ],
+}
+
+SECTION_KEYS = list(SECTION_FIELDS.keys())
+
+
+def slice_payload_for_section(
+ payload: dict[str, Any], section: str
+) -> dict[str, Any]:
+ fields = SECTION_FIELDS.get(section, [])
+ return {k: payload[k] for k in fields if k in payload}
+
+
+# ── Report list ──────────────────────────────────────────────────────────────
+
+def list_reports(conn: Connection) -> list[dict[str, Any]]:
+ cur = conn.execute(
+ "SELECT id, canonical_domain, site_name, generated_at FROM report_payload ORDER BY id DESC"
+ )
+ rows = cur.fetchall()
+ result = []
+ for row in rows:
+ generated = _row_field(row, "generated_at")
+ result.append({
+ "id": int(_row_field(row, "id")),
+ "canonical_domain": _row_field(row, "canonical_domain"),
+ "site_name": _row_field(row, "site_name"),
+ "generated_at": generated.isoformat() if hasattr(generated, "isoformat") else generated,
+ })
+ return result
+
+
+# ── Crawl runs ───────────────────────────────────────────────────────────────
+
+def list_crawl_runs(conn: Connection) -> list[dict[str, Any]]:
+ try:
+ cur = conn.execute(
+ "SELECT id, start_url, created_at, render_mode, discovery_mode FROM crawl_runs ORDER BY id DESC"
+ )
+ rows = cur.fetchall()
+ except Exception:
+ return []
+ result = []
+ for row in rows:
+ created = _row_field(row, "created_at")
+ result.append({
+ "id": int(_row_field(row, "id")),
+ "start_url": str(_row_field(row, "start_url") or ""),
+ "created_at": created.isoformat() if hasattr(created, "isoformat") else str(created or ""),
+ "render_mode": _row_field(row, "render_mode"),
+ "discovery_mode": _row_field(row, "discovery_mode"),
+ })
+ return result
+
+
+def list_crawl_run_summaries(conn: Connection) -> list[dict[str, Any]]:
+ """Aggregate crawl run stats for portfolio cards and crawl history."""
+ try:
+ cur = conn.execute(
+ """
+ SELECT
+ cr.id AS crawl_run_id,
+ cr.start_url,
+ cr.created_at,
+ cr.render_mode,
+ cr.discovery_mode,
+ COUNT(crl.id)::int AS url_count,
+ COUNT(*) FILTER (WHERE crl.status LIKE '2%%')::int AS s2xx,
+ COUNT(*) FILTER (WHERE crl.status LIKE '3%%')::int AS s3xx,
+ COUNT(*) FILTER (WHERE crl.status LIKE '4%%')::int AS s4xx,
+ COUNT(*) FILTER (WHERE crl.status LIKE '5%%')::int AS s5xx,
+ COUNT(*) FILTER (
+ WHERE crl.status IS NULL
+ OR crl.status = ''
+ OR crl.status !~ '^[2345]'
+ )::int AS other,
+ COUNT(*) FILTER (
+ WHERE NULLIF(TRIM(COALESCE(crl.title, crl.data->>'title', '')), '') IS NOT NULL
+ )::int AS with_title,
+ COALESCE(ROUND(AVG(NULLIF((crl.data->>'word_count')::numeric, 0))), 0)::int AS avg_word_count,
+ COUNT(*) FILTER (
+ WHERE COALESCE((crl.data->>'word_count')::int, 0) > 0
+ AND COALESCE((crl.data->>'word_count')::int, 0) < 300
+ )::int AS thin_pages
+ FROM crawl_runs cr
+ LEFT JOIN crawl_results crl ON crl.crawl_run_id = cr.id
+ GROUP BY cr.id, cr.start_url, cr.created_at, cr.render_mode, cr.discovery_mode
+ ORDER BY cr.id DESC
+ """
+ )
+ rows = cur.fetchall()
+ except Exception:
+ return []
+ result = []
+ for row in rows:
+ created = _row_field(row, "created_at")
+ result.append({
+ "crawl_run_id": int(_row_field(row, "crawl_run_id")),
+ "start_url": str(_row_field(row, "start_url") or ""),
+ "created_at": created.isoformat() if hasattr(created, "isoformat") else str(created or ""),
+ "url_count": int(_row_field(row, "url_count") or 0),
+ "s2xx": int(_row_field(row, "s2xx") or 0),
+ "s3xx": int(_row_field(row, "s3xx") or 0),
+ "s4xx": int(_row_field(row, "s4xx") or 0),
+ "s5xx": int(_row_field(row, "s5xx") or 0),
+ "other": int(_row_field(row, "other") or 0),
+ "with_title": int(_row_field(row, "with_title") or 0),
+ "avg_word_count": int(_row_field(row, "avg_word_count") or 0),
+ "thin_pages": int(_row_field(row, "thin_pages") or 0),
+ "render_mode": _row_field(row, "render_mode"),
+ "discovery_mode": _row_field(row, "discovery_mode"),
+ })
+ return result
+
+
+# ── Report payload ───────────────────────────────────────────────────────────
+
+def get_report_payload(
+ conn: Connection,
+ report_id: Optional[int] = None,
+ domain: Optional[str] = None,
+ section: Optional[str] = None,
+) -> Optional[dict[str, Any]]:
+ resolved_id = report_id
+
+ if resolved_id is None and domain:
+ domain_lower = domain.strip().lower()
+ reports = list_reports(conn)
+ match = next(
+ (r for r in reports if (r.get("canonical_domain") or "").lower() == domain_lower),
+ None,
+ )
+ if match:
+ resolved_id = match["id"]
+
+ payload = read_report_payload(conn, resolved_id)
+ if payload is None:
+ return None
+
+ if section and section in SECTION_FIELDS:
+ return slice_payload_for_section(payload, section)
+ return payload
+
+
+# ── Crawl preview ────────────────────────────────────────────────────────────
+
+def get_crawl_preview_payload(conn: Connection, crawl_run_id: int) -> dict[str, Any]:
+ cur = conn.execute(
+ "SELECT id, start_url, created_at FROM crawl_runs WHERE id = %s",
+ (crawl_run_id,),
+ )
+ run_row = cur.fetchone()
+ if not run_row:
+ raise ValueError("Crawl run not found")
+
+ start_url = str(_row_field(run_row, "start_url") or "")
+ from urllib.parse import urlparse
+ try:
+ site_host = urlparse(start_url).hostname or ""
+ except Exception:
+ site_host = ""
+
+ cur2 = conn.execute(
+ "SELECT url, data FROM crawl_results WHERE crawl_run_id = %s",
+ (crawl_run_id,),
+ )
+ pages = []
+ for row in cur2.fetchall():
+ data = _parse_row_json(row, "data", index=1)
+ if not isinstance(data, dict):
+ data = {}
+ pages.append({"url": str(_row_field(row, "url") or ""), **data})
+
+ return {
+ "crawl_only_preview": True,
+ "crawl_run_id": crawl_run_id,
+ "site_name": site_host,
+ "top_pages": pages,
+ }
+
+
+# ── Audit history ────────────────────────────────────────────────────────────
+
+def _avg_score(categories: list[dict[str, Any]]) -> Optional[int]:
+ nums = [float(c["score"]) for c in categories if isinstance(c.get("score"), (int, float))]
+ if not nums:
+ return None
+ return round(sum(nums) / len(nums))
+
+
+def _issue_counts(categories: list[dict[str, Any]]) -> dict[str, int]:
+ counts: dict[str, int] = {"Critical": 0, "High": 0, "Medium": 0, "Low": 0}
+ for cat in categories:
+ for issue in (cat.get("issues") or []):
+ p = str(issue.get("priority") or "Medium")
+ counts[p] = counts.get(p, 0) + 1
+ return counts
+
+
+def _lh_scores(payload: dict[str, Any]) -> tuple[Optional[int], Optional[int]]:
+ summary = payload.get("lighthouse_summary")
+ if not isinstance(summary, dict):
+ return None, None
+ mm = summary.get("median_metrics") or {}
+ cs = summary.get("category_scores") or {}
+ perf_raw = mm.get("performance_score") or cs.get("performance")
+ seo_raw = mm.get("seo_score") or cs.get("seo")
+ perf = round(float(perf_raw)) if isinstance(perf_raw, (int, float)) else None
+ seo = round(float(seo_raw)) if isinstance(seo_raw, (int, float)) else None
+ return perf, seo
+
+
+def list_audit_history(
+ conn: Connection,
+ property_id: Optional[int] = None,
+ domain: Optional[str] = None,
+ limit: int = 20,
+) -> list[dict[str, Any]]:
+ clauses: list[str] = []
+ vals: list[Any] = []
+
+ if property_id is not None and property_id > 0:
+ clauses.append("property_id = %s")
+ vals.append(property_id)
+ elif domain:
+ normalized = domain.strip().lower()
+ clauses.append(
+ "(LOWER(canonical_domain) = %s OR regexp_replace(LOWER(COALESCE(canonical_domain, '')), '[^a-z0-9]+', '-', 'g') = %s)"
+ )
+ vals.append(normalized)
+ vals.append(normalized)
+
+ limit = max(1, min(100, limit))
+ vals.append(limit)
+ where = f"WHERE {' AND '.join(clauses)}" if clauses else ""
+
+ cur = conn.execute(
+ f"""SELECT id, canonical_domain, site_name, generated_at, data
+ FROM report_payload {where}
+ ORDER BY generated_at DESC LIMIT %s""",
+ vals,
+ )
+ rows = cur.fetchall()
+ result = []
+ for row in rows:
+ data = _parse_row_json(row, "data")
+ if not isinstance(data, dict):
+ data = {}
+ categories = data.get("categories") or []
+ cat_scores = {
+ (c.get("id") or c.get("name") or "unknown"): float(c["score"])
+ for c in categories
+ if isinstance(c.get("score"), (int, float))
+ }
+ perf, seo = _lh_scores(data)
+ tech_seo_cat = next((c for c in categories if c.get("id") == "technical_seo"), None)
+ tech_seo = round(float(tech_seo_cat["score"])) if tech_seo_cat and isinstance(tech_seo_cat.get("score"), (int, float)) else None
+ generated_at = _row_field(row, "generated_at")
+ result.append({
+ "reportId": int(_row_field(row, "id")),
+ "canonicalDomain": _row_field(row, "canonical_domain"),
+ "siteName": _row_field(row, "site_name"),
+ "generatedAt": generated_at.isoformat() if hasattr(generated_at, "isoformat") else generated_at,
+ "healthScore": _avg_score(categories),
+ "categoryScores": cat_scores,
+ "issueCounts": _issue_counts(categories),
+ "perfScore": perf,
+ "seoScore": seo,
+ "technicalSeoScore": tech_seo,
+ })
+ return result
+
+
+# ── Mobile-desktop delta ─────────────────────────────────────────────────────
+
+def get_mobile_desktop_delta(conn: Connection, run_id: int) -> list[dict[str, Any]]:
+ cur = conn.execute(
+ "SELECT mobile_run_id FROM crawl_runs WHERE id = %s", (run_id,)
+ )
+ row = cur.fetchone()
+ mobile_run_id = _row_field(row, "mobile_run_id")
+ if not row or mobile_run_id is None:
+ return []
+ mobile_run_id = int(mobile_run_id)
+
+ def fetch_run(rid: int) -> dict[str, dict[str, Any]]:
+ c = conn.execute(
+ "SELECT url, data FROM crawl_results WHERE crawl_run_id = %s", (rid,)
+ )
+ m: dict[str, dict[str, Any]] = {}
+ for r in c.fetchall():
+ d = _parse_row_json(r, "data", index=1)
+ if not isinstance(d, dict):
+ d = {}
+ key = str(_row_field(r, "url") or "").rstrip("/").lower()
+ m[key] = {
+ "title": str(d.get("title") or ""),
+ "h1": str(d.get("h1") or ""),
+ "word_count": int(d.get("word_count") or 0),
+ "status": int(d.get("status") or 0),
+ }
+ return m
+
+ desktop_map = fetch_run(run_id)
+ mobile_map = fetch_run(mobile_run_id)
+
+ deltas = []
+ for key, desktop in desktop_map.items():
+ mobile = mobile_map.get(key)
+ if not mobile:
+ continue
+ title_differs = desktop["title"] != mobile["title"]
+ h1_differs = desktop["h1"] != mobile["h1"]
+ word_count_delta = abs(desktop["word_count"] - mobile["word_count"])
+ status_differs = desktop["status"] != mobile["status"]
+ if not title_differs and not h1_differs and word_count_delta <= 50 and not status_differs:
+ continue
+ deltas.append({
+ "url": key,
+ "desktop": desktop,
+ "mobile": mobile,
+ "title_differs": title_differs,
+ "h1_differs": h1_differs,
+ "word_count_delta": word_count_delta,
+ "status_differs": status_differs,
+ })
+
+ deltas.sort(
+ key=lambda d: (d["status_differs"] * 4 + d["title_differs"] * 2 + d["h1_differs"]),
+ reverse=True,
+ )
+ return deltas
diff --git a/src/website_profiling/clients/__init__.py b/src/website_profiling/clients/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/website_profiling/clients/file_service.py b/src/website_profiling/clients/file_service.py
new file mode 100644
index 00000000..3285c3b1
--- /dev/null
+++ b/src/website_profiling/clients/file_service.py
@@ -0,0 +1,59 @@
+"""HTTP client for FileService (.NET) — PDF and workbook exports."""
+from __future__ import annotations
+
+import os
+from typing import Optional
+from urllib.parse import urlencode
+
+import requests
+
+_DEFAULT_BASE = "http://127.0.0.1:8080"
+_TIMEOUT_SECONDS = 120
+
+
+def _base_url() -> str:
+ return (os.environ.get("FILE_SERVICE_URL") or _DEFAULT_BASE).strip().rstrip("/")
+
+
+def _get_bytes(path: str, *, params: Optional[dict[str, str]] = None) -> bytes:
+ url = f"{_base_url()}{path}"
+ if params:
+ url = f"{url}?{urlencode(params)}"
+ try:
+ response = requests.get(url, timeout=_TIMEOUT_SECONDS)
+ except requests.RequestException as exc:
+ raise RuntimeError(f"File service unreachable at {_base_url()}: {exc}") from exc
+ if response.status_code == 404:
+ raise FileNotFoundError(response.text or "Report not found")
+ if response.status_code >= 400:
+ raise RuntimeError(
+ f"File service returned {response.status_code}: {response.text[:500]}"
+ )
+ return response.content
+
+
+def fetch_report_pdf(
+ report_id: Optional[int] = None,
+ *,
+ profile: str = "standard",
+ branding: bool = True,
+) -> bytes:
+ """Fetch audit PDF bytes from FileService."""
+ params = {
+ "profile": profile,
+ "disposition": "attachment",
+ "branding": "true" if branding else "false",
+ }
+ if report_id is not None:
+ return _get_bytes(f"/v1/reports/{int(report_id)}/pdf", params=params)
+ raise ValueError("report_id is required for PDF export")
+
+
+def fetch_report_workbook(report_id: Optional[int] = None) -> bytes:
+ """Fetch crawl workbook (.xlsx) bytes from FileService."""
+ if report_id is None:
+ raise ValueError("report_id is required for workbook export")
+ return _get_bytes(
+ f"/v1/reports/{int(report_id)}/workbook",
+ params={"disposition": "attachment"},
+ )
diff --git a/src/website_profiling/commands/pipeline_cmd.py b/src/website_profiling/commands/pipeline_cmd.py
index 967468bd..06853905 100644
--- a/src/website_profiling/commands/pipeline_cmd.py
+++ b/src/website_profiling/commands/pipeline_cmd.py
@@ -97,13 +97,13 @@ def select_lighthouse_urls_from_gsc(
if not google_data or max_pages <= 0:
return []
gsc = google_data.get("gsc") if isinstance(google_data.get("gsc"), dict) else {}
- pages = gsc.get("pages") if isinstance(gsc.get("pages"), list) else []
+ pages = gsc.get("top_pages") if isinstance(gsc.get("top_pages"), list) else []
crawl_set = {u.rstrip("/") for u in crawl_urls}
ranked: list[tuple[float, str]] = []
for row in pages:
if not isinstance(row, dict):
continue
- url = str(row.get("page") or row.get("url") or "").strip()
+ url = str(row.get("page") or "").strip()
if not url:
continue
norm = url.rstrip("/")
diff --git a/src/website_profiling/db/config_store.py b/src/website_profiling/db/config_store.py
index 7bcc10cd..856ca49b 100644
--- a/src/website_profiling/db/config_store.py
+++ b/src/website_profiling/db/config_store.py
@@ -1,11 +1,6 @@
"""Pipeline and LLM config tables."""
from __future__ import annotations
-import json
-import os
-import subprocess
-import time
-from pathlib import Path
from typing import Any, Optional
import pandas as pd
@@ -17,6 +12,7 @@
_json_val,
_now_iso,
_parse_json_field,
+ _row_field,
_sanitize_for_json,
)
from .pool import db_session, get_data_dir, get_database_url
@@ -82,3 +78,41 @@ def write_llm_config(conn: Connection, entries: dict[str, str], secret_keys: set
)
+def read_llm_config_full(conn: Connection) -> list[dict[str, Any]]:
+ """Return llm_config rows including the is_secret flag."""
+ try:
+ cur = conn.execute("SELECT key, value, is_secret FROM llm_config ORDER BY key")
+ return [
+ {
+ "key": str(_row_field(row, "key", index=0)),
+ "value": str(_row_field(row, "value", index=1)),
+ "is_secret": bool(_row_field(row, "is_secret", index=2)),
+ }
+ for row in cur.fetchall() or []
+ ]
+ except Exception:
+ return []
+
+
+def read_app_setting(conn: Connection, key: str) -> str | None:
+ try:
+ cur = conn.execute("SELECT value FROM app_settings WHERE key = %s", (key,))
+ row = cur.fetchone()
+ if not row:
+ return None
+ val = _row_field(row, "value", index=0)
+ return str(val) if val is not None else None
+ except Exception:
+ return None
+
+
+def write_app_setting(conn: Connection, key: str, value: str) -> None:
+ conn.execute(
+ """INSERT INTO app_settings (key, value, updated_at)
+ VALUES (%s, %s, now())
+ ON CONFLICT (key) DO UPDATE
+ SET value = EXCLUDED.value,
+ updated_at = now()""",
+ (key, value),
+ )
+ conn.commit()
diff --git a/src/website_profiling/db/content_draft_store.py b/src/website_profiling/db/content_draft_store.py
new file mode 100644
index 00000000..41f41118
--- /dev/null
+++ b/src/website_profiling/db/content_draft_store.py
@@ -0,0 +1,177 @@
+"""Content drafts for Content Studio (content_drafts table)."""
+from __future__ import annotations
+
+from typing import Any, Optional
+
+from psycopg import Connection
+from psycopg.types.json import Json
+
+from ._common import _parse_row_json, _row_field
+
+_LIST_COLUMNS = """
+ id, property_id, title, target_keyword, landing_url, status,
+ grade_score, created_at::text, updated_at::text
+"""
+
+_DETAIL_COLUMNS = """
+ id, property_id, title, target_keyword, landing_url, status,
+ body_html, title_tag, meta_description, grade_score, grade_snapshot,
+ created_at::text, updated_at::text
+"""
+
+
+def _grade_score_value(raw: Any) -> float | None:
+ if raw is None:
+ return None
+ return float(raw)
+
+
+def _map_list_row(row: Any) -> dict[str, Any]:
+ return {
+ "id": int(_row_field(row, "id")),
+ "property_id": int(_row_field(row, "property_id")),
+ "title": _row_field(row, "title"),
+ "target_keyword": _row_field(row, "target_keyword"),
+ "landing_url": _row_field(row, "landing_url"),
+ "status": _row_field(row, "status"),
+ "grade_score": _grade_score_value(_row_field(row, "grade_score")),
+ "created_at": _row_field(row, "created_at"),
+ "updated_at": _row_field(row, "updated_at"),
+ }
+
+
+def _map_detail_row(row: Any) -> dict[str, Any]:
+ return {
+ "id": int(_row_field(row, "id")),
+ "property_id": int(_row_field(row, "property_id")),
+ "title": _row_field(row, "title"),
+ "target_keyword": _row_field(row, "target_keyword"),
+ "landing_url": _row_field(row, "landing_url"),
+ "status": _row_field(row, "status"),
+ "body_html": _row_field(row, "body_html") or "",
+ "title_tag": _row_field(row, "title_tag") or "",
+ "meta_description": _row_field(row, "meta_description") or "",
+ "grade_score": _grade_score_value(_row_field(row, "grade_score")),
+ "grade_snapshot": _parse_row_json(row, "grade_snapshot"),
+ "created_at": _row_field(row, "created_at"),
+ "updated_at": _row_field(row, "updated_at"),
+ }
+
+
+def list_content_drafts(
+ conn: Connection,
+ property_id: int,
+ *,
+ limit: int = 100,
+) -> list[dict[str, Any]]:
+ limit = max(1, min(int(limit), 200))
+ cur = conn.execute(
+ f"""SELECT {_LIST_COLUMNS}
+ FROM content_drafts
+ WHERE property_id = %s
+ ORDER BY updated_at DESC
+ LIMIT %s""",
+ (property_id, limit),
+ )
+ return [_map_list_row(row) for row in cur.fetchall() or []]
+
+
+def get_content_draft(conn: Connection, draft_id: int) -> dict[str, Any] | None:
+ cur = conn.execute(
+ f"SELECT {_DETAIL_COLUMNS} FROM content_drafts WHERE id = %s",
+ (draft_id,),
+ )
+ row = cur.fetchone()
+ return _map_detail_row(row) if row else None
+
+
+def create_content_draft(
+ conn: Connection,
+ property_id: int,
+ *,
+ title: str = "Untitled draft",
+ target_keyword: str = "",
+ landing_url: str | None = None,
+ status: str = "draft",
+ body_html: str = "",
+ title_tag: str = "",
+ meta_description: str = "",
+) -> int:
+ cur = conn.execute(
+ """INSERT INTO content_drafts
+ (property_id, title, target_keyword, landing_url, status,
+ body_html, title_tag, meta_description)
+ VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
+ RETURNING id""",
+ (
+ property_id,
+ (title or "Untitled draft").strip() or "Untitled draft",
+ (target_keyword or "").strip(),
+ (landing_url or "").strip() or None,
+ status or "draft",
+ body_html or "",
+ title_tag or "",
+ meta_description or "",
+ ),
+ )
+ row = cur.fetchone()
+ conn.commit()
+ return int(_row_field(row, "id"))
+
+
+def update_content_draft(
+ conn: Connection,
+ draft_id: int,
+ patch: dict[str, Any],
+) -> dict[str, Any] | None:
+ fields: list[str] = []
+ values: list[Any] = []
+
+ def set_field(col: str, val: Any) -> None:
+ fields.append(f"{col} = %s")
+ values.append(val)
+
+ if "title" in patch:
+ set_field("title", (str(patch["title"]).strip() or "Untitled draft"))
+ if "target_keyword" in patch:
+ set_field("target_keyword", str(patch["target_keyword"]).strip())
+ if "landing_url" in patch:
+ set_field("landing_url", str(patch["landing_url"]).strip() or None)
+ if "status" in patch:
+ set_field("status", patch["status"])
+ if "body_html" in patch:
+ set_field("body_html", patch["body_html"])
+ if "title_tag" in patch:
+ set_field("title_tag", patch["title_tag"])
+ if "meta_description" in patch:
+ set_field("meta_description", patch["meta_description"])
+ if "grade_score" in patch:
+ set_field("grade_score", patch["grade_score"])
+ if "grade_snapshot" in patch:
+ gs = patch["grade_snapshot"]
+ set_field("grade_snapshot", Json(gs) if gs is not None else None)
+
+ if not fields:
+ return get_content_draft(conn, draft_id)
+
+ fields.append("updated_at = now()")
+ values.append(draft_id)
+ cur = conn.execute(
+ f"""UPDATE content_drafts SET {', '.join(fields)}
+ WHERE id = %s
+ RETURNING {_DETAIL_COLUMNS}""",
+ values,
+ )
+ row = cur.fetchone()
+ conn.commit()
+ return _map_detail_row(row) if row else None
+
+
+def delete_content_draft(conn: Connection, draft_id: int) -> bool:
+ cur = conn.execute(
+ "DELETE FROM content_drafts WHERE id = %s RETURNING id",
+ (draft_id,),
+ )
+ deleted = cur.fetchone() is not None
+ conn.commit()
+ return deleted
diff --git a/src/website_profiling/db/crawl_store.py b/src/website_profiling/db/crawl_store.py
index 5546368e..263a7d17 100644
--- a/src/website_profiling/db/crawl_store.py
+++ b/src/website_profiling/db/crawl_store.py
@@ -34,36 +34,13 @@ def create_crawl_run(
) -> int:
mode = (render_mode or "static").strip().lower()
disc = (discovery_mode or "spider").strip().lower()
- statements = [
- (
- "INSERT INTO crawl_runs (created_at, start_url, property_id, render_mode, discovery_mode) VALUES (%s, %s, %s, %s, %s) RETURNING id",
- (_now_iso(), start_url, property_id, mode, disc),
- ),
- (
- "INSERT INTO crawl_runs (created_at, start_url, property_id, render_mode) VALUES (%s, %s, %s, %s) RETURNING id",
- (_now_iso(), start_url, property_id, mode),
- ),
- (
- "INSERT INTO crawl_runs (created_at, start_url, property_id) VALUES (%s, %s, %s) RETURNING id",
- (_now_iso(), start_url, property_id),
- ),
- ]
- last_err: Exception | None = None
- for sql, params in statements:
- try:
- cur = conn.execute(sql, params)
- row = cur.fetchone()
- conn.commit()
- return int(row["id"])
- except Exception as exc:
- last_err = exc
- try:
- conn.rollback()
- except Exception:
- pass
- if last_err is not None:
- raise last_err
- raise RuntimeError("create_crawl_run failed") # pragma: no cover
+ cur = conn.execute(
+ "INSERT INTO crawl_runs (created_at, start_url, property_id, render_mode, discovery_mode) VALUES (%s, %s, %s, %s, %s) RETURNING id",
+ (_now_iso(), start_url, property_id, mode, disc),
+ )
+ row = cur.fetchone()
+ conn.commit()
+ return int(row["id"])
def get_latest_crawl_run_id(conn: Connection) -> Optional[int]:
@@ -84,25 +61,13 @@ def get_crawl_run_info(conn: Connection, run_id: int) -> Optional[dict[str, Any]
row = cur.fetchone()
if row is None:
return None
- out: dict[str, Any] = {
+ return {
"created_at": row["created_at"],
"start_url": row["start_url"],
+ "render_mode": row["render_mode"],
}
- if "render_mode" in row.keys():
- out["render_mode"] = row["render_mode"]
- return out
except Exception:
- try:
- cur = conn.execute(
- "SELECT created_at, start_url FROM crawl_runs WHERE id = %s",
- (run_id,),
- )
- row = cur.fetchone()
- if row is None:
- return None
- return {"created_at": row["created_at"], "start_url": row["start_url"]}
- except Exception:
- return None
+ return None
def set_mobile_run_id(conn: Connection, desktop_run_id: int, mobile_run_id: int) -> None:
@@ -289,14 +254,6 @@ def _canonical_domain_from_report(conn: Connection, report_data: dict[str, Any])
fetch_method = EXCLUDED.fetch_method,
data = EXCLUDED.data"""
-_CRAWL_INSERT_SQL_LEGACY = """INSERT INTO crawl_results (crawl_run_id, url, status, title, data)
-VALUES (%s, %s, %s, %s, %s)
-ON CONFLICT (crawl_run_id, url) DO UPDATE SET
- status = EXCLUDED.status,
- title = EXCLUDED.title,
- data = EXCLUDED.data"""
-
-
def _crawl_rows_from_df(df: pd.DataFrame, crawl_run_id: int) -> list[tuple]:
rows: list[tuple] = []
if df.empty or "url" not in df.columns:
@@ -324,24 +281,8 @@ def _crawl_rows_from_df(df: pd.DataFrame, crawl_run_id: int) -> list[tuple]:
def _write_crawl_rows(conn: Connection, rows: list[tuple]) -> None:
if not rows:
return
- normalized: list[tuple] = []
- for row in rows:
- if len(row) == 5:
- normalized.append((row[0], row[1], row[2], row[3], "static", row[4]))
- else:
- normalized.append(row)
- try:
- # Savepoint so that a failure (e.g. a legacy schema missing the
- # fetch_method column) rolls back ONLY this insert and leaves the
- # transaction usable. Without it the legacy fallback below runs inside an
- # aborted transaction, raises "current transaction is aborted", and
- # silently writes nothing.
- with conn.transaction():
- _executemany(conn, _CRAWL_INSERT_SQL, normalized, page_size=_CRAWL_BATCH_SIZE)
- except Exception:
- legacy = [(r[0], r[1], r[2], r[3], r[5]) for r in normalized]
- with conn.transaction():
- _executemany(conn, _CRAWL_INSERT_SQL_LEGACY, legacy, page_size=_CRAWL_BATCH_SIZE)
+ with conn.transaction():
+ _executemany(conn, _CRAWL_INSERT_SQL, rows, page_size=_CRAWL_BATCH_SIZE)
def write_crawl_batch(
@@ -431,36 +372,13 @@ def merge_crawl_result_fields_batch(
def read_crawl(conn: Connection, run_id: Optional[int] = None) -> pd.DataFrame:
- try:
- return _read_crawl_rows(conn, run_id, include_fetch_method=True)
- except Exception:
- try:
- return _read_crawl_rows(conn, run_id, include_fetch_method=False)
- except Exception:
- return pd.DataFrame()
-
-
-def _read_crawl_rows(
- conn: Connection,
- run_id: Optional[int],
- *,
- include_fetch_method: bool,
-) -> pd.DataFrame:
if run_id is None:
run_id = get_latest_crawl_run_id(conn)
- if include_fetch_method:
- if run_id is None:
- cur = conn.execute("SELECT url, fetch_method, data FROM crawl_results")
- else:
- cur = conn.execute(
- "SELECT url, fetch_method, data FROM crawl_results WHERE crawl_run_id = %s",
- (run_id,),
- )
- elif run_id is None:
- cur = conn.execute("SELECT url, data FROM crawl_results")
+ if run_id is None:
+ cur = conn.execute("SELECT url, fetch_method, data FROM crawl_results")
else:
cur = conn.execute(
- "SELECT url, data FROM crawl_results WHERE crawl_run_id = %s",
+ "SELECT url, fetch_method, data FROM crawl_results WHERE crawl_run_id = %s",
(run_id,),
)
rows = cur.fetchall()
@@ -469,19 +387,12 @@ def _read_crawl_rows(
records = []
for row in rows:
rec: dict[str, Any] = {"url": row["url"]}
- fm_col: Optional[str] = None
- if include_fetch_method and "fetch_method" in row.keys():
- fm_col = str(row["fetch_method"] or "static").strip() or "static"
+ if "fetch_method" in row.keys():
+ rec["fetch_method"] = str(row["fetch_method"] or "static").strip() or "static"
data = _parse_row_json(row) or {}
if isinstance(data, dict):
rec.update(data)
- if fm_col is not None:
- rec["fetch_method"] = fm_col
- elif not include_fetch_method:
- rec["fetch_method"] = str(
- (data.get("fetch_method") if isinstance(data, dict) else None) or "static"
- ).strip() or "static"
- elif "fetch_method" not in rec:
+ if "fetch_method" not in rec:
rec["fetch_method"] = "static"
records.append(rec)
df = pd.DataFrame(records)
diff --git a/src/website_profiling/db/dashboard_store.py b/src/website_profiling/db/dashboard_store.py
new file mode 100644
index 00000000..308d03c2
--- /dev/null
+++ b/src/website_profiling/db/dashboard_store.py
@@ -0,0 +1,116 @@
+"""Custom dashboards (dashboards table)."""
+from __future__ import annotations
+
+from typing import Any
+
+from psycopg import Connection
+from psycopg.types.json import Json
+
+from ._common import _row_field
+
+_SELECT = """
+ SELECT id, property_id, name, layout_json, is_default, created_at, updated_at
+ FROM dashboards
+"""
+
+
+def _map_dashboard(row: Any) -> dict[str, Any]:
+ created = _row_field(row, "created_at", index=5)
+ updated = _row_field(row, "updated_at", index=6)
+ layout = _row_field(row, "layout_json", index=3) or {}
+ return {
+ "id": int(_row_field(row, "id", index=0)),
+ "propertyId": int(_row_field(row, "property_id", index=1)),
+ "name": _row_field(row, "name", index=2),
+ "layoutJson": layout,
+ "isDefault": bool(_row_field(row, "is_default", index=4)),
+ "createdAt": created.isoformat() if hasattr(created, "isoformat") else str(created or ""),
+ "updatedAt": updated.isoformat() if hasattr(updated, "isoformat") else str(updated or ""),
+ }
+
+
+def list_dashboards(conn: Connection, property_id: int) -> list[dict[str, Any]]:
+ cur = conn.execute(
+ f"{_SELECT} WHERE property_id = %s ORDER BY updated_at DESC",
+ (property_id,),
+ )
+ return [_map_dashboard(row) for row in cur.fetchall() or []]
+
+
+def get_dashboard(conn: Connection, dashboard_id: int, property_id: int) -> dict[str, Any] | None:
+ cur = conn.execute(
+ f"{_SELECT} WHERE id = %s AND property_id = %s",
+ (dashboard_id, property_id),
+ )
+ row = cur.fetchone()
+ return _map_dashboard(row) if row else None
+
+
+def create_dashboard(
+ conn: Connection,
+ property_id: int,
+ name: str,
+ layout_json: Any,
+) -> dict[str, Any]:
+ cur = conn.execute(
+ """
+ INSERT INTO dashboards (property_id, name, layout_json)
+ VALUES (%s, %s, %s)
+ RETURNING id, property_id, name, layout_json, is_default, created_at, updated_at
+ """,
+ (property_id, name, Json(layout_json)),
+ )
+ row = cur.fetchone()
+ conn.commit()
+ return _map_dashboard(row)
+
+
+def update_dashboard(
+ conn: Connection,
+ dashboard_id: int,
+ property_id: int,
+ *,
+ name: str | None = None,
+ layout_json: Any | None = None,
+ is_default: bool | None = None,
+) -> dict[str, Any] | None:
+ sets = ["updated_at = now()"]
+ vals: list[Any] = []
+
+ if name is not None:
+ sets.append("name = %s")
+ vals.append(name.strip() or "Untitled dashboard")
+ if layout_json is not None:
+ sets.append("layout_json = %s")
+ vals.append(Json(layout_json))
+ if is_default is not None:
+ if is_default:
+ conn.execute(
+ "UPDATE dashboards SET is_default = false WHERE property_id = %s",
+ (property_id,),
+ )
+ sets.append("is_default = %s")
+ vals.append(is_default)
+
+ vals.extend([dashboard_id, property_id])
+ cur = conn.execute(
+ f"""
+ UPDATE dashboards SET {', '.join(sets)}
+ WHERE id = %s AND property_id = %s
+ RETURNING id, property_id, name, layout_json, is_default, created_at, updated_at
+ """,
+ vals,
+ )
+ row = cur.fetchone()
+ conn.commit()
+ return _map_dashboard(row) if row else None
+
+
+def delete_dashboard(conn: Connection, dashboard_id: int, property_id: int) -> bool:
+ cur = conn.execute(
+ "DELETE FROM dashboards WHERE id = %s AND property_id = %s RETURNING id",
+ (dashboard_id, property_id),
+ )
+ deleted = cur.fetchone() is not None
+ conn.commit()
+ return deleted
diff --git a/src/website_profiling/db/issue_status_store.py b/src/website_profiling/db/issue_status_store.py
new file mode 100644
index 00000000..05870569
--- /dev/null
+++ b/src/website_profiling/db/issue_status_store.py
@@ -0,0 +1,100 @@
+"""Issue workflow status persistence (issue_status table)."""
+from __future__ import annotations
+
+import hashlib
+from typing import Any, Optional
+
+from psycopg import Connection
+
+from ._common import _row_field
+
+_VALID_STATUS = frozenset({"open", "in_progress", "fixed", "ignored"})
+
+_SELECT_COLUMNS = """
+ id, property_id, report_id, issue_fingerprint, category_id,
+ message, url, priority, status, assignee, note, updated_at
+"""
+
+
+def issue_fingerprint(message: str, url: str, category_id: Optional[str] = None) -> str:
+ raw = f"{category_id or ''}|{url or ''}|{message or ''}"
+ return hashlib.sha256(raw.encode()).hexdigest()[:32]
+
+
+def _map_issue_row(row: Any) -> dict[str, Any]:
+ report_id = _row_field(row, "report_id")
+ updated = _row_field(row, "updated_at")
+ return {
+ "id": int(_row_field(row, "id")),
+ "propertyId": int(_row_field(row, "property_id")),
+ "reportId": int(report_id) if report_id is not None else None,
+ "issueFingerprint": _row_field(row, "issue_fingerprint"),
+ "categoryId": _row_field(row, "category_id"),
+ "message": _row_field(row, "message"),
+ "url": _row_field(row, "url"),
+ "priority": _row_field(row, "priority"),
+ "status": _row_field(row, "status"),
+ "assignee": _row_field(row, "assignee"),
+ "note": _row_field(row, "note"),
+ "updatedAt": updated.isoformat() if hasattr(updated, "isoformat") else str(updated or ""),
+ }
+
+
+def list_issue_status(conn: Connection, property_id: int) -> list[dict[str, Any]]:
+ cur = conn.execute(
+ f"""SELECT {_SELECT_COLUMNS}
+ FROM issue_status
+ WHERE property_id = %s
+ ORDER BY updated_at DESC""",
+ (property_id,),
+ )
+ return [_map_issue_row(row) for row in cur.fetchall() or []]
+
+
+def upsert_issue_status(
+ conn: Connection,
+ *,
+ property_id: int,
+ message: str,
+ status: str,
+ report_id: int | None = None,
+ url: str = "",
+ priority: str = "Medium",
+ category_id: str | None = None,
+ assignee: str | None = None,
+ note: str | None = None,
+) -> dict[str, Any]:
+ if status not in _VALID_STATUS:
+ raise ValueError(f"invalid status: {status}")
+
+ fp = issue_fingerprint(message, url, category_id)
+ cur = conn.execute(
+ f"""INSERT INTO issue_status
+ (property_id, report_id, issue_fingerprint, category_id, message, url,
+ priority, status, assignee, note, updated_at)
+ VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, now())
+ ON CONFLICT (property_id, issue_fingerprint) DO UPDATE SET
+ status = EXCLUDED.status,
+ assignee = COALESCE(EXCLUDED.assignee, issue_status.assignee),
+ note = COALESCE(EXCLUDED.note, issue_status.note),
+ report_id = COALESCE(EXCLUDED.report_id, issue_status.report_id),
+ updated_at = now()
+ RETURNING {_SELECT_COLUMNS}""",
+ (
+ property_id,
+ report_id,
+ fp,
+ category_id,
+ message,
+ url,
+ priority,
+ status,
+ assignee,
+ note,
+ ),
+ )
+ row = cur.fetchone()
+ conn.commit()
+ if not row:
+ raise RuntimeError("issue status upsert failed")
+ return _map_issue_row(row)
diff --git a/src/website_profiling/db/markdown_store.py b/src/website_profiling/db/markdown_store.py
index 9dbbcd28..0677e62c 100644
--- a/src/website_profiling/db/markdown_store.py
+++ b/src/website_profiling/db/markdown_store.py
@@ -5,7 +5,7 @@
from psycopg import Connection
-from ._common import _executemany, _now_iso
+from ._common import _executemany, _now_iso, _row_field
_MD_BATCH_SIZE = 200
@@ -71,7 +71,15 @@ def read_page_markdown(conn: Connection, crawl_run_id: int, url: str) -> Optiona
row = cur.fetchone()
if row is None:
return None
- return dict(row)
+ return {
+ "url": _row_field(row, "url"),
+ "title": _row_field(row, "title"),
+ "markdown": _row_field(row, "markdown"),
+ "word_count": _row_field(row, "word_count"),
+ "strategy": _row_field(row, "strategy"),
+ "source_byte_length": _row_field(row, "source_byte_length"),
+ "extracted_at": _row_field(row, "extracted_at"),
+ }
except Exception:
return None
@@ -97,7 +105,7 @@ def list_page_markdown(
(crawl_run_id, pattern),
)
total_row = count_cur.fetchone()
- total = int(dict(total_row).get("count", 0)) if total_row else 0
+ total = int(_row_field(total_row, "count", index=0) or 0) if total_row else 0
cur = conn.execute(
"""SELECT url, title, word_count, strategy, extracted_at
@@ -113,7 +121,7 @@ def list_page_markdown(
(crawl_run_id,),
)
total_row = count_cur.fetchone()
- total = int(dict(total_row).get("count", 0)) if total_row else 0
+ total = int(_row_field(total_row, "count", index=0) or 0) if total_row else 0
cur = conn.execute(
"""SELECT url, title, word_count, strategy, extracted_at
@@ -123,7 +131,16 @@ def list_page_markdown(
LIMIT %s OFFSET %s""",
(crawl_run_id, limit, offset),
)
- items = [dict(row) for row in cur.fetchall()]
+ items = [
+ {
+ "url": _row_field(row, "url"),
+ "title": _row_field(row, "title"),
+ "word_count": _row_field(row, "word_count"),
+ "strategy": _row_field(row, "strategy"),
+ "extracted_at": _row_field(row, "extracted_at"),
+ }
+ for row in cur.fetchall() or []
+ ]
return {"items": items, "total": total, "limit": limit, "offset": offset}
except Exception:
return {"items": [], "total": 0, "limit": limit, "offset": offset}
@@ -141,7 +158,10 @@ def count_page_markdown_by_run(conn: Connection, crawl_run_ids: list[int]) -> di
GROUP BY crawl_run_id""",
(crawl_run_ids,),
)
- return {int(row["crawl_run_id"]): int(row["cnt"]) for row in cur.fetchall()}
+ return {
+ int(_row_field(row, "crawl_run_id")): int(_row_field(row, "cnt") or 0)
+ for row in cur.fetchall() or []
+ }
except Exception:
return {}
@@ -159,3 +179,47 @@ def delete_page_markdown_for_run(conn: Connection, crawl_run_id: int, *, commit:
return deleted
except Exception:
return 0
+
+
+def list_markdown_crawl_runs(
+ conn: Connection,
+ property_id: int | None = None,
+ *,
+ limit: int = 50,
+) -> list[dict[str, Any]]:
+ """Crawl runs with HTML and markdown page counts for the page-markdown UI."""
+ limit = max(1, min(int(limit), 100))
+ where = "WHERE cr.property_id = %s" if property_id else ""
+ params: tuple[Any, ...] = (property_id, limit) if property_id else (limit,)
+ cur = conn.execute(
+ f"""
+ SELECT cr.id, cr.created_at, cr.start_url,
+ COALESCE(html_counts.cnt, 0) AS html_page_count,
+ COALESCE(md_counts.cnt, 0) AS markdown_page_count
+ FROM crawl_runs cr
+ LEFT JOIN (
+ SELECT crawl_run_id, COUNT(*)::int AS cnt
+ FROM crawl_page_html GROUP BY crawl_run_id
+ ) html_counts ON html_counts.crawl_run_id = cr.id
+ LEFT JOIN (
+ SELECT crawl_run_id, COUNT(*)::int AS cnt
+ FROM crawl_page_markdown GROUP BY crawl_run_id
+ ) md_counts ON md_counts.crawl_run_id = cr.id
+ {where}
+ ORDER BY cr.id DESC
+ LIMIT %s
+ """,
+ params,
+ )
+ runs: list[dict[str, Any]] = []
+ for row in cur.fetchall() or []:
+ created = _row_field(row, "created_at")
+ runs.append({
+ "id": int(_row_field(row, "id")),
+ "created_at": created.isoformat() if hasattr(created, "isoformat") else str(created or "") or None,
+ "start_url": _row_field(row, "start_url"),
+ "html_page_count": int(_row_field(row, "html_page_count") or 0),
+ "markdown_page_count": int(_row_field(row, "markdown_page_count") or 0),
+ })
+ return runs
+
diff --git a/src/website_profiling/db/pipeline_jobs.py b/src/website_profiling/db/pipeline_jobs.py
new file mode 100644
index 00000000..0eb85f2d
--- /dev/null
+++ b/src/website_profiling/db/pipeline_jobs.py
@@ -0,0 +1,262 @@
+"""Pipeline job DB helpers — shared by FastAPI routers and the worker process."""
+from __future__ import annotations
+
+import os
+from typing import Any, Optional
+
+from psycopg import Connection
+
+from .pool import db_session
+
+# Stale job thresholds (minutes for pending, hours for running)
+_STALE_PENDING_MINUTES = int(os.getenv("PIPELINE_JOB_STALE_PENDING_MINUTES", "10"))
+_STALE_RUNNING_HOURS = int(os.getenv("PIPELINE_JOB_STALE_HOURS", "1"))
+
+PIPELINE_LOG_MAX = 256_000
+PIPELINE_LOG_TRIM = 200_000
+
+
+def _trim_log(existing: str, chunk: str) -> tuple[str, bool]:
+ combined = existing + chunk
+ if len(combined) <= PIPELINE_LOG_MAX:
+ return combined, False
+ return combined[-PIPELINE_LOG_TRIM:], True
+
+
+# ── Enqueue ──────────────────────────────────────────────────────────────────
+
+def enqueue_job(
+ conn: Connection,
+ job_id: str,
+ job_type: str,
+ command: Optional[str],
+ property_id: Optional[int],
+ config_hash: Optional[str] = None,
+) -> bool:
+ """INSERT a pending job. Returns True if inserted, False if a job is already pending/running."""
+ reconcile_stale_jobs(conn)
+ cur = conn.execute(
+ """INSERT INTO pipeline_jobs (id, job_type, status, command, property_id, config_hash)
+ SELECT %s::uuid, %s, 'pending', %s, %s, %s
+ WHERE NOT EXISTS (
+ SELECT 1 FROM pipeline_jobs WHERE status IN ('pending', 'running')
+ )
+ RETURNING id""",
+ (job_id, job_type, command, property_id, config_hash),
+ )
+ conn.commit()
+ return cur.fetchone() is not None
+
+
+# ── Worker claim ─────────────────────────────────────────────────────────────
+
+def try_claim_pending_job(conn: Connection, worker_pid: int) -> Optional[dict[str, Any]]:
+ """Atomically claim one pending job for the worker. Returns the job row or None."""
+ cur = conn.execute(
+ """UPDATE pipeline_jobs
+ SET status = 'running', worker_pid = %s
+ WHERE id = (
+ SELECT id FROM pipeline_jobs
+ WHERE status = 'pending'
+ ORDER BY started_at ASC
+ LIMIT 1
+ FOR UPDATE SKIP LOCKED
+ )
+ RETURNING id, job_type, command, property_id""",
+ (worker_pid,),
+ )
+ row = cur.fetchone()
+ conn.commit()
+ if row is None:
+ return None
+ return {
+ "id": str(row["id"]),
+ "job_type": str(row["job_type"]),
+ "command": row["command"],
+ "property_id": row["property_id"],
+ }
+
+
+# ── Log appending ────────────────────────────────────────────────────────────
+
+def append_job_log(conn: Connection, job_id: str, chunk: str) -> bool:
+ """Append to log_text with row-level lock. Returns True if log was truncated."""
+ cur = conn.execute(
+ "SELECT log_text, log_truncated FROM pipeline_jobs WHERE id = %s::uuid FOR UPDATE",
+ (job_id,),
+ )
+ row = cur.fetchone()
+ if not row:
+ conn.rollback()
+ return False
+ log, truncated = _trim_log(str(row["log_text"] or ""), chunk)
+ log_truncated = bool(row["log_truncated"]) or truncated
+ conn.execute(
+ "UPDATE pipeline_jobs SET log_text = %s, log_truncated = %s WHERE id = %s::uuid",
+ (log, log_truncated, job_id),
+ )
+ conn.commit()
+ return log_truncated
+
+
+# ── Finish ───────────────────────────────────────────────────────────────────
+
+def finish_job(
+ conn: Connection,
+ job_id: str,
+ status: str,
+ exit_code: Optional[int],
+ error: Optional[str] = None,
+ log_truncated: Optional[bool] = None,
+) -> None:
+ if log_truncated is None:
+ conn.execute(
+ """UPDATE pipeline_jobs
+ SET status = %s, exit_code = %s, error_text = %s, finished_at = now(), worker_pid = NULL
+ WHERE id = %s::uuid""",
+ (status, exit_code, error, job_id),
+ )
+ else:
+ conn.execute(
+ """UPDATE pipeline_jobs
+ SET status = %s, exit_code = %s, error_text = %s, finished_at = now(),
+ log_truncated = %s, worker_pid = NULL
+ WHERE id = %s::uuid""",
+ (status, exit_code, error, log_truncated, job_id),
+ )
+ conn.commit()
+
+
+# ── Flags ────────────────────────────────────────────────────────────────────
+
+def check_flags(conn: Connection, job_id: str) -> tuple[bool, bool]:
+ """Return (cancel_requested, pause_requested) for a running job."""
+ cur = conn.execute(
+ "SELECT cancel_requested, pause_requested FROM pipeline_jobs WHERE id = %s::uuid",
+ (job_id,),
+ )
+ row = cur.fetchone()
+ if not row:
+ return False, False
+ return bool(row["cancel_requested"]), bool(row["pause_requested"])
+
+
+def set_cancel_flag(conn: Connection, job_id: str) -> bool:
+ cur = conn.execute(
+ """UPDATE pipeline_jobs SET cancel_requested = true
+ WHERE id = %s::uuid AND status = 'running'
+ RETURNING id""",
+ (job_id,),
+ )
+ conn.commit()
+ return cur.fetchone() is not None
+
+
+def set_pause_flag(conn: Connection, job_id: str) -> bool:
+ cur = conn.execute(
+ """UPDATE pipeline_jobs SET pause_requested = true
+ WHERE id = %s::uuid AND status = 'running'
+ RETURNING id""",
+ (job_id,),
+ )
+ conn.commit()
+ return cur.fetchone() is not None
+
+
+# ── Reconcile stale jobs ─────────────────────────────────────────────────────
+
+def reconcile_stale_jobs(conn: Connection) -> int:
+ """Mark stale running/pending jobs as error. Returns count reconciled."""
+ cur = conn.execute(
+ """UPDATE pipeline_jobs
+ SET status = 'error',
+ error_text = COALESCE(error_text, 'Job interrupted (server restart or timeout)'),
+ finished_at = now()
+ WHERE status = 'running'
+ AND started_at < now() - (%s::text || ' hours')::interval
+ RETURNING id""",
+ (str(_STALE_RUNNING_HOURS),),
+ )
+ count = len(cur.fetchall())
+
+ cur2 = conn.execute(
+ """UPDATE pipeline_jobs
+ SET status = 'error',
+ error_text = 'Job never started (worker restart)',
+ finished_at = now()
+ WHERE status = 'pending'
+ AND started_at < now() - (%s::text || ' minutes')::interval
+ RETURNING id""",
+ (str(_STALE_PENDING_MINUTES),),
+ )
+ count += len(cur2.fetchall())
+ if count:
+ conn.commit()
+ return count
+
+
+# ── Read helpers ─────────────────────────────────────────────────────────────
+
+def get_job(conn: Connection, job_id: str) -> Optional[dict[str, Any]]:
+ cur = conn.execute(
+ """SELECT id, job_type, status, exit_code, log_text, error_text,
+ log_truncated, property_id, started_at, finished_at, command
+ FROM pipeline_jobs WHERE id = %s::uuid""",
+ (job_id,),
+ )
+ row = cur.fetchone()
+ if not row:
+ return None
+ return _job_row_to_dict(row)
+
+
+def list_jobs(conn: Connection, limit: int = 50) -> list[dict[str, Any]]:
+ reconcile_stale_jobs(conn)
+ cur = conn.execute(
+ """SELECT id, job_type, status, exit_code, log_text, error_text,
+ log_truncated, property_id, started_at, finished_at, command
+ FROM pipeline_jobs ORDER BY started_at DESC LIMIT %s""",
+ (limit,),
+ )
+ return [_job_row_to_dict(r) for r in cur.fetchall()]
+
+
+def get_active_job(conn: Connection) -> Optional[dict[str, Any]]:
+ cur = conn.execute(
+ """SELECT id, job_type, status, exit_code, log_text, error_text,
+ log_truncated, property_id, started_at, finished_at, command
+ FROM pipeline_jobs WHERE status IN ('pending', 'running')
+ ORDER BY started_at DESC LIMIT 1""",
+ )
+ row = cur.fetchone()
+ return _job_row_to_dict(row) if row else None
+
+
+def cancel_job_in_db(conn: Connection, job_id: str, message: str = "Cancelled by user") -> bool:
+ cur = conn.execute(
+ """UPDATE pipeline_jobs
+ SET status = 'error', error_text = %s, exit_code = -1, finished_at = now()
+ WHERE id = %s::uuid AND status IN ('pending', 'running')
+ RETURNING id""",
+ (message, job_id),
+ )
+ conn.commit()
+ return cur.fetchone() is not None
+
+
+def _job_row_to_dict(row: Any) -> dict[str, Any]:
+ started_at = row["started_at"]
+ finished_at = row["finished_at"]
+ return {
+ "id": str(row["id"]),
+ "jobType": str(row["job_type"] or ""),
+ "status": str(row["status"] or ""),
+ "exitCode": row["exit_code"],
+ "log": str(row["log_text"] or ""),
+ "error": row["error_text"],
+ "logTruncated": bool(row["log_truncated"]),
+ "propertyId": row["property_id"],
+ "startedAt": started_at.isoformat() if started_at else None,
+ "finishedAt": finished_at.isoformat() if finished_at else None,
+ "command": row["command"],
+ }
diff --git a/src/website_profiling/db/portfolio_store.py b/src/website_profiling/db/portfolio_store.py
new file mode 100644
index 00000000..ad89d364
--- /dev/null
+++ b/src/website_profiling/db/portfolio_store.py
@@ -0,0 +1,38 @@
+"""Portfolio item deletion (report_payload / crawl_runs)."""
+from __future__ import annotations
+
+from psycopg import Connection
+
+
+def delete_portfolio_report(conn: Connection, report_id: int) -> bool:
+ cur = conn.execute(
+ "DELETE FROM report_payload WHERE id = %s RETURNING id",
+ (report_id,),
+ )
+ deleted = cur.fetchone() is not None
+ conn.commit()
+ return deleted
+
+
+def delete_portfolio_crawl_run(conn: Connection, crawl_run_id: int) -> bool:
+ cur = conn.execute(
+ "DELETE FROM crawl_runs WHERE id = %s RETURNING id",
+ (crawl_run_id,),
+ )
+ deleted = cur.fetchone() is not None
+ conn.commit()
+ return deleted
+
+
+def delete_portfolio_item(
+ conn: Connection,
+ *,
+ report_id: int | None = None,
+ crawl_run_id: int | None = None,
+) -> bool:
+ deleted = False
+ if report_id is not None:
+ deleted = delete_portfolio_report(conn, report_id)
+ if crawl_run_id is not None:
+ deleted = delete_portfolio_crawl_run(conn, crawl_run_id)
+ return deleted
diff --git a/src/website_profiling/db/property_store.py b/src/website_profiling/db/property_store.py
index 434a2a76..85da80a0 100644
--- a/src/website_profiling/db/property_store.py
+++ b/src/website_profiling/db/property_store.py
@@ -206,3 +206,219 @@ def list_properties_public(conn: Connection) -> list[dict[str, Any]]:
"crawl_authorized_at": crawl_auth.isoformat() if crawl_auth else None,
})
return out
+
+
+def get_property_id_by_domain(conn: Connection, domain: str) -> int | None:
+ """Resolve property id from canonical domain (case-insensitive)."""
+ normalized = (domain or "").strip().lower()
+ if not normalized:
+ return None
+ prop = get_property_by_domain(conn, normalized)
+ return int(prop["id"]) if prop else None
+
+
+def resolve_property_id_for_page(
+ conn: Connection,
+ page_url: str,
+ property_id_str: str | None = None,
+ domain_str: str | None = None,
+) -> int | None:
+ """Resolve property ID from explicit param, domain, or URL hostname."""
+ if property_id_str:
+ try:
+ return int(property_id_str)
+ except (ValueError, TypeError):
+ pass
+
+ if domain_str:
+ prop_id = get_property_id_by_domain(conn, domain_str)
+ if prop_id is not None:
+ return prop_id
+
+ host = _extract_hostname(page_url)
+ if host:
+ return get_property_id_by_domain(conn, host)
+ return None
+
+
+def get_property_ops(conn: Connection, property_id: int) -> dict[str, Any] | None:
+ cur = conn.execute(
+ "SELECT schedule_cron, alert_webhook_url, alert_email FROM properties WHERE id = %s",
+ (property_id,),
+ )
+ row = cur.fetchone()
+ if not row:
+ return None
+ return {
+ "schedule_cron": _row_field(row, "schedule_cron", index=0),
+ "alert_webhook_url": _row_field(row, "alert_webhook_url", index=1),
+ "alert_email": _row_field(row, "alert_email", index=2),
+ }
+
+
+def update_property_ops(
+ conn: Connection,
+ property_id: int,
+ *,
+ schedule_cron: str | None,
+ alert_webhook_url: str | None,
+ alert_email: str | None,
+) -> None:
+ conn.execute(
+ """
+ UPDATE properties
+ SET schedule_cron = %s,
+ alert_webhook_url = %s,
+ alert_email = %s,
+ updated_at = now()
+ WHERE id = %s
+ """,
+ (schedule_cron, alert_webhook_url, alert_email, property_id),
+ )
+ conn.commit()
+
+
+def delete_property(conn: Connection, property_id: int) -> bool:
+ cur = conn.execute(
+ "DELETE FROM properties WHERE id = %s RETURNING id",
+ (property_id,),
+ )
+ deleted = cur.fetchone() is not None
+ conn.commit()
+ return deleted
+
+
+def update_property_crawl_preset(
+ conn: Connection,
+ property_id: int,
+ preset: str | None,
+) -> None:
+ conn.execute(
+ "UPDATE properties SET default_crawl_preset = %s, updated_at = now() WHERE id = %s",
+ (preset, property_id),
+ )
+ conn.commit()
+
+
+def authorize_property_crawl(conn: Connection, property_id: int) -> None:
+ """Mark property as crawl-authorized (OAuth flow)."""
+ conn.execute(
+ "UPDATE properties SET crawl_authorized_at = now(), updated_at = now() WHERE id = %s",
+ (property_id,),
+ )
+ conn.commit()
+
+
+def get_property_google_public_status(conn: Connection, property_id: int) -> dict[str, Any]:
+ row = get_property_by_id(conn, property_id)
+ if not row:
+ return {
+ "connected": False,
+ "authMode": None,
+ "gscSiteUrl": None,
+ "ga4PropertyId": None,
+ "dateRangeDays": 28,
+ "connectedEmail": None,
+ "connectedAt": None,
+ }
+ connected_at = row.get("google_connected_at")
+ return {
+ "connected": connected_at is not None,
+ "authMode": row.get("google_auth_mode"),
+ "gscSiteUrl": row.get("gsc_site_url"),
+ "ga4PropertyId": row.get("ga4_property_id"),
+ "dateRangeDays": int(row.get("google_date_range_days") or 0) or 28,
+ "connectedEmail": row.get("google_connected_email"),
+ "connectedAt": connected_at,
+ }
+
+
+def apply_property_google_credentials_patch(
+ conn: Connection,
+ property_id: int,
+ *,
+ refresh_token: str | None = None,
+ auth_mode: str | None = None,
+ gsc_site_url: str | None = None,
+ ga4_property_id: str | None = None,
+ date_range_days: int | None = None,
+ connected_email: str | None = None,
+ fields_set: frozenset[str] | None = None,
+) -> None:
+ """Merge Google OAuth / site mapping fields on a property row."""
+ allowed = fields_set or frozenset({
+ "refresh_token", "auth_mode", "gsc_site_url", "ga4_property_id",
+ "date_range_days", "connected_email",
+ })
+ sets: list[str] = ["updated_at = now()"]
+ vals: list[Any] = []
+
+ def _add(col: str, val: Any) -> None:
+ sets.append(f"{col} = %s")
+ vals.append(val)
+
+ if "gsc_site_url" in allowed and gsc_site_url is not None:
+ _add("gsc_site_url", gsc_site_url.strip() or None)
+ if "ga4_property_id" in allowed and ga4_property_id is not None:
+ v = ga4_property_id.strip() if ga4_property_id else ""
+ if v and not v.isdigit():
+ raise ValueError(
+ "Analytics property ID must be a numeric ID (e.g. 123456789). "
+ "The G-XXXXXXX code is a Measurement ID."
+ )
+ _add("ga4_property_id", v or None)
+ if "date_range_days" in allowed and date_range_days is not None and date_range_days > 0:
+ _add("google_date_range_days", date_range_days)
+ if "auth_mode" in allowed and auth_mode is not None:
+ _add("google_auth_mode", auth_mode or None)
+ if "connected_email" in allowed and connected_email is not None:
+ _add("google_connected_email", connected_email.strip() or None)
+ if "refresh_token" in allowed and refresh_token is not None:
+ token = refresh_token.strip()
+ _add("google_refresh_token", token or None)
+ if token:
+ sets.append("google_connected_at = now()")
+ else:
+ sets.append("google_connected_at = NULL")
+ if "connected_email" not in allowed or connected_email is None:
+ sets.append("google_connected_email = NULL")
+
+ if len(vals) == 0:
+ raise ValueError("No valid fields provided")
+
+ vals.append(property_id)
+ conn.execute(
+ f"UPDATE properties SET {', '.join(sets)} WHERE id = %s",
+ vals,
+ )
+ conn.commit()
+
+
+def disconnect_property_google(conn: Connection, property_id: int) -> None:
+ apply_property_google_credentials_patch(
+ conn,
+ property_id,
+ refresh_token="",
+ auth_mode=None,
+ fields_set=frozenset({"refresh_token", "auth_mode"}),
+ )
+
+
+def get_property_google_status(conn: Connection, property_id: int) -> dict[str, Any] | None:
+ """Property-level Google integration status for the integrations UI."""
+ from website_profiling.db.google_app_store import read_google_app_settings
+ from website_profiling.integrations.google.store import read_last_google_fetched_at_for_property
+
+ if not get_property_by_id(conn, property_id):
+ return None
+
+ prop_status = get_property_google_public_status(conn, property_id)
+ app_cfg = read_google_app_settings(conn)
+ has_client_id = bool(app_cfg.get("client_id"))
+
+ return {
+ **prop_status,
+ "hasClientId": has_client_id,
+ "lastFetchedAt": read_last_google_fetched_at_for_property(conn, property_id),
+ "propertyId": property_id,
+ }
diff --git a/src/website_profiling/db/saved_filter_store.py b/src/website_profiling/db/saved_filter_store.py
new file mode 100644
index 00000000..fc7ef4ae
--- /dev/null
+++ b/src/website_profiling/db/saved_filter_store.py
@@ -0,0 +1,59 @@
+"""Saved crawl filters (saved_crawl_filters table)."""
+from __future__ import annotations
+
+from typing import Any
+
+from psycopg import Connection
+from psycopg.types.json import Json
+
+from ._common import _row_field
+
+
+def _map_filter_row(row: Any) -> dict[str, Any]:
+ created = _row_field(row, "created_at")
+ return {
+ "id": _row_field(row, "id"),
+ "propertyId": _row_field(row, "property_id"),
+ "name": _row_field(row, "name"),
+ "filterJson": _row_field(row, "filter_json") or {},
+ "createdAt": created.isoformat() if hasattr(created, "isoformat") else str(created or ""),
+ }
+
+
+def list_saved_filters(conn: Connection, property_id: int) -> list[dict[str, Any]]:
+ cur = conn.execute(
+ """
+ SELECT id, property_id, name, filter_json, created_at
+ FROM saved_crawl_filters
+ WHERE property_id = %s
+ ORDER BY name
+ """,
+ (property_id,),
+ )
+ return [_map_filter_row(row) for row in cur.fetchall() or []]
+
+
+def upsert_saved_filter(
+ conn: Connection,
+ property_id: int,
+ name: str,
+ filter_json: dict[str, Any],
+) -> None:
+ conn.execute(
+ """
+ INSERT INTO saved_crawl_filters (property_id, name, filter_json)
+ VALUES (%s, %s, %s)
+ ON CONFLICT (property_id, name) DO UPDATE SET filter_json = EXCLUDED.filter_json
+ """,
+ (property_id, name, Json(filter_json)),
+ )
+ conn.commit()
+
+
+def delete_saved_filter(conn: Connection, property_id: int, name: str) -> bool:
+ cur = conn.execute(
+ "DELETE FROM saved_crawl_filters WHERE property_id = %s AND name = %s",
+ (property_id, name),
+ )
+ conn.commit()
+ return cur.rowcount > 0
diff --git a/src/website_profiling/integrations/google/gsc_links_store.py b/src/website_profiling/integrations/google/gsc_links_store.py
index 85af6030..d9ca1bd3 100644
--- a/src/website_profiling/integrations/google/gsc_links_store.py
+++ b/src/website_profiling/integrations/google/gsc_links_store.py
@@ -164,3 +164,34 @@ def read_gsc_links_status(
"sampleLinkCount": len(data.get("sample_links") or []),
"latestLinkCount": len(data.get("latest_links") or []),
}
+
+
+def list_backlinks_velocity(
+ conn: Connection,
+ property_id: int,
+ *,
+ limit: int = 52,
+) -> list[dict[str, Any]]:
+ """Referring-domain trend snapshots for Backlinks velocity chart."""
+ from ...db._common import _parse_row_json, _row_field
+
+ limit = max(1, min(int(limit), 52))
+ cur = conn.execute(
+ """SELECT fetched_at, referring_domains, top_domains
+ FROM gsc_links_snapshots
+ WHERE property_id = %s
+ ORDER BY fetched_at ASC
+ LIMIT %s""",
+ (property_id, limit),
+ )
+ snapshots: list[dict[str, Any]] = []
+ for row in cur.fetchall() or []:
+ fetched = _row_field(row, "fetched_at", index=0)
+ top_domains = _parse_row_json(row, "top_domains", index=2)
+ snapshots.append({
+ "capturedAt": fetched.isoformat() if hasattr(fetched, "isoformat") else str(fetched or "") or None,
+ "referringDomains": int(_row_field(row, "referring_domains", index=1) or 0),
+ "topDomains": top_domains if isinstance(top_domains, list) else [],
+ })
+ return snapshots
+
diff --git a/src/website_profiling/integrations/google/keyword_store.py b/src/website_profiling/integrations/google/keyword_store.py
index 6a6b5f4a..8c6460ba 100644
--- a/src/website_profiling/integrations/google/keyword_store.py
+++ b/src/website_profiling/integrations/google/keyword_store.py
@@ -9,7 +9,7 @@
from psycopg import Connection
from psycopg.types.json import Json
-from ...db.storage import _parse_row_json, _sanitize_for_json
+from ...db._common import _parse_row_json, _row_field, _sanitize_for_json
def write_keyword_data(
@@ -118,9 +118,13 @@ def read_keyword_snapshots_for_property(
)
out: list[dict[str, Any]] = []
for row in cur.fetchall():
- data = _parse_row_json(row)
+ data = _parse_row_json(row, "data", index=1)
if isinstance(data, dict):
- out.append({"fetched_at": row["fetched_at"], **data})
+ fetched = _row_field(row, "fetched_at", index=0)
+ out.append({
+ "fetched_at": fetched.isoformat() if hasattr(fetched, "isoformat") else str(fetched or ""),
+ **data,
+ })
return out
except Exception:
return []
@@ -144,15 +148,33 @@ def read_keyword_history(
ORDER BY id DESC LIMIT %s""",
(property_id, keyword, limit),
)
- return [
- {
- "fetched_at": row["fetched_at"],
- "position": row["position"],
- "clicks": row["clicks"],
- "impressions": row["impressions"],
- "ctr": row["ctr"],
- }
- for row in cur.fetchall()
- ]
+ rows = list(cur.fetchall() or [])
+ return [_map_keyword_history_row(row) for row in reversed(rows)]
except Exception:
return []
+
+
+def read_keyword_history_batch(
+ conn: Connection,
+ keywords: list[str],
+ *,
+ property_id: int,
+ limit: int = 30,
+) -> dict[str, list[dict[str, Any]]]:
+ """Batch keyword history keyed by keyword string."""
+ limit = max(1, min(int(limit), 90))
+ results: dict[str, list[dict[str, Any]]] = {}
+ for kw in keywords:
+ results[kw] = read_keyword_history(conn, kw, limit, property_id=property_id)
+ return results
+
+
+def _map_keyword_history_row(row: Any) -> dict[str, Any]:
+ fetched = _row_field(row, "fetched_at", index=0)
+ return {
+ "fetched_at": fetched.isoformat() if hasattr(fetched, "isoformat") else str(fetched or ""),
+ "position": _row_field(row, "position", index=1),
+ "clicks": _row_field(row, "clicks", index=2),
+ "impressions": _row_field(row, "impressions", index=3),
+ "ctr": _row_field(row, "ctr", index=4),
+ }
diff --git a/src/website_profiling/integrations/google/page_snapshot_store.py b/src/website_profiling/integrations/google/page_snapshot_store.py
index 06b4f1a0..3067ac58 100644
--- a/src/website_profiling/integrations/google/page_snapshot_store.py
+++ b/src/website_profiling/integrations/google/page_snapshot_store.py
@@ -7,7 +7,7 @@
from psycopg import Connection
from psycopg.types.json import Json
-from ...db.storage import _parse_row_json, _sanitize_for_json
+from ...db._common import _parse_row_json, _row_field, _sanitize_for_json
from .normalize import normalize_url
from .page_lookup import _public_ga4_page, _public_gsc_page, summary_from_slice
@@ -33,7 +33,7 @@ def write_page_snapshot(conn: Connection, page_url: str, data: dict[str, Any]) -
(page_url.strip(), url_norm, Json(_sanitize_for_json(data))),
)
row = cur.fetchone()
- snapshot_id = int(row["id"]) if row else 0
+ snapshot_id = int(_row_field(row, "id", index=0)) if row else 0
limit = max_snapshots_per_url()
conn.execute(
"""
@@ -60,12 +60,13 @@ def read_page_snapshot(conn: Connection, snapshot_id: int) -> dict[str, Any] | N
row = cur.fetchone()
if not row:
return None
- data = _parse_row_json(row) or {}
+ data = _parse_row_json(row, "data", index=4) or {}
+ fetched = _row_field(row, "fetched_at", index=3)
return {
- "snapshotId": int(row["id"]),
- "pageUrl": str(row["page_url"]),
- "urlNorm": str(row["url_norm"]),
- "fetchedAt": row["fetched_at"].isoformat() if row["fetched_at"] else None,
+ "snapshotId": int(_row_field(row, "id", index=0)),
+ "pageUrl": str(_row_field(row, "page_url", index=1)),
+ "urlNorm": str(_row_field(row, "url_norm", index=2)),
+ "fetchedAt": fetched.isoformat() if hasattr(fetched, "isoformat") else str(fetched or ""),
"source": data.get("source") or "live",
"gsc": data.get("gsc"),
"ga4": data.get("ga4"),
@@ -90,13 +91,14 @@ def list_live_history(
)
out: list[dict[str, Any]] = []
for row in cur.fetchall():
- data = _parse_row_json(row) or {}
+ data = _parse_row_json(row, "data", index=2) or {}
gsc = data.get("gsc")
ga4 = data.get("ga4")
+ fetched = _row_field(row, "fetched_at", index=1)
out.append(
{
- "id": int(row["id"]),
- "fetchedAt": row["fetched_at"].isoformat() if row["fetched_at"] else None,
+ "id": int(_row_field(row, "id", index=0)),
+ "fetchedAt": fetched.isoformat() if hasattr(fetched, "isoformat") else str(fetched or ""),
"type": "live",
**summary_from_slice(gsc, ga4),
}
@@ -111,6 +113,59 @@ def latest_live_snapshot(conn: Connection, page_url: str) -> dict[str, Any] | No
return read_page_snapshot(conn, int(rows[0]["id"]))
+def read_page_snapshot_compare(conn: Connection, snapshot_id: int) -> dict[str, Any] | None:
+ """Load snapshot for page-compare API ({id, fetchedAt, data})."""
+ cur = conn.execute(
+ "SELECT id, fetched_at, data FROM page_google_snapshots WHERE id = %s",
+ (snapshot_id,),
+ )
+ row = cur.fetchone()
+ if not row:
+ return None
+ data = _parse_row_json(row, "data", index=2)
+ if not isinstance(data, dict):
+ data = {}
+ fetched = _row_field(row, "fetched_at", index=1)
+ return {
+ "id": int(_row_field(row, "id", index=0)),
+ "fetchedAt": fetched.isoformat() if hasattr(fetched, "isoformat") else str(fetched or ""),
+ "data": data,
+ }
+
+
+def list_page_snapshot_api_history(
+ conn: Connection,
+ page_url: str,
+ *,
+ limit: int = 15,
+) -> list[dict[str, Any]]:
+ """History rows with raw gsc/ga4 blobs for the integrations API."""
+ url_norm = normalize_url(page_url)
+ cur = conn.execute(
+ """
+ SELECT id, fetched_at, data
+ FROM page_google_snapshots
+ WHERE url_norm = %s
+ ORDER BY fetched_at DESC, id DESC
+ LIMIT %s
+ """,
+ (url_norm, limit),
+ )
+ out: list[dict[str, Any]] = []
+ for row in cur.fetchall() or []:
+ data = _parse_row_json(row, "data", index=2) or {}
+ if not isinstance(data, dict):
+ data = {}
+ fetched = _row_field(row, "fetched_at", index=1)
+ out.append({
+ "id": int(_row_field(row, "id", index=0)),
+ "fetchedAt": fetched.isoformat() if hasattr(fetched, "isoformat") else str(fetched or ""),
+ "gsc": data.get("gsc"),
+ "ga4": data.get("ga4"),
+ })
+ return out
+
+
def package_live_payload(
page_url: str,
gsc: dict[str, Any] | None,
diff --git a/src/website_profiling/integrations/google/store.py b/src/website_profiling/integrations/google/store.py
index ba5fdece..1c0e9334 100644
--- a/src/website_profiling/integrations/google/store.py
+++ b/src/website_profiling/integrations/google/store.py
@@ -11,7 +11,7 @@
from psycopg import Connection
from psycopg.types.json import Json
-from ...db.storage import _parse_row_json, _sanitize_for_json
+from ...db._common import _parse_row_json, _row_field, _sanitize_for_json
def write_google_data(
@@ -128,6 +128,125 @@ def read_prior_google_snapshot(
return None
+def read_last_google_fetched_at(conn: Connection) -> str | None:
+ """ISO timestamp of the most recent google_data row (any property)."""
+ try:
+ cur = conn.execute(
+ "SELECT fetched_at FROM google_data ORDER BY id DESC LIMIT 1"
+ )
+ row = cur.fetchone()
+ if not row:
+ return None
+ fetched = _row_field(row, "fetched_at", index=0)
+ if fetched is None:
+ return None
+ return fetched.isoformat() if hasattr(fetched, "isoformat") else str(fetched)
+ except Exception:
+ return None
+
+
+def read_google_snapshot_row(
+ conn: Connection,
+ property_id: int,
+ *,
+ snapshot_id: int | None = None,
+) -> dict[str, Any] | None:
+ """Return one google_data row as {id, fetchedAt, data} with full parsed blob."""
+ try:
+ if snapshot_id is not None:
+ cur = conn.execute(
+ """
+ SELECT id, fetched_at, data
+ FROM google_data
+ WHERE id = %s AND property_id = %s
+ """,
+ (snapshot_id, property_id),
+ )
+ else:
+ cur = conn.execute(
+ """
+ SELECT id, fetched_at, data
+ FROM google_data
+ WHERE property_id = %s
+ ORDER BY id DESC
+ LIMIT 1
+ """,
+ (property_id,),
+ )
+ row = cur.fetchone()
+ if not row:
+ return None
+ data = _parse_row_json(row, "data", index=2)
+ if not isinstance(data, dict):
+ return None
+ fetched = _row_field(row, "fetched_at", index=1)
+ return {
+ "id": int(_row_field(row, "id", index=0)),
+ "fetchedAt": fetched.isoformat() if hasattr(fetched, "isoformat") else str(fetched or ""),
+ "data": data,
+ }
+ except Exception:
+ return None
+
+
+def list_google_snapshot_rows(
+ conn: Connection,
+ property_id: int,
+ *,
+ limit: int = 10,
+) -> list[dict[str, Any]]:
+ """Recent google_data rows for a property as {id, fetchedAt, data}."""
+ limit = max(1, min(int(limit), 50))
+ try:
+ cur = conn.execute(
+ """
+ SELECT id, fetched_at, data
+ FROM google_data
+ WHERE property_id = %s
+ ORDER BY id DESC
+ LIMIT %s
+ """,
+ (property_id, limit),
+ )
+ out: list[dict[str, Any]] = []
+ for row in cur.fetchall() or []:
+ data = _parse_row_json(row, "data", index=2)
+ if not isinstance(data, dict):
+ continue
+ fetched = _row_field(row, "fetched_at", index=1)
+ out.append({
+ "id": int(_row_field(row, "id", index=0)),
+ "fetchedAt": fetched.isoformat() if hasattr(fetched, "isoformat") else str(fetched or ""),
+ "data": data,
+ })
+ return out
+ except Exception:
+ return []
+
+
+def read_last_google_fetched_at_for_property(conn: Connection, property_id: int) -> str | None:
+ """ISO timestamp of the most recent google_data row for a property."""
+ try:
+ cur = conn.execute(
+ """
+ SELECT fetched_at FROM google_data
+ WHERE property_id = %s
+ ORDER BY id DESC
+ LIMIT 1
+ """,
+ (property_id,),
+ )
+ row = cur.fetchone()
+ if not row:
+ return None
+ fetched = _row_field(row, "fetched_at", index=0)
+ if fetched is None:
+ return None
+ return fetched.isoformat() if hasattr(fetched, "isoformat") else str(fetched)
+ except Exception:
+ return None
+
+
def gsc_row_deltas(
current_rows: list[dict[str, Any]],
prior_rows: list[dict[str, Any]],
diff --git a/src/website_profiling/integrations/keywords/competitor_gap_store.py b/src/website_profiling/integrations/keywords/competitor_gap_store.py
index ea585b2a..7c6ca8b2 100644
--- a/src/website_profiling/integrations/keywords/competitor_gap_store.py
+++ b/src/website_profiling/integrations/keywords/competitor_gap_store.py
@@ -1,7 +1,6 @@
"""Read/write per-property competitor keyword gap rows."""
from __future__ import annotations
-import json
from typing import Any
from psycopg import Connection
@@ -25,7 +24,7 @@ def read_competitor_keyword_gap(conn: Connection, property_id: int | None) -> li
)
row = cur.fetchone()
if row is None:
- return _migrate_legacy_config_if_empty(conn, property_id)
+ return []
data = _parse_row_json(row)
if isinstance(data, list):
return [r for r in data if isinstance(r, dict)]
@@ -34,27 +33,6 @@ def read_competitor_keyword_gap(conn: Connection, property_id: int | None) -> li
return []
-def _migrate_legacy_config_if_empty(conn: Connection, property_id: int) -> list[dict[str, Any]]:
- """One-time read from global pipeline_config when property has no rows yet."""
- try:
- from ...config import get_str
- from ...db.config_store import read_pipeline_config
-
- known, _ = read_pipeline_config(conn)
- raw = (get_str(known or {}, "competitor_keyword_gap_json", "") or "").strip()
- if not raw:
- return []
- parsed = json.loads(raw)
- if not isinstance(parsed, list):
- return []
- rows = [r for r in parsed if isinstance(r, dict)]
- if rows:
- write_competitor_keyword_gap(conn, property_id, rows)
- return rows
- except Exception:
- return []
-
-
def write_competitor_keyword_gap(
conn: Connection,
property_id: int,
diff --git a/src/website_profiling/llm/agent.py b/src/website_profiling/llm/agent.py
index 25d0d0a2..936677d3 100644
--- a/src/website_profiling/llm/agent.py
+++ b/src/website_profiling/llm/agent.py
@@ -77,7 +77,7 @@ def _max_tool_rounds(cfg: dict[str, str]) -> int:
- Export lists: export_list_as_csv with the matching list tool
Export playbook (chat UI shows download buttons after export tools — do not paste file contents):
-- Full audit PDF/HTML/CSV/JSON: export_audit_report with format pdf|html|csv|json
+- Full audit PDF/CSV/JSON: export_audit_report with format pdf|csv|json (PDF via FileService)
- Compare issue diff CSV: export_compare_csv with baseline_report_id
- Export a list as CSV: export_list_as_csv with tool_name and tool_args (e.g. list_broken_links)
- After export tools succeed, tell the user their download is ready; the UI renders file buttons automatically
diff --git a/src/website_profiling/llm/audit_summary.py b/src/website_profiling/llm/audit_summary.py
index 551582df..fc530802 100644
--- a/src/website_profiling/llm/audit_summary.py
+++ b/src/website_profiling/llm/audit_summary.py
@@ -17,7 +17,7 @@ def rank_issues_by_traffic(
for row in gsc_pages or []:
if not isinstance(row, dict):
continue
- url = str(row.get("page") or row.get("url") or "").strip().lower()
+ url = str(row.get("page") or "").strip().lower()
if not url:
continue
try:
@@ -55,7 +55,7 @@ def generate_audit_executive_summary(
categories = report_payload.get("categories") or []
gsc = (report_payload.get("google") or {}).get("gsc") or {}
- gsc_pages = (gsc.get("top_pages") or gsc.get("pages")) if isinstance(gsc, dict) else []
+ gsc_pages = gsc.get("top_pages") if isinstance(gsc, dict) else []
top_issues = rank_issues_by_traffic(categories, gsc_pages)[:5]
scores = [c.get("score") for c in categories if isinstance(c.get("score"), (int, float))]
diff --git a/src/website_profiling/llm/ollama_catalog.py b/src/website_profiling/llm/ollama_catalog.py
new file mode 100644
index 00000000..865a6f8b
--- /dev/null
+++ b/src/website_profiling/llm/ollama_catalog.py
@@ -0,0 +1,174 @@
+"""Ollama local + cloud model catalog (mirrors web/src/server/ollamaModels.ts)."""
+from __future__ import annotations
+
+import json
+import re
+import urllib.error
+import urllib.request
+from typing import Any
+
+OLLAMA_CLOUD_CATALOG_URL = "https://ollama.com/api/tags"
+
+PRO_CLOUD_MODEL_PATTERNS = [
+ re.compile(r"671b", re.I),
+ re.compile(r"480b", re.I),
+ re.compile(r":1t(?:-cloud|:cloud)?$", re.I),
+ re.compile(r"v4-pro", re.I),
+ re.compile(r"nemotron-3-ultra", re.I),
+ re.compile(r"nemotron-3-super", re.I),
+ re.compile(r"mistral-large", re.I),
+ re.compile(r"397b", re.I),
+ re.compile(r"cogito-2\.1:671b", re.I),
+ re.compile(r"deepseek-v4-pro", re.I),
+ re.compile(r"qwen3-coder:480b", re.I),
+ re.compile(r"gpt-oss:120b", re.I),
+]
+
+
+def is_cloud_model_ref(name: str) -> bool:
+ return name.endswith("-cloud") or name.endswith(":cloud")
+
+
+def to_cloud_model_ref(name: str) -> str:
+ trimmed = name.strip()
+ if not trimmed:
+ return trimmed
+ if trimmed.endswith("-cloud") or trimmed.endswith(":cloud"):
+ return trimmed
+ return f"{trimmed}-cloud" if ":" in trimmed else f"{trimmed}:cloud"
+
+
+def resolve_billing_tier(name: str, source: str) -> dict[str, Any]:
+ cloud = source == "cloud" or is_cloud_model_ref(name)
+ if not cloud:
+ return {"billing": "free_local", "requires_subscription": False}
+ if any(p.search(name) for p in PRO_CLOUD_MODEL_PATTERNS):
+ return {"billing": "cloud_pro", "requires_subscription": True}
+ return {"billing": "cloud_free", "requires_subscription": True}
+
+
+def _with_billing(entry: dict[str, Any]) -> dict[str, Any]:
+ tier = resolve_billing_tier(str(entry.get("name") or ""), str(entry.get("source") or "local"))
+ return {**entry, **tier}
+
+
+def _normalize_local_model(raw: dict[str, Any]) -> dict[str, Any] | None:
+ name = str(raw.get("name") or "").strip()
+ if not name:
+ return None
+ cloud = bool(raw.get("remote_host")) or is_cloud_model_ref(name)
+ details = raw.get("details") if isinstance(raw.get("details"), dict) else {}
+ return _with_billing({
+ "name": name,
+ "source": "cloud" if cloud else "local",
+ "installed": True,
+ "capabilities": raw.get("capabilities") if isinstance(raw.get("capabilities"), list) else None,
+ "context_length": details.get("context_length"),
+ })
+
+
+def _normalize_catalog_model(raw: dict[str, Any]) -> dict[str, Any] | None:
+ base = str(raw.get("name") or "").strip()
+ if not base:
+ return None
+ return _with_billing({
+ "name": to_cloud_model_ref(base),
+ "source": "cloud",
+ "installed": False,
+ })
+
+
+def _model_key(name: str) -> str:
+ return name.lower()
+
+
+def merge_ollama_models(
+ local: list[dict[str, Any]],
+ cloud_catalog: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+ by_key: dict[str, dict[str, Any]] = {}
+ for m in cloud_catalog:
+ by_key[_model_key(str(m.get("name") or ""))] = m
+ for m in local:
+ key = _model_key(str(m.get("name") or ""))
+ existing = by_key.get(key)
+ merged = {
+ **(existing or {}),
+ **m,
+ "installed": True,
+ "capabilities": m.get("capabilities") or (existing or {}).get("capabilities"),
+ "context_length": m.get("context_length") or (existing or {}).get("context_length"),
+ }
+ by_key[key] = _with_billing(merged)
+
+ def sort_key(m: dict[str, Any]) -> tuple:
+ return (
+ 0 if m.get("installed") else 1,
+ 0 if m.get("source") == "local" else 1,
+ str(m.get("name") or ""),
+ )
+
+ return sorted(by_key.values(), key=sort_key)
+
+
+def _fetch_json(url: str, *, timeout: float = 8.0) -> dict[str, Any] | None:
+ try:
+ req = urllib.request.Request(url, headers={"Accept": "application/json"})
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
+ return json.loads(resp.read().decode())
+ except (urllib.error.URLError, TimeoutError, json.JSONDecodeError, OSError):
+ return None
+
+
+def fetch_ollama_models(base_url: str) -> dict[str, Any]:
+ normalized_base = (base_url or "http://127.0.0.1:11434").rstrip("/") or "http://127.0.0.1:11434"
+
+ local_data = _fetch_json(f"{normalized_base}/api/tags", timeout=8.0)
+ cloud_data = _fetch_json(OLLAMA_CLOUD_CATALOG_URL, timeout=12.0)
+
+ local_ok = local_data is not None
+ cloud_catalog_ok = cloud_data is not None
+
+ local_models = [
+ m for raw in (local_data or {}).get("models") or []
+ if isinstance(raw, dict)
+ for m in [_normalize_local_model(raw)]
+ if m is not None
+ ]
+ cloud_models = [
+ m for raw in (cloud_data or {}).get("models") or []
+ if isinstance(raw, dict)
+ for m in [_normalize_catalog_model(raw)]
+ if m is not None
+ ]
+ models = merge_ollama_models(local_models, cloud_models)
+
+ if not local_ok and not cloud_catalog_ok:
+ return {
+ "ok": False,
+ "baseUrl": normalized_base,
+ "models": [],
+ "cloudCatalogOk": False,
+ "localOk": False,
+ "error": "Cannot reach Ollama or the cloud model catalog.",
+ }
+
+ return {
+ "ok": local_ok or cloud_catalog_ok,
+ "baseUrl": normalized_base,
+ "models": models,
+ "cloudCatalogOk": cloud_catalog_ok,
+ "localOk": local_ok,
+ }
+
+
+def model_is_configured(models: list[dict[str, Any]], configured_model: str) -> bool:
+ target = configured_model.strip()
+ if not target:
+ return len(models) > 0
+ key = _model_key(target)
+ return any(_model_key(str(m.get("name") or "")) == key for m in models)
+
+
+def models_support_tools(models: list[dict[str, Any]]) -> bool:
+ return any("tools" in (m.get("capabilities") or []) for m in models)
diff --git a/src/website_profiling/reporting/builder.py b/src/website_profiling/reporting/builder.py
index c4c29ef1..8a547baf 100644
--- a/src/website_profiling/reporting/builder.py
+++ b/src/website_profiling/reporting/builder.py
@@ -685,7 +685,7 @@ def run_simple_report(
gsc_pages = []
gsc_block = (report_data.get("google") or {}).get("gsc") or {}
if isinstance(gsc_block, dict):
- gsc_pages = gsc_block.get("top_pages") or gsc_block.get("pages") or []
+ gsc_pages = gsc_block.get("top_pages") or []
enrich_top_issues_with_llm(
report_data.get("categories") or [],
llm_cfg_for_clusters,
diff --git a/src/website_profiling/reporting/indexation.py b/src/website_profiling/reporting/indexation.py
index 8f8394e6..c6913dfd 100644
--- a/src/website_profiling/reporting/indexation.py
+++ b/src/website_profiling/reporting/indexation.py
@@ -31,12 +31,12 @@ def _gsc_page_urls(google_data: dict[str, Any] | None) -> list[str]:
if not google_data:
return []
gsc = google_data.get("gsc") if isinstance(google_data.get("gsc"), dict) else {}
- raw = gsc.get("top_pages") or gsc.get("pages")
+ raw = gsc.get("top_pages")
pages = raw if isinstance(raw, list) else []
out: list[str] = []
for row in pages:
if isinstance(row, dict):
- u = str(row.get("page") or row.get("url") or "").strip()
+ u = str(row.get("page") or "").strip()
if u:
out.append(u)
return out
@@ -46,12 +46,12 @@ def _gsc_by_page(google_data: dict[str, Any] | None) -> dict[str, dict]:
if not google_data:
return {}
gsc = google_data.get("gsc") if isinstance(google_data.get("gsc"), dict) else {}
- raw = gsc.get("top_pages") or gsc.get("pages")
+ raw = gsc.get("top_pages")
pages = raw if isinstance(raw, list) else []
out: dict[str, dict] = {}
for row in pages:
if isinstance(row, dict):
- u = str(row.get("page") or row.get("url") or "").strip()
+ u = str(row.get("page") or "").strip()
if u:
out[u] = row
return out
diff --git a/src/website_profiling/reporting/issue_impact.py b/src/website_profiling/reporting/issue_impact.py
index fd14f1de..1629e406 100644
--- a/src/website_profiling/reporting/issue_impact.py
+++ b/src/website_profiling/reporting/issue_impact.py
@@ -13,10 +13,10 @@ def _metrics_by_url(google_data: dict[str, Any] | None) -> tuple[dict[str, dict]
return clicks, sessions
gsc = google_data.get("gsc") or {}
if isinstance(gsc, dict):
- for row in (gsc.get("pages") or gsc.get("top_pages") or []):
+ for row in (gsc.get("top_pages") or []):
if not isinstance(row, dict):
continue
- url = str(row.get("page") or row.get("url") or "").strip().lower().rstrip("/")
+ url = str(row.get("page") or "").strip().lower().rstrip("/")
if not url:
continue
clicks[url] = {
@@ -25,10 +25,10 @@ def _metrics_by_url(google_data: dict[str, Any] | None) -> tuple[dict[str, dict]
}
ga4 = google_data.get("ga4") or {}
if isinstance(ga4, dict):
- for row in (ga4.get("pages") or ga4.get("top_pages") or []):
+ for row in (ga4.get("top_pages") or []):
if not isinstance(row, dict):
continue
- path = str(row.get("path") or row.get("pagePath") or row.get("url") or "").strip().lower()
+ path = str(row.get("path") or "").strip().lower()
if not path:
continue
sessions[path] = {"ga4_sessions": float(row.get("sessions") or 0)}
diff --git a/src/website_profiling/reporting/pdf/__init__.py b/src/website_profiling/reporting/pdf/__init__.py
deleted file mode 100644
index c2aabc1a..00000000
--- a/src/website_profiling/reporting/pdf/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-"""PDF document model and export pipeline."""
-from __future__ import annotations
-
-from .builder import build_pdf_document
-from .document import PdfDocument
-from .options import PdfBuildOptions, PdfLimits
-from .render import render_pdf_document
-
-__all__ = [
- "build_pdf_document",
- "render_pdf_document",
- "PdfDocument",
- "PdfBuildOptions",
- "PdfLimits",
-]
diff --git a/src/website_profiling/reporting/pdf/adapters/__init__.py b/src/website_profiling/reporting/pdf/adapters/__init__.py
deleted file mode 100644
index 50c1f7df..00000000
--- a/src/website_profiling/reporting/pdf/adapters/__init__.py
+++ /dev/null
@@ -1,29 +0,0 @@
-"""Section adapter registry.
-
-Each adapter maps a section key to a function that accepts the raw payload
-dict + PdfBuildOptions and returns a list of PdfSection objects. Adapters
-that find no relevant data return an empty list.
-"""
-from __future__ import annotations
-
-from typing import Any, Callable
-
-from ..document import PdfSection
-from ..options import PdfBuildOptions
-
-SectionAdapterFn = Callable[[dict[str, Any], PdfBuildOptions], list[PdfSection]]
-
-# Populated by each sub-module at import time
-SECTION_ADAPTERS: dict[str, SectionAdapterFn] = {}
-
-
-def register(key: str) -> Callable[[SectionAdapterFn], SectionAdapterFn]:
- """Decorator: @register("lighthouse") marks a function as a section adapter."""
- def _wrap(fn: SectionAdapterFn) -> SectionAdapterFn:
- SECTION_ADAPTERS[key] = fn
- return fn
- return _wrap
-
-
-# Import adapters so they self-register
-from . import core, findings, appendix # noqa: E402, F401
diff --git a/src/website_profiling/reporting/pdf/adapters/appendix.py b/src/website_profiling/reporting/pdf/adapters/appendix.py
deleted file mode 100644
index 16329ff5..00000000
--- a/src/website_profiling/reporting/pdf/adapters/appendix.py
+++ /dev/null
@@ -1,68 +0,0 @@
-"""Appendix adapter — crawled URL sample and data-source glossary."""
-from __future__ import annotations
-
-from typing import Any
-
-from ....tools.export_audit_data import _GLOSSARY_ROWS
-from ..document import (
- KeyValueBlock,
- PdfSection,
- PdfTruncation,
- SpacerBlock,
- UrlListBlock,
-)
-from ..options import PdfBuildOptions
-from . import register
-
-
-@register("appendix")
-def adapt_appendix(payload: dict[str, Any], opts: PdfBuildOptions) -> list[PdfSection]:
- if not opts.include_appendix:
- return []
-
- sections: list[PdfSection] = []
-
- # --- Crawled URLs sample ---
- links = [l for l in (payload.get("links") or []) if isinstance(l, dict)]
- if links:
- limit = opts.limits.urls_sample
- sample = links[:limit]
- rows = [
- {
- "url": str(lnk.get("url") or ""),
- "status": str(lnk.get("status") or ""),
- "title": str(lnk.get("title") or "").strip(),
- }
- for lnk in sample
- ]
- has_titles = any(r["title"] for r in rows)
- trunc = PdfTruncation(shown=len(rows), total=len(links)) if len(links) > limit else None
- sections.append(PdfSection(
- id="appendix.urls",
- section_key="links",
- title="Crawled URLs (sample)",
- priority=80,
- page_break_before=False,
- blocks=[
- UrlListBlock(
- id="appendix.url_list",
- rows=rows,
- show_title=has_titles,
- truncation=trunc,
- ),
- SpacerBlock(id="appendix.url_spacer", height_pt=6),
- ],
- ))
-
- # --- Glossary ---
- if opts.include_glossary:
- gloss_rows = [(term, desc) for term, desc in _GLOSSARY_ROWS]
- sections.append(PdfSection(
- id="appendix.glossary",
- section_key="core",
- title="Data source glossary",
- priority=90,
- blocks=[KeyValueBlock(id="appendix.glossary_kv", rows=gloss_rows, layout="glossary")],
- ))
-
- return sections
diff --git a/src/website_profiling/reporting/pdf/adapters/core.py b/src/website_profiling/reporting/pdf/adapters/core.py
deleted file mode 100644
index 27bef7e0..00000000
--- a/src/website_profiling/reporting/pdf/adapters/core.py
+++ /dev/null
@@ -1,38 +0,0 @@
-"""Core adapter — audit-details section (category scores live on cover)."""
-from __future__ import annotations
-
-from typing import Any
-
-from ....tools.export_audit_data import _format_report_date, _summary_lines
-from ..document import KeyValueBlock, PdfSection, SpacerBlock
-from ..options import PdfBuildOptions
-from . import register
-
-
-@register("core")
-def adapt_core(payload: dict[str, Any], opts: PdfBuildOptions) -> list[PdfSection]:
- sections: list[PdfSection] = []
-
- # Category scores are rendered on the cover page — not duplicated here.
-
- # --- Audit details section ---
- summary_rows = _summary_lines(payload)
- if summary_rows:
- formatted_rows: list[tuple[str, str]] = []
- for key, val in summary_rows:
- if key == "Report generated":
- formatted_rows.append((key, _format_report_date(val)))
- else:
- formatted_rows.append((key, val))
- sections.append(PdfSection(
- id="core.audit_details",
- section_key="core",
- title="Audit details",
- priority=70,
- blocks=[
- KeyValueBlock(id="core.audit_kv", rows=formatted_rows, layout="audit"),
- SpacerBlock(id="core.audit_spacer", height_pt=6),
- ],
- ))
-
- return sections
diff --git a/src/website_profiling/reporting/pdf/adapters/findings.py b/src/website_profiling/reporting/pdf/adapters/findings.py
deleted file mode 100644
index ebc18e8d..00000000
--- a/src/website_profiling/reporting/pdf/adapters/findings.py
+++ /dev/null
@@ -1,54 +0,0 @@
-"""Findings adapter — normalizes and groups all audit issues."""
-from __future__ import annotations
-
-from typing import Any
-
-from ....tools.export_audit_data import _issues_rows, _priority_sort_key
-from ..document import PdfSection, PdfTruncation
-from ..normalize import group_issues_for_pdf, normalize_issue_for_pdf
-from ..options import PdfBuildOptions
-from . import register
-
-
-@register("findings")
-def adapt_findings(payload: dict[str, Any], opts: PdfBuildOptions) -> list[PdfSection]:
- raw_rows = _issues_rows(payload)
- if not raw_rows:
- return []
-
- raw_rows = sorted(raw_rows, key=_priority_sort_key)
- total = len(raw_rows)
- capped = raw_rows[: opts.limits.issues_total]
-
- pdf_issues = [
- normalize_issue_for_pdf(row, include_recommendation=opts.include_recommendations)
- for row in capped
- ]
-
- groups = group_issues_for_pdf(
- pdf_issues,
- issues_per_group=opts.limits.issues_per_group,
- issues_total=opts.limits.issues_total,
- )
-
- if not groups:
- return []
-
- section_trunc: PdfTruncation | None = None
- if total > opts.limits.issues_total:
- section_trunc = PdfTruncation(
- shown=opts.limits.issues_total,
- total=total,
- reason="limit",
- continue_in=["CSV", "workbook"],
- )
-
- return [PdfSection(
- id="findings",
- section_key="findings",
- title="Findings",
- priority=20,
- page_break_before=False,
- blocks=list(groups), # type: ignore[arg-type]
- truncation=section_trunc,
- )]
diff --git a/src/website_profiling/reporting/pdf/builder.py b/src/website_profiling/reporting/pdf/builder.py
deleted file mode 100644
index 8f2d30b2..00000000
--- a/src/website_profiling/reporting/pdf/builder.py
+++ /dev/null
@@ -1,178 +0,0 @@
-"""build_pdf_document — assembles a PdfDocument from a raw report payload."""
-from __future__ import annotations
-
-from datetime import datetime, timezone
-from typing import Any, Optional
-
-from ...tools.export_audit_data import (
- _executive_export_data,
- _executive_source_label,
- _format_report_date,
- _issue_priority_counts,
- _issues_rows,
- _overall_score,
- _priority_sort_key,
- _score_band,
-)
-from .document import (
- SCHEMA_VERSION,
- PdfCoverBlock,
- PdfDocument,
- PdfFooterBlock,
- PdfIssue,
- PdfMeta,
- PdfScoreHero,
- ScoreCard,
- ScoreCardsBlock,
- StatChip,
- StatGridBlock,
-)
-from .normalize import normalize_issue_for_pdf
-from .options import PdfBuildOptions
-from .adapters import SECTION_ADAPTERS
-
-
-def _build_meta(
- payload: dict[str, Any],
- opts: PdfBuildOptions,
- exported_at: str,
- all_issue_counts: dict[str, int],
- overall: Optional[int],
- included_sections: list[str],
-) -> PdfMeta:
- site = str(payload.get("site_name") or "Site Audit")
- generated_raw = str(payload.get("report_generated_at") or "")
- generated = _format_report_date(generated_raw)
- meta_block = payload.get("report_meta") or {}
- data_sources: list[str] = []
- if isinstance(meta_block, dict):
- data_sources = [str(s) for s in (meta_block.get("data_sources") or [])]
- report_title = str(payload.get("report_title") or "Technical SEO Audit Report")
- return PdfMeta(
- report_id=opts.report_id,
- property=site,
- report_title=report_title,
- generated_at=generated,
- exported_at=exported_at,
- data_sources=data_sources,
- health_score=overall,
- issue_counts=all_issue_counts,
- included_sections=included_sections,
- )
-
-
-def _build_cover(
- payload: dict[str, Any],
- opts: PdfBuildOptions,
- overall: Optional[int],
- all_issue_counts: dict[str, int],
-) -> PdfCoverBlock:
- site = str(payload.get("site_name") or "Site Audit")
- report_title = str(payload.get("report_title") or "Technical SEO Audit Report")
-
- score_txt, band = _score_band(float(overall) if overall is not None else None)
- hero = PdfScoreHero(score=score_txt, band=band, label="Overall health score") # type: ignore[arg-type]
-
- priority_chips = [
- StatChip(label="Critical", value=str(all_issue_counts["critical"]), tone="critical"),
- StatChip(label="High", value=str(all_issue_counts["high"]), tone="high"),
- StatChip(label="Medium", value=str(all_issue_counts["medium"]), tone="medium"),
- StatChip(label="Low", value=str(all_issue_counts["low"]), tone="low"),
- ]
- priority_strip = StatGridBlock(id="cover.priority_strip", chips=priority_chips, columns=4)
-
- categories = payload.get("categories") or []
- score_cards: list[ScoreCard] = []
- for cat in categories:
- if not isinstance(cat, dict):
- continue
- from ...reporting.terminology import category_display_name
- name = category_display_name(str(cat.get("name") or "Category"))
- raw = cat.get("score")
- sv: float | None = None
- if raw is not None:
- try:
- sv = float(raw)
- except (TypeError, ValueError):
- pass
- stxt, sband = _score_band(sv)
- issue_n = len(cat.get("issues") or [])
- score_cards.append(ScoreCard(name=name, score=stxt, issue_count=issue_n, tone=sband)) # type: ignore[arg-type]
- cat_scores_block = ScoreCardsBlock(id="cover.category_scores", cards=score_cards)
-
- # Executive summary
- exec_data = _executive_export_data(payload)
- exec_summary = exec_data.get("summary") or None
- exec_source = _executive_source_label(exec_data.get("source") or "") if exec_data.get("source") else None
- priorities_list: list[str] = exec_data.get("priorities") or []
-
- # Top issues for cover — one row per distinct headline; prefer rows with a URL
- all_rows = sorted(_issues_rows(payload), key=_priority_sort_key)
- top_limit = opts.limits.top_issues_cover
- headline_order: list[str] = []
- by_headline: dict[str, PdfIssue] = {}
- for row in all_rows:
- issue = normalize_issue_for_pdf(row, include_recommendation=False)
- if issue.headline not in by_headline:
- headline_order.append(issue.headline)
- by_headline[issue.headline] = issue
- elif not by_headline[issue.headline].url and issue.url:
- by_headline[issue.headline] = issue
- top_issues = [by_headline[h] for h in headline_order[:top_limit]]
-
- return PdfCoverBlock(
- headline=f"Site Audit — {site}",
- subtitle=report_title,
- hero=hero,
- priority_strip=priority_strip,
- category_scores=cat_scores_block,
- executive_summary=exec_summary,
- executive_source=exec_source,
- priorities_list=priorities_list[:8],
- top_issues=top_issues,
- )
-
-
-def build_pdf_document(
- payload: dict[str, Any],
- opts: Optional[PdfBuildOptions] = None,
-) -> PdfDocument:
- """Transform a raw ReportPayload dict into a PdfDocument ready for rendering."""
- if opts is None:
- opts = PdfBuildOptions()
-
- exported_at = datetime.now(timezone.utc).strftime("%d %B %Y, %H:%M UTC")
- overall = _overall_score(payload)
- all_issues = _issues_rows(payload)
- all_issue_counts = _issue_priority_counts(all_issues)
-
- effective_sections = opts.effective_sections()
-
- # Run each requested adapter
- sections: list = []
- for key in effective_sections:
- adapter = SECTION_ADAPTERS.get(key)
- if adapter is None:
- continue
- result = adapter(payload, opts)
- sections.extend(result)
-
- # Sort sections by priority
- sections.sort(key=lambda s: s.priority)
-
- meta = _build_meta(
- payload, opts, exported_at, all_issue_counts, overall,
- included_sections=effective_sections,
- )
- cover = _build_cover(payload, opts, overall, all_issue_counts)
- footer = PdfFooterBlock(exported_at=exported_at)
-
- return PdfDocument(
- schema_version=SCHEMA_VERSION,
- document_kind="audit",
- meta=meta,
- cover=cover,
- sections=sections,
- footer=footer,
- appendix=None, # appendix content is included as PdfSections in sections list
- )
diff --git a/src/website_profiling/reporting/pdf/document.py b/src/website_profiling/reporting/pdf/document.py
deleted file mode 100644
index 94e9aa4c..00000000
--- a/src/website_profiling/reporting/pdf/document.py
+++ /dev/null
@@ -1,322 +0,0 @@
-"""PdfDocument v1 — versioned, block-based document model.
-
-All types are JSON-serializable dataclasses. The renderer consumes these;
-no ReportLab types appear here.
-"""
-from __future__ import annotations
-
-from dataclasses import dataclass, field
-from typing import Any, Literal, Optional
-
-SCHEMA_VERSION = "1.0"
-
-# ---------------------------------------------------------------------------
-# Primitive / shared
-# ---------------------------------------------------------------------------
-
-PriorityTone = Literal["critical", "high", "medium", "low", "neutral", "good", "fair", "poor"]
-DocumentKind = Literal["audit", "compare"]
-
-
-@dataclass
-class PdfTruncation:
- shown: int
- total: int
- reason: Literal["limit", "page_budget", "empty"] = "limit"
- continue_in: list[str] = field(default_factory=lambda: ["CSV", "workbook"])
-
-
-# ---------------------------------------------------------------------------
-# Block types — renderer handles each `type` discriminator
-# ---------------------------------------------------------------------------
-
-@dataclass
-class HeadingBlock:
- type: str = field(default="heading", init=False)
- id: str = ""
- text: str = ""
- level: int = 2 # 2 = section heading, 3 = sub-heading
- visible: bool = True
-
-
-@dataclass
-class ParagraphBlock:
- type: str = field(default="paragraph", init=False)
- id: str = ""
- text: str = ""
- italic: bool = False
- visible: bool = True
-
-
-@dataclass
-class CalloutBlock:
- type: str = field(default="callout", init=False)
- id: str = ""
- text: str = ""
- severity: Literal["info", "warn", "critical"] = "info"
- visible: bool = True
-
-
-@dataclass
-class SpacerBlock:
- type: str = field(default="spacer", init=False)
- id: str = ""
- height_pt: float = 8.0
- visible: bool = True
-
-
-@dataclass
-class KpiItem:
- label: str
- value: str
- delta: Optional[str] = None
- tone: PriorityTone = "neutral"
- help: Optional[str] = None
-
-
-@dataclass
-class KpiRowBlock:
- type: str = field(default="kpi_row", init=False)
- id: str = ""
- items: list[KpiItem] = field(default_factory=list)
- visible: bool = True
-
-
-@dataclass
-class StatChip:
- label: str
- value: str
- tone: PriorityTone = "neutral"
-
-
-@dataclass
-class StatGridBlock:
- type: str = field(default="stat_grid", init=False)
- id: str = ""
- chips: list[StatChip] = field(default_factory=list)
- columns: int = 4
- visible: bool = True
-
-
-@dataclass
-class KeyValueBlock:
- type: str = field(default="key_value", init=False)
- id: str = ""
- rows: list[tuple[str, str]] = field(default_factory=list)
- layout: Literal["default", "audit", "glossary"] = "default"
- visible: bool = True
-
-
-@dataclass
-class ScoreCard:
- name: str
- score: Optional[str] # formatted string, e.g. "87" or "—"
- issue_count: int = 0
- tone: Literal["score-good", "score-fair", "score-poor", "score-na"] = "score-na"
-
-
-@dataclass
-class ScoreCardsBlock:
- type: str = field(default="score_cards", init=False)
- id: str = ""
- cards: list[ScoreCard] = field(default_factory=list)
- visible: bool = True
-
-
-@dataclass
-class TableColumn:
- key: str
- label: str
- width: Literal["narrow", "medium", "wide", "url"] = "medium"
- align: Literal["left", "center", "right"] = "left"
-
-
-@dataclass
-class MetricTableBlock:
- type: str = field(default="metric_table", init=False)
- id: str = ""
- columns: list[TableColumn] = field(default_factory=list)
- rows: list[dict[str, str]] = field(default_factory=list)
- repeat_header: bool = True
- truncation: Optional[PdfTruncation] = None
- visible: bool = True
-
-
-@dataclass
-class UrlListBlock:
- type: str = field(default="url_list", init=False)
- id: str = ""
- rows: list[dict[str, str]] = field(default_factory=list) # keys: url, status, title
- show_title: bool = True
- truncation: Optional[PdfTruncation] = None
- visible: bool = True
-
-
-# ---------------------------------------------------------------------------
-# Issue blocks — primary findings format
-# ---------------------------------------------------------------------------
-
-@dataclass
-class PdfIssueMetrics:
- gsc_clicks: Optional[int] = None
- gsc_impressions: Optional[int] = None
- ga4_sessions: Optional[int] = None
- impact_score: Optional[float] = None
- lh_audit_id: Optional[str] = None
-
-
-@dataclass
-class PdfIssue:
- id: str
- priority: str
- category: str
- headline: str # ≤ 80 chars, no embedded URL duplication
- url: Optional[str] = None
- path: Optional[str] = None # display-only short path
- detail: Optional[str] = None
- recommendation: Optional[str] = None
- metrics: Optional[PdfIssueMetrics] = None
- tags: list[str] = field(default_factory=list)
- related_urls: list[str] = field(default_factory=list) # collapsed duplicates
-
-
-@dataclass
-class IssueGroupBlock:
- type: str = field(default="issue_group", init=False)
- id: str = ""
- title: str = ""
- group_label: str = "" # e.g. "Critical — 1 issue"
- issues: list[PdfIssue] = field(default_factory=list)
- render_as: Literal["list", "compact_table"] = "list"
- truncation: Optional[PdfTruncation] = None
- visible: bool = True
-
-
-@dataclass
-class IssueTableBlock:
- """Fallback tabular rendering for dense medium/low groups."""
- type: str = field(default="issue_table", init=False)
- id: str = ""
- title: str = ""
- issues: list[PdfIssue] = field(default_factory=list)
- truncation: Optional[PdfTruncation] = None
- visible: bool = True
-
-
-@dataclass
-class MarkdownBlock:
- type: str = field(default="markdown", init=False)
- id: str = ""
- text: str = ""
- visible: bool = True
-
-
-# Union type for IDE / type-checkers
-PdfBlock = (
- HeadingBlock
- | ParagraphBlock
- | CalloutBlock
- | SpacerBlock
- | KpiRowBlock
- | StatGridBlock
- | KeyValueBlock
- | ScoreCardsBlock
- | MetricTableBlock
- | UrlListBlock
- | IssueGroupBlock
- | IssueTableBlock
- | MarkdownBlock
-)
-
-# ---------------------------------------------------------------------------
-# Cover
-# ---------------------------------------------------------------------------
-
-@dataclass
-class PdfScoreHero:
- score: Optional[str]
- band: Literal["score-good", "score-fair", "score-poor", "score-na"]
- label: str # e.g. "Overall health score"
-
-
-@dataclass
-class PdfCoverBlock:
- headline: str
- subtitle: str
- hero: PdfScoreHero
- priority_strip: StatGridBlock
- category_scores: ScoreCardsBlock
- executive_summary: Optional[str] = None # prose paragraph
- executive_source: Optional[str] = None
- priorities_list: list[str] = field(default_factory=list)
- top_issues: list[PdfIssue] = field(default_factory=list)
-
-
-# ---------------------------------------------------------------------------
-# Section
-# ---------------------------------------------------------------------------
-
-@dataclass
-class PdfSection:
- id: str
- section_key: str
- title: str
- priority: int = 50 # lower = earlier in document
- page_break_before: bool = False
- keep_with_next_blocks: int = 1
- source_label: Optional[str] = None
- provenance: Optional[str] = None
- blocks: list[Any] = field(default_factory=list) # list[PdfBlock]
- truncation: Optional[PdfTruncation] = None
-
-
-# ---------------------------------------------------------------------------
-# Appendix
-# ---------------------------------------------------------------------------
-
-@dataclass
-class PdfAppendix:
- url_sample: Optional[UrlListBlock] = None
- audit_details: Optional[KeyValueBlock] = None
- glossary: Optional[KeyValueBlock] = None
-
-
-# ---------------------------------------------------------------------------
-# Meta / Footer
-# ---------------------------------------------------------------------------
-
-@dataclass
-class PdfMeta:
- report_id: Optional[int]
- property: str
- report_title: str
- generated_at: str # formatted for display
- exported_at: str
- data_sources: list[str]
- health_score: Optional[int]
- issue_counts: dict[str, int] # {critical, high, medium, low}
- truncation_summary: list[str] = field(default_factory=list)
- included_sections: list[str] = field(default_factory=list)
- locale: str = "en"
-
-
-@dataclass
-class PdfFooterBlock:
- confidential_note: str = "Confidential — prepared for client review."
- generator: str = "Site Audit"
- exported_at: str = ""
-
-
-# ---------------------------------------------------------------------------
-# Root document
-# ---------------------------------------------------------------------------
-
-@dataclass
-class PdfDocument:
- schema_version: str
- document_kind: DocumentKind
- meta: PdfMeta
- cover: PdfCoverBlock
- sections: list[PdfSection]
- footer: PdfFooterBlock
- appendix: Optional[PdfAppendix] = None
diff --git a/src/website_profiling/reporting/pdf/normalize.py b/src/website_profiling/reporting/pdf/normalize.py
deleted file mode 100644
index b2d7547c..00000000
--- a/src/website_profiling/reporting/pdf/normalize.py
+++ /dev/null
@@ -1,358 +0,0 @@
-"""Issue normalization and grouping for PDF output.
-
-Transforms raw ``_issues_rows`` dicts (which mirror the DB payload) into
-``PdfIssue`` objects suited for print layout:
-- Strips duplicated URLs from headlines
-- Expands Lighthouse audit-id abbreviations into human labels
-- Groups by priority → category for use by IssueGroupBlock
-"""
-from __future__ import annotations
-
-import hashlib
-import re
-from typing import Any, Optional
-from urllib.parse import urlparse
-
-from .document import IssueGroupBlock, PdfIssue, PdfIssueMetrics, PdfTruncation
-
-# ---------------------------------------------------------------------------
-# Lighthouse audit-id → human label registry
-# ---------------------------------------------------------------------------
-
-_LH_AUDIT_LABELS: dict[str, str] = {
- "cache-insight": "Serve assets with efficient cache policy",
- "color-contrast": "Background and foreground colors lack sufficient contrast",
- "unused-css-rules": "Remove unused CSS",
- "errors-in-console": "Browser errors logged to the console",
- "label-content-name-mismatch": "Button/link label does not match accessible name",
- "network-dependency-tree-insight": "Minimize critical request chain depth",
- "render-blocking-insight": "Eliminate render-blocking resources",
- "unused-javascript": "Remove unused JavaScript",
- "uses-optimized-images": "Efficiently encode images",
- "uses-responsive-images": "Properly size images",
- "uses-webp-images": "Serve images in next-gen formats",
- "largest-contentful-paint-element": "Largest Contentful Paint element",
- "total-blocking-time": "Total Blocking Time",
- "cumulative-layout-shift": "Cumulative Layout Shift",
- "first-contentful-paint": "First Contentful Paint",
- "speed-index": "Speed Index",
- "interactive": "Time to Interactive",
- "server-response-time": "Reduce initial server response time",
- "dom-size": "Avoid an excessive DOM size",
- "long-tasks": "Avoid long main-thread tasks",
- "layout-shifts": "Avoid large layout shifts",
- "image-alt": "Image elements do not have alt attributes",
- "link-name": "Links do not have a discernible name",
- "button-name": "Buttons do not have an accessible name",
- "duplicate-id-active": "Document has active focus elements with duplicate ID",
- "heading-order": "Heading elements are not in a sequentially-descending order",
- "meta-description": "Document does not have a meta description",
- "document-title": "Document does not have a element",
- "hreflang": "Document does not have a valid hreflang",
- "canonical": "Page is not canonical",
- "robots-txt": "Robots.txt is not valid",
- "tap-targets": "Touch targets are not sized appropriately",
-}
-
-_URL_IN_MSG_PATTERN = re.compile(
- r"(https?://\S+|(?:^|[\s:])(/\S+))", re.IGNORECASE
-)
-
-# Colon at end of a known-bad audit id: "cache-insight:" → strip colon
-_AUDIT_ID_TRAILING_COLON = re.compile(r"^([\w-]+):$")
-
-
-def _lh_label(audit_id: str) -> str:
- """Return a human-readable label for a Lighthouse audit id."""
- clean = audit_id.rstrip(":").strip().lower()
- return _LH_AUDIT_LABELS.get(clean, clean.replace("-", " ").title())
-
-
-def _strip_url_from_headline(message: str, url: str) -> str:
- """Remove URL from message text when it duplicates the dedicated url field."""
- if not url or not message:
- return message
-
- # Direct inclusion: "Issue text: https://example.com/path"
- stripped = message.replace(url, "").strip().rstrip(":").strip()
- if stripped and stripped != message:
- return stripped
-
- # URL with trailing slash variant
- url_slash = url.rstrip("/") + "/"
- stripped2 = message.replace(url_slash, "").strip().rstrip(":").strip()
- if stripped2 and stripped2 != message:
- return stripped2
-
- return message
-
-
-def _extract_path(url: str) -> Optional[str]:
- """Return just the path component of a URL for compact display."""
- if not url:
- return None
- try:
- parsed = urlparse(url)
- return parsed.path or None
- except Exception:
- return None
-
-
-def _is_lighthouse_row(message: str, tags: list[str]) -> tuple[bool, str]:
- """Detect Lighthouse issue rows and return (is_lh, audit_id)."""
- # Pattern: "audit-id:" alone or at start of message
- m = _AUDIT_ID_TRAILING_COLON.match(message.strip())
- if m:
- return True, m.group(1)
- # Tag-based
- if "lighthouse" in tags:
- return True, ""
- return False, ""
-
-
-def _issue_id(row: dict[str, Any]) -> str:
- key = f"{row.get('category','')}\x00{row.get('priority','')}\x00{row.get('message','')}\x00{row.get('url','')}"
- return hashlib.md5(key.encode()).hexdigest()[:12]
-
-
-def _shorten_headline(headline: str, raw_message: str, url: str) -> str:
- """Apply common headline cleanups after URL strip / lighthouse expansion."""
- lower = headline.lower()
- lower_raw = raw_message.lower()
-
- if "url in sitemap but not crawled" in lower:
- return "In sitemap, not crawled"
-
- if lower_raw.startswith("redirect:"):
- m = re.match(r"redirect:\s*(\d{3})\s*to\b", lower_raw)
- if m:
- return f"{m.group(1)} redirect"
-
- if lower.startswith("lighthouse:"):
- return headline.split(":", 1)[-1].strip()
-
- if lower.startswith("axe:"):
- body = headline.split(":", 1)[-1].strip()
- if len(body) > 90:
- dot = body.find(". ")
- if dot > 0:
- body = body[: dot + 1]
- else:
- body = body[:87].rsplit(" ", 1)[0] + "…"
- return body
-
- if len(headline) > 100:
- return headline[:97].rsplit(" ", 1)[0] + "…"
-
- return headline
-
-
-_GENERIC_CWV_REC = "See Performance (Core Web Vitals) in this audit, or re-run Lighthouse from Run audit."
-
-
-def _normalize_recommendation(rec: Optional[str]) -> Optional[str]:
- if not rec:
- return None
- if rec.strip() == _GENERIC_CWV_REC:
- return "Review Lighthouse audit details for this page."
- return rec.strip()
-
-
-def collapse_duplicate_issues(issues: list[PdfIssue]) -> list[PdfIssue]:
- """Merge rows that share the same headline + recommendation into one card with URL list."""
- buckets: dict[tuple[str, str], list[PdfIssue]] = {}
- order: list[tuple[str, str]] = []
- for iss in issues:
- key = (iss.headline, iss.recommendation or "")
- if key not in buckets:
- order.append(key)
- buckets[key] = []
- buckets[key].append(iss)
-
- collapsed: list[PdfIssue] = []
- for key in order:
- group = buckets[key]
- first = group[0]
- urls: list[str] = []
- for item in group:
- if item.url and item.url not in urls:
- urls.append(item.url)
- if len(urls) <= 1:
- collapsed.append(first)
- continue
- headline = first.headline
- if len(urls) > 1 and not headline.endswith(")"):
- headline = f"{headline} ({len(urls)} URLs)"
- collapsed.append(PdfIssue(
- id=first.id,
- priority=first.priority,
- category=first.category,
- headline=headline,
- url=None,
- path=first.path,
- detail=first.detail,
- recommendation=first.recommendation,
- metrics=first.metrics,
- tags=first.tags,
- related_urls=urls,
- ))
- return collapsed
-
-
-def normalize_issue_for_pdf(
- row: dict[str, Any],
- include_recommendation: bool = True,
-) -> PdfIssue:
- """Convert a raw issues_row dict → PdfIssue for print layout."""
- priority = str(row.get("priority") or "").lower()
- category = str(row.get("category") or "")
- raw_message = str(row.get("message") or "").strip()
- url = str(row.get("url") or "").strip()
- recommendation = _normalize_recommendation(
- str(row.get("recommendation") or "").strip() if include_recommendation else None
- )
-
- # Detect Lighthouse rows (audit-id only, no human label). Pass the row's own
- # tags so tag-based detection actually works (was hardcoded to [], making the
- # `"lighthouse" in tags` branch dead).
- is_lh, audit_id = _is_lighthouse_row(
- raw_message, [str(t).lower() for t in (row.get("tags") or [])]
- )
- if is_lh and audit_id:
- headline = _lh_label(audit_id)
- else:
- headline = _strip_url_from_headline(raw_message, url)
-
- headline = _shorten_headline(headline, raw_message, url)
-
- tags: list[str] = []
- lower_msg = raw_message.lower()
- if "sitemap" in lower_msg:
- tags.append("sitemap")
- if is_lh or "lighthouse" in lower_msg:
- tags.append("lighthouse")
- if "axe" in lower_msg or "wcag" in lower_msg or "contrast" in lower_msg:
- tags.append("axe")
- if "redirect" in lower_msg:
- tags.append("redirect")
- if "canonical" in lower_msg:
- tags.append("canonical")
- if "security" in category.lower():
- tags.append("security")
-
- # Metrics from issue dict (ReportIssue fields)
- gsc_clicks = row.get("gsc_clicks")
- gsc_imp = row.get("gsc_impressions")
- impact = row.get("impact_score")
- lh_id = audit_id if is_lh else row.get("lh_audit_id")
- metrics = None
- if any(v is not None for v in (gsc_clicks, gsc_imp, impact, lh_id)):
- metrics = PdfIssueMetrics(
- gsc_clicks=int(gsc_clicks) if gsc_clicks is not None else None,
- gsc_impressions=int(gsc_imp) if gsc_imp is not None else None,
- impact_score=float(impact) if impact is not None else None,
- lh_audit_id=str(lh_id) if lh_id else None,
- )
-
- return PdfIssue(
- id=_issue_id(row),
- priority=priority,
- category=category,
- headline=headline,
- url=url or None,
- path=_extract_path(url),
- detail=None,
- recommendation=recommendation or None,
- metrics=metrics,
- tags=tags,
- )
-
-
-# ---------------------------------------------------------------------------
-# Grouping
-# ---------------------------------------------------------------------------
-
-_PRIORITY_ORDER = {"critical": 0, "high": 1, "medium": 2, "low": 3}
-_PRIORITY_LABELS = {
- "critical": "Critical",
- "high": "High",
- "medium": "Medium",
- "low": "Low",
-}
-
-# Above this count per priority, sub-group by category
-_SUBGROUP_THRESHOLD = 8
-
-# Always use stacked list layout — tables only for cover top-issues / URL inventory
-_COMPACT_TABLE_THRESHOLD = 999
-
-
-def group_issues_for_pdf(
- issues: list[PdfIssue],
- issues_per_group: int = 25,
- issues_total: int = 120,
-) -> list[IssueGroupBlock]:
- """Group PdfIssue list by priority → category, returning IssueGroupBlock list."""
- # Sort and cap total
- sorted_issues = sorted(issues, key=lambda i: (_PRIORITY_ORDER.get(i.priority, 9), i.category))
- if len(sorted_issues) > issues_total:
- sorted_issues = sorted_issues[:issues_total]
-
- # Bucket by priority
- by_priority: dict[str, list[PdfIssue]] = {}
- for iss in sorted_issues:
- by_priority.setdefault(iss.priority, []).append(iss)
-
- groups: list[IssueGroupBlock] = []
-
- for pri in ("critical", "high", "medium", "low"):
- pri_issues = by_priority.get(pri, [])
- if not pri_issues:
- continue
-
- pri_label = _PRIORITY_LABELS.get(pri, pri.title())
- total_in_pri = len(pri_issues)
-
- if total_in_pri <= _SUBGROUP_THRESHOLD:
- # Single group for this priority
- shown = collapse_duplicate_issues(pri_issues[:issues_per_group])
- trunc = (
- PdfTruncation(shown=len(shown), total=total_in_pri)
- if total_in_pri > len(shown)
- else None
- )
- render_as = "compact_table" if len(shown) >= _COMPACT_TABLE_THRESHOLD else "list"
- groups.append(IssueGroupBlock(
- id=f"findings.{pri}",
- title=f"{pri_label} findings",
- group_label=f"{pri_label} — {total_in_pri} issue{'s' if total_in_pri != 1 else ''}",
- issues=shown,
- render_as=render_as,
- truncation=trunc,
- ))
- else:
- # Sub-group by category
- by_cat: dict[str, list[PdfIssue]] = {}
- for iss in pri_issues:
- by_cat.setdefault(iss.category, []).append(iss)
-
- for cat, cat_issues in sorted(by_cat.items()):
- cat_total = len(cat_issues)
- shown = collapse_duplicate_issues(cat_issues[:issues_per_group])
- trunc = (
- PdfTruncation(shown=len(shown), total=cat_total)
- if cat_total > len(shown)
- else None
- )
- render_as = "compact_table" if len(shown) >= _COMPACT_TABLE_THRESHOLD else "list"
- cat_id = cat.lower().replace(" ", "_").replace("&", "and")
- groups.append(IssueGroupBlock(
- id=f"findings.{pri}.{cat_id}",
- title=f"{pri_label} — {cat}",
- group_label=f"{pri_label} — {cat}: {cat_total} issue{'s' if cat_total != 1 else ''}",
- issues=shown,
- render_as=render_as,
- truncation=trunc,
- ))
-
- return groups
diff --git a/src/website_profiling/reporting/pdf/options.py b/src/website_profiling/reporting/pdf/options.py
deleted file mode 100644
index 678a446a..00000000
--- a/src/website_profiling/reporting/pdf/options.py
+++ /dev/null
@@ -1,43 +0,0 @@
-"""PdfBuildOptions, PdfLimits, and document profiles."""
-from __future__ import annotations
-
-from dataclasses import dataclass, field
-from typing import Literal, Optional
-
-Profile = Literal["executive", "standard", "full"]
-
-# Sections for each profile; None in sections means "use profile default"
-_PROFILE_SECTIONS: dict[str, list[str]] = {
- "executive": ["core"],
- "standard": ["core", "findings", "appendix"],
- "full": ["core", "findings", "lighthouse", "security", "traffic", "keywords",
- "indexation", "content", "links", "appendix"],
-}
-
-
-@dataclass
-class PdfLimits:
- issues_total: int = 120
- issues_per_group: int = 25
- top_issues_cover: int = 6
- urls_sample: int = 20
- metric_table_rows: int = 15
- gsc_queries: int = 10
- keyword_rows: int = 15
- diagnostic_items: int = 20
-
-
-@dataclass
-class PdfBuildOptions:
- profile: Profile = "standard"
- sections: Optional[list[str]] = None # None → derive from profile
- limits: PdfLimits = field(default_factory=PdfLimits)
- include_appendix: bool = True
- include_recommendations: bool = True
- include_glossary: bool = True
- report_id: Optional[int] = None
-
- def effective_sections(self) -> list[str]:
- if self.sections is not None:
- return self.sections
- return _PROFILE_SECTIONS.get(self.profile, _PROFILE_SECTIONS["standard"])
diff --git a/src/website_profiling/reporting/pdf/render/__init__.py b/src/website_profiling/reporting/pdf/render/__init__.py
deleted file mode 100644
index fc2a79ad..00000000
--- a/src/website_profiling/reporting/pdf/render/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-"""PDF/HTML renderers."""
-from __future__ import annotations
-
-from .html import render_html_document
-from .reportlab import render_pdf_document
-
-__all__ = ["render_pdf_document", "render_html_document"]
diff --git a/src/website_profiling/reporting/pdf/render/html.py b/src/website_profiling/reporting/pdf/render/html.py
deleted file mode 100644
index ee1eda8f..00000000
--- a/src/website_profiling/reporting/pdf/render/html.py
+++ /dev/null
@@ -1,774 +0,0 @@
-"""HTML renderer — converts PdfDocument → preview/print HTML matching the PDF layout."""
-from __future__ import annotations
-
-import html
-import re
-from typing import Any
-
-from ..document import (
- CalloutBlock,
- HeadingBlock,
- IssueGroupBlock,
- IssueTableBlock,
- KeyValueBlock,
- KpiRowBlock,
- MarkdownBlock,
- MetricTableBlock,
- ParagraphBlock,
- PdfCoverBlock,
- PdfDocument,
- PdfIssue,
- PdfSection,
- ScoreCardsBlock,
- SpacerBlock,
- StatGridBlock,
- UrlListBlock,
-)
-from . import styles as S
-
-
-def html_styles() -> str:
- """CSS shared by standard export preview HTML."""
- return """
- :root {
- --ink: #0f172a;
- --muted: #64748b;
- --line: #e2e8f0;
- --surface: #ffffff;
- --surface-muted: #f8fafc;
- --header-bg: #f1f5f9;
- --brand-accent: #2563eb;
- --good: #059669;
- --good-bg: #ecfdf5;
- --fair: #d97706;
- --fair-bg: #fffbeb;
- --poor: #dc2626;
- --poor-bg: #fef2f2;
- --critical-fg: #991b1b;
- --critical-bg: #fee2e2;
- --high-fg: #c2410c;
- --high-bg: #ffedd5;
- --medium-fg: #a16207;
- --medium-bg: #fef3c7;
- --low-fg: #475569;
- --low-bg: #f1f5f9;
- }
- * { box-sizing: border-box; }
- body {
- margin: 0;
- background: #eef2f7;
- color: var(--ink);
- font: 400 14px/1.45 "Segoe UI", system-ui, -apple-system, sans-serif;
- }
- .report {
- max-width: 816px;
- margin: 0 auto;
- background: var(--surface);
- box-shadow: 0 1px 3px rgba(15, 23, 42, 0.08);
- }
- .cover {
- padding: 1.75rem 1.85rem 1.25rem;
- background: var(--surface);
- }
- .cover-head {
- display: flex;
- justify-content: space-between;
- align-items: flex-start;
- gap: 1rem;
- margin-bottom: 0.35rem;
- }
- .cover-head h1 {
- margin: 0;
- font-size: 1.35rem;
- font-weight: 700;
- line-height: 1.25;
- }
- .cover-subtitle {
- margin: 0.25rem 0 0;
- color: var(--muted);
- font-size: 0.92rem;
- }
- .hero-score {
- text-align: center;
- min-width: 4.5rem;
- }
- .hero-score .score {
- display: block;
- font-size: 2rem;
- font-weight: 700;
- line-height: 1;
- }
- .hero-score .suffix {
- display: block;
- margin-top: 0.15rem;
- font-size: 0.72rem;
- color: var(--muted);
- }
- .hero-score.score-good .score { color: var(--good); }
- .hero-score.score-fair .score { color: var(--fair); }
- .hero-score.score-poor .score { color: var(--poor); }
- .hero-score.score-na .score { color: var(--muted); }
- .cover-meta-line {
- margin: 0.5rem 0 1rem;
- color: var(--muted);
- font-size: 0.82rem;
- }
- .section-title {
- margin: 1.1rem 0 0.35rem;
- font-size: 0.82rem;
- font-weight: 700;
- color: var(--ink);
- }
- .section-rule {
- border: none;
- border-top: 1px solid var(--line);
- margin: 0 0 0.65rem;
- }
- .section-lead {
- margin: 0 0 0.65rem;
- color: var(--muted);
- font-size: 0.78rem;
- }
- .grid-table {
- width: 100%;
- border-collapse: collapse;
- table-layout: fixed;
- margin-bottom: 0.85rem;
- font-size: 0.82rem;
- }
- .grid-table th,
- .grid-table td {
- border: 1px solid var(--line);
- padding: 0.65rem 0.5rem;
- text-align: center;
- vertical-align: middle;
- }
- .stat-grid td.stat-critical { background: var(--critical-bg); color: var(--critical-fg); }
- .stat-grid td.stat-high { background: var(--high-bg); color: var(--high-fg); }
- .stat-grid td.stat-medium { background: var(--medium-bg); color: var(--medium-fg); }
- .stat-grid td.stat-low { background: var(--low-bg); color: var(--low-fg); }
- .stat-grid .stat-value {
- display: block;
- font-size: 1.15rem;
- font-weight: 700;
- line-height: 1.1;
- }
- .stat-grid .stat-label {
- display: block;
- margin-top: 0.2rem;
- font-size: 0.72rem;
- color: var(--muted);
- }
- .score-grid .score-value {
- display: block;
- font-size: 0.95rem;
- font-weight: 700;
- line-height: 1.1;
- }
- .score-grid .score-name {
- display: block;
- margin-top: 0.25rem;
- font-size: 0.72rem;
- font-weight: 600;
- line-height: 1.25;
- }
- .score-grid .score-meta {
- display: block;
- margin-top: 0.15rem;
- font-size: 0.68rem;
- color: var(--muted);
- }
- .score-grid td { background: var(--surface-muted); }
- .score-grid .score-good .score-value { color: var(--good); }
- .score-grid .score-fair .score-value { color: var(--fair); }
- .score-grid .score-poor .score-value { color: var(--poor); }
- .score-grid .score-na .score-value { color: var(--muted); }
- .exec-panel {
- border: 1px solid var(--line);
- border-left: 3px solid var(--brand-accent);
- background: var(--surface-muted);
- padding: 0.85rem 1rem;
- margin-bottom: 1rem;
- border-radius: 0 4px 4px 0;
- }
- .exec-source {
- margin: 0 0 0.45rem;
- font-size: 0.68rem;
- font-weight: 700;
- color: var(--brand-accent);
- text-transform: uppercase;
- letter-spacing: 0.04em;
- }
- .exec-body { margin: 0; font-size: 0.88rem; line-height: 1.5; }
- .exec-subhead {
- margin: 0.65rem 0 0.35rem;
- font-size: 0.72rem;
- font-weight: 700;
- color: var(--muted);
- }
- .exec-priorities {
- margin: 0;
- padding-left: 1.1rem;
- font-size: 0.82rem;
- line-height: 1.45;
- }
- .data-table {
- width: 100%;
- border-collapse: collapse;
- font-size: 0.82rem;
- border: 1px solid var(--line);
- margin-bottom: 0.85rem;
- }
- .data-table th,
- .data-table td {
- padding: 0.55rem 0.65rem;
- text-align: left;
- vertical-align: middle;
- border-bottom: 1px solid var(--line);
- }
- .data-table thead th {
- background: var(--header-bg);
- font-size: 0.72rem;
- font-weight: 700;
- color: var(--muted);
- }
- .data-table tbody tr:nth-child(even) td { background: var(--surface-muted); }
- .data-table tbody tr:last-child td { border-bottom: none; }
- .data-table .col-status { text-align: center; width: 4.5rem; }
- .data-table .col-priority { text-align: center; width: 5rem; }
- .kv-audit th {
- width: 23%;
- font-weight: 700;
- vertical-align: top;
- }
- .kv-glossary th {
- width: 21%;
- font-weight: 700;
- vertical-align: top;
- background: var(--header-bg);
- }
- .kv-glossary td { line-height: 1.45; }
- .link { color: var(--brand-accent); word-break: break-all; }
- .site-wide { color: var(--muted); font-style: italic; font-size: 0.78rem; }
- .badge {
- display: inline-block;
- padding: 0.15rem 0.45rem;
- border-radius: 3px;
- font-size: 0.65rem;
- font-weight: 700;
- text-transform: uppercase;
- letter-spacing: 0.03em;
- border: 1px solid transparent;
- }
- .badge-critical { background: var(--critical-bg); color: var(--critical-fg); border-color: var(--critical-fg); }
- .badge-high { background: var(--high-bg); color: var(--high-fg); border-color: var(--high-fg); }
- .badge-medium { background: var(--medium-bg); color: var(--medium-fg); border-color: var(--medium-fg); }
- .badge-low { background: var(--low-bg); color: var(--low-fg); border-color: var(--low-fg); }
- .status-200 { background: var(--good-bg); color: var(--good); border-color: var(--good); }
- .status-3xx { background: var(--fair-bg); color: var(--fair); border-color: var(--fair); }
- .status-4xx, .status-5xx { background: var(--poor-bg); color: var(--poor); border-color: var(--poor); }
- .status-other { background: var(--surface-muted); color: var(--muted); border-color: var(--line); }
- .content { padding: 0 1.85rem 1.5rem; }
- .doc-section { margin-bottom: 1.35rem; }
- .doc-section > h2 {
- margin: 0 0 0.35rem;
- font-size: 0.82rem;
- font-weight: 700;
- }
- .doc-section .source-label {
- margin: 0 0 0.5rem;
- font-size: 0.78rem;
- color: var(--muted);
- }
- .group-label {
- margin: 0.65rem 0 0.35rem;
- font-size: 0.78rem;
- font-weight: 700;
- }
- .issue-card {
- border-left: 3px solid var(--line);
- background: var(--surface-muted);
- padding: 0.45rem 0.65rem;
- margin-bottom: 0.45rem;
- font-size: 0.82rem;
- }
- .issue-card.priority-critical { border-color: var(--critical-fg); background: var(--critical-bg); }
- .issue-card.priority-high { border-color: var(--high-fg); background: var(--high-bg); }
- .issue-card.priority-medium { border-color: var(--medium-fg); background: var(--medium-bg); }
- .issue-card.priority-low { border-color: var(--low-fg); background: var(--low-bg); }
- .issue-headline { margin: 0; font-weight: 700; line-height: 1.35; }
- .issue-url {
- margin: 0.2rem 0 0;
- font-size: 0.76rem;
- color: var(--brand-accent);
- word-break: break-all;
- }
- .issue-rec {
- margin: 0.25rem 0 0;
- font-size: 0.76rem;
- color: var(--muted);
- font-style: italic;
- }
- .issue-url-list {
- margin: 0.25rem 0 0;
- padding-left: 1rem;
- font-size: 0.76rem;
- color: var(--brand-accent);
- }
- .muted-note {
- margin: 0.35rem 0 0;
- font-size: 0.76rem;
- color: var(--muted);
- }
- .page-break {
- break-before: page;
- page-break-before: always;
- height: 0;
- margin: 0;
- border-top: 1px dashed var(--line);
- }
- .report-footer {
- border-top: 1px solid var(--line);
- padding: 0.85rem 1.85rem 1.25rem;
- color: var(--muted);
- font-size: 0.72rem;
- line-height: 1.45;
- }
- .content {
- padding: 0 1.85rem 1.5rem;
- }
- .custom-section {
- margin-bottom: 1.35rem;
- }
- .custom-section > h2 {
- margin: 0 0 0.35rem;
- font-size: 0.82rem;
- font-weight: 700;
- }
- .callout {
- border: 1px solid var(--line);
- border-left: 3px solid var(--brand-accent);
- background: var(--surface-muted);
- padding: 0.85rem 1rem;
- border-radius: 0 4px 4px 0;
- margin: 0.5rem 0;
- }
- p.muted, .muted {
- color: var(--muted);
- font-size: 0.82rem;
- margin: 0.35rem 0 0.65rem;
- }
- .url, td.url {
- color: var(--brand-accent);
- word-break: break-all;
- font-size: 0.76rem;
- }
- table.data, .table-wrap table {
- width: 100%;
- border-collapse: collapse;
- font-size: 0.82rem;
- border: 1px solid var(--line);
- margin: 0.5rem 0 0.85rem;
- }
- table.data th, table.data td,
- .table-wrap table th, .table-wrap table td {
- padding: 0.55rem 0.65rem;
- text-align: left;
- vertical-align: top;
- border-bottom: 1px solid var(--line);
- }
- table.data thead th, .table-wrap table thead th {
- background: var(--header-bg);
- font-size: 0.72rem;
- font-weight: 700;
- color: var(--muted);
- }
- .category-cards {
- display: flex;
- flex-wrap: wrap;
- gap: 0.75rem;
- margin: 0.65rem 0;
- }
- article.score-card {
- flex: 1 1 140px;
- max-width: 180px;
- border: 1px solid var(--line);
- border-radius: 4px;
- padding: 0.75rem;
- background: var(--surface-muted);
- text-align: center;
- }
- article.score-card .score-value {
- font-size: 1.1rem;
- font-weight: 700;
- }
- article.score-card .score-name {
- margin-top: 0.35rem;
- font-size: 0.72rem;
- font-weight: 600;
- }
- article.score-card .score-meta {
- margin-top: 0.2rem;
- font-size: 0.68rem;
- color: var(--muted);
- }
- article.score-card.score-good .score-value { color: var(--good); }
- article.score-card.score-fair .score-value { color: var(--fair); }
- article.score-card.score-poor .score-value { color: var(--poor); }
- article.score-card.score-na .score-value { color: var(--muted); }
- .notes, .json-preview {
- line-height: 1.5;
- font-size: 0.82rem;
- }
- .json-preview {
- overflow-x: auto;
- background: var(--surface-muted);
- padding: 0.75rem;
- border: 1px solid var(--line);
- border-radius: 4px;
- }
- @media print {
- body { background: #fff; }
- .report { max-width: none; box-shadow: none; }
- .cover, .content, .report-footer { padding-left: 0.65in; padding-right: 0.65in; }
- .page-break { border: none; }
- }
-"""
-
-
-def _esc(text: Any) -> str:
- return html.escape(str(text) if text is not None else "")
-
-
-def _priority_badge(priority: str) -> str:
- key = priority.lower()
- cls = f"badge badge-{key}" if key in {"critical", "high", "medium", "low"} else "badge badge-low"
- return f'{_esc(priority)}'
-
-
-def _status_badge(code: str) -> str:
- c = str(code or "").strip()
- if c == "200":
- cls = "badge status-200"
- elif c.startswith("3"):
- cls = "badge status-3xx"
- elif c and c[0] in "45":
- cls = "badge status-4xx" if c.startswith("4") else "badge status-5xx"
- else:
- cls = "badge status-other"
- return f'{_esc(c or "—")}'
-
-
-def _issue_location(issue: PdfIssue) -> str:
- if issue.path:
- return f'{_esc(issue.path)}'
- if issue.url:
- return f'{_esc(issue.url)}'
- return 'Site-wide'
-
-
-def _section_heading(title: str) -> str:
- return f'{_esc(title)}
'
-
-
-def _render_stat_grid(block: StatGridBlock) -> str:
- if not block.chips:
- return ""
- cells = []
- for chip in block.chips:
- tone = chip.tone if chip.tone in {"critical", "high", "medium", "low"} else "low"
- cells.append(
- f''
- f'{_esc(chip.value)}'
- f'{_esc(chip.label)}'
- f" | "
- )
- while len(cells) < block.columns:
- cells.append(" | ")
- return f''
-
-
-def _render_score_cards(block: ScoreCardsBlock) -> str:
- if not block.cards:
- return ""
- cols = S.GRID_COLS
- rows_html: list[str] = []
- row: list[str] = []
- for card in block.cards:
- issue_label = f"{card.issue_count} issue{'s' if card.issue_count != 1 else ''}"
- row.append(
- f''
- f'{_esc(card.score or "—")}'
- f'{_esc(card.name)}'
- f'{issue_label}'
- f" | "
- )
- if len(row) == cols:
- rows_html.append(f"{''.join(row)}
")
- row = []
- if row:
- while len(row) < cols:
- row.append(" | ")
- rows_html.append(f"{''.join(row)}
")
- return f''
-
-
-def _render_executive_panel(cover: PdfCoverBlock) -> str:
- if not (cover.executive_summary or cover.priorities_list):
- return ""
- parts = ['']
- if cover.executive_source:
- parts.append(f'
Source · {_esc(cover.executive_source)}
')
- if cover.executive_summary:
- parts.append(f'
{_esc(cover.executive_summary)}
')
- if cover.priorities_list:
- parts.append('
Recommended priorities
')
- parts.append('
')
- for pri in cover.priorities_list[:6]:
- parts.append(f"- {_esc(pri)}
")
- parts.append("
")
- parts.append("
")
- return "".join(parts)
-
-
-def _render_top_issues(issues: list[PdfIssue]) -> str:
- if not issues:
- return ""
- rows = "".join(
- f""
- f'| {_priority_badge(iss.priority)} | '
- f"{_esc(iss.headline)} | "
- f"{_issue_location(iss)} | "
- f"
"
- for iss in issues
- )
- return (
- f"{_section_heading('Top traffic-impacting issues')}"
- f'Ranked by severity and traffic impact — address critical and high items first.
'
- f''
- f"| Priority | Issue | Location |
"
- f"{rows}
"
- )
-
-
-def _render_cover(cover: PdfCoverBlock, meta) -> str:
- counts = meta.issue_counts
- total = sum(counts.values())
- meta_line = (
- f"Report generated {meta.generated_at} · {total} findings "
- f"(Critical {counts.get('critical', 0)}, High {counts.get('high', 0)}, "
- f"Medium {counts.get('medium', 0)}, Low {counts.get('low', 0)})"
- )
- hero = cover.hero
- exec_html = ""
- if cover.executive_summary or cover.priorities_list:
- exec_html = _section_heading("Executive summary") + _render_executive_panel(cover)
- top_html = _render_top_issues(cover.top_issues)
-
- cat_html = ""
- if cover.category_scores.cards:
- cat_html = _section_heading("Category scores") + _render_score_cards(cover.category_scores)
-
- return f"""
- """
-
-
-def _render_issue(issue: PdfIssue) -> str:
- pri = issue.priority.lower()
- cls = f"issue-card priority-{pri}" if pri in {"critical", "high", "medium", "low"} else "issue-card"
- parts = [f'', f'
{_esc(issue.headline)}
']
- if issue.related_urls:
- items = "".join(f"
{_esc(u)}" for u in issue.related_urls[:10])
- extra = len(issue.related_urls) - 10
- if extra > 0:
- items += f'
… and {extra} more (see CSV export)'
- parts.append(f'
')
- elif issue.url:
- parts.append(f'
{_esc(issue.url)}
')
- if issue.recommendation:
- parts.append(f'
Fix: {_esc(issue.recommendation)}
')
- parts.append("
")
- return "".join(parts)
-
-
-def _render_issue_group(block: IssueGroupBlock) -> str:
- parts = [f'{_esc(block.group_label)}
']
- if block.render_as == "compact_table":
- rows = "".join(
- f"| {_esc(iss.headline)} | "
- f'{_esc(iss.url or "")} |
'
- for iss in block.issues
- )
- parts.append(
- f'"
- )
- else:
- for iss in block.issues:
- parts.append(_render_issue(iss))
- if block.truncation:
- t = block.truncation
- parts.append(
- f'Showing {t.shown} of {t.total}. '
- f"Full list in {', '.join(t.continue_in)}.
"
- )
- return "".join(parts)
-
-
-def _render_key_value(block: KeyValueBlock) -> str:
- if not block.rows:
- return ""
- layout = getattr(block, "layout", "default") or "default"
- if layout == "audit":
- table_cls = "data-table kv-audit"
- elif layout == "glossary":
- table_cls = "data-table kv-glossary"
- else:
- table_cls = "data-table kv-audit"
- rows = "".join(
- f"| {_esc(k)} | {_esc(v)} |
" for k, v in block.rows
- )
- return f''
-
-
-def _render_url_list(block: UrlListBlock) -> str:
- if not block.rows:
- return ""
- show_title = getattr(block, "show_title", True)
- head = "URL | Status | "
- if show_title:
- head += "Title | "
- body_rows: list[str] = []
- for row in block.rows:
- url = str(row.get("url") or "")
- status = str(row.get("status") or "")
- cells = (
- f'{_esc(url)} | '
- f'{_status_badge(status)} | '
- )
- if show_title:
- title = str(row.get("title") or "").strip()
- title_cell = _esc(title) if title else '—'
- cells += f"{title_cell} | "
- body_rows.append(f"{cells}
")
- note = ""
- if block.truncation:
- t = block.truncation
- note = (
- f'Showing {t.shown} of {t.total} URLs. '
- f"Export CSV/workbook for full inventory.
"
- )
- return (
- f'{head}
'
- f'{"".join(body_rows)}
{note}'
- )
-
-
-def _render_block(block: Any) -> str:
- if not getattr(block, "visible", True):
- return ""
- btype = getattr(block, "type", None)
- if btype == "issue_group":
- return _render_issue_group(block)
- if btype == "key_value":
- return _render_key_value(block)
- if btype == "url_list":
- return _render_url_list(block)
- if btype == "issue_table":
- rows = "".join(
- f"| {_esc(iss.headline)} | {_esc(iss.url or '')} |
"
- for iss in block.issues
- )
- title = f"{_esc(block.title)}
" if block.title else ""
- return (
- f"{title}"
- )
- if btype == "paragraph":
- return f"{_esc(block.text)}
"
- if btype == "heading":
- tag = "h3" if block.level >= 3 else "h2"
- return f"<{tag}>{_esc(block.text)}{tag}>"
- if btype == "callout":
- return f''
- if btype == "markdown":
- text = re.sub(r"<[^>]+>", " ", block.text)
- return f"{_esc(text)}
"
- if btype == "metric_table":
- cols = block.columns
- if not cols:
- return ""
- head = "".join(f"{_esc(c.label)} | " for c in cols)
- body = ""
- for row in block.rows:
- body += "" + "".join(
- f'| {_esc(row.get(c.key, ""))} | ' for c in cols
- ) + "
"
- return f''
- if btype in {"spacer", "kpi_row", "stat_grid", "score_cards"}:
- return ""
- return ""
-
-
-def _render_section(section: PdfSection) -> str:
- parts = [f'']
- parts.append(f"{_esc(section.title)}
")
- if section.source_label:
- parts.append(f'Source: {_esc(section.source_label)}
')
- for block in section.blocks:
- parts.append(_render_block(block))
- if section.truncation:
- t = section.truncation
- parts.append(
- f'Showing {t.shown} of {t.total} issues. '
- f"Export CSV or workbook for full data.
"
- )
- parts.append("")
- return "".join(parts)
-
-
-def render_html_document(doc: PdfDocument) -> str:
- """Render a PdfDocument as HTML matching the PDF export layout."""
- cover_html = _render_cover(doc.cover, doc.meta)
- sections_html = "".join(_render_section(s) for s in doc.sections)
- footer = doc.footer
- footer_text = (
- f"{footer.confidential_note} "
- f"Generated by {footer.generator} · {footer.exported_at}"
- )
- title = _esc(doc.cover.headline)
- return f"""
-
-
-
-
-{title}
-
-
-
-
-{cover_html}
-
-
-{sections_html}
-
-
-
-
-"""
diff --git a/src/website_profiling/reporting/pdf/render/reportlab.py b/src/website_profiling/reporting/pdf/render/reportlab.py
deleted file mode 100644
index 1ae98bb2..00000000
--- a/src/website_profiling/reporting/pdf/render/reportlab.py
+++ /dev/null
@@ -1,947 +0,0 @@
-"""ReportLab renderer — converts PdfDocument → PDF bytes.
-
-Layout rules:
-- Every table cell is wrapped in Paragraph (prevents column bleed/overflow).
-- Findings are rendered as stacked item blocks (issue_group), not 4-col tables.
-- LongTable + repeatRows=1 for metric/url tables.
-- Page numbers via onFirstPage / onLaterPages callbacks.
-"""
-from __future__ import annotations
-
-import html
-import io
-from typing import Any
-
-from ..document import (
- CalloutBlock,
- HeadingBlock,
- IssueGroupBlock,
- IssueTableBlock,
- KeyValueBlock,
- KpiRowBlock,
- MarkdownBlock,
- MetricTableBlock,
- ParagraphBlock,
- PdfCoverBlock,
- PdfDocument,
- PdfIssue,
- PdfMeta,
- PdfSection,
- ScoreCardsBlock,
- SpacerBlock,
- StatGridBlock,
- UrlListBlock,
-)
-from . import styles as S
-
-
-def _content_w_in() -> float:
- return S.CONTENT_WIDTH_IN
-
-
-def _col_w_in(cols: int) -> float:
- return _content_w_in() / cols
-
-
-def _content_w_pt() -> float:
- from reportlab.lib.units import inch
- return _content_w_in() * inch
-
-
-def _grid_table_style() -> Any:
- from reportlab.platypus import TableStyle
- style = TableStyle([
- ("BOX", (0, 0), (-1, -1), 0.5, _hex(S.BORDER)),
- ("INNERGRID", (0, 0), (-1, -1), 0.5, _hex(S.BORDER)),
- ("ALIGN", (0, 0), (-1, -1), "CENTER"),
- ("VALIGN", (0, 0), (-1, -1), "MIDDLE"),
- ("TOPPADDING", (0, 0), (-1, -1), 10),
- ("BOTTOMPADDING", (0, 0), (-1, -1), 10),
- ("LEFTPADDING", (0, 0), (-1, -1), 6),
- ("RIGHTPADDING", (0, 0), (-1, -1), 6),
- ])
- return style
-
-
-def _require_reportlab() -> None:
- try:
- from reportlab.lib import colors # noqa: F401
- except ImportError as exc:
- raise RuntimeError("PDF export requires reportlab (pip install reportlab)") from exc
-
-
-# ---------------------------------------------------------------------------
-# ReportLab helpers
-# ---------------------------------------------------------------------------
-
-def _rl_colors():
- from reportlab.lib import colors
- return colors
-
-
-def _hex(color_str: str):
- return _rl_colors().HexColor(color_str)
-
-
-def _make_styles():
- from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
- base = getSampleStyleSheet()
-
- def ps(name: str, parent_name: str = "Normal", **kwargs) -> ParagraphStyle:
- return ParagraphStyle(name, parent=base[parent_name], **kwargs)
-
- return {
- "title": ps("ATitle", "Heading1", fontSize=20, textColor=_hex(S.INK),
- spaceAfter=2, leading=24, spaceBefore=0),
- "subtitle": ps("ASubtitle", fontSize=11, textColor=_hex(S.MUTED), spaceAfter=4, leading=14),
- "section": ps("ASection", "Heading2", fontSize=11, textColor=_hex(S.INK),
- spaceBefore=12, spaceAfter=4, borderPad=0),
- "subsection": ps("ASubsection", "Heading3", fontSize=10, textColor=_hex(S.INK),
- spaceBefore=6, spaceAfter=4),
- "body": ps("ABody", fontSize=9, leading=13, textColor=_hex(S.INK)),
- "body_italic": ps("ABodyI", fontSize=9, leading=13, textColor=_hex(S.MUTED), italic=True),
- "muted": ps("AMuted", fontSize=8, leading=11, textColor=_hex(S.MUTED)),
- "url": ps("AUrl", fontName="Courier", fontSize=8, leading=10,
- textColor=_hex(S.BRAND_ACCENT), wordWrap="CJK"),
- "kv_key": ps("AKvKey", fontSize=9, leading=12, textColor=_hex(S.INK), fontName="Helvetica-Bold"),
- "kv_val": ps("AKvVal", fontSize=9, leading=12, textColor=_hex(S.INK)),
- "th": ps("ATh", fontSize=8, leading=10, textColor=_hex(S.MUTED), fontName="Helvetica-Bold"),
- "td": ps("ATd", fontSize=9, leading=12, textColor=_hex(S.INK)),
- "td_url": ps("ATdUrl", fontName="Courier", fontSize=8, leading=10,
- textColor=_hex(S.BRAND_ACCENT), wordWrap="CJK"),
- "td_link": ps("ATdLink", fontSize=8, leading=11, textColor=_hex(S.BRAND_ACCENT), wordWrap="CJK"),
- "kv_desc": ps("AKvDesc", fontSize=9, leading=13, textColor=_hex(S.INK)),
- "cover_title": ps("ACoverTitle", fontSize=22, textColor=_hex("#f8fafc"),
- spaceAfter=4, leading=28, fontName="Helvetica-Bold"),
- "cover_sub": ps("ACoverSub", fontSize=11, textColor=_hex("#cbd5e1"), spaceAfter=2),
- "hero_score": ps("AHeroScore", fontSize=28, leading=32, fontName="Helvetica-Bold"),
- "hero_suffix": ps("AHeroSuffix", fontSize=10, textColor=_hex(S.MUTED), alignment=2),
- "score_value": ps("AScoreVal", fontSize=15, leading=18, fontName="Helvetica-Bold", alignment=1),
- "score_name": ps("AScoreName", fontSize=8, leading=11, alignment=1, spaceAfter=2),
- "score_meta": ps("AScoreMeta", fontSize=7, leading=9, textColor=_hex(S.MUTED), alignment=1),
- "stat_value": ps("AStatVal", fontSize=18, leading=20, fontName="Helvetica-Bold", alignment=1),
- "stat_label": ps("AStatLabel", fontSize=8, leading=10, textColor=_hex(S.MUTED), alignment=1),
- "cover_meta": ps("ACoverMetaLine", fontSize=9, textColor=_hex(S.MUTED), spaceAfter=10, leading=12),
- "badge": ps("ABadge", fontSize=8, leading=10, fontName="Helvetica-Bold"),
- "footer": ps("AFooter", fontSize=7, textColor=_hex(S.MUTED), leading=9),
- "issue_headline": ps("AIssHeadline", fontSize=9, leading=12,
- textColor=_hex(S.INK), fontName="Helvetica-Bold"),
- "issue_rec": ps("AIssRec", fontSize=8, leading=11, textColor=_hex(S.MUTED), italic=True),
- "callout_info": ps("ACalloutInfo", fontSize=9, leading=12,
- textColor=_hex(S.BRAND_ACCENT), leftIndent=8),
- "callout_warn": ps("ACalloutWarn", fontSize=9, leading=12,
- textColor=_hex(S.FAIR), leftIndent=8),
- "callout_critical": ps("ACalloutCrit", fontSize=9, leading=12,
- textColor=_hex(S.CRITICAL_FG), leftIndent=8),
- "exec_body": ps("AExecBody", fontSize=10, leading=15, textColor=_hex(S.INK), spaceAfter=4),
- "exec_subhead": ps("AExecSub", fontSize=8, leading=11, textColor=_hex(S.MUTED),
- fontName="Helvetica-Bold", spaceBefore=6, spaceAfter=3),
- "exec_bullet": ps("AExecBullet", fontSize=9, leading=13, textColor=_hex(S.INK), leftIndent=10),
- "exec_source": ps("AExecSource", fontSize=7, leading=9, textColor=_hex(S.BRAND_ACCENT),
- fontName="Helvetica-Bold", spaceAfter=4),
- "section_lead": ps("ASectionLead", fontSize=8, leading=11, textColor=_hex(S.MUTED), spaceAfter=6),
- "td_site": ps("ATdSite", fontSize=8, leading=10, textColor=_hex(S.MUTED), italic=True),
- }
-
-
-def _p(text: str, style) -> Any:
- """Plain-text paragraph — content is HTML-escaped."""
- from reportlab.platypus import Paragraph
- return Paragraph(html.escape(str(text)), style)
-
-
-def _p_html(markup: str, style) -> Any:
- """Markup paragraph — caller must escape user content before embedding tags."""
- from reportlab.platypus import Paragraph
- return Paragraph(str(markup), style)
-
-
-def _safe_p(text: str, style, fallback: str = "—") -> Any:
- return _p(text if text else fallback, style)
-
-
-def _table_style_base():
- from reportlab.platypus import TableStyle
- return TableStyle([
- ("BACKGROUND", (0, 0), (-1, 0), _hex(S.HEADER_BG)),
- ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
- ("FONTSIZE", (0, 0), (-1, -1), 9),
- ("GRID", (0, 0), (-1, -1), 0.3, _hex(S.BORDER)),
- ("VALIGN", (0, 0), (-1, -1), "TOP"),
- ("TOPPADDING", (0, 0), (-1, -1), 4),
- ("BOTTOMPADDING", (0, 0), (-1, -1), 4),
- ("LEFTPADDING", (0, 0), (-1, -1), 5),
- ("RIGHTPADDING", (0, 0), (-1, -1), 5),
- ])
-
-
-def _page_callback(canvas, doc, footer_text: str) -> None:
- from reportlab.lib.units import inch
- page_w, _ = doc.pagesize
- canvas.saveState()
- canvas.setFont("Helvetica", 7)
- canvas.setFillColor(_hex(S.MUTED))
- canvas.drawString(0.55 * inch, 0.35 * inch, footer_text)
- page_num = f"Page {doc.page}"
- canvas.drawRightString(page_w - 0.55 * inch, 0.35 * inch, page_num)
- canvas.restoreState()
-
-
-# ---------------------------------------------------------------------------
-# Block renderers — each returns a list of flowables
-# ---------------------------------------------------------------------------
-
-def _render_heading(block: HeadingBlock, st: dict) -> list:
- from reportlab.platypus import Spacer
- style = st["section"] if block.level == 2 else st["subsection"]
- return [_p(block.text, style), Spacer(1, 2)]
-
-
-def _render_paragraph(block: ParagraphBlock, st: dict) -> list:
- style = st["body_italic"] if block.italic else st["body"]
- return [_p(block.text, style)]
-
-
-def _render_callout(block: CalloutBlock, st: dict) -> list:
- from reportlab.lib.units import inch
- from reportlab.platypus import Spacer, Table, TableStyle
- style_map = {"info": st["callout_info"], "warn": st["callout_warn"], "critical": st["callout_critical"]}
- bg_map = {"info": "#eff6ff", "warn": S.FAIR_BG, "critical": S.CRITICAL_BG}
- s = style_map.get(block.severity, st["body"])
- bg = bg_map.get(block.severity, "#eff6ff")
- cell = [[_p(block.text, s)]]
- tbl = Table(cell, colWidths=[_content_w_in() * inch])
- tbl.setStyle(TableStyle([
- ("BACKGROUND", (0, 0), (-1, -1), _hex(bg)),
- ("LEFTPADDING", (0, 0), (-1, -1), 10),
- ("RIGHTPADDING", (0, 0), (-1, -1), 8),
- ("TOPPADDING", (0, 0), (-1, -1), 6),
- ("BOTTOMPADDING", (0, 0), (-1, -1), 6),
- ("BOX", (0, 0), (-1, -1), 2, _hex(S.BRAND_ACCENT)),
- ]))
- return [tbl, Spacer(1, 4)]
-
-
-def _render_spacer(block: SpacerBlock, _st: dict) -> list:
- from reportlab.platypus import Spacer
- return [Spacer(1, block.height_pt)]
-
-
-def _render_kpi_row(block: KpiRowBlock, st: dict) -> list:
- from reportlab.lib.units import inch
- from reportlab.platypus import Spacer, Table, TableStyle
- if not block.items:
- return []
- n = len(block.items)
- w = _col_w_in(n)
- row_data = [[_p_html(f"{html.escape(i.value)}
{html.escape(i.label)}", st["body"]) for i in block.items]]
- tbl = Table(row_data, colWidths=[w * inch] * n)
- tbl.setStyle(TableStyle([
- ("BOX", (0, 0), (-1, -1), 0.3, _hex(S.BORDER)),
- ("INNERGRID", (0, 0), (-1, -1), 0.3, _hex(S.BORDER)),
- ("ALIGN", (0, 0), (-1, -1), "CENTER"),
- ("VALIGN", (0, 0), (-1, -1), "MIDDLE"),
- ("TOPPADDING", (0, 0), (-1, -1), 6),
- ("BOTTOMPADDING", (0, 0), (-1, -1), 6),
- ("BACKGROUND", (0, 0), (-1, -1), _hex(S.SURFACE_MUTED)),
- ]))
- return [tbl, Spacer(1, 8)]
-
-
-def _render_stat_grid(block: StatGridBlock, st: dict) -> list:
- from reportlab.lib.units import inch
- from reportlab.platypus import Spacer, Table
- if not block.chips:
- return []
- # Never build more table cells than declared column widths: if a block has
- # more chips than columns, widen the grid to fit them (ReportLab errors at
- # build time on a cell/colWidths mismatch). Unchanged when chips <= columns.
- n = max(block.columns, len(block.chips))
- col_w = _col_w_in(n)
- row: list = []
- for chip in block.chips:
- fg, _bg = S.PRIORITY_TONES.get(chip.tone, (S.INK, S.SURFACE_MUTED))
- val_style = ParagraphStyle_compat(st["stat_value"], textColor=_hex(fg))
- row.append(_cell_stack([(chip.value, val_style), (chip.label, st["stat_label"])], col_w))
- while len(row) < n:
- row.append("")
- tbl = Table([row], colWidths=[col_w * inch] * n, rowHeights=[0.62 * inch])
- ts = _grid_table_style()
- for i, chip in enumerate(block.chips):
- _fg, bg = S.PRIORITY_TONES.get(chip.tone, (S.INK, S.SURFACE_MUTED))
- ts.add("BACKGROUND", (i, 0), (i, 0), _hex(bg))
- tbl.setStyle(ts)
- return [tbl, Spacer(1, 12)]
-
-
-def ParagraphStyle_compat(base_style, **overrides):
- """Clone a ParagraphStyle with attribute overrides."""
- from reportlab.lib.styles import ParagraphStyle
- return ParagraphStyle(
- f"{base_style.name}_override",
- parent=base_style,
- **overrides,
- )
-
-
-def _section_heading(text: str, st: dict) -> list:
- from reportlab.platypus import HRFlowable, Spacer
- return [
- _p(text, st["section"]),
- HRFlowable(
- width=_content_w_pt(),
- thickness=0.5,
- color=_hex(S.BORDER),
- spaceBefore=0,
- spaceAfter=8,
- ),
- ]
-
-
-def _cell_stack(rows: list[tuple[str, Any]], col_w_in: float):
- """Borderless vertically stacked paragraphs for a grid cell."""
- from reportlab.lib.units import inch
- from reportlab.platypus import Table, TableStyle
- data = [[_p(text, style)] for text, style in rows]
- tbl = Table(data, colWidths=[col_w_in * inch])
- tbl.setStyle(TableStyle([
- ("ALIGN", (0, 0), (-1, -1), "CENTER"),
- ("VALIGN", (0, 0), (-1, -1), "MIDDLE"),
- ("LEFTPADDING", (0, 0), (-1, -1), 2),
- ("RIGHTPADDING", (0, 0), (-1, -1), 2),
- ("TOPPADDING", (0, 0), (-1, -1), 0),
- ("BOTTOMPADDING", (0, 0), (-1, -1), 0),
- ]))
- return tbl
-
-
-def _data_table_style() -> Any:
- from reportlab.platypus import TableStyle
- return TableStyle([
- ("BOX", (0, 0), (-1, -1), 0.5, _hex(S.BORDER)),
- ("LINEBELOW", (0, 0), (-1, -2), 0.35, _hex(S.BORDER)),
- ("VALIGN", (0, 0), (-1, -1), "TOP"),
- ("TOPPADDING", (0, 0), (-1, -1), 9),
- ("BOTTOMPADDING", (0, 0), (-1, -1), 9),
- ("LEFTPADDING", (0, 0), (-1, -1), 10),
- ("RIGHTPADDING", (0, 0), (-1, -1), 10),
- ])
-
-
-def _apply_row_zebra(ts: Any, row_count: int, start_row: int = 0) -> None:
- for r in range(start_row, start_row + row_count):
- bg = S.SURFACE_MUTED if (r - start_row) % 2 else "#ffffff"
- ts.add("BACKGROUND", (0, r), (-1, r), _hex(bg))
-
-
-def _http_status_badge(code: str, st: dict) -> Any:
- from reportlab.lib.units import inch
- from reportlab.platypus import Table, TableStyle
- c = str(code or "").strip()
- if c == "200":
- fg, bg = S.GOOD, S.GOOD_BG
- elif c.startswith("3"):
- fg, bg = S.FAIR, S.FAIR_BG
- elif c and c[0] in "45":
- fg, bg = S.POOR, S.POOR_BG
- else:
- fg, bg = S.MUTED, S.SURFACE_MUTED
- badge_style = ParagraphStyle_compat(st["badge"], textColor=_hex(fg), fontSize=8)
- label = c or "—"
- tbl = Table([[ _p(label, badge_style) ]], colWidths=[0.52 * inch])
- tbl.setStyle(TableStyle([
- ("BACKGROUND", (0, 0), (-1, -1), _hex(bg)),
- ("ALIGN", (0, 0), (-1, -1), "CENTER"),
- ("VALIGN", (0, 0), (-1, -1), "MIDDLE"),
- ("TOPPADDING", (0, 0), (-1, -1), 3),
- ("BOTTOMPADDING", (0, 0), (-1, -1), 3),
- ("BOX", (0, 0), (-1, -1), 0.4, _hex(fg)),
- ]))
- return tbl
-
-
-def _render_key_value(block: KeyValueBlock, st: dict) -> list:
- layout = getattr(block, "layout", "default") or "default"
- if layout == "audit":
- return _render_audit_kv(block, st)
- if layout == "glossary":
- return _render_glossary_kv(block, st)
- return _render_default_kv(block, st)
-
-
-def _render_default_kv(block: KeyValueBlock, st: dict) -> list:
- from reportlab.lib.units import inch
- from reportlab.platypus import LongTable, Spacer
- if not block.rows:
- return []
- data = [[_p(k, st["kv_key"]), _p(v, st["kv_val"])] for k, v in block.rows]
- kv_key_w = _content_w_in() * 0.30
- kv_val_w = _content_w_in() - kv_key_w
- tbl = LongTable(data, colWidths=[kv_key_w * inch, kv_val_w * inch], repeatRows=0)
- ts = _table_style_base()
- from reportlab.platypus import TableStyle
- ts.add("BACKGROUND", (0, 0), (-1, -1), _hex(S.SURFACE_MUTED))
- ts.add("BACKGROUND", (0, 0), (0, -1), _hex(S.HEADER_BG))
- ts.add("FONTNAME", (0, 0), (0, -1), "Helvetica-Bold")
- tbl.setStyle(ts)
- return [tbl, Spacer(1, 6)]
-
-
-def _render_audit_kv(block: KeyValueBlock, st: dict) -> list:
- from reportlab.lib.units import inch
- from reportlab.platypus import LongTable, Spacer
- if not block.rows:
- return []
- kv_key_w = 1.65
- kv_val_w = _content_w_in() - kv_key_w
- data = [[_p(k, st["kv_key"]), _p(v, st["kv_val"])] for k, v in block.rows]
- tbl = LongTable(data, colWidths=[kv_key_w * inch, kv_val_w * inch], repeatRows=0)
- ts = _data_table_style()
- _apply_row_zebra(ts, len(block.rows))
- tbl.setStyle(ts)
- return [tbl, Spacer(1, 10)]
-
-
-def _render_glossary_kv(block: KeyValueBlock, st: dict) -> list:
- from reportlab.lib.units import inch
- from reportlab.platypus import LongTable, Spacer
- if not block.rows:
- return []
- term_w = 1.55
- desc_w = _content_w_in() - term_w
- data = [[_p(k, st["kv_key"]), _p(v, st["kv_desc"])] for k, v in block.rows]
- tbl = LongTable(data, colWidths=[term_w * inch, desc_w * inch], repeatRows=0)
- ts = _data_table_style()
- for r in range(len(block.rows)):
- ts.add("BACKGROUND", (0, r), (0, r), _hex(S.HEADER_BG))
- val_bg = "#ffffff" if r % 2 == 0 else S.SURFACE_MUTED
- ts.add("BACKGROUND", (1, r), (1, r), _hex(val_bg))
- tbl.setStyle(ts)
- return [tbl, Spacer(1, 10)]
-
-
-def _render_score_cards(block: ScoreCardsBlock, st: dict) -> list:
- from reportlab.lib.units import inch
- from reportlab.platypus import Spacer, Table
- if not block.cards:
- return []
- cols = S.GRID_COLS
- col_w = _col_w_in(cols)
- grid_rows: list[list] = []
- row: list = []
- for card in block.cards:
- score_color = S.SCORE_TONES.get(card.tone, S.MUTED)
- val_style = ParagraphStyle_compat(st["score_value"], textColor=_hex(score_color))
- issue_label = f"{card.issue_count} issue{'s' if card.issue_count != 1 else ''}"
- row.append(_cell_stack([
- (card.score or "—", val_style),
- (card.name, st["score_name"]),
- (issue_label, st["score_meta"]),
- ], col_w))
- if len(row) == cols:
- grid_rows.append(row)
- row = []
- if row:
- while len(row) < cols:
- row.append("")
- grid_rows.append(row)
- tbl = Table(grid_rows, colWidths=[col_w * inch] * cols, rowHeights=[0.78 * inch] * len(grid_rows))
- ts = _grid_table_style()
- for r_idx, grid_row in enumerate(grid_rows):
- for c_idx in range(cols):
- if c_idx < len(grid_row) and grid_row[c_idx] != "":
- ts.add("BACKGROUND", (c_idx, r_idx), (c_idx, r_idx), _hex(S.SURFACE_MUTED))
- tbl.setStyle(ts)
- return [tbl, Spacer(1, 12)]
-
-
-def _url_list_table_style(col_count: int) -> Any:
- from reportlab.platypus import TableStyle
- ts = TableStyle([
- ("BACKGROUND", (0, 0), (-1, 0), _hex(S.HEADER_BG)),
- ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
- ("FONTSIZE", (0, 0), (-1, -1), 9),
- ("TEXTCOLOR", (0, 0), (-1, 0), _hex(S.MUTED)),
- ("BOX", (0, 0), (-1, -1), 0.5, _hex(S.BORDER)),
- ("LINEBELOW", (0, 0), (-1, 0), 0.8, _hex(S.BORDER)),
- ("LINEBELOW", (0, 1), (-1, -1), 0.35, _hex(S.BORDER)),
- ("VALIGN", (0, 0), (-1, -1), "MIDDLE"),
- ("TOPPADDING", (0, 0), (-1, -1), 8),
- ("BOTTOMPADDING", (0, 0), (-1, -1), 8),
- ("LEFTPADDING", (0, 0), (-1, -1), 10),
- ("RIGHTPADDING", (0, 0), (-1, -1), 10),
- ])
- if col_count >= 2:
- ts.add("ALIGN", (1, 0), (1, -1), "CENTER")
- return ts
-
-
-def _render_url_list(block: UrlListBlock, st: dict) -> list:
- from reportlab.lib.units import inch
- from reportlab.platypus import LongTable, Spacer
- if not block.rows:
- return []
-
- show_title = getattr(block, "show_title", True)
- if show_title:
- header = [_p("URL", st["th"]), _p("Status", st["th"]), _p("Title", st["th"])]
- status_w = 0.72
- title_w = 1.85
- url_w = _content_w_in() - status_w - title_w
- col_widths = [url_w * inch, status_w * inch, title_w * inch]
- else:
- header = [_p("URL", st["th"]), _p("Status", st["th"])]
- status_w = 0.72
- url_w = _content_w_in() - status_w
- col_widths = [url_w * inch, status_w * inch]
-
- data: list = [header]
- for r in block.rows:
- url_cell = _safe_p(r.get("url", ""), st["td_link"])
- status_cell = _http_status_badge(str(r.get("status", "")), st)
- if show_title:
- title = str(r.get("title") or "").strip()
- data.append([url_cell, status_cell, _p(title, st["td"]) if title else _p("—", st["td_site"])])
- else:
- data.append([url_cell, status_cell])
-
- tbl = LongTable(data, colWidths=col_widths, repeatRows=1)
- ts = _url_list_table_style(len(col_widths))
- # Zebra only data rows (skip header)
- for r in range(1, len(data)):
- bg = S.SURFACE_MUTED if (r - 1) % 2 else "#ffffff"
- ts.add("BACKGROUND", (0, r), (-1, r), _hex(bg))
- tbl.setStyle(ts)
-
- parts: list = [tbl]
- if block.truncation:
- t = block.truncation
- note = f"Showing {t.shown} of {t.total} URLs. Export CSV/workbook for full inventory."
- parts.append(Spacer(1, 4))
- parts.append(_p(note, st["muted"]))
- parts.append(Spacer(1, 10))
- return parts
-
-
-def _render_metric_table(block: MetricTableBlock, st: dict) -> list:
- from reportlab.lib.units import inch
- from reportlab.platypus import LongTable, Spacer
-
- if not block.columns or not block.rows:
- return []
-
- _width_map = {"narrow": 0.75, "medium": 1.5, "wide": 2.5, "url": 2.0}
- total_cols = len(block.columns)
- available = _content_w_in()
- col_widths = [_width_map.get(c.width, 1.5) * inch for c in block.columns]
- # Scale to available width
- total_specified = sum(col_widths)
- if total_specified > available * inch:
- scale = (available * inch) / total_specified
- col_widths = [w * scale for w in col_widths]
-
- header = [_p(c.label, st["th"]) for c in block.columns]
- data: list = [header]
- for r in block.rows:
- cell_style = lambda col: st["td_url"] if col.width == "url" else st["td"]
- data.append([_safe_p(str(r.get(c.key, "")), cell_style(c)) for c in block.columns])
-
- tbl = LongTable(data, colWidths=col_widths, repeatRows=1 if block.repeat_header else 0)
- tbl.setStyle(_table_style_base())
- parts: list = [tbl]
- if block.truncation:
- t = block.truncation
- note = f"Showing {t.shown} of {t.total} rows. Full data in {', '.join(t.continue_in)}."
- parts.append(Spacer(1, 3))
- parts.append(_p(note, st["muted"]))
- parts.append(Spacer(1, 8))
- return parts
-
-
-def _priority_badge(priority: str, st: dict) -> Any:
- fg, bg = S.PRIORITY_TONES.get(priority, (S.INK, S.SURFACE_MUTED))
- from reportlab.lib.units import inch
- from reportlab.platypus import Table, TableStyle
- badge_style = ParagraphStyle_compat(st["badge"], textColor=_hex(fg), fontSize=7)
- cell = [[_p(priority.upper(), badge_style)]]
- tbl = Table(cell, colWidths=[0.62 * inch])
- tbl.setStyle(TableStyle([
- ("BACKGROUND", (0, 0), (-1, -1), _hex(bg)),
- ("ALIGN", (0, 0), (-1, -1), "CENTER"),
- ("TOPPADDING", (0, 0), (-1, -1), 3),
- ("BOTTOMPADDING", (0, 0), (-1, -1), 3),
- ("LEFTPADDING", (0, 0), (-1, -1), 2),
- ("RIGHTPADDING", (0, 0), (-1, -1), 2),
- ("BOX", (0, 0), (-1, -1), 0.5, _hex(fg)),
- ]))
- return tbl
-
-
-def _issue_location_cell(issue: PdfIssue, st: dict) -> Any:
- if issue.path:
- return _p(issue.path, st["td_url"])
- if issue.url:
- return _p(issue.url, st["td_url"])
- return _p("Site-wide", st["td_site"])
-
-
-def _top_issues_table_style():
- from reportlab.platypus import TableStyle
- return TableStyle([
- ("BACKGROUND", (0, 0), (-1, 0), _hex(S.HEADER_BG)),
- ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
- ("FONTSIZE", (0, 0), (-1, -1), 9),
- ("TEXTCOLOR", (0, 0), (-1, 0), _hex(S.MUTED)),
- ("LINEBELOW", (0, 0), (-1, 0), 0.8, _hex(S.BORDER)),
- ("LINEBELOW", (0, 1), (-1, -1), 0.35, _hex(S.BORDER)),
- ("ROWBACKGROUNDS", (0, 1), (-1, -1), [_hex("#ffffff"), _hex(S.SURFACE_MUTED)]),
- ("VALIGN", (0, 0), (-1, -1), "MIDDLE"),
- ("ALIGN", (0, 0), (0, -1), "CENTER"),
- ("TOPPADDING", (0, 0), (-1, -1), 8),
- ("BOTTOMPADDING", (0, 0), (-1, -1), 8),
- ("LEFTPADDING", (0, 0), (-1, -1), 8),
- ("RIGHTPADDING", (0, 0), (-1, -1), 8),
- ("BOX", (0, 0), (-1, -1), 0.5, _hex(S.BORDER)),
- ])
-
-
-def _render_executive_panel(cover: PdfCoverBlock, st: dict) -> list:
- from reportlab.lib.units import inch
- from reportlab.platypus import Spacer, Table, TableStyle
-
- rows: list[list] = []
- if cover.executive_source:
- rows.append([_p(f"Source · {cover.executive_source}", st["exec_source"])])
- if cover.executive_summary:
- rows.append([_p(cover.executive_summary, st["exec_body"])])
- if cover.priorities_list:
- rows.append([_p("Recommended priorities", st["exec_subhead"])])
- for i, pri in enumerate(cover.priorities_list[:6], 1):
- rows.append([_p(f"{i}. {pri}", st["exec_bullet"])])
-
- if not rows:
- return []
-
- content_w = _content_w_in()
- inner = Table(rows, colWidths=[content_w * inch])
- inner.setStyle(TableStyle([
- ("LEFTPADDING", (0, 0), (-1, -1), 0),
- ("RIGHTPADDING", (0, 0), (-1, -1), 0),
- ("TOPPADDING", (0, 0), (-1, -1), 2),
- ("BOTTOMPADDING", (0, 0), (-1, -1), 2),
- ("VALIGN", (0, 0), (-1, -1), "TOP"),
- ]))
-
- panel = Table([[inner]], colWidths=[content_w * inch])
- panel.setStyle(TableStyle([
- ("BACKGROUND", (0, 0), (-1, -1), _hex(S.SURFACE_MUTED)),
- ("LINEBEFORE", (0, 0), (0, -1), 3, _hex(S.BRAND_ACCENT)),
- ("BOX", (0, 0), (-1, -1), 0.5, _hex(S.BORDER)),
- ("LEFTPADDING", (0, 0), (-1, -1), 14),
- ("RIGHTPADDING", (0, 0), (-1, -1), 12),
- ("TOPPADDING", (0, 0), (-1, -1), 10),
- ("BOTTOMPADDING", (0, 0), (-1, -1), 10),
- ]))
- return [panel, Spacer(1, 14)]
-
-
-def _render_top_issues_table(issues: list[PdfIssue], st: dict) -> list:
- from reportlab.lib.units import inch
- from reportlab.platypus import LongTable, Spacer
-
- if not issues:
- return []
-
- content_w = _content_w_in()
- pri_w = 0.78
- loc_w = 1.55
- issue_w = content_w - pri_w - loc_w
-
- header = [
- _p("Priority", st["th"]),
- _p("Issue", st["th"]),
- _p("Location", st["th"]),
- ]
- rows: list = [header]
- for iss in issues:
- rows.append([
- _priority_badge(iss.priority, st),
- _p(iss.headline, st["td"]),
- _issue_location_cell(iss, st),
- ])
-
- tbl = LongTable(
- rows,
- colWidths=[pri_w * inch, issue_w * inch, loc_w * inch],
- repeatRows=1,
- )
- tbl.setStyle(_top_issues_table_style())
- return [tbl, Spacer(1, 10)]
-
-
-def _render_single_issue(issue: PdfIssue, st: dict) -> list:
- from reportlab.lib.units import inch
- from reportlab.platypus import Spacer, Table, TableStyle
- fg, bg = S.PRIORITY_TONES.get(issue.priority, (S.INK, S.SURFACE_MUTED))
-
- lines: list = [[_p(issue.headline, st["issue_headline"])]]
- if issue.related_urls:
- max_show = 10
- for url in issue.related_urls[:max_show]:
- lines.append([_p(f"• {url}", st["url"])])
- extra = len(issue.related_urls) - max_show
- if extra > 0:
- lines.append([_p(f"• … and {extra} more (see CSV export)", st["muted"])])
- elif issue.url:
- lines.append([_p(issue.url, st["url"])])
- if issue.recommendation:
- lines.append([_p(f"Fix: {issue.recommendation}", st["issue_rec"])])
-
- inner = Table(lines, colWidths=[(_content_w_in() - 0.3) * inch])
- inner.setStyle(TableStyle([
- ("LEFTPADDING", (0, 0), (-1, -1), 8),
- ("RIGHTPADDING", (0, 0), (-1, -1), 4),
- ("TOPPADDING", (0, 0), (-1, -1), 1),
- ("BOTTOMPADDING", (0, 0), (-1, -1), 2),
- ("VALIGN", (0, 0), (-1, -1), "TOP"),
- ]))
-
- outer = Table([[inner]], colWidths=[_content_w_in() * inch])
- outer.setStyle(TableStyle([
- ("LINEBEFORE", (0, 0), (0, -1), 3, _hex(fg)),
- ("BACKGROUND", (0, 0), (-1, -1), _hex(bg)),
- ("TOPPADDING", (0, 0), (-1, -1), 5),
- ("BOTTOMPADDING", (0, 0), (-1, -1), 5),
- ("VALIGN", (0, 0), (-1, -1), "TOP"),
- ]))
- return [outer, Spacer(1, 6)]
-
-
-def _render_issue_group(block: IssueGroupBlock, st: dict) -> list:
- from reportlab.platypus import Spacer
- parts: list = []
- parts.append(_p(block.group_label, st["subsection"]))
-
- if block.render_as == "compact_table":
- parts.extend(_render_issue_table_compact(block.issues, st))
- else:
- for issue in block.issues:
- parts.extend(_render_single_issue(issue, st))
-
- if block.truncation:
- t = block.truncation
- note = f"Showing {t.shown} of {t.total}. Full list in {', '.join(t.continue_in)}."
- parts.append(_p(note, st["muted"]))
-
- parts.append(Spacer(1, 8))
- return parts
-
-
-def _render_issue_table_compact(issues: list[PdfIssue], st: dict) -> list:
- """Two-column Issue | URL table (priority is already in the group heading)."""
- from reportlab.lib.units import inch
- from reportlab.platypus import LongTable, Spacer
- header = [_p("Issue", st["th"]), _p("URL", st["th"])]
- data: list = [header]
- for iss in issues:
- data.append([
- _p(iss.headline, st["td"]),
- _safe_p(iss.url or "", st["td_url"]),
- ])
- issue_w = _content_w_in() * 0.52
- url_w = _content_w_in() - issue_w
- tbl = LongTable(data, colWidths=[issue_w * inch, url_w * inch], repeatRows=1)
- tbl.setStyle(_table_style_base())
- return [tbl, Spacer(1, 4)]
-
-
-def _render_issue_table(block: IssueTableBlock, st: dict) -> list:
- parts: list = []
- if block.title:
- parts.append(_p(block.title, st["subsection"]))
- parts.extend(_render_issue_table_compact(block.issues, st))
- if block.truncation:
- t = block.truncation
- note = f"Showing {t.shown} of {t.total}. Full list in {', '.join(t.continue_in)}."
- parts.append(_p(note, st["muted"]))
- return parts
-
-
-def _render_markdown(block: MarkdownBlock, st: dict) -> list:
- import re
- from reportlab.platypus import Spacer
- # Strip HTML-like markdown tags to plain text for safety
- text = re.sub(r"<[^>]+>", " ", block.text)
- return [_p(text, st["body"]), Spacer(1, 4)]
-
-
-BLOCK_RENDERERS = {
- "heading": _render_heading,
- "paragraph": _render_paragraph,
- "callout": _render_callout,
- "spacer": _render_spacer,
- "kpi_row": _render_kpi_row,
- "stat_grid": _render_stat_grid,
- "key_value": _render_key_value,
- "score_cards": _render_score_cards,
- "url_list": _render_url_list,
- "metric_table": _render_metric_table,
- "issue_group": _render_issue_group,
- "issue_table": _render_issue_table,
- "markdown": _render_markdown,
-}
-
-
-def _flowables_for_block(block: Any, st: dict) -> list:
- btype = getattr(block, "type", None)
- if not getattr(block, "visible", True):
- return []
- renderer = BLOCK_RENDERERS.get(btype)
- if renderer is None:
- return []
- return renderer(block, st)
-
-
-# ---------------------------------------------------------------------------
-# Cover renderer
-# ---------------------------------------------------------------------------
-
-def _render_cover(cover: PdfCoverBlock, meta: PdfMeta, st: dict) -> list:
- from reportlab.lib.units import inch
- from reportlab.platypus import Spacer, Table, TableStyle
- parts: list = []
-
- content_w = _content_w_in()
- score_col = 1.35
- title_col = content_w - score_col
-
- score_color = S.SCORE_TONES.get(cover.hero.band, S.MUTED)
- score_display = cover.hero.score or "—"
- score_style = ParagraphStyle_compat(
- st["hero_score"], textColor=_hex(score_color), alignment=1, fontSize=32, leading=36,
- )
- suffix_style = ParagraphStyle_compat(st["hero_suffix"], alignment=1)
-
- score_block = Table(
- [[_p(score_display, score_style)], [_p("/100", suffix_style)]],
- colWidths=[score_col * inch],
- )
- score_block.setStyle(TableStyle([
- ("ALIGN", (0, 0), (-1, -1), "CENTER"),
- ("VALIGN", (0, 0), (-1, -1), "TOP"),
- ("LEFTPADDING", (0, 0), (-1, -1), 0),
- ("RIGHTPADDING", (0, 0), (-1, -1), 0),
- ("TOPPADDING", (0, 0), (-1, -1), 0),
- ("BOTTOMPADDING", (0, 0), (-1, -1), 0),
- ]))
-
- title_row = Table(
- [[_p(cover.headline, st["title"]), score_block]],
- colWidths=[title_col * inch, score_col * inch],
- )
- title_row.setStyle(TableStyle([
- ("VALIGN", (0, 0), (-1, -1), "TOP"),
- ("ALIGN", (0, 0), (0, 0), "LEFT"),
- ("ALIGN", (1, 0), (1, 0), "RIGHT"),
- ("LEFTPADDING", (0, 0), (-1, -1), 0),
- ("RIGHTPADDING", (0, 0), (-1, -1), 0),
- ("TOPPADDING", (0, 0), (-1, -1), 0),
- ("BOTTOMPADDING", (0, 0), (-1, -1), 0),
- ]))
- parts.append(title_row)
- parts.append(_p(cover.subtitle, st["subtitle"]))
-
- counts = meta.issue_counts
- total = sum(counts.values())
- meta_line = (
- f"Report generated {meta.generated_at} · {total} findings "
- f"(Critical {counts.get('critical', 0)}, High {counts.get('high', 0)}, "
- f"Medium {counts.get('medium', 0)}, Low {counts.get('low', 0)})"
- )
- parts.append(_p(meta_line, st["cover_meta"]))
-
- parts.extend(_flowables_for_block(cover.priority_strip, st))
-
- if cover.category_scores.cards:
- parts.extend(_section_heading("Category scores", st))
- parts.extend(_render_score_cards(cover.category_scores, st))
-
- if cover.executive_summary or cover.priorities_list:
- parts.extend(_section_heading("Executive summary", st))
- parts.extend(_render_executive_panel(cover, st))
-
- if cover.top_issues:
- parts.extend(_section_heading("Top traffic-impacting issues", st))
- parts.append(_p(
- "Ranked by severity and traffic impact — address critical and high items first.",
- st["section_lead"],
- ))
- parts.extend(_render_top_issues_table(cover.top_issues, st))
-
- return parts
-
-
-# ---------------------------------------------------------------------------
-# Section renderer
-# ---------------------------------------------------------------------------
-
-def _render_section(section: PdfSection, st: dict) -> list:
- from reportlab.platypus import PageBreak, Spacer
- parts: list = []
- if section.page_break_before:
- parts.append(PageBreak())
- parts.extend(_section_heading(section.title, st))
- if section.source_label:
- parts.append(_p(f"Source: {section.source_label}", st["muted"]))
- for block in section.blocks:
- parts.extend(_flowables_for_block(block, st))
- if section.truncation:
- t = section.truncation
- note = f"Showing {t.shown} of {t.total} issues. Export CSV or workbook for full data."
- parts.append(_p(note, st["muted"]))
- parts.append(Spacer(1, 4))
- return parts
-
-
-# ---------------------------------------------------------------------------
-# Main entry point
-# ---------------------------------------------------------------------------
-
-def render_pdf_document(doc: PdfDocument) -> bytes:
- _require_reportlab()
-
- from reportlab.lib.pagesizes import letter
- from reportlab.lib.units import inch
- from reportlab.platypus import PageBreak, SimpleDocTemplate
-
- buf = io.BytesIO()
- footer_text = (
- f"{doc.footer.confidential_note} "
- f"Generated by {doc.footer.generator} · {doc.footer.exported_at}"
- )
-
- pdf_doc = SimpleDocTemplate(
- buf,
- pagesize=letter,
- topMargin=0.65 * inch,
- bottomMargin=0.65 * inch,
- leftMargin=0.65 * inch,
- rightMargin=0.65 * inch,
- title=doc.cover.headline,
- author=doc.footer.generator,
- )
-
- st = _make_styles()
- story: list = []
-
- story.extend(_render_cover(doc.cover, doc.meta, st))
- story.append(PageBreak())
-
- for section in doc.sections:
- story.extend(_render_section(section, st))
-
- def on_page(canvas, d):
- _page_callback(canvas, d, footer_text)
-
- pdf_doc.build(story, onFirstPage=on_page, onLaterPages=on_page)
- return buf.getvalue()
diff --git a/src/website_profiling/reporting/pdf/render/styles.py b/src/website_profiling/reporting/pdf/render/styles.py
deleted file mode 100644
index ab9951e2..00000000
--- a/src/website_profiling/reporting/pdf/render/styles.py
+++ /dev/null
@@ -1,54 +0,0 @@
-"""ReportLab style constants mirroring the HTML CSS design tokens."""
-from __future__ import annotations
-
-# Colour palette — mirrors _report_html_styles() CSS variables
-INK = "#0f172a"
-MUTED = "#64748b"
-BORDER = "#e2e8f0"
-SURFACE_MUTED = "#f8fafc"
-BRAND = "#0b0f19"
-BRAND_ACCENT = "#2563eb"
-
-GOOD = "#059669"
-GOOD_BG = "#ecfdf5"
-FAIR = "#d97706"
-FAIR_BG = "#fffbeb"
-POOR = "#dc2626"
-POOR_BG = "#fef2f2"
-
-CRITICAL_FG = "#991b1b"
-CRITICAL_BG = "#fee2e2"
-HIGH_FG = "#c2410c"
-HIGH_BG = "#ffedd5"
-MEDIUM_FG = "#a16207"
-MEDIUM_BG = "#fef3c7"
-LOW_FG = "#475569"
-LOW_BG = "#f1f5f9"
-
-HEADER_BG = "#f1f5f9"
-
-# Column widths (inches) for common patterns
-COL_NARROW = 0.75
-COL_MEDIUM = 1.5
-COL_WIDE = 2.5
-COL_URL = 2.0
-
-# Letter page with 0.65" margins — keep all flowables on this width for alignment
-PAGE_MARGIN_IN = 0.65
-PAGE_WIDTH_IN = 8.5
-CONTENT_WIDTH_IN = PAGE_WIDTH_IN - 2 * PAGE_MARGIN_IN # 7.2
-GRID_COLS = 4
-
-PRIORITY_TONES = {
- "critical": (CRITICAL_FG, CRITICAL_BG),
- "high": (HIGH_FG, HIGH_BG),
- "medium": (MEDIUM_FG, MEDIUM_BG),
- "low": (LOW_FG, LOW_BG),
-}
-
-SCORE_TONES = {
- "score-good": GOOD,
- "score-fair": FAIR,
- "score-poor": POOR,
- "score-na": MUTED,
-}
diff --git a/src/website_profiling/reporting/report_metadata.py b/src/website_profiling/reporting/report_metadata.py
index 36690a22..67d79d30 100644
--- a/src/website_profiling/reporting/report_metadata.py
+++ b/src/website_profiling/reporting/report_metadata.py
@@ -163,9 +163,9 @@ def _build_report_metadata(
if lighthouse_summary:
sources.append("lighthouse")
if google_data:
- if google_data.get("gsc") or google_data.get("gsc_summary"):
+ if google_data.get("gsc"):
sources.append("search_console")
- if google_data.get("ga4") or google_data.get("ga4_summary"):
+ if google_data.get("ga4"):
sources.append("analytics")
if gsc_links_data and "search_console" not in sources:
sources.append("search_console")
diff --git a/src/website_profiling/tools/audit_tools/backlinks/backlinks.py b/src/website_profiling/tools/audit_tools/backlinks/backlinks.py
index b322c275..781c7099 100644
--- a/src/website_profiling/tools/audit_tools/backlinks/backlinks.py
+++ b/src/website_profiling/tools/audit_tools/backlinks/backlinks.py
@@ -118,20 +118,20 @@ def get_backlinks_velocity(conn: Connection, ctx: AuditToolContext, args: dict[s
return {"error": "property_id is required"}
limit = parse_limit(args.get("limit"), 52, 52)
cur = conn.execute(
- """SELECT captured_at, referring_domains, top_domains
+ """SELECT fetched_at, referring_domains, top_domains
FROM gsc_links_snapshots
WHERE property_id = %s
- ORDER BY captured_at ASC
+ ORDER BY fetched_at ASC
LIMIT %s""",
(int(scoped.property_id), limit),
)
snapshots = []
for row in cur.fetchall() or []:
- captured = row["captured_at"] if hasattr(row, "keys") else row[0]
+ fetched = row["fetched_at"] if hasattr(row, "keys") else row[0]
domains = row["referring_domains"] if hasattr(row, "keys") else row[1]
top = row["top_domains"] if hasattr(row, "keys") else row[2]
snapshots.append({
- "captured_at": captured.isoformat() if hasattr(captured, "isoformat") else str(captured or ""),
+ "captured_at": fetched.isoformat() if hasattr(fetched, "isoformat") else str(fetched or ""),
"referring_domains": domains,
"top_domains": top,
})
diff --git a/src/website_profiling/tools/audit_tools/compare/compare_list_tools.py b/src/website_profiling/tools/audit_tools/compare/compare_list_tools.py
index 62f7c1fb..2db041bb 100644
--- a/src/website_profiling/tools/audit_tools/compare/compare_list_tools.py
+++ b/src/website_profiling/tools/audit_tools/compare/compare_list_tools.py
@@ -154,7 +154,7 @@ def list_compare_traffic_losers(conn: Connection, ctx: AuditToolContext, args: d
delta = cur_clicks - base_clicks
if delta >= 0:
continue
- url = str(cur_row.get("page") or cur_row.get("url") or key)
+ url = str(cur_row.get("page") or key)
losers.append({
"url": url,
"clicks_current": cur_clicks,
diff --git a/src/website_profiling/tools/audit_tools/export/export_tools.py b/src/website_profiling/tools/audit_tools/export/export_tools.py
index fa8d77ec..448c7dc7 100644
--- a/src/website_profiling/tools/audit_tools/export/export_tools.py
+++ b/src/website_profiling/tools/audit_tools/export/export_tools.py
@@ -11,24 +11,19 @@
save_artifact,
)
from ...export_compare import export_compare_issues_csv
-from ...export_audit import (
- export_audit_csv,
- export_audit_html,
- export_audit_json,
- export_audit_pdf,
-)
+from ...export_audit import export_audit_csv, export_audit_json
+from ....clients.file_service import fetch_report_pdf
from .._slice import parse_limit
from ..compare.compare_helpers import load_compare_pair
from ..context import AuditToolContext
-_EXPORT_FORMATS = {"pdf", "html", "csv", "json"}
+_EXPORT_FORMATS = {"pdf", "csv", "json"}
_MIME = {
"pdf": "application/pdf",
- "html": "text/html; charset=utf-8",
"csv": "text/csv; charset=utf-8",
"json": "application/json; charset=utf-8",
}
-_EXT = {"pdf": "pdf", "html": "html", "csv": "csv", "json": "json"}
+_EXT = {"pdf": "pdf", "csv": "csv", "json": "json"}
_LIST_EXPORT_ALLOWLIST = frozenset({
"list_issues",
@@ -214,18 +209,17 @@ def export_audit_report(conn: Connection, ctx: AuditToolContext, args: dict[str,
if fmt not in _EXPORT_FORMATS:
return {"error": f"format must be one of: {', '.join(sorted(_EXPORT_FORMATS))}"}
report_id = scoped.report_id
+ profile = str(args.get("profile") or "standard").strip().lower()
try:
if fmt == "pdf":
- data = export_audit_pdf(report_id)
+ data = fetch_report_pdf(report_id, profile=profile)
filename = f"audit-export.{_EXT[fmt]}"
return {
**_artifact_from_bytes(data, filename=filename, mime_type=_MIME[fmt], extra={"format": fmt, "report_id": report_id}),
"format": fmt,
"report_id": report_id,
}
- if fmt == "html":
- data = export_audit_html(report_id)
- elif fmt == "csv":
+ if fmt == "csv":
data = export_audit_csv(report_id)
else:
data = export_audit_json(report_id)
@@ -296,8 +290,7 @@ def export_list_as_csv(conn: Connection, ctx: AuditToolContext, args: dict[str,
def list_export_formats(_conn: Connection, _ctx: AuditToolContext, _args: dict[str, Any]) -> dict[str, Any]:
return {
"formats": [
- {"tool": "export_audit_report", "format": "pdf", "description": "Full audit PDF deliverable"},
- {"tool": "export_audit_report", "format": "html", "description": "Full audit HTML preview/print"},
+ {"tool": "export_audit_report", "format": "pdf", "description": "Full audit PDF deliverable (FileService)"},
{"tool": "export_audit_report", "format": "csv", "description": "Full audit CSV (URLs + issues)"},
{"tool": "export_audit_report", "format": "json", "description": "Full audit JSON payload"},
{"tool": "export_compare_csv", "format": "csv", "description": "Issue added/removed diff between two reports"},
@@ -309,7 +302,7 @@ def list_export_formats(_conn: Connection, _ctx: AuditToolContext, _args: dict[s
"Compare this report to report 38 as CSV",
],
"notes": [
- "PDF requires reportlab (pip install reportlab)",
+ "PDF requires FileService (FILE_SERVICE_URL; see services/FileService/)",
"Artifacts expire after 24 hours",
"Chat UI shows download buttons after export tools run",
],
diff --git a/src/website_profiling/tools/audit_tools/google/google.py b/src/website_profiling/tools/audit_tools/google/google.py
index 74df39f9..9b3f9789 100644
--- a/src/website_profiling/tools/audit_tools/google/google.py
+++ b/src/website_profiling/tools/audit_tools/google/google.py
@@ -146,7 +146,7 @@ def get_gsc_ctr_opportunity_pages(conn: Connection, ctx: AuditToolContext, args:
if not data:
return {"error": "no google data found", "pages": [], "total": 0, "truncated": False}
gsc = data.get("gsc") if isinstance(data.get("gsc"), dict) else {}
- pages = gsc.get("pages") or gsc.get("top_pages") or []
+ pages = gsc.get("top_pages") or []
if not isinstance(pages, list):
pages = []
try:
diff --git a/src/website_profiling/tools/audit_tools/google/google_lists.py b/src/website_profiling/tools/audit_tools/google/google_lists.py
index 3f9a4944..051ba2f3 100644
--- a/src/website_profiling/tools/audit_tools/google/google_lists.py
+++ b/src/website_profiling/tools/audit_tools/google/google_lists.py
@@ -124,7 +124,7 @@ def list_gsc_ctr_underperformers(conn: Connection, ctx: AuditToolContext, args:
expected = industry_ctr(pos)
if ctr > 0 and ctr < min(site_median * 0.7, expected * 0.7):
under.append({
- "page": row.get("page") or row.get("url"),
+ "page": row.get("page"),
"clicks": row.get("clicks"),
"impressions": row.get("impressions"),
"ctr": row.get("ctr"),
diff --git a/src/website_profiling/tools/audit_tools/tool_catalog.py b/src/website_profiling/tools/audit_tools/tool_catalog.py
index 896640ea..1238e6e4 100644
--- a/src/website_profiling/tools/audit_tools/tool_catalog.py
+++ b/src/website_profiling/tools/audit_tools/tool_catalog.py
@@ -244,7 +244,7 @@ def _tool(name: str, description: str, properties: dict[str, Any], required: lis
_tool(
"export_audit_report",
"Export full audit report as PDF, HTML, CSV, or JSON. Returns download artifact metadata.",
- {"property_id": _PID, "report_id": _RID, "format": {"type": "string", "enum": ["pdf", "html", "csv", "json"]}},
+ {"property_id": _PID, "report_id": _RID, "format": {"type": "string", "enum": ["pdf", "csv", "json"]}},
),
_tool(
"export_compare_csv",
diff --git a/src/website_profiling/tools/export_audit.py b/src/website_profiling/tools/export_audit.py
index 771e701c..51c90eef 100644
--- a/src/website_profiling/tools/export_audit.py
+++ b/src/website_profiling/tools/export_audit.py
@@ -1,4 +1,4 @@
-"""Export audit payload to CSV, JSON, HTML (preview/print), and PDF."""
+"""Export audit payload to CSV and JSON."""
from __future__ import annotations
import csv
@@ -10,20 +10,8 @@
from .export_audit_data import (
_executive_export_data,
_executive_source_label,
- _format_report_date,
- _issue_priority_counts,
_issue_recommendation,
_issues_rows,
- _overall_score,
- _priority_sort_key,
- _score_band,
- _summary_lines,
-)
-from .export_audit_html import (
- _category_cards_html,
- _executive_summary_html,
- _priority_stats_html,
- _report_html_styles,
)
@@ -84,31 +72,3 @@ def export_audit_csv(report_id: Optional[int] = None) -> str:
def export_audit_json(report_id: Optional[int] = None) -> str:
payload = _load_payload(report_id)
return json.dumps(payload, indent=2, default=str)
-
-
-def export_audit_html(report_id: Optional[int] = None, profile: str = "standard") -> str:
- """Export audit report as HTML preview matching the PDF layout."""
- from ..reporting.pdf import build_pdf_document
- from ..reporting.pdf.options import PdfBuildOptions
- from ..reporting.pdf.render.html import render_html_document
-
- payload = _load_payload(report_id)
- opts = PdfBuildOptions(profile=profile, report_id=report_id) # type: ignore[arg-type]
- doc = build_pdf_document(payload, opts)
- return render_html_document(doc)
-
-
-def export_audit_pdf(report_id: Optional[int] = None, profile: str = "standard") -> bytes:
- """Export audit report as a formatted PDF using the PdfDocument pipeline.
-
- Args:
- report_id: Specific report ID to load (None = latest).
- profile: "executive" | "standard" | "full" (default "standard").
- """
- from ..reporting.pdf import build_pdf_document, render_pdf_document
- from ..reporting.pdf.options import PdfBuildOptions
-
- payload = _load_payload(report_id)
- opts = PdfBuildOptions(profile=profile, report_id=report_id) # type: ignore[arg-type]
- doc = build_pdf_document(payload, opts)
- return render_pdf_document(doc)
diff --git a/src/website_profiling/tools/export_audit_data.py b/src/website_profiling/tools/export_audit_data.py
index e6d28a07..d52a9077 100644
--- a/src/website_profiling/tools/export_audit_data.py
+++ b/src/website_profiling/tools/export_audit_data.py
@@ -71,20 +71,11 @@ def _executive_export_data(payload: dict[str, Any]) -> dict[str, Any]:
if isinstance(raw_top, list):
top_issues = [i for i in raw_top if isinstance(i, dict)][:8]
- legacy_recs = payload.get("recommendations") or []
- legacy_list: list[str] = []
- if isinstance(legacy_recs, list):
- legacy_list = [str(r).strip() for r in legacy_recs if str(r).strip()]
-
- if not summary and legacy_list:
- summary = "\n".join(f"• {r}" for r in legacy_list[:12])
-
return {
"summary": summary,
"priorities": priorities,
"top_issues": top_issues,
"source": source,
- "legacy_recommendations": legacy_list,
}
diff --git a/src/website_profiling/tools/export_audit_html.py b/src/website_profiling/tools/export_audit_html.py
deleted file mode 100644
index e3b61dbd..00000000
--- a/src/website_profiling/tools/export_audit_html.py
+++ /dev/null
@@ -1,119 +0,0 @@
-"""Audit export HTML generation."""
-from __future__ import annotations
-
-import html
-from typing import Any, Optional
-
-from ..reporting.terminology import category_display_name
-from .export_audit_data import (
- _GLOSSARY_ROWS,
- _ISSUE_LIMIT_HTML,
- _ISSUE_LIMIT_PDF,
- _LINK_LIMIT,
- _executive_export_data,
- _executive_source_label,
- _format_report_date,
- _issue_priority_counts,
- _issues_rows,
- _overall_score,
- _priority_sort_key,
- _score_band,
- _summary_lines,
-)
-
-def _executive_summary_html(payload: dict[str, Any]) -> str:
- data = _executive_export_data(payload)
- if not data["summary"] and not data["priorities"] and not data["top_issues"]:
- return ""
-
- parts: list[str] = ['Executive summary
']
- if data["source"]:
- parts.append(
- f'Source: {html.escape(_executive_source_label(data["source"]))}
'
- )
- if data["summary"]:
- summary_html = html.escape(data["summary"]).replace("\n", "
")
- parts.append(f'')
-
- if data["priorities"]:
- pri_items = "".join(f"{html.escape(p)}" for p in data["priorities"][:8])
- parts.append(f"Priorities
")
-
- if data["top_issues"]:
- rows = ""
- for iss in data["top_issues"]:
- pri = str(iss.get("priority") or "").lower()
- badge_cls = f"badge-{pri}" if pri in {"critical", "high", "medium", "low"} else "badge-low"
- clicks = iss.get("gsc_clicks")
- clicks_txt = ""
- if clicks is not None:
- try:
- if float(clicks) > 0:
- clicks_txt = f' · {int(float(clicks))} GSC clicks'
- except (TypeError, ValueError):
- pass
- rows += (
- ""
- f"| {html.escape(str(iss.get('priority') or ''))} | "
- f"{html.escape(str(iss.get('message') or ''))} | "
- f"{html.escape(str(iss.get('url') or ''))} | "
- f"{html.escape(clicks_txt.lstrip(' · ') if clicks_txt else '—')} | "
- "
"
- )
- parts.append(
- "Top traffic-impacting issues
"
- ''
- "| Priority | Issue | URL | GSC clicks | "
- f"
{rows}
"
- )
-
- parts.append("")
- return "".join(parts)
-
-
-def _category_cards_html(categories: Any) -> str:
- cards: list[str] = []
- for cat in categories or []:
- if not isinstance(cat, dict):
- continue
- name = html.escape(category_display_name(str(cat.get("name") or "Category")))
- score_val: float | None = None
- if cat.get("score") is not None:
- try:
- score_val = float(cat["score"])
- except (TypeError, ValueError):
- score_val = None
- score_txt, score_cls = _score_band(score_val)
- issue_n = len(cat.get("issues") or [])
- cards.append(
- f''
- f'{score_txt}
'
- f'{name}
'
- f'{issue_n} issue{"s" if issue_n != 1 else ""}
'
- f""
- )
- return "".join(cards) or 'No category scores available.
'
-
-
-def _priority_stats_html(counts: dict[str, int]) -> str:
- labels = (
- ("critical", "Critical"),
- ("high", "High"),
- ("medium", "Medium"),
- ("low", "Low"),
- )
- parts: list[str] = []
- for key, label in labels:
- n = counts.get(key, 0)
- parts.append(
- f''
- f'{n}'
- f'{label}'
- f"
"
- )
- return "".join(parts)
-
-
-def _report_html_styles() -> str:
- from ..reporting.pdf.render.html import html_styles
- return html_styles()
diff --git a/src/website_profiling/tools/export_crawl_workbook.py b/src/website_profiling/tools/export_crawl_workbook.py
deleted file mode 100644
index 6fd46d1d..00000000
--- a/src/website_profiling/tools/export_crawl_workbook.py
+++ /dev/null
@@ -1,106 +0,0 @@
-"""Export crawl workbook as ZIP of CSV sheets."""
-from __future__ import annotations
-
-import csv
-import io
-import json
-import zipfile
-from typing import Any
-
-
-def _csv_bytes(rows: list[dict[str, Any]], columns: list[str]) -> bytes:
- buf = io.StringIO()
- writer = csv.DictWriter(buf, fieldnames=columns, extrasaction="ignore")
- writer.writeheader()
- for row in rows:
- writer.writerow({k: row.get(k, "") for k in columns})
- return buf.getvalue().encode("utf-8")
-
-
-def _parse_custom_fields(raw: Any) -> dict[str, str]:
- if raw is None:
- return {}
- if isinstance(raw, dict):
- return {str(k): str(v) for k, v in raw.items()}
- text = str(raw).strip()
- if not text:
- return {}
- try:
- parsed = json.loads(text)
- except json.JSONDecodeError:
- return {}
- if not isinstance(parsed, dict):
- return {}
- return {str(k): str(v) for k, v in parsed.items()}
-
-
-def _custom_field_rows(links: list[Any]) -> tuple[list[dict[str, Any]], list[str]]:
- rows: list[dict[str, Any]] = []
- field_names: set[str] = set()
- for row in links:
- if not isinstance(row, dict):
- continue
- url = row.get("url")
- custom_extract = row.get("custom_extract")
- fields = _parse_custom_fields(row.get("custom_fields"))
- if not url or (not custom_extract and not fields):
- continue
- field_names.update(fields.keys())
- rows.append({"url": url, "custom_extract": custom_extract or "", **fields})
- columns = ["url", "custom_extract", *sorted(field_names)]
- return rows, columns
-
-
-def build_crawl_workbook_zip(report_payload: dict[str, Any]) -> bytes:
- """Build ZIP containing Internal URLs, Links, Redirects, Issues CSVs."""
- mem = io.BytesIO()
- with zipfile.ZipFile(mem, "w", zipfile.ZIP_DEFLATED) as zf:
- links = report_payload.get("links") or []
- if isinstance(links, list) and links:
- url_cols = [
- "url", "status", "title", "meta_description", "h1",
- "canonical_url", "inlinks", "outlinks", "depth", "word_count",
- ]
- zf.writestr("internal_urls.csv", _csv_bytes(links, url_cols))
-
- link_edges = report_payload.get("link_edges") or []
- if isinstance(link_edges, list) and link_edges:
- edge_cols = [
- "from_url", "to_url", "anchor_text", "rel",
- "is_nofollow", "is_sponsored", "is_ugc", "link_type", "position",
- ]
- zf.writestr("links.csv", _csv_bytes(link_edges, edge_cols))
-
- redirects = report_payload.get("redirects") or []
- if isinstance(redirects, list) and redirects:
- zf.writestr(
- "redirects.csv",
- _csv_bytes(redirects, ["url", "message", "priority", "recommendation"]),
- )
-
- issue_rows: list[dict[str, Any]] = []
- for cat in report_payload.get("categories") or []:
- if not isinstance(cat, dict):
- continue
- cat_name = cat.get("name") or cat.get("id") or ""
- for iss in cat.get("issues") or []:
- if isinstance(iss, dict):
- issue_rows.append({**iss, "category": cat_name})
- if issue_rows:
- zf.writestr(
- "issues.csv",
- _csv_bytes(
- issue_rows,
- [
- "category", "priority", "message", "url",
- "impact_score", "gsc_clicks", "gsc_impressions", "ga4_sessions",
- "recommendation",
- ],
- ),
- )
-
- custom_rows, custom_cols = _custom_field_rows(links if isinstance(links, list) else [])
- if custom_rows:
- zf.writestr("custom_fields.csv", _csv_bytes(custom_rows, custom_cols))
-
- return mem.getvalue()
diff --git a/src/website_profiling/worker/__init__.py b/src/website_profiling/worker/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/website_profiling/worker/__main__.py b/src/website_profiling/worker/__main__.py
new file mode 100644
index 00000000..f83cfde6
--- /dev/null
+++ b/src/website_profiling/worker/__main__.py
@@ -0,0 +1,7 @@
+"""Entry point: python -m website_profiling.worker"""
+from __future__ import annotations
+
+from .loop import run_worker_loop
+
+if __name__ == "__main__":
+ run_worker_loop()
diff --git a/src/website_profiling/worker/loop.py b/src/website_profiling/worker/loop.py
new file mode 100644
index 00000000..39f9b0fb
--- /dev/null
+++ b/src/website_profiling/worker/loop.py
@@ -0,0 +1,52 @@
+"""Worker main loop: poll pending jobs and run them one at a time."""
+from __future__ import annotations
+
+import logging
+import os
+import signal
+import time
+
+from website_profiling.db.pipeline_jobs import try_claim_pending_job
+from website_profiling.db.pool import db_session
+
+from .runner import run_job
+
+logger = logging.getLogger("website_profiling.worker")
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+
+_POLL_INTERVAL = float(os.getenv("WP_WORKER_POLL_INTERVAL", "1.0"))
+
+_running = True
+
+
+def _handle_sigterm(signum: int, frame: object) -> None:
+ global _running
+ logger.info("Worker received signal %s, shutting down after current job.", signum)
+ _running = False
+
+
+def run_worker_loop() -> None:
+ signal.signal(signal.SIGTERM, _handle_sigterm)
+ signal.signal(signal.SIGINT, _handle_sigterm)
+
+ logger.info("Pipeline worker started (PID %s, poll interval %.1fs).", os.getpid(), _POLL_INTERVAL)
+
+ while _running:
+ try:
+ with db_session() as conn:
+ job = try_claim_pending_job(conn, os.getpid())
+ except Exception as exc:
+ logger.warning("Worker DB poll error: %s", exc)
+ time.sleep(_POLL_INTERVAL)
+ continue
+
+ if job:
+ logger.info("Running job %s (command=%r).", job["id"], job.get("command"))
+ try:
+ run_job(job)
+ except Exception as exc:
+ logger.error("Unhandled error in job %s: %s", job["id"], exc, exc_info=True)
+ else:
+ time.sleep(_POLL_INTERVAL)
+
+ logger.info("Worker exiting cleanly.")
diff --git a/src/website_profiling/worker/runner.py b/src/website_profiling/worker/runner.py
new file mode 100644
index 00000000..f7f72d7d
--- /dev/null
+++ b/src/website_profiling/worker/runner.py
@@ -0,0 +1,134 @@
+"""Subprocess runner: spawn the audit CLI and pump output to the DB."""
+from __future__ import annotations
+
+import os
+import subprocess
+import sys
+import threading
+import time
+from typing import Any
+
+from website_profiling.db.pipeline_jobs import append_job_log, check_flags, finish_job
+from website_profiling.db.pool import db_session
+
+from .signals import cancel_subprocess, pause_subprocess
+
+
+def _get_spawn_env(property_id: Any = None) -> dict[str, str]:
+ """Build env dict for spawning `python -m src`, mirroring pipelineSpawnEnv.ts."""
+ repo_root = os.environ.get("WEBSITE_PROFILING_ROOT", os.path.dirname(
+ os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+ ))
+ data_dir = os.environ.get("DATA_DIR", os.path.join(repo_root, "data"))
+ env = os.environ.copy()
+ env["WEBSITE_PROFILING_ROOT"] = repo_root
+ env["DATA_DIR"] = data_dir
+ existing_pythonpath = env.get("PYTHONPATH", "")
+ src_path = os.path.join(repo_root, "src")
+ env["PYTHONPATH"] = f"{src_path}{os.pathsep}{existing_pythonpath}" if existing_pythonpath else src_path
+ env["PYTHONIOENCODING"] = "utf-8"
+ env["PYTHONUTF8"] = "1"
+ if property_id is not None:
+ env["WP_PROPERTY_ID"] = str(property_id)
+ return env
+
+
+def _pump_output(proc: subprocess.Popen, job_id: str) -> None: # type: ignore[type-arg]
+ """Read stdout+stderr from the subprocess and append to DB log."""
+ def _pump_stream(stream: Any) -> None:
+ while True:
+ line = stream.readline()
+ if not line:
+ break
+ text = line if isinstance(line, str) else line.decode("utf-8", errors="replace")
+ try:
+ with db_session() as conn:
+ append_job_log(conn, job_id, text)
+ except Exception:
+ pass
+
+ t_out = threading.Thread(target=_pump_stream, args=(proc.stdout,), daemon=True)
+ t_err = threading.Thread(target=_pump_stream, args=(proc.stderr,), daemon=True)
+ t_out.start()
+ t_err.start()
+ t_out.join()
+ t_err.join()
+
+
+def run_job(job: dict) -> None:
+ """Execute one pipeline job, handling cancel/pause/resume signals."""
+ job_id: str = job["id"]
+ command: str | None = job.get("command")
+ property_id = job.get("property_id")
+
+ repo_root = os.environ.get("WEBSITE_PROFILING_ROOT", "")
+ python_exe = os.environ.get("PYTHON", sys.executable)
+
+ args = [python_exe, "-m", "src"]
+ if command:
+ args.extend(command.split())
+
+ env = _get_spawn_env(property_id)
+
+ try:
+ proc = subprocess.Popen(
+ args,
+ cwd=repo_root or None,
+ env=env,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ bufsize=1,
+ universal_newlines=True,
+ )
+ except Exception as exc:
+ with db_session() as conn:
+ finish_job(conn, job_id, "error", -1, str(exc))
+ return
+
+ pump_thread = threading.Thread(target=_pump_output, args=(proc, job_id), daemon=True)
+ pump_thread.start()
+
+ paused = False
+
+ while proc.poll() is None:
+ time.sleep(1.0)
+ try:
+ with db_session() as conn:
+ cancel, pause = check_flags(conn, job_id)
+ except Exception:
+ cancel, pause = False, False
+
+ if cancel:
+ cancel_subprocess(proc)
+ proc.wait()
+ pump_thread.join(timeout=5)
+ with db_session() as conn:
+ finish_job(conn, job_id, "error", -1, "Cancelled by user")
+ return
+
+ if pause and not paused:
+ pause_subprocess(proc)
+ paused = True
+
+ proc.wait()
+ pump_thread.join(timeout=10)
+
+ exit_code = proc.returncode
+
+ if paused and exit_code == 0:
+ with db_session() as conn:
+ job_row = conn.execute(
+ "SELECT log_text FROM pipeline_jobs WHERE id = %s::uuid", (job_id,)
+ ).fetchone()
+ log_text = str((job_row or {}).get("log_text") or "")
+ log_truncated_row = conn.execute(
+ "SELECT log_truncated FROM pipeline_jobs WHERE id = %s::uuid", (job_id,)
+ ).fetchone()
+ log_truncated = bool((log_truncated_row or {}).get("log_truncated"))
+ finish_job(conn, job_id, "paused", exit_code, log_truncated=log_truncated)
+ return
+
+ status = "success" if exit_code == 0 else "error"
+ error = None if exit_code == 0 else f"Process exited with code {exit_code}"
+ with db_session() as conn:
+ finish_job(conn, job_id, status, exit_code, error)
diff --git a/src/website_profiling/worker/signals.py b/src/website_profiling/worker/signals.py
new file mode 100644
index 00000000..402151c6
--- /dev/null
+++ b/src/website_profiling/worker/signals.py
@@ -0,0 +1,34 @@
+"""Cancel and pause signal helpers for the pipeline worker."""
+from __future__ import annotations
+
+import os
+import subprocess
+import sys
+import tempfile
+
+
+def cancel_subprocess(proc: subprocess.Popen) -> None: # type: ignore[type-arg]
+ """Kill a subprocess as hard as possible."""
+ try:
+ proc.kill()
+ except ProcessLookupError:
+ pass
+
+
+def pause_subprocess(proc: subprocess.Popen) -> None: # type: ignore[type-arg]
+ """Send SIGUSR1 on Unix or write a pause-flag file on Windows."""
+ if sys.platform == "win32":
+ # Windows: write a flag file the Python worker checks.
+ flag = os.path.join(tempfile.gettempdir(), f"wp_pause_{proc.pid}.flag")
+ try:
+ with open(flag, "w") as f:
+ f.write("pause")
+ except OSError:
+ pass
+ else:
+ import signal
+
+ try:
+ os.kill(proc.pid, signal.SIGUSR1)
+ except ProcessLookupError:
+ pass
diff --git a/tests/README.md b/tests/README.md
index 664a432a..7d5f45c6 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -25,6 +25,26 @@ tests/content_studio/
test_tools.py # deterministic analyze tools
```
+## API integration (`tests/api/`)
+
+FastAPI routes are omitted from the core coverage gate (see `.coveragerc`). Use HTTP integration tests against a real Postgres instead:
+
+```
+tests/api/
+ conftest.py # TestClient + ephemeral property fixture
+ test_api_integration.py # @pytest.mark.integration — full route smoke + CRUD
+ test_content_drafts_list.py
+ test_report_loader_list.py
+```
+
+Requires `DATABASE_URL` (same as other `@pytest.mark.integration` tests). Run:
+
+```bash
+pytest tests/api/test_api_integration.py -m integration --no-cov
+```
+
+These tests catch response-shape regressions (camelCase vs snake_case), dict_row SQL bugs, and wrong column names that unit mocks miss.
+
## Core (everything else)
Remaining `tests/test_*.py` files cover the core gate (100% on all packages except `reporting/`, `tools/`, and other omits in `.coveragerc`).
diff --git a/tests/api/conftest.py b/tests/api/conftest.py
new file mode 100644
index 00000000..41f077a6
--- /dev/null
+++ b/tests/api/conftest.py
@@ -0,0 +1,56 @@
+"""Shared fixtures for FastAPI integration tests (requires PostgreSQL)."""
+from __future__ import annotations
+
+import os
+import uuid
+from collections.abc import Iterator
+from typing import Any
+
+import pytest
+from fastapi.testclient import TestClient
+
+from website_profiling.api.deps import get_db
+from website_profiling.api.main import app
+from website_profiling.db.pool import db_session
+
+
+def _database_url_configured() -> bool:
+ return bool((os.environ.get("DATABASE_URL") or "").strip())
+
+
+@pytest.fixture(scope="session")
+def require_database_url() -> None:
+ if not _database_url_configured():
+ pytest.skip("DATABASE_URL not set — start Postgres and run alembic upgrade head")
+
+
+def _override_get_db() -> Iterator[Any]:
+ with db_session() as conn:
+ yield conn
+
+
+@pytest.fixture
+def api_client(require_database_url: None) -> Iterator[TestClient]:
+ app.dependency_overrides[get_db] = _override_get_db
+ with TestClient(app) as client:
+ yield client
+ app.dependency_overrides.clear()
+
+
+@pytest.fixture
+def test_property(require_database_url: None) -> Iterator[dict[str, Any]]:
+ """Ephemeral property row; deleted after the test module using it finishes."""
+ domain = f"api-int-{uuid.uuid4().hex[:12]}.example"
+ with db_session() as conn:
+ from website_profiling.db.property_store import delete_property, upsert_property_by_domain
+
+ property_id = upsert_property_by_domain(
+ conn,
+ "API Integration Test",
+ domain,
+ f"https://{domain}",
+ )
+ payload = {"id": property_id, "domain": domain, "name": "API Integration Test"}
+ yield payload
+ with db_session() as conn:
+ delete_property(conn, property_id)
diff --git a/tests/api/test_api_integration.py b/tests/api/test_api_integration.py
new file mode 100644
index 00000000..83d238af
--- /dev/null
+++ b/tests/api/test_api_integration.py
@@ -0,0 +1,357 @@
+"""FastAPI HTTP integration tests — exercises real routes against PostgreSQL.
+
+These catch response-shape regressions and dict_row bugs that unit tests miss.
+Requires DATABASE_URL (same as other @pytest.mark.integration tests).
+"""
+from __future__ import annotations
+
+import uuid
+from typing import Any
+from unittest.mock import patch
+
+import pytest
+from fastapi.testclient import TestClient
+
+from website_profiling.db.pool import db_session
+
+
+pytestmark = pytest.mark.integration
+
+
+def test_health(api_client: TestClient) -> None:
+ res = api_client.get("/api/health")
+ assert res.status_code == 200
+ body = res.json()
+ assert body["ok"] is True
+ assert body["database"] == "up"
+
+
+def test_report_meta_response_shape(api_client: TestClient) -> None:
+ res = api_client.get("/api/report/meta")
+ assert res.status_code == 200
+ body = res.json()
+ assert "reports" in body
+ assert "crawlRuns" in body
+ assert isinstance(body["reports"], list)
+ for row in body["reports"]:
+ assert "canonical_domain" in row
+ assert "site_name" in row
+ assert "generated_at" in row
+ assert "canonicalDomain" not in row
+
+
+def test_properties_crud_and_ops(api_client: TestClient) -> None:
+ domain = f"api-prop-{uuid.uuid4().hex[:10]}.example"
+ create = api_client.post(
+ "/api/properties",
+ json={"name": "Props API", "canonical_domain": domain, "site_url": f"https://{domain}"},
+ )
+ assert create.status_code == 201
+ created = create.json()
+ property_id = int(created["id"])
+ assert created["canonical_domain"] == domain
+
+ try:
+ listing = api_client.get("/api/properties")
+ assert listing.status_code == 200
+ ids = {p["id"] for p in listing.json()["properties"]}
+ assert property_id in ids
+
+ detail = api_client.get(f"/api/properties/{property_id}")
+ assert detail.status_code == 200
+ assert detail.json()["canonical_domain"] == domain
+
+ ops_put = api_client.put(
+ f"/api/properties/{property_id}/ops",
+ json={
+ "scheduleCron": "0 9 * * 1",
+ "alertWebhookUrl": "https://hooks.example/alert",
+ "alertEmail": "ops@example.com",
+ },
+ )
+ assert ops_put.status_code == 200
+ assert ops_put.json()["ok"] is True
+
+ ops_get = api_client.get(f"/api/properties/{property_id}/ops")
+ assert ops_get.status_code == 200
+ ops = ops_get.json()
+ assert ops["schedule_cron"] == "0 9 * * 1"
+ assert ops["alert_webhook_url"] == "https://hooks.example/alert"
+ assert ops["alert_email"] == "ops@example.com"
+
+ preset_put = api_client.put(
+ f"/api/properties/{property_id}/preset",
+ json={"preset": "quick"},
+ )
+ assert preset_put.status_code == 200
+ assert preset_put.json()["default_crawl_preset"] == "quick"
+ finally:
+ deleted = api_client.delete(f"/api/properties/{property_id}")
+ assert deleted.status_code == 200
+ assert deleted.json()["ok"] is True
+
+
+def test_property_google_status_shape(api_client: TestClient, test_property: dict[str, Any]) -> None:
+ property_id = int(test_property["id"])
+ res = api_client.get(f"/api/properties/{property_id}/google/status")
+ assert res.status_code == 200
+ body = res.json()
+ for key in (
+ "connected",
+ "authMode",
+ "gscSiteUrl",
+ "ga4PropertyId",
+ "dateRangeDays",
+ "hasClientId",
+ "lastFetchedAt",
+ "propertyId",
+ ):
+ assert key in body
+ assert body["propertyId"] == property_id
+
+
+def test_integrations_google_status(api_client: TestClient) -> None:
+ res = api_client.get("/api/integrations/google/status")
+ assert res.status_code == 200
+ body = res.json()
+ assert "hasClientId" in body
+ assert "lastFetchedAt" in body
+
+
+def test_pipeline_and_llm_config_wrappers(api_client: TestClient) -> None:
+ pipe = api_client.get("/api/pipeline-config")
+ assert pipe.status_code == 200
+ pipe_body = pipe.json()
+ assert "state" in pipe_body
+ assert isinstance(pipe_body["state"], dict)
+
+ llm = api_client.get("/api/llm-config")
+ assert llm.status_code == 200
+ llm_body = llm.json()
+ assert "state" in llm_body
+ assert isinstance(llm_body["state"], dict)
+
+
+def test_content_drafts_full_crud(api_client: TestClient, test_property: dict[str, Any]) -> None:
+ property_id = int(test_property["id"])
+
+ empty = api_client.get("/api/content-drafts", params={"propertyId": property_id})
+ assert empty.status_code == 200
+ assert isinstance(empty.json()["drafts"], list)
+
+ create = api_client.post(
+ "/api/content-drafts",
+ json={
+ "propertyId": property_id,
+ "title": "Integration draft",
+ "target_keyword": "seo audit",
+ },
+ )
+ assert create.status_code == 200
+ draft_id = int(create.json()["id"])
+
+ listed = api_client.get("/api/content-drafts", params={"propertyId": property_id})
+ assert listed.status_code == 200
+ drafts = listed.json()["drafts"]
+ match = next((d for d in drafts if d["id"] == draft_id), None)
+ assert match is not None
+ assert match["property_id"] == property_id
+ assert match["target_keyword"] == "seo audit"
+
+ detail = api_client.get(f"/api/content-drafts/{draft_id}")
+ assert detail.status_code == 200
+ assert detail.json()["draft"]["title"] == "Integration draft"
+
+ patched = api_client.patch(
+ f"/api/content-drafts/{draft_id}",
+ json={"title": "Updated draft", "body_html": "Hello
"},
+ )
+ assert patched.status_code == 200
+ assert patched.json()["draft"]["title"] == "Updated draft"
+
+ removed = api_client.delete(f"/api/content-drafts/{draft_id}")
+ assert removed.status_code == 200
+ assert removed.json()["ok"] is True
+
+
+def test_dashboards_crud(api_client: TestClient, test_property: dict[str, Any]) -> None:
+ property_id = int(test_property["id"])
+
+ create = api_client.post(
+ "/api/dashboards",
+ json={
+ "propertyId": property_id,
+ "name": "Integration dashboard",
+ "layoutJson": {"version": 2, "widgets": [], "slicers": []},
+ },
+ )
+ assert create.status_code == 201
+ dashboard = create.json()["dashboard"]
+ dashboard_id = int(dashboard["id"])
+ assert dashboard["propertyId"] == property_id
+ assert dashboard["name"] == "Integration dashboard"
+
+ listed = api_client.get("/api/dashboards", params={"propertyId": property_id})
+ assert listed.status_code == 200
+ ids = {d["id"] for d in listed.json()["dashboards"]}
+ assert dashboard_id in ids
+
+ updated = api_client.put(
+ f"/api/dashboards/{dashboard_id}",
+ json={"propertyId": property_id, "name": "Renamed dashboard"},
+ )
+ assert updated.status_code == 200
+ assert updated.json()["dashboard"]["name"] == "Renamed dashboard"
+
+ deleted = api_client.delete(
+ f"/api/dashboards/{dashboard_id}",
+ params={"propertyId": property_id},
+ )
+ assert deleted.status_code == 200
+ assert deleted.json()["ok"] is True
+
+
+def test_saved_filters_crud(api_client: TestClient, test_property: dict[str, Any]) -> None:
+ property_id = int(test_property["id"])
+ filter_name = f"filter-{uuid.uuid4().hex[:8]}"
+
+ upsert = api_client.post(
+ "/api/filters",
+ json={
+ "propertyId": property_id,
+ "name": filter_name,
+ "filterJson": {"status": ["200"]},
+ },
+ )
+ assert upsert.status_code == 200
+ assert upsert.json()["ok"] is True
+
+ listed = api_client.get("/api/filters", params={"propertyId": property_id})
+ assert listed.status_code == 200
+ names = {f["name"] for f in listed.json()["filters"]}
+ assert filter_name in names
+
+ deleted = api_client.request(
+ "DELETE",
+ "/api/filters",
+ json={"propertyId": property_id, "name": filter_name},
+ )
+ assert deleted.status_code == 200
+ assert deleted.json()["ok"] is True
+
+
+def test_issue_status_upsert_and_list(api_client: TestClient, test_property: dict[str, Any]) -> None:
+ property_id = int(test_property["id"])
+
+ empty = api_client.get("/api/issues/status", params={"propertyId": property_id})
+ assert empty.status_code == 200
+ assert isinstance(empty.json()["issues"], list)
+
+ upsert = api_client.put(
+ "/api/issues/status",
+ json={
+ "propertyId": property_id,
+ "message": "Missing meta description",
+ "status": "open",
+ "url": "https://example.com/page",
+ "priority": "Medium",
+ },
+ )
+ assert upsert.status_code == 200
+ issue = upsert.json()["issue"]
+ assert issue["propertyId"] == property_id
+ assert issue["status"] == "open"
+ assert issue["message"] == "Missing meta description"
+
+ listed = api_client.get("/api/issues/status", params={"propertyId": property_id})
+ assert listed.status_code == 200
+ messages = {i["message"] for i in listed.json()["issues"]}
+ assert "Missing meta description" in messages
+
+
+def test_portfolio_delete_crawl_run(api_client: TestClient, test_property: dict[str, Any]) -> None:
+ property_id = int(test_property["id"])
+ with db_session() as conn:
+ from website_profiling.db.crawl_store import create_crawl_run
+
+ crawl_run_id = create_crawl_run(
+ conn,
+ start_url=f"https://{test_property['domain']}",
+ property_id=property_id,
+ )
+
+ res = api_client.request(
+ "DELETE",
+ "/api/portfolio/delete",
+ json={"crawlRunId": crawl_run_id},
+ )
+ assert res.status_code == 200
+ assert res.json()["ok"] is True
+
+ with db_session() as conn:
+ cur = conn.execute("SELECT id FROM crawl_runs WHERE id = %s", (crawl_run_id,))
+ assert cur.fetchone() is None
+
+
+def test_properties_resolve(api_client: TestClient, test_property: dict[str, Any]) -> None:
+ res = api_client.get(
+ "/api/properties/resolve",
+ params={"startUrl": f"https://{test_property['domain']}/"},
+ )
+ assert res.status_code == 200
+ body = res.json()
+ assert body["id"] == test_property["id"]
+ assert body["canonical_domain"] == test_property["domain"]
+
+
+def test_ollama_status_response_shape(api_client: TestClient) -> None:
+ fake_models = [
+ {
+ "name": "llama3.2",
+ "source": "local",
+ "installed": True,
+ "capabilities": ["tools"],
+ "billing": "free_local",
+ "requires_subscription": False,
+ }
+ ]
+ with (
+ patch(
+ "website_profiling.llm.ollama_catalog.fetch_ollama_models",
+ return_value={
+ "ok": True,
+ "baseUrl": "http://127.0.0.1:11434",
+ "models": fake_models,
+ "cloudCatalogOk": True,
+ "localOk": True,
+ },
+ ),
+ patch(
+ "website_profiling.db.config_store.read_llm_config",
+ return_value={"llm_model": "llama3.2", "llm_base_url": "http://127.0.0.1:11434"},
+ ),
+ ):
+ res = api_client.get("/api/ollama/status")
+
+ assert res.status_code == 200
+ body = res.json()
+ assert body["ok"] is True
+ assert body["configuredModel"] == "llama3.2"
+ assert body["modelInstalled"] is True
+ assert body["supportsTools"] is True
+ assert isinstance(body["models"], list)
+ assert len(body["models"]) == 1
+
+
+def test_backlinks_velocity_empty(api_client: TestClient, test_property: dict[str, Any]) -> None:
+ res = api_client.get(
+ "/api/backlinks/velocity",
+ params={"propertyId": test_property["id"]},
+ )
+ assert res.status_code == 200
+ assert isinstance(res.json()["snapshots"], list)
+
+
+def test_report_payload_not_found(api_client: TestClient) -> None:
+ res = api_client.get("/api/report/payload", params={"reportId": 999999999})
+ assert res.status_code == 404
diff --git a/tests/api/test_content_drafts_list.py b/tests/api/test_content_drafts_list.py
new file mode 100644
index 00000000..4b7d3f04
--- /dev/null
+++ b/tests/api/test_content_drafts_list.py
@@ -0,0 +1,30 @@
+"""Content drafts list must work with psycopg dict_row (pool default)."""
+from __future__ import annotations
+
+from website_profiling.db.content_draft_store import list_content_drafts
+from website_profiling.db.pool import db_session
+
+
+def test_list_content_drafts_with_rows() -> None:
+ with db_session() as conn:
+ cur = conn.execute("SELECT id FROM properties LIMIT 1")
+ row = cur.fetchone()
+ assert row is not None
+ property_id = int(row["id"])
+
+ conn.execute(
+ "DELETE FROM content_drafts WHERE property_id = %s AND title = 'Dict row test'",
+ (property_id,),
+ )
+ conn.execute(
+ """INSERT INTO content_drafts (property_id, title, target_keyword)
+ VALUES (%s, 'Dict row test', 'seo')""",
+ (property_id,),
+ )
+ conn.commit()
+
+ drafts = list_content_drafts(conn, property_id)
+ assert len(drafts) >= 1
+ draft = next(d for d in drafts if d["title"] == "Dict row test")
+ assert draft["property_id"] == property_id
+ assert draft["target_keyword"] == "seo"
diff --git a/tests/api/test_report_loader_list.py b/tests/api/test_report_loader_list.py
new file mode 100644
index 00000000..a7699d13
--- /dev/null
+++ b/tests/api/test_report_loader_list.py
@@ -0,0 +1,29 @@
+"""Tests for report_loader list_reports field naming (snake_case for frontend)."""
+from __future__ import annotations
+
+from unittest.mock import MagicMock
+
+from website_profiling.api.services.report_loader import list_reports
+
+
+def test_list_reports_uses_snake_case_keys() -> None:
+ row = {
+ "id": 7,
+ "canonical_domain": "example.com",
+ "site_name": "Example",
+ "generated_at": MagicMock(isoformat=lambda: "2026-01-01T00:00:00+00:00"),
+ }
+ conn = MagicMock()
+ conn.execute.return_value.fetchall.return_value = [row]
+
+ reports = list_reports(conn)
+ assert len(reports) == 1
+ assert reports[0] == {
+ "id": 7,
+ "canonical_domain": "example.com",
+ "site_name": "Example",
+ "generated_at": "2026-01-01T00:00:00+00:00",
+ }
+ assert "canonicalDomain" not in reports[0]
+ assert "siteName" not in reports[0]
+ assert "generatedAt" not in reports[0]
diff --git a/tests/clients/test_file_service.py b/tests/clients/test_file_service.py
new file mode 100644
index 00000000..6f2bcd93
--- /dev/null
+++ b/tests/clients/test_file_service.py
@@ -0,0 +1,65 @@
+"""Tests for FileService HTTP client."""
+from __future__ import annotations
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from website_profiling.clients import file_service
+
+
+def test_fetch_report_pdf_success() -> None:
+ mock_resp = MagicMock()
+ mock_resp.status_code = 200
+ mock_resp.content = b"%PDF-1.4"
+ with patch.object(file_service.requests, "get", return_value=mock_resp) as mock_get:
+ data = file_service.fetch_report_pdf(42, profile="standard")
+ assert data == b"%PDF-1.4"
+ mock_get.assert_called_once()
+ assert "/v1/reports/42/pdf" in mock_get.call_args[0][0]
+
+
+def test_fetch_report_pdf_not_found() -> None:
+ mock_resp = MagicMock()
+ mock_resp.status_code = 404
+ mock_resp.text = "missing"
+ with patch.object(file_service.requests, "get", return_value=mock_resp):
+ with pytest.raises(FileNotFoundError):
+ file_service.fetch_report_pdf(99)
+
+
+def test_fetch_report_pdf_upstream_error() -> None:
+ mock_resp = MagicMock()
+ mock_resp.status_code = 502
+ mock_resp.text = "bad gateway"
+ with patch.object(file_service.requests, "get", return_value=mock_resp):
+ with pytest.raises(RuntimeError, match="502"):
+ file_service.fetch_report_pdf(1)
+
+
+def test_fetch_report_pdf_requires_report_id() -> None:
+ with pytest.raises(ValueError, match="report_id"):
+ file_service.fetch_report_pdf(None)
+
+
+def test_fetch_report_workbook_success() -> None:
+ mock_resp = MagicMock()
+ mock_resp.status_code = 200
+ mock_resp.content = b"PK\x03\x04"
+ with patch.object(file_service.requests, "get", return_value=mock_resp) as mock_get:
+ data = file_service.fetch_report_workbook(7)
+ assert data.startswith(b"PK")
+ assert "/v1/reports/7/workbook" in mock_get.call_args[0][0]
+
+
+def test_fetch_report_pdf_network_error() -> None:
+ import requests
+
+ with patch.object(file_service.requests, "get", side_effect=requests.ConnectionError("refused")):
+ with pytest.raises(RuntimeError, match="File service unreachable"):
+ file_service.fetch_report_pdf(1)
+
+
+def test_fetch_report_workbook_requires_report_id() -> None:
+ with pytest.raises(ValueError, match="report_id"):
+ file_service.fetch_report_workbook(None)
diff --git a/tests/db_test_fakes.py b/tests/db_test_fakes.py
index 60aa5004..63bfe8ff 100644
--- a/tests/db_test_fakes.py
+++ b/tests/db_test_fakes.py
@@ -37,6 +37,7 @@ class FakeConn:
def __init__(self) -> None:
self.executed: list[tuple[str, tuple[Any, ...] | None]] = []
self.commits = 0
+ self.rollbacks = 0
self._next_cursor: FakeCursor | None = None
self._cursor_queue: list[FakeCursor] = []
@@ -62,6 +63,9 @@ def cursor(self) -> Iterator[FakeCursor]:
def commit(self) -> None:
self.commits += 1
+ def rollback(self) -> None:
+ self.rollbacks += 1
+
@contextmanager
def transaction(self) -> Iterator[None]:
yield None
diff --git a/tests/llm/test_ollama_catalog.py b/tests/llm/test_ollama_catalog.py
new file mode 100644
index 00000000..94903ccf
--- /dev/null
+++ b/tests/llm/test_ollama_catalog.py
@@ -0,0 +1,24 @@
+"""Ollama catalog merge and model lookup."""
+from __future__ import annotations
+
+from website_profiling.llm.ollama_catalog import (
+ merge_ollama_models,
+ model_is_configured,
+ models_support_tools,
+)
+
+
+def test_merge_ollama_models_prefers_installed_local() -> None:
+ local = [{"name": "llama3.2", "source": "local", "installed": True, "capabilities": ["tools"]}]
+ cloud = [{"name": "llama3.2:cloud", "source": "cloud", "installed": False}]
+ merged = merge_ollama_models(local, cloud)
+ assert len(merged) >= 1
+ entry = next(m for m in merged if m["name"] == "llama3.2")
+ assert entry["installed"] is True
+ assert entry["capabilities"] == ["tools"]
+
+
+def test_model_is_configured_case_insensitive() -> None:
+ models = [{"name": "Llama3.2", "source": "local", "installed": True}]
+ assert model_is_configured(models, "llama3.2") is True
+ assert models_support_tools(models) is False
diff --git a/tests/reporting/test_indexation_coverage.py b/tests/reporting/test_indexation_coverage.py
index a7bc498c..634f8baf 100644
--- a/tests/reporting/test_indexation_coverage.py
+++ b/tests/reporting/test_indexation_coverage.py
@@ -27,16 +27,14 @@ def test_gsc_page_urls_extracts_top_pages() -> None:
"gsc": {
"top_pages": [
{"page": "https://example.com/x"},
- {"url": "https://example.com/y"},
+ {"page": "https://example.com/y"},
]
}
}
- assert len(_gsc_page_urls(google)) == 2
-
-
-def test_gsc_page_urls_legacy_pages_fallback() -> None:
- google = {"gsc": {"pages": [{"page": "https://example.com/x"}]}}
- assert _gsc_page_urls(google) == ["https://example.com/x"]
+ assert _gsc_page_urls(google) == [
+ "https://example.com/x",
+ "https://example.com/y",
+ ]
@patch("website_profiling.reporting.indexation.discover_sitemap_urls")
diff --git a/tests/reporting/test_pdf_branch_coverage.py b/tests/reporting/test_pdf_branch_coverage.py
deleted file mode 100644
index cd8d1c2c..00000000
--- a/tests/reporting/test_pdf_branch_coverage.py
+++ /dev/null
@@ -1,434 +0,0 @@
-"""Branch-coverage tests for the PDF pipeline (adapters, normalize, renderers)."""
-from __future__ import annotations
-
-from unittest.mock import patch
-
-import pytest
-
-pytest.importorskip("reportlab")
-
-from website_profiling.reporting.pdf.adapters.appendix import adapt_appendix
-from website_profiling.reporting.pdf.adapters.findings import adapt_findings
-from website_profiling.reporting.pdf.builder import build_pdf_document
-from website_profiling.reporting.pdf.document import (
- SCHEMA_VERSION,
- CalloutBlock,
- HeadingBlock,
- IssueGroupBlock,
- IssueTableBlock,
- KeyValueBlock,
- KpiItem,
- KpiRowBlock,
- MarkdownBlock,
- MetricTableBlock,
- ParagraphBlock,
- PdfCoverBlock,
- PdfDocument,
- PdfFooterBlock,
- PdfIssue,
- PdfMeta,
- PdfScoreHero,
- PdfSection,
- PdfTruncation,
- ScoreCard,
- ScoreCardsBlock,
- SpacerBlock,
- StatChip,
- StatGridBlock,
- TableColumn,
- UrlListBlock,
-)
-from website_profiling.reporting.pdf.normalize import (
- _extract_path,
- _is_lighthouse_row,
- _strip_url_from_headline,
- normalize_issue_for_pdf,
-)
-from website_profiling.reporting.pdf.options import PdfBuildOptions, PdfLimits
-from website_profiling.reporting.pdf.render.html import (
- _render_executive_panel as _html_render_executive_panel,
- _render_stat_grid as _html_render_stat_grid,
- _render_score_cards as _html_render_score_cards,
- _render_block as _html_render_block,
- render_html_document,
-)
-from website_profiling.reporting.pdf.render.reportlab import (
- _flowables_for_block,
- _make_styles,
- _p,
- _p_html,
- _render_executive_panel as _rl_render_executive_panel,
- _render_top_issues_table,
- _safe_p,
- render_pdf_document,
-)
-
-
-def _row(message: str, **kwargs) -> dict:
- base = {
- "category": "Technical SEO",
- "priority": "high",
- "message": message,
- "url": "",
- "recommendation": "Fix it",
- }
- base.update(kwargs)
- return base
-
-
-def _issue(**kwargs) -> PdfIssue:
- defaults = {
- "id": "iss001",
- "priority": "high",
- "category": "Technical SEO",
- "headline": "Sample issue",
- "url": "https://example.com/a",
- "path": "/a",
- "recommendation": "Fix it",
- }
- defaults.update(kwargs)
- return PdfIssue(**defaults)
-
-
-def _minimal_cover(**kwargs) -> PdfCoverBlock:
- defaults = {
- "headline": "Site Audit — example.com",
- "subtitle": "Technical SEO Audit Report",
- "hero": PdfScoreHero(score="80", band="score-good", label="Overall health score"),
- "priority_strip": StatGridBlock(
- id="cover.priority",
- chips=[StatChip(label="High", value="1", tone="high")],
- columns=4,
- ),
- "category_scores": ScoreCardsBlock(
- id="cover.scores",
- cards=[ScoreCard(name="Technical SEO", score="80", issue_count=1, tone="score-good")],
- ),
- }
- defaults.update(kwargs)
- return PdfCoverBlock(**defaults)
-
-
-def _minimal_meta() -> PdfMeta:
- return PdfMeta(
- report_id=1,
- property="example.com",
- report_title="Technical SEO Audit Report",
- generated_at="18 June 2026",
- exported_at="18 June 2026, 12:00 UTC",
- data_sources=["crawl"],
- health_score=80,
- issue_counts={"critical": 0, "high": 1, "medium": 0, "low": 0},
- )
-
-
-def _exhaustive_document() -> PdfDocument:
- """Synthetic document exercising every block type and renderer edge path."""
- related = [f"https://example.com/p{i}" for i in range(15)]
- issue_with_urls = _issue(
- headline="Collapsed duplicate",
- related_urls=related,
- url=None,
- recommendation="Consolidate",
- )
- compact_group = IssueGroupBlock(
- id="findings.compact",
- group_label="Medium — compact table",
- issues=[_issue(headline="Compact row", url="https://example.com/c")],
- render_as="compact_table",
- truncation=PdfTruncation(shown=1, total=5),
- )
- list_group = IssueGroupBlock(
- id="findings.list",
- group_label="High — list",
- issues=[issue_with_urls, _issue(url=None, path=None, headline="Site-wide issue")],
- )
- return PdfDocument(
- schema_version=SCHEMA_VERSION,
- document_kind="audit",
- meta=_minimal_meta(),
- cover=_minimal_cover(
- executive_summary="Executive overview text.",
- executive_source="deterministic",
- priorities_list=["Priority one", "Priority two"],
- top_issues=[_issue(priority="critical", headline="Critical item")],
- ),
- sections=[
- PdfSection(
- id="blocks.all",
- section_key="core",
- title="All block types",
- priority=10,
- page_break_before=True,
- source_label="crawl",
- truncation=PdfTruncation(shown=2, total=10),
- blocks=[
- HeadingBlock(id="h2", text="Section heading", level=2),
- HeadingBlock(id="h3", text="Sub heading", level=3),
- ParagraphBlock(id="p", text="Body paragraph"),
- ParagraphBlock(id="pi", text="Italic note", italic=True),
- CalloutBlock(id="c-info", text="Info callout", severity="info"),
- CalloutBlock(id="c-warn", text="Warn callout", severity="warn"),
- CalloutBlock(id="c-crit", text="Critical callout", severity="critical"),
- SpacerBlock(id="sp", height_pt=4),
- KpiRowBlock(id="kpi", items=[KpiItem(label="Pages", value="42")]),
- StatGridBlock(id="stat", chips=[], columns=4),
- ScoreCardsBlock(id="scores", cards=[]),
- KeyValueBlock(id="kv-default", rows=[("Key", "Value")], layout="default"),
- KeyValueBlock(id="kv-empty", rows=[]),
- MetricTableBlock(
- id="metrics",
- columns=[
- TableColumn(key="url", label="URL", width="url"),
- TableColumn(key="val", label="Value", width="wide"),
- ],
- rows=[{"url": "https://example.com", "val": "1"}],
- truncation=PdfTruncation(shown=1, total=3),
- ),
- MetricTableBlock(id="metrics-empty", columns=[], rows=[]),
- UrlListBlock(
- id="urls",
- rows=[
- {"url": "https://example.com", "status": "200", "title": "Home"},
- {"url": "https://example.com/old", "status": "301", "title": ""},
- {"url": "https://example.com/missing", "status": "404", "title": "Missing"},
- {"url": "https://example.com/error", "status": "500", "title": "Error"},
- {"url": "https://example.com/unknown", "status": "", "title": ""},
- ],
- truncation=PdfTruncation(shown=5, total=12),
- ),
- UrlListBlock(id="urls-notitle", rows=[{"url": "https://x.com", "status": "200"}], show_title=False),
- UrlListBlock(id="urls-empty", rows=[]),
- list_group,
- compact_group,
- IssueTableBlock(
- id="issue-table",
- title="Issue table",
- issues=[_issue(headline="Table row")],
- truncation=PdfTruncation(shown=1, total=4),
- ),
- MarkdownBlock(id="md", text="Bold markdown snippet"),
- ParagraphBlock(id="hidden", text="hidden", visible=False),
- ],
- ),
- ],
- footer=PdfFooterBlock(exported_at="18 June 2026, 12:00 UTC"),
- )
-
-
-class TestNormalizeBranches:
- def test_strip_url_trailing_slash_variant(self):
- class _Msg(str):
- def replace(self, old, new="", count=-1):
- if old == "https://example.com/page":
- return str(self)
- return super().replace(old, new, count)
-
- url = "https://example.com/page"
- msg = _Msg("Not crawled: https://example.com/page/")
- assert url not in _strip_url_from_headline(msg, url)
-
- def test_extract_path_parse_error(self, monkeypatch):
- def boom(_url):
- raise ValueError("bad url")
-
- monkeypatch.setattr(
- "website_profiling.reporting.pdf.normalize.urlparse",
- boom,
- )
- assert _extract_path("https://example.com") is None
-
- def test_lighthouse_tag_detection(self):
- is_lh, audit_id = _is_lighthouse_row("generic message", ["lighthouse"])
- assert is_lh is True
- assert audit_id == ""
-
- def test_redirect_headline_shortening(self):
- issue = normalize_issue_for_pdf(_row("redirect: 301 to https://example.com/new"))
- assert issue.headline == "301 redirect"
- assert "redirect" in issue.tags
-
- def test_lighthouse_prefix_stripped(self):
- issue = normalize_issue_for_pdf(_row("lighthouse: Long cache lifetime"))
- assert issue.headline == "Long cache lifetime"
-
- def test_axe_headline_truncated_at_sentence(self):
- long_body = "A" * 50 + ". " + "B" * 60
- issue = normalize_issue_for_pdf(_row(f"axe: {long_body}"))
- assert issue.headline.endswith(".")
- assert len(issue.headline) < len(long_body)
-
-
-class TestAdapterAndBuilderBranches:
- def test_appendix_disabled(self):
- payload = {"links": [{"url": "https://example.com", "status": "200"}]}
- assert adapt_appendix(payload, PdfBuildOptions(include_appendix=False)) == []
-
- def test_findings_empty_groups_after_normalize(self):
- payload = {
- "categories": [{"name": "Tech", "issues": [_row("issue one")]}],
- }
- with patch(
- "website_profiling.reporting.pdf.adapters.findings.group_issues_for_pdf",
- return_value=[],
- ):
- assert adapt_findings(payload, PdfBuildOptions()) == []
-
- def test_findings_section_truncation_when_over_limit(self):
- issues = [_row(f"issue {i}") for i in range(30)]
- payload = {"categories": [{"name": "Tech", "issues": issues}]}
- opts = PdfBuildOptions(limits=PdfLimits(issues_total=5, issues_per_group=5))
- sections = adapt_findings(payload, opts)
- assert sections[0].truncation is not None
- assert sections[0].truncation.total == 30
-
- def test_builder_skips_non_dict_categories_and_bad_scores(self):
- payload = {
- "site_name": "example.com",
- "categories": [
- "bad",
- {"name": "Tech", "score": "not-a-number", "issues": []},
- ],
- "links": [],
- }
- doc = build_pdf_document(payload)
- names = [c.name for c in doc.cover.category_scores.cards]
- assert names == ["Tech"]
-
- def test_builder_prefers_url_for_duplicate_headlines(self):
- payload = {
- "site_name": "example.com",
- "categories": [{
- "name": "Tech",
- "score": 80,
- "issues": [
- _row("Missing title", url=""),
- _row("Missing title", url="https://example.com/page"),
- ],
- }],
- "links": [],
- }
- doc = build_pdf_document(payload)
- assert doc.cover.top_issues[0].url == "https://example.com/page"
-
- def test_builder_skips_unknown_section_adapters(self):
- payload = {"site_name": "example.com", "categories": [], "links": []}
- doc = build_pdf_document(payload, PdfBuildOptions(sections=["missing", "core"]))
- assert any(s.id == "core.audit_details" for s in doc.sections)
-
- def test_options_custom_sections_override_profile(self):
- opts = PdfBuildOptions(profile="full", sections=["core"])
- assert opts.effective_sections() == ["core"]
-
-
-class TestRendererBranches:
- def test_html_renders_all_block_types(self):
- html = render_html_document(_exhaustive_document())
- assert "All block types" in html
- assert "status-3xx" in html
- assert "status-4xx" in html
- assert "status-5xx" in html
- assert "status-other" in html
- assert "issue-card" in html
- assert "compact_table" not in html # render_as is not echoed; table headers are
- assert "IssueURL | " in html
- assert "Fix:" in html
- assert "and 5 more" in html
- assert "Source: crawl" in html
- assert "Showing 2 of 10 issues" in html
-
- def test_pdf_renders_all_block_types(self):
- pdf = render_pdf_document(_exhaustive_document())
- assert pdf[:4] == b"%PDF"
- assert len(pdf) > 2_000
-
- def test_reportlab_helper_functions(self):
- st = _make_styles()
- assert _p("plain", st["body"]) is not None
- assert _p_html("markup", st["body"]) is not None
- assert _safe_p("", st["body"]) is not None
-
- def test_reportlab_empty_executive_and_top_issues(self):
- st = _make_styles()
- cover = _minimal_cover(
- executive_summary=None,
- executive_source=None,
- priorities_list=[],
- top_issues=[],
- )
- assert _rl_render_executive_panel(cover, st) == []
- assert _render_top_issues_table([], st) == []
-
- def test_reportlab_stat_grid_more_chips_than_columns(self):
- # chips > columns must not crash: colWidths must match the cell count.
- st = _make_styles()
- block = StatGridBlock(
- id="s",
- columns=2,
- chips=[
- StatChip(label="A", value="1", tone="high"),
- StatChip(label="B", value="2", tone="medium"),
- StatChip(label="C", value="3", tone="low"),
- ],
- )
- out = _flowables_for_block(block, st)
- assert out # renders a table flowable instead of raising at build time
-
- def test_reportlab_empty_optional_blocks(self):
- st = _make_styles()
- assert _flowables_for_block(KpiRowBlock(id="k", items=[]), st) == []
- assert _flowables_for_block(StatGridBlock(id="s", chips=[]), st) == []
- assert _flowables_for_block(ScoreCardsBlock(id="sc", cards=[]), st) == []
- assert _flowables_for_block(KeyValueBlock(id="kv", rows=[]), st) == []
- assert _flowables_for_block(UrlListBlock(id="u", rows=[]), st) == []
- assert _flowables_for_block(MetricTableBlock(id="m", columns=[], rows=[]), st) == []
- assert _flowables_for_block(ParagraphBlock(id="h", text="x", visible=False), st) == []
-
- def test_html_empty_cover_fragments(self):
- doc = PdfDocument(
- schema_version=SCHEMA_VERSION,
- document_kind="audit",
- meta=_minimal_meta(),
- cover=_minimal_cover(
- top_issues=[],
- executive_summary=None,
- executive_source=None,
- priorities_list=[],
- priority_strip=StatGridBlock(id="cover.priority", chips=[], columns=4),
- category_scores=ScoreCardsBlock(id="cover.scores", cards=[]),
- ),
- sections=[],
- footer=PdfFooterBlock(exported_at="now"),
- )
- html = render_html_document(doc)
- assert "Top traffic-impacting issues" not in html
- assert "Category scores" not in html
-
- def test_html_renderer_empty_helpers(self):
- cover = _minimal_cover(executive_summary=None, priorities_list=[])
- assert _html_render_executive_panel(cover) == ""
- assert _html_render_stat_grid(StatGridBlock(id="s", chips=[], columns=4)) == ""
- assert _html_render_score_cards(ScoreCardsBlock(id="sc", cards=[])) == ""
-
- class _Unknown:
- type = "unknown"
- visible = True
-
- assert _html_render_block(_Unknown()) == ""
-
- def test_reportlab_empty_kv_and_scaled_metric_table(self):
- st = _make_styles()
- assert _flowables_for_block(KeyValueBlock(id="a", rows=[], layout="audit"), st) == []
- assert _flowables_for_block(KeyValueBlock(id="g", rows=[], layout="glossary"), st) == []
- wide = MetricTableBlock(
- id="wide",
- columns=[TableColumn(key=f"c{i}", label=f"C{i}", width="wide") for i in range(8)],
- rows=[{f"c{i}": "x" for i in range(8)}],
- )
- assert _flowables_for_block(wide, st)
-
- class _Unknown:
- type = "not_registered"
- visible = True
-
- assert _flowables_for_block(_Unknown(), st) == []
diff --git a/tests/reporting/test_pdf_builder.py b/tests/reporting/test_pdf_builder.py
deleted file mode 100644
index 09b9cffd..00000000
--- a/tests/reporting/test_pdf_builder.py
+++ /dev/null
@@ -1,229 +0,0 @@
-"""Tests for build_pdf_document — document structure and metadata."""
-from __future__ import annotations
-
-import pytest
-
-from website_profiling.reporting.pdf.builder import build_pdf_document
-from website_profiling.reporting.pdf.document import SCHEMA_VERSION, IssueGroupBlock, KeyValueBlock, ScoreCardsBlock
-from website_profiling.reporting.pdf.options import PdfBuildOptions
-
-
-def _base_payload(**overrides) -> dict:
- p = {
- "site_name": "test.example",
- "report_generated_at": "2026-06-18T04:38:27+00:00",
- "categories": [
- {
- "name": "Technical SEO",
- "score": 79,
- "issues": [
- {
- "priority": "high",
- "message": "URL in sitemap but not crawled: https://test.example/page",
- "url": "https://test.example/page",
- "recommendation": "Review sitemap",
- },
- {
- "priority": "medium",
- "message": "Missing canonical URL.",
- "url": "https://test.example/llms.txt",
- "recommendation": "Add canonical",
- },
- ],
- },
- {
- "name": "Mobile SEO",
- "score": 90,
- "issues": [
- {
- "priority": "critical",
- "message": "2 page(s) missing viewport meta tag.",
- "url": "",
- "recommendation": "Add viewport",
- }
- ],
- },
- ],
- "links": [
- {"url": "https://test.example", "status": "200", "title": "Home"},
- {"url": "https://test.example/about", "status": "301", "title": "About"},
- ],
- "report_meta": {"data_sources": ["crawl", "lighthouse"]},
- }
- p.update(overrides)
- return p
-
-
-class TestDocumentSchema:
- def test_schema_version(self):
- doc = build_pdf_document(_base_payload())
- assert doc.schema_version == SCHEMA_VERSION
-
- def test_document_kind_audit(self):
- doc = build_pdf_document(_base_payload())
- assert doc.document_kind == "audit"
-
- def test_meta_property(self):
- doc = build_pdf_document(_base_payload())
- assert doc.meta.property == "test.example"
-
- def test_meta_issue_counts(self):
- doc = build_pdf_document(_base_payload())
- assert doc.meta.issue_counts["critical"] == 1
- assert doc.meta.issue_counts["high"] == 1
- assert doc.meta.issue_counts["medium"] == 1
- assert doc.meta.issue_counts["low"] == 0
-
- def test_meta_health_score_present(self):
- doc = build_pdf_document(_base_payload())
- assert doc.meta.health_score is not None
- assert 0 <= doc.meta.health_score <= 100
-
- def test_footer_generated(self):
- doc = build_pdf_document(_base_payload())
- assert doc.footer.exported_at
-
-
-class TestCover:
- def test_cover_headline(self):
- doc = build_pdf_document(_base_payload())
- assert doc.cover.headline == "Site Audit — test.example"
-
- def test_cover_priority_strip_chips(self):
- doc = build_pdf_document(_base_payload())
- chips = {c.label: c.value for c in doc.cover.priority_strip.chips}
- assert chips["Critical"] == "1"
- assert chips["High"] == "1"
- assert chips["Medium"] == "1"
- assert chips["Low"] == "0"
-
- def test_cover_category_scores(self):
- doc = build_pdf_document(_base_payload())
- names = [c.name for c in doc.cover.category_scores.cards]
- assert "Technical SEO" in names
- assert "Mobile SEO" in names
-
- def test_cover_top_issues_capped(self):
- payload = _base_payload()
- doc = build_pdf_document(payload, PdfBuildOptions(limits=type("L", (), {"top_issues_cover": 2,
- "issues_total": 120, "issues_per_group": 25, "urls_sample": 20,
- "metric_table_rows": 15, "gsc_queries": 10, "keyword_rows": 15, "diagnostic_items": 20})()))
- assert len(doc.cover.top_issues) <= 2
-
- def test_cover_top_issues_critical_first(self):
- doc = build_pdf_document(_base_payload())
- if len(doc.cover.top_issues) >= 2:
- assert doc.cover.top_issues[0].priority == "critical"
-
- def test_cover_executive_summary_present(self):
- payload = _base_payload(executive_summary={
- "source": "deterministic",
- "summary": "Looks good overall.",
- "priorities": ["Fix viewport"],
- })
- doc = build_pdf_document(payload)
- assert doc.cover.executive_summary == "Looks good overall."
- assert doc.cover.priorities_list == ["Fix viewport"]
-
- def test_cover_executive_summary_none_when_missing(self):
- doc = build_pdf_document(_base_payload())
- # no executive_summary in base payload
- assert doc.cover.executive_summary is None or doc.cover.executive_summary == ""
-
-
-class TestSections:
- def test_standard_has_findings(self):
- doc = build_pdf_document(_base_payload())
- section_ids = [s.id for s in doc.sections]
- assert "findings" in section_ids
-
- def test_standard_has_audit_details(self):
- doc = build_pdf_document(_base_payload())
- section_ids = [s.id for s in doc.sections]
- assert "core.audit_details" in section_ids
-
- def test_category_scores_on_cover_not_in_sections(self):
- doc = build_pdf_document(_base_payload())
- section_ids = [s.id for s in doc.sections]
- assert "core.category_scores" not in section_ids
- assert len(doc.cover.category_scores.cards) >= 1
-
- def test_standard_has_url_sample(self):
- doc = build_pdf_document(_base_payload())
- section_ids = [s.id for s in doc.sections]
- assert "appendix.urls" in section_ids
-
- def test_standard_has_glossary(self):
- doc = build_pdf_document(_base_payload())
- section_ids = [s.id for s in doc.sections]
- assert "appendix.glossary" in section_ids
-
- def test_sections_sorted_by_priority(self):
- doc = build_pdf_document(_base_payload())
- priorities = [s.priority for s in doc.sections]
- assert priorities == sorted(priorities)
-
- def test_findings_section_has_issue_group_blocks(self):
- doc = build_pdf_document(_base_payload())
- findings = next(s for s in doc.sections if s.id == "findings")
- assert any(isinstance(b, IssueGroupBlock) for b in findings.blocks)
-
- def test_findings_starts_on_new_page_via_cover_break(self):
- doc = build_pdf_document(_base_payload())
- # Cover ends with explicit page break; findings section should not double-break
- findings = next(s for s in doc.sections if s.id == "findings")
- assert findings.page_break_before is False
-
- def test_url_sample_truncation(self):
- links = [{"url": f"https://x.com/p{i}", "status": "200", "title": f"P{i}"} for i in range(30)]
- payload = _base_payload(links=links)
- doc = build_pdf_document(payload, PdfBuildOptions())
- url_section = next(s for s in doc.sections if s.id == "appendix.urls")
- url_block = url_section.blocks[0]
- assert len(url_block.rows) == 20 # default limit
- assert url_block.truncation is not None
- assert url_block.truncation.total == 30
-
- def test_executive_profile_only_cover_sections(self):
- doc = build_pdf_document(_base_payload(), PdfBuildOptions(profile="executive"))
- # executive profile sections = ["core"] only
- section_keys = {s.section_key for s in doc.sections}
- assert "findings" not in [s.id for s in doc.sections]
-
- def test_no_findings_section_when_no_issues(self):
- payload = _base_payload()
- payload["categories"] = [{"name": "Technical SEO", "score": 100, "issues": []}]
- doc = build_pdf_document(payload)
- section_ids = [s.id for s in doc.sections]
- assert "findings" not in section_ids
-
- def test_issues_normalized_url_dedup(self):
- doc = build_pdf_document(_base_payload())
- findings = next(s for s in doc.sections if s.id == "findings")
- all_issues = []
- for blk in findings.blocks:
- if isinstance(blk, IssueGroupBlock):
- all_issues.extend(blk.issues)
- sitemap_issue = next(
- (i for i in all_issues if i.headline == "In sitemap, not crawled"), None
- )
- assert sitemap_issue is not None
- # URL must not be embedded in the headline
- if sitemap_issue.url:
- assert sitemap_issue.url not in sitemap_issue.headline
-
-
-class TestEmptyPayload:
- def test_empty_categories(self):
- doc = build_pdf_document({"site_name": "empty.test", "categories": [], "links": []})
- assert doc.cover.headline == "Site Audit — empty.test"
- assert doc.meta.health_score is None
-
- def test_empty_links_no_url_section(self):
- doc = build_pdf_document({"site_name": "empty.test", "categories": [], "links": []})
- ids = [s.id for s in doc.sections]
- assert "appendix.urls" not in ids
-
- def test_missing_keys_no_crash(self):
- doc = build_pdf_document({})
- assert doc.document_kind == "audit"
diff --git a/tests/reporting/test_pdf_normalize.py b/tests/reporting/test_pdf_normalize.py
deleted file mode 100644
index 4cdb4c39..00000000
--- a/tests/reporting/test_pdf_normalize.py
+++ /dev/null
@@ -1,208 +0,0 @@
-"""Unit tests for PDF issue normalization and grouping."""
-from __future__ import annotations
-
-import pytest
-
-from website_profiling.reporting.pdf.normalize import (
- collapse_duplicate_issues,
- group_issues_for_pdf,
- normalize_issue_for_pdf,
-)
-
-
-def _row(message: str, url: str = "", priority: str = "high", category: str = "Technical SEO",
- recommendation: str = "Fix it") -> dict:
- return {
- "category": category,
- "priority": priority,
- "message": message,
- "url": url,
- "recommendation": recommendation,
- "llm_recommendation": "",
- }
-
-
-class TestNormalizeIssue:
- def test_url_dedup_from_message(self):
- """URL embedded in message should be stripped from headline."""
- issue = normalize_issue_for_pdf(_row(
- message="URL in sitemap but not crawled: https://codefrydev.in/2048",
- url="https://codefrydev.in/2048",
- ))
- assert "https://codefrydev.in/2048" not in issue.headline
- assert issue.headline == "In sitemap, not crawled"
- assert issue.url == "https://codefrydev.in/2048"
-
- def test_url_dedup_no_change_when_url_blank(self):
- issue = normalize_issue_for_pdf(_row(
- message="2 page(s) missing viewport meta tag.",
- url="",
- ))
- assert "viewport" in issue.headline
- assert issue.url is None
-
- def test_lighthouse_cache_insight_label(self):
- issue = normalize_issue_for_pdf(_row(message="cache-insight:", url="https://example.com"))
- assert issue.headline == "Serve assets with efficient cache policy"
- assert "lighthouse" in issue.tags
-
- def test_lighthouse_color_contrast_label(self):
- issue = normalize_issue_for_pdf(_row(message="color-contrast:", url="https://example.com"))
- assert issue.headline == "Background and foreground colors lack sufficient contrast"
-
- def test_unknown_lighthouse_id_fallback(self):
- """Unknown audit ids should be title-cased as fallback."""
- issue = normalize_issue_for_pdf(_row(message="my-custom-check:", url="https://example.com"))
- assert issue.headline == "My Custom Check"
-
- def test_plain_message_unchanged(self):
- issue = normalize_issue_for_pdf(_row(message="Missing H1 on homepage.", url=""))
- assert issue.headline == "Missing H1 on homepage."
-
- def test_recommendation_included(self):
- issue = normalize_issue_for_pdf(_row(message="issue", recommendation="Do this"))
- assert issue.recommendation == "Do this"
-
- def test_recommendation_excluded(self):
- issue = normalize_issue_for_pdf(_row(message="issue", recommendation="Do this"),
- include_recommendation=False)
- assert issue.recommendation is None
-
- def test_sitemap_tag_applied(self):
- issue = normalize_issue_for_pdf(_row(message="URL in sitemap but not crawled: https://x.com/p",
- url="https://x.com/p"))
- assert "sitemap" in issue.tags
-
- def test_path_extracted_from_url(self):
- issue = normalize_issue_for_pdf(_row(message="issue", url="https://example.com/blog/post"))
- assert issue.path == "/blog/post"
-
- def test_path_none_when_url_blank(self):
- issue = normalize_issue_for_pdf(_row(message="issue", url=""))
- assert issue.path is None
-
- def test_unique_id_generated(self):
- r = _row(message="Missing title", url="https://example.com")
- issue = normalize_issue_for_pdf(r)
- assert len(issue.id) == 12
-
- def test_same_row_same_id(self):
- r = _row(message="Missing title", url="https://example.com")
- i1 = normalize_issue_for_pdf(r)
- i2 = normalize_issue_for_pdf(r)
- assert i1.id == i2.id
-
- def test_different_rows_different_id(self):
- r1 = _row(message="Missing title", url="https://example.com")
- r2 = _row(message="Missing title", url="https://other.com")
- assert normalize_issue_for_pdf(r1).id != normalize_issue_for_pdf(r2).id
-
- def test_generic_cwv_recommendation_shortened(self):
- generic = (
- "See Performance (Core Web Vitals) in this audit, "
- "or re-run Lighthouse from Run audit."
- )
- issue = normalize_issue_for_pdf(_row(message="largest-contentful-paint:", recommendation=generic))
- assert issue.recommendation == "Review Lighthouse audit details for this page."
-
-
-class TestCollapseDuplicates:
- def test_merges_same_headline_and_fix(self):
- rows = [
- _row("URL in sitemap but not crawled: https://a.com/1", url="https://a.com/1"),
- _row("URL in sitemap but not crawled: https://a.com/2", url="https://a.com/2"),
- ]
- issues = [normalize_issue_for_pdf(r) for r in rows]
- collapsed = collapse_duplicate_issues(issues)
- assert len(collapsed) == 1
- assert collapsed[0].related_urls == ["https://a.com/1", "https://a.com/2"]
- assert "(2 URLs)" in collapsed[0].headline
-
- def test_keeps_distinct_recommendations_separate(self):
- rows = [
- _row("issue", url="https://a.com/1", recommendation="Fix A"),
- _row("issue", url="https://a.com/2", recommendation="Fix B"),
- ]
- issues = [normalize_issue_for_pdf(r) for r in rows]
- assert len(collapse_duplicate_issues(issues)) == 2
-
- def test_collapse_in_grouping(self):
- rows = [
- _row(f"URL in sitemap but not crawled: https://a.com/{i}", url=f"https://a.com/{i}")
- for i in range(5)
- ]
- issues = [normalize_issue_for_pdf(r) for r in rows]
- groups = group_issues_for_pdf(issues)
- assert len(groups[0].issues) == 1
- assert len(groups[0].issues[0].related_urls) == 5
-
-
-class TestGroupIssues:
- def _make_issues(self, specs):
- result = []
- for priority, category, msg in specs:
- row = _row(message=msg, priority=priority, category=category)
- result.append(normalize_issue_for_pdf(row))
- return result
-
- def test_single_priority_single_group(self):
- issues = self._make_issues([("critical", "Mobile SEO", "Missing viewport")])
- groups = group_issues_for_pdf(issues)
- assert len(groups) == 1
- assert groups[0].id == "findings.critical"
- assert len(groups[0].issues) == 1
-
- def test_groups_sorted_critical_first(self):
- issues = self._make_issues([
- ("low", "Tech", "thing"),
- ("critical", "Mobile", "viewport"),
- ("high", "Technical SEO", "sitemap"),
- ])
- groups = group_issues_for_pdf(issues)
- priorities = [g.id.split(".")[1] for g in groups]
- assert priorities[0] == "critical"
- assert priorities[1] == "high"
- assert priorities[-1] == "low"
-
- def test_subgroup_by_category_when_many(self):
- # More than _SUBGROUP_THRESHOLD (8) issues in one priority → sub-groups by category
- issues = self._make_issues(
- [("high", f"Cat{i % 3}", f"Issue {i}") for i in range(12)]
- )
- groups = group_issues_for_pdf(issues)
- # Should have multiple sub-groups under high
- ids = [g.id for g in groups]
- assert any("." in id and id.startswith("findings.high.") for id in ids)
-
- def test_truncation_applied(self):
- issues = self._make_issues([("low", "Tech", f"issue {i}") for i in range(30)])
- groups = group_issues_for_pdf(issues, issues_per_group=10)
- low_group = next(g for g in groups if "low" in g.id)
- assert low_group.truncation is not None
- assert low_group.truncation.shown == 10
- assert low_group.truncation.total == 30
-
- def test_total_cap_respected(self):
- issues = self._make_issues([("medium", "Tech", f"m{i}") for i in range(200)])
- groups = group_issues_for_pdf(issues, issues_total=50)
- total_shown = sum(len(g.issues) for g in groups)
- assert total_shown <= 50
-
- def test_empty_input_returns_empty(self):
- assert group_issues_for_pdf([]) == []
-
- def test_group_label_includes_count(self):
- issues = self._make_issues([("critical", "Mobile", "viewport")])
- groups = group_issues_for_pdf(issues)
- assert "1 issue" in groups[0].group_label
-
- def test_list_for_all_groups(self):
- issues = self._make_issues([("low", "Tech", f"x{i}") for i in range(15)])
- groups = group_issues_for_pdf(issues, issues_per_group=20)
- low_group = next(g for g in groups if "low" in g.id)
- assert low_group.render_as == "list"
-
- def test_list_for_small_group(self):
- issues = self._make_issues([("critical", "Mobile", f"x{i}") for i in range(3)])
- groups = group_issues_for_pdf(issues)
- assert groups[0].render_as == "list"
diff --git a/tests/reporting/test_pdf_render.py b/tests/reporting/test_pdf_render.py
deleted file mode 100644
index 25cef69e..00000000
--- a/tests/reporting/test_pdf_render.py
+++ /dev/null
@@ -1,300 +0,0 @@
-"""Smoke and content regression tests for the PDF renderer.
-
-These tests verify:
- 1. Render produces valid PDF bytes.
- 2. PDF text contains expected content and does NOT contain the old broken patterns.
- 3. The export_audit.export_audit_pdf() entry point is backward-compatible.
-"""
-from __future__ import annotations
-
-import pytest
-
-pytest.importorskip("reportlab")
-
-from website_profiling.reporting.pdf.builder import build_pdf_document
-from website_profiling.reporting.pdf.render import render_pdf_document
-from website_profiling.reporting.pdf.options import PdfBuildOptions
-
-
-def _rich_payload() -> dict:
- return {
- "site_name": "codefrydev.in",
- "report_generated_at": "2026-06-18T04:38:27+00:00",
- "report_meta": {
- "data_sources": ["crawl", "lighthouse", "search_console"],
- "crawl_scope": {
- "pages_crawled": 15,
- "max_pages_configured": 15,
- "crawl_limited": True,
- "render_mode": "javascript",
- "js_concurrency": 3,
- },
- },
- "categories": [
- {
- "name": "Technical SEO",
- "score": 79,
- "issues": [
- {
- "priority": "high",
- "message": "URL in sitemap but not crawled: https://codefrydev.in/2048",
- "url": "https://codefrydev.in/2048",
- "recommendation": "Add the page to the crawl scope.",
- },
- {
- "priority": "medium",
- "message": "Missing canonical URL.",
- "url": "https://codefrydev.in/llms.txt",
- "recommendation": "Add .",
- },
- ],
- },
- {
- "name": "Core Web Vitals",
- "score": 100,
- "issues": [
- {
- "priority": "high",
- "message": "cache-insight:",
- "url": "https://codefrydev.in",
- "recommendation": "Add Cache-Control headers.",
- },
- {
- "priority": "high",
- "message": "color-contrast:",
- "url": "https://codefrydev.in",
- "recommendation": "Increase contrast ratio to 4.5:1.",
- },
- ],
- },
- {
- "name": "Accessibility & markup",
- "score": 69,
- "issues": [
- {
- "priority": "medium",
- "message": (
- "axe: Ensure the contrast between foreground and background "
- "colors meets WCAG 2 AA minimum contrast ra"
- ),
- "url": "https://codefrydev.in",
- "recommendation": "Raise text contrast.",
- }
- ],
- },
- {
- "name": "Mobile SEO",
- "score": 90,
- "issues": [
- {
- "priority": "critical",
- "message": "2 page(s) missing viewport meta tag.",
- "url": "",
- "recommendation": "Add .",
- }
- ],
- },
- {
- "name": "Security",
- "score": 75,
- "issues": [
- {
- "priority": "medium",
- "message": "X-Content-Type-Options header not set.",
- "url": "https://codefrydev.in",
- "recommendation": "Add nosniff header.",
- },
- {
- "priority": "medium",
- "message": "X-Frame-Options header not set.",
- "url": "https://codefrydev.in",
- "recommendation": "Add X-Frame-Options: DENY.",
- },
- ],
- },
- ],
- "links": [
- {"url": "https://codefrydev.in", "status": "200", "title": "CodeFryDev"},
- {"url": "https://codefrydev.in/games", "status": "301", "title": "Games"},
- {"url": "https://codefrydev.in/about-us", "status": "301", "title": "About Us"},
- ],
- "summary": {"total_urls": 15},
- "status_counts": {"301": 12, "200": 3},
- "executive_summary": {
- "source": "deterministic",
- "summary": "Overall health is 87/100. Critical gap: viewport meta missing on 2 pages.",
- "priorities": ["Fix missing viewport meta", "Expand crawl scope to cover sitemap URLs"],
- "top_issues": [
- {"priority": "critical", "message": "Missing viewport meta tag", "url": ""},
- ],
- },
- }
-
-
-@pytest.fixture(scope="module")
-def rendered_pdf() -> bytes:
- payload = _rich_payload()
- doc = build_pdf_document(payload, PdfBuildOptions(profile="standard"))
- return render_pdf_document(doc)
-
-
-class TestPdfSmoke:
- def test_returns_bytes(self, rendered_pdf):
- assert isinstance(rendered_pdf, bytes)
-
- def test_pdf_header(self, rendered_pdf):
- assert rendered_pdf[:4] == b"%PDF"
-
- def test_non_trivial_size(self, rendered_pdf):
- assert len(rendered_pdf) > 1_000
-
- def test_executive_profile_renders(self):
- payload = _rich_payload()
- doc = build_pdf_document(payload, PdfBuildOptions(profile="executive"))
- pdf = render_pdf_document(doc)
- assert pdf[:4] == b"%PDF"
-
- def test_empty_payload_renders(self):
- doc = build_pdf_document({"site_name": "empty", "categories": [], "links": []})
- pdf = render_pdf_document(doc)
- assert pdf[:4] == b"%PDF"
-
-
-class TestPdfContent:
- """Verify content in the PdfDocument model (document level, not raw PDF bytes).
-
- Content assertions live here because the ReportLab output is FlateDecode
- compressed. We test the document model which is what drives the render.
- """
-
- def _get_doc(self):
- return build_pdf_document(_rich_payload(), PdfBuildOptions(profile="standard"))
-
- def test_site_name_in_cover_headline(self):
- doc = self._get_doc()
- assert "codefrydev.in" in doc.cover.headline
-
- def test_no_ellipsis_truncation_in_issue_headlines(self):
- """The new normalizer must NOT add '...' truncation that the old renderer applied."""
- from website_profiling.reporting.pdf.document import IssueGroupBlock
- doc = self._get_doc()
- findings = next(s for s in doc.sections if s.id == "findings")
- for blk in findings.blocks:
- if isinstance(blk, IssueGroupBlock):
- for issue in blk.issues:
- assert not issue.headline.endswith("..."), (
- f"Headline has hard '...' truncation from old code: {issue.headline!r}"
- )
-
- def test_lighthouse_label_expanded_in_headline(self):
- """cache-insight: should be expanded to human label, not left as bare audit id."""
- from website_profiling.reporting.pdf.document import IssueGroupBlock
- doc = self._get_doc()
- findings = next(s for s in doc.sections if s.id == "findings")
- for blk in findings.blocks:
- if isinstance(blk, IssueGroupBlock):
- for issue in blk.issues:
- assert issue.headline != "cache-insight:", (
- f"Lighthouse audit id was not expanded: {issue.headline!r}"
- )
-
- def test_url_not_duplicated_in_headline(self):
- """Sitemap URLs embedded in message should not appear in headline."""
- from website_profiling.reporting.pdf.document import IssueGroupBlock
- doc = self._get_doc()
- findings = next(s for s in doc.sections if s.id == "findings")
- for blk in findings.blocks:
- if isinstance(blk, IssueGroupBlock):
- for issue in blk.issues:
- if issue.url:
- assert issue.url not in issue.headline, (
- f"URL {issue.url!r} duplicated in headline {issue.headline!r}"
- )
-
- def test_glossary_section_present(self):
- doc = self._get_doc()
- from website_profiling.reporting.pdf.document import KeyValueBlock
- gloss_section = next(s for s in doc.sections if s.id == "appendix.glossary")
- gloss_block = next(b for b in gloss_section.blocks if isinstance(b, KeyValueBlock))
- keys = [row[0] for row in gloss_block.rows]
- assert "Crawl" in keys
-
-
-class TestHtmlPreviewParity:
- def test_html_renders_from_same_document(self):
- from website_profiling.reporting.pdf.render.html import render_html_document
- payload = _rich_payload()
- doc = build_pdf_document(payload, PdfBuildOptions(profile="standard"))
- html_out = render_html_document(doc)
- assert "Site Audit — codefrydev.in" in html_out
- assert "Executive summary" in html_out
- assert "Top traffic-impacting issues" in html_out
- assert "Findings" in html_out
- assert "Audit details" in html_out
- assert "class=\"issue-card" in html_out
- assert "cover-head" in html_out
- assert "grid-table stat-grid" in html_out
-
- """Ensure export_audit.export_audit_pdf() remains backward-compatible."""
-
- def test_backward_compat_no_args(self, monkeypatch):
- from website_profiling.tools import export_audit
- monkeypatch.setattr(export_audit, "_load_payload", lambda _rid=None: _rich_payload())
- pdf = export_audit.export_audit_pdf()
- assert isinstance(pdf, bytes)
- assert pdf[:4] == b"%PDF"
-
- def test_backward_compat_report_id(self, monkeypatch):
- from website_profiling.tools import export_audit
- monkeypatch.setattr(export_audit, "_load_payload", lambda _rid=None: _rich_payload())
- pdf = export_audit.export_audit_pdf(report_id=42)
- assert pdf[:4] == b"%PDF"
-
- def test_profile_param_standard(self, monkeypatch):
- from website_profiling.tools import export_audit
- monkeypatch.setattr(export_audit, "_load_payload", lambda _rid=None: _rich_payload())
- pdf = export_audit.export_audit_pdf(profile="standard")
- assert pdf[:4] == b"%PDF"
-
- def test_profile_param_executive(self, monkeypatch):
- from website_profiling.tools import export_audit
- monkeypatch.setattr(export_audit, "_load_payload", lambda _rid=None: _rich_payload())
- pdf = export_audit.export_audit_pdf(profile="executive")
- assert pdf[:4] == b"%PDF"
-
- def test_requires_reportlab(self, monkeypatch):
- from website_profiling.tools import export_audit
- monkeypatch.setattr(export_audit, "_load_payload", lambda _rid=None: _rich_payload())
-
- import builtins
- real_import = builtins.__import__
-
- def fake_import(name, *args, **kwargs):
- if name == "reportlab" or name.startswith("reportlab."):
- raise ImportError("no reportlab")
- return real_import(name, *args, **kwargs)
-
- with pytest.MonkeyPatch().context() as mp:
- mp.setattr(builtins, "__import__", fake_import)
- with pytest.raises(RuntimeError, match="reportlab"):
- export_audit.export_audit_pdf()
-
- def test_large_payload_no_crash(self, monkeypatch):
- from website_profiling.tools import export_audit
- issues = [
- {
- "priority": "low",
- "message": "x" * 150,
- "url": "https://example.com/" + ("path/" * 20),
- "recommendation": "fix",
- }
- for _ in range(90)
- ]
- payload = {
- "site_name": "Truncate PDF",
- "categories": [{"name": "Technical SEO", "score": 80, "issues": issues}],
- "links": [],
- }
- monkeypatch.setattr(export_audit, "_load_payload", lambda _rid=None: payload)
- pdf = export_audit.export_audit_pdf()
- assert pdf[:4] == b"%PDF"
diff --git a/tests/reporting/test_reporting_gaps.py b/tests/reporting/test_reporting_gaps.py
index b184f1cf..4a7c1e7a 100644
--- a/tests/reporting/test_reporting_gaps.py
+++ b/tests/reporting/test_reporting_gaps.py
@@ -37,7 +37,7 @@ def test_issue_impact_enriches_and_sorts():
]
google = {
"gsc": {"top_pages": [{"page": "https://example.com/page", "clicks": 3, "impressions": 100}]},
- "ga4": {"pages": [{"path": "/page", "sessions": 2}]},
+ "ga4": {"top_pages": [{"path": "/page", "sessions": 2}]},
}
enrich_categories_with_traffic_impact(categories, google)
assert categories[0]["issues"][0]["impact_score"] > categories[0]["issues"][1]["impact_score"]
@@ -55,7 +55,7 @@ def test_issue_impact_skips_homepage_ga4_path():
categories = [{
"issues": [{"url": "https://example.com/about", "priority": "Medium"}],
}]
- google = {"ga4": {"pages": [{"path": "/", "sessions": 999}]}}
+ google = {"ga4": {"top_pages": [{"path": "/", "sessions": 999}]}}
enrich_categories_with_traffic_impact(categories, google)
issue = categories[0]["issues"][0]
assert issue["ga4_sessions"] == 0
@@ -77,7 +77,7 @@ def test_issue_impact_handles_invalid_rows():
],
{
"gsc": {"top_pages": ["bad", {"page": "", "clicks": 1}, {"page": "https://example.com/x", "clicks": 2, "impressions": 5}]},
- "ga4": {"pages": ["bad", {"path": "", "sessions": 1}, {"path": "/x", "sessions": 4}]},
+ "ga4": {"top_pages": ["bad", {"path": "", "sessions": 1}, {"path": "/x", "sessions": 4}]},
},
)
assert sort_issues_by_impact(
diff --git a/tests/test_analysis_crawl_stores_edge_unit.py b/tests/test_analysis_crawl_stores_edge_unit.py
index 29a79f22..3394c116 100644
--- a/tests/test_analysis_crawl_stores_edge_unit.py
+++ b/tests/test_analysis_crawl_stores_edge_unit.py
@@ -259,12 +259,12 @@ def __getitem__(self, key):
batch = CrawlConn()
cs.write_crawl_batch(batch, [(1, "u", "200", "t", "static", _json_val({}))], 1) # type: ignore[arg-type]
- rconn = CrawlConn(fetchall=[{"url": "u", "data": {}}])
- df = cs._read_crawl_rows(rconn, 1, include_fetch_method=True) # type: ignore[arg-type]
+ rconn = CrawlConn(fetchall=[{"url": "u", "fetch_method": "static", "data": {}}])
+ df = cs.read_crawl(rconn, run_id=1) # type: ignore[arg-type]
assert df.iloc[0]["fetch_method"] == "static"
rconn2 = CrawlConn(fetchall=[{"url": "u", "data": {"fetch_method": "rendered"}}])
- df2 = cs._read_crawl_rows(rconn2, 1, include_fetch_method=False) # type: ignore[arg-type]
+ df2 = cs.read_crawl(rconn2, run_id=1) # type: ignore[arg-type]
assert df2.iloc[0]["fetch_method"] == "rendered"
nconn = CrawlConn()
diff --git a/tests/test_commands_config_stores_edge_unit.py b/tests/test_commands_config_stores_edge_unit.py
index ceab5a6b..b78539fb 100644
--- a/tests/test_commands_config_stores_edge_unit.py
+++ b/tests/test_commands_config_stores_edge_unit.py
@@ -639,18 +639,23 @@ class AlwaysBoom(CrawlConn):
def execute(self, sql, params=None):
raise RuntimeError("x")
- assert cs.read_crawl(AlwaysBoom()).empty # type: ignore[arg-type]
+ with pytest.raises(RuntimeError, match="x"):
+ cs.read_crawl(AlwaysBoom()) # type: ignore[arg-type]
rconn = CrawlConn(fetchall=[{"url": "u", "fetch_method": "rendered", "data": {}}])
monkeypatch.setattr(cs, "get_latest_crawl_run_id", lambda _c: None)
df = cs.read_crawl(rconn, run_id=None) # type: ignore[arg-type]
assert df.iloc[0]["fetch_method"] == "rendered"
- rconn2 = CrawlConn(fetchall=[{"url": "u", "data": {"fetch_method": "static"}}])
+ rconn2 = CrawlConn(fetchall=[{"url": "u", "fetch_method": "static", "data": {}}])
monkeypatch.setattr(cs, "get_latest_crawl_run_id", lambda _c: 2)
- df2 = cs._read_crawl_rows(rconn2, 2, include_fetch_method=False) # type: ignore[arg-type]
+ df2 = cs.read_crawl(rconn2, run_id=2) # type: ignore[arg-type]
assert df2.iloc[0]["fetch_method"] == "static"
+ rconn3 = CrawlConn(fetchall=[{"url": "u", "data": {}}])
+ df3 = cs.read_crawl(rconn3, run_id=2) # type: ignore[arg-type]
+ assert df3.iloc[0]["fetch_method"] == "static"
+
nconn = CrawlConn()
monkeypatch.setattr(cs, "get_latest_crawl_run_id", lambda _c: None)
cs.write_nodes(nconn, pd.DataFrame([{"url": "https://a.com", "count": 1}]), crawl_run_id=None) # type: ignore[arg-type]
@@ -971,7 +976,7 @@ def execute(self, sql, params=None):
)
rconn = CrawlConn(fetchall=[{"url": "u", "fetch_method": None, "data": {}}])
- df = cs._read_crawl_rows(rconn, 1, include_fetch_method=True) # type: ignore[arg-type]
+ df = cs.read_crawl(rconn, run_id=1) # type: ignore[arg-type]
assert df.iloc[0]["fetch_method"] == "static"
nconn = CrawlConn()
diff --git a/tests/test_common_analysis_commands_db_unit.py b/tests/test_common_analysis_commands_db_unit.py
index 82644927..f5aa3c98 100644
--- a/tests/test_common_analysis_commands_db_unit.py
+++ b/tests/test_common_analysis_commands_db_unit.py
@@ -819,37 +819,14 @@ def test_config_store_read_write_pipeline(monkeypatch) -> None:
def test_crawl_store_branches(monkeypatch) -> None:
from website_profiling.db import crawl_store as cs
- # create_crawl_run fallback without render_mode
- conn = CrawlConn(fetchone={"id": 3}, boom_execute=True)
- conn.boom_execute = False
-
- class BoomFirst(CrawlConn):
- def execute(self, sql, params=None):
- self.executed.append((sql, params))
- if "render_mode" in sql:
- raise RuntimeError("no column")
- return super().execute(sql, params)
-
- conn2 = BoomFirst(fetchone={"id": 4})
- assert cs.create_crawl_run(conn2, start_url="https://a.com", render_mode="js") == 4 # type: ignore[arg-type]
+ conn = CrawlConn(fetchone={"id": 3})
+ assert cs.create_crawl_run(conn, start_url="https://a.com", render_mode="js") == 3 # type: ignore[arg-type]
assert cs.get_latest_crawl_run_id(CrawlConn(boom_execute=True)) is None # type: ignore[arg-type]
info_conn = CrawlConn(fetchone={"created_at": "t", "start_url": "u", "render_mode": "static"})
assert cs.get_crawl_run_info(info_conn, 1)["render_mode"] == "static" # type: ignore[arg-type]
- # fallback query without render_mode
- class RenderBoom(CrawlConn):
- def execute(self, sql, params=None):
- self.executed.append((sql, params))
- if "render_mode" in sql:
- raise RuntimeError("no render_mode")
- if "FROM crawl_runs WHERE" in sql:
- return FakeCursor(fetchone_value={"created_at": "t", "start_url": "u"})
- return super().execute(sql, params)
-
- assert cs.get_crawl_run_info(RenderBoom(), 1)["start_url"] == "u" # type: ignore[arg-type]
-
row = pd.Series({"url": "https://a.com", "status": float("nan"), "n": 1})
out = cs._df_row_to_crawl_json(row)
assert out["status"] is None
@@ -866,30 +843,12 @@ def execute(self, sql, params=None):
df = pd.DataFrame([{"url": "https://a.com/", "status": 200}])
cs.write_crawl(wconn, df, crawl_run_id=None) # type: ignore[arg-type]
- # legacy insert fallback
- def boom_executemany(conn, sql, params, **kwargs):
- if "fetch_method" in sql:
- raise RuntimeError("legacy")
- from website_profiling.db._common import _executemany as real
-
- return real(conn, sql, params, page_size=kwargs.get("page_size", 500))
-
- monkeypatch.setattr(cs, "_executemany", boom_executemany)
from website_profiling.db._common import _json_val
cs._write_crawl_rows(wconn, [(1, "u", "200", "t", "static", _json_val({}))]) # type: ignore[arg-type]
- # read_crawl fallback without fetch_method
- rconn = CrawlConn(fetchall=[{"url": "u", "data": {"viewport_present": "true"}}])
-
- class FailFirst(CrawlConn):
- def execute(self, sql, params=None):
- if "fetch_method" in sql:
- raise RuntimeError("no fm")
- return super().execute(sql, params)
-
monkeypatch.setattr(cs, "get_latest_crawl_run_id", lambda _c: 1)
- df_read = cs.read_crawl(FailFirst(fetchall=[{"url": "u", "data": {}}]), run_id=1) # type: ignore[arg-type]
+ df_read = cs.read_crawl(CrawlConn(fetchall=[{"url": "u", "fetch_method": "static", "data": {}}]), run_id=1) # type: ignore[arg-type]
assert "fetch_method" in df_read.columns
# write_edges no run id, no latest
diff --git a/tests/test_competitor_gap_store.py b/tests/test_competitor_gap_store.py
index 6e6af39f..3dcdf72e 100644
--- a/tests/test_competitor_gap_store.py
+++ b/tests/test_competitor_gap_store.py
@@ -1,14 +1,12 @@
"""Tests for competitor keyword gap store."""
from __future__ import annotations
-import json
import os
from unittest.mock import MagicMock, patch
import pytest
from website_profiling.integrations.keywords.competitor_gap_store import (
- _migrate_legacy_config_if_empty,
merge_competitor_keyword_import,
read_competitor_keyword_gap,
write_competitor_keyword_gap,
@@ -68,16 +66,9 @@ def test_read_returns_empty_when_data_not_list() -> None:
assert read_competitor_keyword_gap(conn, 2) == []
-def test_read_migrates_when_row_missing() -> None:
+def test_read_returns_empty_when_row_missing() -> None:
conn = _mock_conn_with_row(None)
- legacy = [{"keyword": "legacy", "competitor": "old.com"}]
- with patch(
- "website_profiling.integrations.keywords.competitor_gap_store._migrate_legacy_config_if_empty",
- return_value=legacy,
- ) as migrate:
- out = read_competitor_keyword_gap(conn, 9)
- migrate.assert_called_once_with(conn, 9)
- assert out == legacy
+ assert read_competitor_keyword_gap(conn, 9) == []
def test_read_returns_empty_on_db_error() -> None:
@@ -86,47 +77,6 @@ def test_read_returns_empty_on_db_error() -> None:
assert read_competitor_keyword_gap(conn, 1) == []
-def test_migrate_legacy_empty_config() -> None:
- conn = MagicMock()
- with patch(
- "website_profiling.db.config_store.read_pipeline_config",
- return_value=({}, []),
- ):
- assert _migrate_legacy_config_if_empty(conn, 1) == []
-
-
-def test_migrate_legacy_parses_and_writes() -> None:
- conn = MagicMock()
- rows = [{"keyword": "kw", "competitor": "c.com"}]
- raw = json.dumps(rows)
- with patch(
- "website_profiling.db.config_store.read_pipeline_config",
- return_value=({"competitor_keyword_gap_json": raw}, []),
- ):
- out = _migrate_legacy_config_if_empty(conn, 4)
- assert out == rows
- conn.execute.assert_called()
- conn.commit.assert_called()
-
-
-def test_migrate_legacy_ignores_non_list_json() -> None:
- conn = MagicMock()
- with patch(
- "website_profiling.db.config_store.read_pipeline_config",
- return_value=({"competitor_keyword_gap_json": json.dumps({"bad": True})}, []),
- ):
- assert _migrate_legacy_config_if_empty(conn, 1) == []
-
-
-def test_migrate_legacy_returns_empty_on_error() -> None:
- conn = MagicMock()
- with patch(
- "website_profiling.db.config_store.read_pipeline_config",
- side_effect=RuntimeError("fail"),
- ):
- assert _migrate_legacy_config_if_empty(conn, 1) == []
-
-
def _require_database_url() -> None:
if not (os.environ.get("DATABASE_URL") or "").strip():
pytest.skip("DATABASE_URL not set")
@@ -155,12 +105,6 @@ def roundtrip_property_id() -> int:
return _integration_property_id("competitor-gap-roundtrip.example")
-@pytest.fixture
-def migrate_property_id() -> int:
- _require_database_url()
- return _integration_property_id("competitor-gap-migrate.example")
-
-
@pytest.mark.integration
def test_competitor_gap_db_roundtrip(roundtrip_property_id: int) -> None:
from website_profiling.db import db_session
@@ -192,38 +136,3 @@ def test_competitor_gap_db_roundtrip(roundtrip_property_id: int) -> None:
assert len(merged3) == 2
assert {r["keyword"] for r in merged3} == {"kw2", "new-kw"}
assert read_competitor_keyword_gap(conn, roundtrip_property_id) == merged3
-
-
-@pytest.mark.integration
-def test_migrate_legacy_config_from_pipeline(migrate_property_id: int) -> None:
- from website_profiling.db import db_session
-
- legacy_rows = [{"keyword": "from-config", "competitor": "legacy.com"}]
- with db_session() as conn:
- conn.execute(
- """
- INSERT INTO pipeline_config (key, value, is_unknown, updated_at)
- VALUES (%s, %s, false, now())
- ON CONFLICT (key) DO UPDATE SET
- value = EXCLUDED.value,
- is_unknown = false,
- updated_at = now()
- """,
- ("competitor_keyword_gap_json", json.dumps(legacy_rows)),
- )
- conn.execute(
- "DELETE FROM competitor_keyword_gap WHERE property_id = %s",
- (migrate_property_id,),
- )
- conn.commit()
- rows = read_competitor_keyword_gap(conn, migrate_property_id)
- assert rows == legacy_rows
- conn.execute(
- "DELETE FROM pipeline_config WHERE key = %s",
- ("competitor_keyword_gap_json",),
- )
- conn.execute(
- "DELETE FROM competitor_keyword_gap WHERE property_id = %s",
- (migrate_property_id,),
- )
- conn.commit()
diff --git a/tests/test_crawl_gap_coverage.py b/tests/test_crawl_gap_coverage.py
index 2468ae01..9f9867f5 100644
--- a/tests/test_crawl_gap_coverage.py
+++ b/tests/test_crawl_gap_coverage.py
@@ -214,20 +214,6 @@ def test_write_and_read_link_edges(monkeypatch) -> None:
assert cs.read_link_edges(CrawlConn(), run_id=None) == [] # type: ignore[arg-type]
-def test_create_crawl_run_discovery_mode_fallback() -> None:
- from website_profiling.db import crawl_store as cs
-
- class BoomDisc(CrawlConn):
- def execute(self, sql, params=None):
- self.executed.append((sql, params))
- if "discovery_mode" in sql:
- raise RuntimeError("no discovery_mode")
- return super().execute(sql, params)
-
- conn = BoomDisc(fetchone={"id": 12})
- assert cs.create_crawl_run(conn, start_url="https://a.com", discovery_mode="list") == 12 # type: ignore[arg-type]
-
-
def test_create_crawl_run_raises_when_all_statements_fail() -> None:
from website_profiling.db import crawl_store as cs
diff --git a/tests/test_db_pipeline_jobs_unit.py b/tests/test_db_pipeline_jobs_unit.py
new file mode 100644
index 00000000..3d07d46d
--- /dev/null
+++ b/tests/test_db_pipeline_jobs_unit.py
@@ -0,0 +1,298 @@
+"""Unit tests for website_profiling.db.pipeline_jobs using FakeConn."""
+from __future__ import annotations
+
+import pytest
+import sys
+import os
+sys.path.insert(0, os.path.dirname(__file__))
+from db_test_fakes import FakeConn, FakeCursor
+
+from website_profiling.db.pipeline_jobs import (
+ PIPELINE_LOG_MAX,
+ PIPELINE_LOG_TRIM,
+ _trim_log,
+ append_job_log,
+ cancel_job_in_db,
+ check_flags,
+ enqueue_job,
+ finish_job,
+ get_active_job,
+ get_job,
+ list_jobs,
+ reconcile_stale_jobs,
+ set_cancel_flag,
+ set_pause_flag,
+ try_claim_pending_job,
+)
+
+
+# ── _trim_log ─────────────────────────────────────────────────────────────────
+
+def test_trim_log_no_truncation():
+ result, truncated = _trim_log("hello", " world")
+ assert result == "hello world"
+ assert truncated is False
+
+
+def test_trim_log_truncation():
+ big = "x" * PIPELINE_LOG_MAX
+ result, truncated = _trim_log(big, "extra")
+ assert truncated is True
+ assert len(result) == PIPELINE_LOG_TRIM
+
+
+# ── enqueue_job ───────────────────────────────────────────────────────────────
+
+def test_enqueue_job_success(monkeypatch):
+ conn = FakeConn()
+ # reconcile_stale_jobs will be called; make it a no-op
+ monkeypatch.setattr(
+ "website_profiling.db.pipeline_jobs.reconcile_stale_jobs", lambda c: 0
+ )
+ # enqueue returns a row (success)
+ conn.set_next_cursor(FakeCursor(fetchone_value={"id": "abc-123"}))
+ result = enqueue_job(conn, "abc-123", "crawl", None, None)
+ assert result is True
+ assert conn.commits == 1
+
+
+def test_enqueue_job_already_running(monkeypatch):
+ conn = FakeConn()
+ monkeypatch.setattr(
+ "website_profiling.db.pipeline_jobs.reconcile_stale_jobs", lambda c: 0
+ )
+ # enqueue returns no row (already running)
+ conn.set_next_cursor(FakeCursor(fetchone_value=None))
+ result = enqueue_job(conn, "abc-123", "crawl", None, None)
+ assert result is False
+
+
+# ── try_claim_pending_job ─────────────────────────────────────────────────────
+
+def test_try_claim_pending_job_returns_job():
+ conn = FakeConn()
+ conn.set_next_cursor(
+ FakeCursor(
+ fetchone_value={
+ "id": "job-1",
+ "job_type": "crawl",
+ "command": None,
+ "property_id": None,
+ }
+ )
+ )
+ result = try_claim_pending_job(conn, worker_pid=1234)
+ assert result is not None
+ assert result["id"] == "job-1"
+ assert result["job_type"] == "crawl"
+ assert conn.commits == 1
+
+
+def test_try_claim_pending_job_returns_none():
+ conn = FakeConn()
+ conn.set_next_cursor(FakeCursor(fetchone_value=None))
+ result = try_claim_pending_job(conn, worker_pid=1234)
+ assert result is None
+
+
+# ── append_job_log ────────────────────────────────────────────────────────────
+
+def test_append_job_log_no_row():
+ conn = FakeConn()
+ conn.set_next_cursor(FakeCursor(fetchone_value=None))
+ result = append_job_log(conn, "job-1", "some output")
+ assert result is False
+ assert conn.rollbacks == 1
+
+
+def test_append_job_log_appends_successfully():
+ conn = FakeConn()
+ conn.set_next_cursor(
+ FakeCursor(fetchone_value={"log_text": "existing", "log_truncated": False})
+ )
+ conn.set_next_cursor(FakeCursor()) # UPDATE
+ result = append_job_log(conn, "job-1", " more")
+ assert result is False # not truncated
+ assert conn.commits == 1
+
+
+def test_append_job_log_error_calls_rollback():
+ class BoomConn(FakeConn):
+ def execute(self, sql: str, params=None): # type: ignore[override]
+ self.executed.append((sql, params))
+ if "FOR UPDATE" in sql:
+ raise RuntimeError("db error")
+ return FakeCursor()
+
+ conn = BoomConn()
+ with pytest.raises(RuntimeError):
+ append_job_log(conn, "job-1", "chunk")
+
+
+def test_append_job_log_error_before_rollback():
+ """FOR UPDATE errors propagate without attempting rollback."""
+
+ class BoomAllConn(FakeConn):
+ def execute(self, sql: str, params=None): # type: ignore[override]
+ self.executed.append((sql, params))
+ if "FOR UPDATE" in sql:
+ raise RuntimeError("db error")
+ return FakeCursor()
+
+ conn = BoomAllConn()
+ with pytest.raises(RuntimeError, match="db error"):
+ append_job_log(conn, "job-1", "chunk")
+ assert conn.rollbacks == 0
+
+
+# ── finish_job ────────────────────────────────────────────────────────────────
+
+def test_finish_job_without_log_truncated():
+ conn = FakeConn()
+ finish_job(conn, "job-1", "completed", 0)
+ assert conn.commits == 1
+ sql = conn.executed[0][0]
+ assert "log_truncated" not in sql
+
+
+def test_finish_job_with_log_truncated():
+ conn = FakeConn()
+ finish_job(conn, "job-1", "error", 1, error="oops", log_truncated=True)
+ assert conn.commits == 1
+ sql = conn.executed[0][0]
+ assert "log_truncated" in sql
+
+
+# ── check_flags ───────────────────────────────────────────────────────────────
+
+def test_check_flags_returns_false_when_no_row():
+ conn = FakeConn()
+ conn.set_next_cursor(FakeCursor(fetchone_value=None))
+ cancel, pause = check_flags(conn, "job-1")
+ assert cancel is False
+ assert pause is False
+
+
+def test_check_flags_returns_values():
+ conn = FakeConn()
+ conn.set_next_cursor(
+ FakeCursor(fetchone_value={"cancel_requested": True, "pause_requested": False})
+ )
+ cancel, pause = check_flags(conn, "job-1")
+ assert cancel is True
+ assert pause is False
+
+
+# ── set_cancel_flag / set_pause_flag ─────────────────────────────────────────
+
+def test_set_cancel_flag():
+ conn = FakeConn()
+ set_cancel_flag(conn, "job-1")
+ assert conn.commits == 1
+ assert any("cancel_requested" in sql for sql, _ in conn.executed)
+
+
+def test_set_pause_flag():
+ conn = FakeConn()
+ conn.set_next_cursor(FakeCursor(fetchone_value={"id": "job-1"}))
+ set_pause_flag(conn, "job-1")
+ assert conn.commits == 1
+ assert any("pause_requested" in sql for sql, _ in conn.executed)
+
+
+# ── reconcile_stale_jobs ──────────────────────────────────────────────────────
+
+def test_reconcile_stale_jobs():
+ conn = FakeConn()
+ count = reconcile_stale_jobs(conn)
+ assert isinstance(count, int)
+
+
+def test_reconcile_stale_jobs_commits_when_updated():
+ conn = FakeConn()
+ # First SELECT returns stale pending jobs
+ conn.set_next_cursor(FakeCursor(fetchall_value=[{"id": "j1"}]))
+ # Second SELECT returns stale running jobs
+ conn.set_next_cursor(FakeCursor(fetchall_value=[{"id": "j2"}]))
+ count = reconcile_stale_jobs(conn)
+ assert count == 2
+ assert conn.commits >= 1
+
+
+# ── get_job ───────────────────────────────────────────────────────────────────
+
+def test_get_job_returns_none_when_not_found():
+ conn = FakeConn()
+ conn.set_next_cursor(FakeCursor(fetchone_value=None))
+ result = get_job(conn, "no-such-job")
+ assert result is None
+
+
+def test_get_job_returns_dict():
+ conn = FakeConn()
+ conn.set_next_cursor(
+ FakeCursor(
+ fetchone_value={
+ "id": "job-1",
+ "job_type": "crawl",
+ "status": "completed",
+ "command": None,
+ "property_id": None,
+ "config_hash": None,
+ "started_at": None,
+ "finished_at": None,
+ "exit_code": 0,
+ "error_text": None,
+ "log_text": "",
+ "log_truncated": False,
+ "cancel_requested": False,
+ "pause_requested": False,
+ "worker_pid": None,
+ }
+ )
+ )
+ result = get_job(conn, "job-1")
+ assert result is not None
+ assert result["id"] == "job-1"
+
+
+# ── list_jobs ─────────────────────────────────────────────────────────────────
+
+def test_list_jobs_returns_empty():
+ conn = FakeConn()
+ conn.set_next_cursor(FakeCursor(fetchall_value=[]))
+ result = list_jobs(conn, limit=5)
+ assert result == []
+
+
+# ── get_active_job ────────────────────────────────────────────────────────────
+
+def test_get_active_job_returns_none_when_no_active():
+ conn = FakeConn()
+ conn.set_next_cursor(FakeCursor(fetchone_value=None))
+ result = get_active_job(conn)
+ assert result is None
+
+
+# ── cancel_job_in_db ──────────────────────────────────────────────────────────
+
+def test_cancel_job_in_db_not_found():
+ conn = FakeConn()
+ conn.set_next_cursor(FakeCursor(fetchone_value=None))
+ result = cancel_job_in_db(conn, "no-such-job")
+ assert result is False
+
+
+def test_cancel_job_in_db_already_finished():
+ conn = FakeConn()
+ # The UPDATE returns no row because the job is already finished (status not in pending/running)
+ conn.set_next_cursor(FakeCursor(fetchone_value=None))
+ result = cancel_job_in_db(conn, "job-1")
+ assert result is False
+
+
+def test_cancel_job_in_db_running():
+ conn = FakeConn()
+ conn.set_next_cursor(FakeCursor(fetchone_value={"status": "running", "worker_pid": 99}))
+ result = cancel_job_in_db(conn, "job-1")
+ assert result is True
diff --git a/tests/test_db_store_coverage_gaps.py b/tests/test_db_store_coverage_gaps.py
new file mode 100644
index 00000000..e2ee573f
--- /dev/null
+++ b/tests/test_db_store_coverage_gaps.py
@@ -0,0 +1,426 @@
+"""Unit tests for db store modules that lost coverage after legacy-path removal."""
+from __future__ import annotations
+
+from datetime import datetime, timezone
+from unittest.mock import patch
+
+import pytest
+
+from tests.db_test_fakes import FakeConn, FakeCursor
+
+
+def _dt() -> datetime:
+ return datetime(2024, 6, 1, 12, 0, 0, tzinfo=timezone.utc)
+
+
+def test_config_store_llm_full_and_app_settings() -> None:
+ from website_profiling.db.config_store import (
+ read_app_setting,
+ read_llm_config_full,
+ write_app_setting,
+ )
+
+ conn = FakeConn()
+ conn.set_next_cursor(
+ FakeCursor(
+ fetchall_value=[
+ {"key": "model", "value": "gpt", "is_secret": True},
+ ]
+ )
+ )
+ rows = read_llm_config_full(conn)
+ assert rows == [{"key": "model", "value": "gpt", "is_secret": True}]
+
+ assert read_llm_config_full(FakeConn()) == [] # type: ignore[arg-type]
+
+ class BoomConn(FakeConn):
+ def execute(self, sql, params=None):
+ raise RuntimeError("db down")
+
+ assert read_llm_config_full(BoomConn()) == [] # type: ignore[arg-type]
+
+ conn2 = FakeConn()
+ conn2.set_next_cursor(FakeCursor(fetchone_value={"value": "on"}))
+ assert read_app_setting(conn2, "feature") == "on"
+
+ conn3 = FakeConn()
+ conn3.set_next_cursor(FakeCursor(fetchone_value=None))
+ assert read_app_setting(conn3, "missing") is None
+
+ class BoomConn(FakeConn):
+ def execute(self, sql, params=None):
+ raise RuntimeError("db down")
+
+ assert read_app_setting(BoomConn(), "x") is None # type: ignore[arg-type]
+
+ wconn = FakeConn()
+ write_app_setting(wconn, "k", "v")
+ assert wconn.commits == 1
+ assert "app_settings" in wconn.executed[0][0]
+
+
+def test_portfolio_store_deletes() -> None:
+ from website_profiling.db.portfolio_store import (
+ delete_portfolio_crawl_run,
+ delete_portfolio_item,
+ delete_portfolio_report,
+ )
+
+ conn = FakeConn()
+ conn.set_next_cursor(FakeCursor(fetchone_value={"id": 1}))
+ assert delete_portfolio_report(conn, 1) is True
+
+ conn2 = FakeConn()
+ conn2.set_next_cursor(FakeCursor(fetchone_value=None))
+ assert delete_portfolio_crawl_run(conn2, 9) is False
+
+ conn3 = FakeConn()
+ conn3.set_next_cursor(FakeCursor(fetchone_value={"id": 2}))
+ conn3.set_next_cursor(FakeCursor(fetchone_value={"id": 3}))
+ assert delete_portfolio_item(conn3, report_id=2, crawl_run_id=3) is True
+
+
+def test_dashboard_store_crud() -> None:
+ from website_profiling.db.dashboard_store import (
+ create_dashboard,
+ delete_dashboard,
+ get_dashboard,
+ list_dashboards,
+ update_dashboard,
+ )
+
+ created = datetime(2024, 1, 1, tzinfo=timezone.utc)
+ row = {
+ "id": 1,
+ "property_id": 5,
+ "name": "Dash",
+ "layout_json": {"widgets": []},
+ "is_default": False,
+ "created_at": created,
+ "updated_at": created,
+ }
+
+ conn = FakeConn()
+ conn.set_next_cursor(FakeCursor(fetchall_value=[row]))
+ listed = list_dashboards(conn, 5)
+ assert listed[0]["name"] == "Dash"
+
+ conn2 = FakeConn()
+ conn2.set_next_cursor(FakeCursor(fetchone_value=row))
+ assert get_dashboard(conn2, 1, 5)["id"] == 1
+
+ conn3 = FakeConn()
+ conn3.set_next_cursor(FakeCursor(fetchone_value=None))
+ assert get_dashboard(conn3, 99, 5) is None
+
+ conn4 = FakeConn()
+ conn4.set_next_cursor(FakeCursor(fetchone_value=row))
+ out = create_dashboard(conn4, 5, "New", {"a": 1})
+ assert out["propertyId"] == 5
+ assert conn4.commits == 1
+
+ conn5 = FakeConn()
+ conn5.set_next_cursor(FakeCursor()) # clear defaults
+ conn5.set_next_cursor(FakeCursor(fetchone_value={**row, "is_default": True}))
+ updated = update_dashboard(
+ conn5,
+ 1,
+ 5,
+ name="Renamed",
+ layout_json={"b": 2},
+ is_default=True,
+ )
+ assert updated and updated["isDefault"] is True
+
+ conn6 = FakeConn()
+ conn6.set_next_cursor(FakeCursor(fetchone_value={"id": 1}))
+ assert delete_dashboard(conn6, 1, 5) is True
+
+
+def test_issue_status_store() -> None:
+ from website_profiling.db.issue_status_store import (
+ issue_fingerprint,
+ list_issue_status,
+ upsert_issue_status,
+ )
+
+ fp = issue_fingerprint("msg", "https://ex.com", "cat")
+ assert len(fp) == 32
+
+ row = {
+ "id": 1,
+ "property_id": 2,
+ "report_id": 3,
+ "issue_fingerprint": fp,
+ "category_id": "cat",
+ "message": "msg",
+ "url": "https://ex.com",
+ "priority": "High",
+ "status": "open",
+ "assignee": None,
+ "note": None,
+ "updated_at": _dt(),
+ }
+
+ conn = FakeConn()
+ conn.set_next_cursor(FakeCursor(fetchall_value=[row]))
+ assert list_issue_status(conn, 2)[0]["status"] == "open"
+
+ conn2 = FakeConn()
+ conn2.set_next_cursor(FakeCursor(fetchone_value=row))
+ out = upsert_issue_status(
+ conn2,
+ property_id=2,
+ message="msg",
+ status="fixed",
+ url="https://ex.com",
+ category_id="cat",
+ )
+ assert out["status"] == "open"
+
+ with pytest.raises(ValueError, match="invalid status"):
+ upsert_issue_status(
+ FakeConn(),
+ property_id=1,
+ message="x",
+ status="bogus",
+ )
+
+ conn3 = FakeConn()
+ conn3.set_next_cursor(FakeCursor(fetchone_value=None))
+ with pytest.raises(RuntimeError, match="upsert failed"):
+ upsert_issue_status(conn3, property_id=1, message="x", status="open")
+
+
+def test_content_draft_store_paths() -> None:
+ from website_profiling.db.content_draft_store import (
+ create_content_draft,
+ delete_content_draft,
+ get_content_draft,
+ list_content_drafts,
+ update_content_draft,
+ )
+
+ list_row = {
+ "id": 1,
+ "property_id": 2,
+ "title": "T",
+ "target_keyword": "kw",
+ "landing_url": None,
+ "status": "draft",
+ "grade_score": 88.5,
+ "created_at": "2024-01-01",
+ "updated_at": "2024-01-01",
+ }
+ detail_row = {
+ **list_row,
+ "body_html": "x
",
+ "title_tag": "tag",
+ "meta_description": "desc",
+ "grade_snapshot": {"score": 80},
+ }
+
+ conn = FakeConn()
+ conn.set_next_cursor(FakeCursor(fetchall_value=[list_row]))
+ drafts = list_content_drafts(conn, 2)
+ assert drafts[0]["grade_score"] == 88.5
+
+ none_grade_row = {**list_row, "grade_score": None}
+ conn_none = FakeConn()
+ conn_none.set_next_cursor(FakeCursor(fetchall_value=[none_grade_row]))
+ assert list_content_drafts(conn_none, 2)[0]["grade_score"] is None
+
+ conn2 = FakeConn()
+ conn2.set_next_cursor(FakeCursor(fetchone_value=detail_row))
+ assert get_content_draft(conn2, 1)["title"] == "T"
+
+ conn3 = FakeConn()
+ conn3.set_next_cursor(FakeCursor(fetchone_value={"id": 10}))
+ assert create_content_draft(conn3, 2, title="New") == 10
+
+ conn4 = FakeConn()
+ conn4.set_next_cursor(FakeCursor(fetchone_value=detail_row))
+ patched = update_content_draft(
+ conn4,
+ 1,
+ {
+ "title": "Updated",
+ "target_keyword": "new-kw",
+ "landing_url": "https://ex.com",
+ "status": "published",
+ "body_html": "",
+ "title_tag": "t",
+ "meta_description": "m",
+ "grade_score": 90.0,
+ "grade_snapshot": {"a": 1},
+ },
+ )
+ assert patched and patched["title"] == "T"
+
+ conn5 = FakeConn()
+ conn5.set_next_cursor(FakeCursor(fetchone_value=detail_row))
+ assert update_content_draft(conn5, 1, {})["id"] == 1
+
+ conn6 = FakeConn()
+ conn6.set_next_cursor(FakeCursor(fetchone_value={"id": 1}))
+ assert delete_content_draft(conn6, 1) is True
+
+
+def test_markdown_store_list_crawl_runs() -> None:
+ from website_profiling.db.markdown_store import list_markdown_crawl_runs
+
+ created = _dt()
+ row = {
+ "id": 7,
+ "created_at": created,
+ "start_url": "https://ex.com",
+ "html_page_count": 3,
+ "markdown_page_count": 2,
+ }
+
+ conn = FakeConn()
+ conn.set_next_cursor(FakeCursor(fetchall_value=[row]))
+ runs = list_markdown_crawl_runs(conn, property_id=1)
+ assert runs[0]["html_page_count"] == 3
+ assert runs[0]["created_at"] == created.isoformat()
+
+ conn2 = FakeConn()
+ conn2.set_next_cursor(FakeCursor(fetchall_value=[row]))
+ all_runs = list_markdown_crawl_runs(conn2)
+ assert len(all_runs) == 1
+
+
+def test_property_store_ops_and_google() -> None:
+ from website_profiling.db.property_store import (
+ authorize_property_crawl,
+ apply_property_google_credentials_patch,
+ delete_property,
+ disconnect_property_google,
+ get_property_google_public_status,
+ get_property_google_status,
+ get_property_id_by_domain,
+ get_property_ops,
+ resolve_property_id_for_page,
+ update_property_crawl_preset,
+ update_property_ops,
+ )
+
+ prop_row = {
+ "id": 1,
+ "name": "ex.com",
+ "canonical_domain": "ex.com",
+ "site_url": "https://ex.com",
+ "gsc_site_url": "https://ex.com/",
+ "ga4_property_id": "123",
+ "google_auth_mode": "oauth",
+ "google_refresh_token": "tok",
+ "google_connected_at": _dt(),
+ "google_connected_email": "a@ex.com",
+ "google_date_range_days": 28,
+ "default_crawl_preset": None,
+ "crawl_authorized_at": None,
+ }
+
+ assert get_property_id_by_domain(FakeConn(), "") is None
+
+ conn = FakeConn()
+ conn.set_next_cursor(FakeCursor(fetchone_value=prop_row))
+ assert get_property_id_by_domain(conn, "EX.COM") == 1
+
+ conn2 = FakeConn()
+ conn2.set_next_cursor(FakeCursor(fetchone_value=prop_row))
+ assert resolve_property_id_for_page(conn2, "https://ex.com/page", property_id_str="1") == 1
+
+ conn3 = FakeConn()
+ conn3.set_next_cursor(FakeCursor(fetchone_value=prop_row))
+ assert resolve_property_id_for_page(conn3, "https://ex.com", domain_str="ex.com") == 1
+
+ conn4 = FakeConn()
+ conn4.set_next_cursor(FakeCursor(fetchone_value=prop_row))
+ assert resolve_property_id_for_page(conn4, "https://ex.com/path") == 1
+
+ assert resolve_property_id_for_page(FakeConn(), "https://ex.com", property_id_str="not-int") is None
+
+ conn4b = FakeConn()
+ conn4b.set_next_cursor(FakeCursor(fetchone_value=None))
+ assert resolve_property_id_for_page(conn4b, "https://unknown.com/page") is None
+
+ conn4c = FakeConn()
+ conn4c.set_next_cursor(FakeCursor(fetchone_value=None))
+ conn4c.set_next_cursor(FakeCursor(fetchone_value=prop_row))
+ assert resolve_property_id_for_page(
+ conn4c,
+ "https://ex.com/page",
+ domain_str="missing.com",
+ ) == 1
+
+ conn5 = FakeConn()
+ conn5.set_next_cursor(FakeCursor(fetchone_value=None))
+ assert get_property_ops(conn5, 99) is None
+
+ conn6 = FakeConn()
+ conn6.set_next_cursor(FakeCursor(fetchone_value=("cron", "hook", "email")))
+ ops = get_property_ops(conn6, 1)
+ assert ops["schedule_cron"] == "cron"
+
+ uconn = FakeConn()
+ update_property_ops(uconn, 1, schedule_cron="0 0 * * *", alert_webhook_url=None, alert_email="a@ex.com")
+ assert uconn.commits == 1
+
+ dconn = FakeConn()
+ dconn.set_next_cursor(FakeCursor(fetchone_value={"id": 1}))
+ assert delete_property(dconn, 1) is True
+
+ pconn = FakeConn()
+ update_property_crawl_preset(pconn, 1, "starter")
+ authorize_property_crawl(pconn, 1)
+ assert pconn.commits == 2
+
+ conn7 = FakeConn()
+ conn7.set_next_cursor(FakeCursor(fetchone_value=None))
+ missing = get_property_google_public_status(conn7, 404)
+ assert missing["connected"] is False
+
+ gconn = FakeConn()
+ apply_property_google_credentials_patch(
+ gconn,
+ 1,
+ gsc_site_url="https://ex.com/",
+ ga4_property_id="999",
+ date_range_days=14,
+ auth_mode="oauth",
+ connected_email="user@ex.com",
+ refresh_token="new-token",
+ )
+ assert gconn.commits == 1
+
+ with pytest.raises(ValueError, match="Analytics property ID"):
+ apply_property_google_credentials_patch(
+ FakeConn(),
+ 1,
+ ga4_property_id="G-ABC123",
+ )
+
+ with pytest.raises(ValueError, match="No valid fields"):
+ apply_property_google_credentials_patch(FakeConn(), 1)
+
+ dconn2 = FakeConn()
+ disconnect_property_google(dconn2, 1)
+ assert dconn2.commits == 1
+
+ conn8 = FakeConn()
+ conn8.set_next_cursor(FakeCursor(fetchone_value=prop_row))
+ with patch(
+ "website_profiling.db.google_app_store.read_google_app_settings",
+ return_value={"client_id": "cid"},
+ ), patch(
+ "website_profiling.integrations.google.store.read_last_google_fetched_at_for_property",
+ return_value="2024-01-01",
+ ):
+ status = get_property_google_status(conn8, 1)
+ assert status and status["hasClientId"] is True
+ assert status["lastFetchedAt"] == "2024-01-01"
+
+ conn9 = FakeConn()
+ conn9.set_next_cursor(FakeCursor(fetchone_value=None))
+ assert get_property_google_status(conn9, 1) is None
diff --git a/tests/test_historical_keywords_crawl_store_unit.py b/tests/test_historical_keywords_crawl_store_unit.py
index b4bc4966..5a37a1c6 100644
--- a/tests/test_historical_keywords_crawl_store_unit.py
+++ b/tests/test_historical_keywords_crawl_store_unit.py
@@ -252,7 +252,7 @@ def test_crawl_store_core_helpers(monkeypatch):
{
"RETURNING id": _Cursor(row={"id": 11}),
"ORDER BY id DESC LIMIT 1": _Cursor(row={"id": 9}),
- "WHERE id = %s": _Cursor(row={"created_at": "now", "start_url": "https://a.com"}),
+ "WHERE id = %s": _Cursor(row={"created_at": "now", "start_url": "https://a.com", "render_mode": "static"}),
}
)
assert cs.create_crawl_run(conn, "https://a.com") == 11 # type: ignore[arg-type]
diff --git a/tests/test_issue_impact.py b/tests/test_issue_impact.py
index 1ce5975e..8a0a24bd 100644
--- a/tests/test_issue_impact.py
+++ b/tests/test_issue_impact.py
@@ -19,7 +19,7 @@ def test_enrich_and_sort_issues():
{"message": "high traffic", "url": "https://ex.com/b", "priority": "Medium"},
],
}]
- google = {"gsc": {"pages": [
+ google = {"gsc": {"top_pages": [
{"page": "https://ex.com/b", "clicks": 50, "impressions": 1000},
]}}
enrich_categories_with_traffic_impact(categories, google)
diff --git a/tests/test_link_edges.py b/tests/test_link_edges.py
index 1d17a8de..aad8f935 100644
--- a/tests/test_link_edges.py
+++ b/tests/test_link_edges.py
@@ -148,32 +148,3 @@ def test_parse_links_backward_compat():
title, links = parse_links("https://example.com", html)
assert title == ""
assert links == {"https://example.com/a", "https://example.com/b"}
-
-
-def test_workbook_links_csv_columns():
- from website_profiling.tools.export_crawl_workbook import build_crawl_workbook_zip
- import zipfile
- import io
-
- payload = {
- "link_edges": [
- {
- "from_url": "https://example.com/",
- "to_url": "https://example.com/about",
- "anchor_text": "About",
- "rel": "nofollow",
- "is_nofollow": True,
- "is_sponsored": False,
- "is_ugc": False,
- "link_type": "internal",
- "position": "content",
- }
- ]
- }
- raw = build_crawl_workbook_zip(payload)
- with zipfile.ZipFile(io.BytesIO(raw)) as zf:
- header = zf.read("links.csv").decode("utf-8").splitlines()[0]
- assert "from_url" in header
- assert "anchor_text" in header
- assert "is_nofollow" in header
- assert "position" in header
diff --git a/tests/test_pipeline_lighthouse_url_selection.py b/tests/test_pipeline_lighthouse_url_selection.py
index 2f244dbb..a4e6c12f 100644
--- a/tests/test_pipeline_lighthouse_url_selection.py
+++ b/tests/test_pipeline_lighthouse_url_selection.py
@@ -46,7 +46,7 @@ def test_select_lighthouse_urls_from_gsc_ranks_by_clicks() -> None:
google = {
"gsc": {
- "pages": [
+ "top_pages": [
{"page": "https://a.com/low", "clicks": 1},
{"page": "https://a.com/high", "clicks": 50},
{"page": "https://a.com/missing", "clicks": 99},
@@ -62,7 +62,7 @@ def test_select_lighthouse_urls_from_gsc_ranks_by_clicks() -> None:
def test_select_lighthouse_urls_from_gsc_falls_back_to_crawl() -> None:
from website_profiling.commands.pipeline_cmd import select_lighthouse_urls_from_gsc
- google = {"gsc": {"pages": [{"page": "https://other.com", "clicks": 99}]}}
+ google = {"gsc": {"top_pages": [{"page": "https://other.com", "clicks": 99}]}}
assert select_lighthouse_urls_from_gsc(google, ["https://a.com/a", "https://a.com/b"], 1) == [
"https://a.com/a",
]
@@ -73,7 +73,7 @@ def test_select_lighthouse_urls_from_gsc_skips_bad_rows() -> None:
google = {
"gsc": {
- "pages": [
+ "top_pages": [
"bad-row",
{"page": "", "clicks": 5},
{"page": "https://a.com/x", "clicks": "not-a-number"},
diff --git a/tests/test_property_store_unit.py b/tests/test_property_store_unit.py
index 6de15bf9..ef008b55 100644
--- a/tests/test_property_store_unit.py
+++ b/tests/test_property_store_unit.py
@@ -159,3 +159,27 @@ def test_list_properties_public() -> None:
assert len(rows) == 1
assert rows[0]["google_connected"] is True
assert rows[0]["google_connected_at"] == dt.isoformat()
+
+
+def test_resolve_property_id_for_page_hostname() -> None:
+ from website_profiling.db.property_store import resolve_property_id_for_page
+
+ row = {
+ "id": 12,
+ "name": "ex.com",
+ "canonical_domain": "ex.com",
+ "site_url": "https://ex.com",
+ "gsc_site_url": None,
+ "ga4_property_id": None,
+ "google_auth_mode": None,
+ "google_refresh_token": None,
+ "google_connected_at": None,
+ "google_connected_email": None,
+ "google_date_range_days": 28,
+ "default_crawl_preset": None,
+ "crawl_authorized_at": None,
+ }
+ conn = FakeConn()
+ conn.set_next_cursor(FakeCursor(fetchone_value=row))
+ assert resolve_property_id_for_page(conn, "https://ex.com/about") == 12
+ assert resolve_property_id_for_page(FakeConn(), "") is None
diff --git a/tests/test_storage_bulk.py b/tests/test_storage_bulk.py
index 776976d8..874dfe90 100644
--- a/tests/test_storage_bulk.py
+++ b/tests/test_storage_bulk.py
@@ -29,6 +29,7 @@ def test_write_crawl_bulk_round_trip(pg_conn):
f"https://example.com/page-{i}",
"200",
f"Page {i}",
+ "static",
Json({"status": "200", "title": f"Page {i}"}),
)
)
diff --git a/tests/tools/test_audit_tools_coverage.py b/tests/tools/test_audit_tools_coverage.py
index 41a95b25..2dbd670b 100644
--- a/tests/tools/test_audit_tools_coverage.py
+++ b/tests/tools/test_audit_tools_coverage.py
@@ -455,7 +455,7 @@ def _read_pair(_conn: MagicMock, rid: int) -> dict:
bl_conn = MagicMock()
bl_conn.execute = MagicMock(return_value=MagicMock(
- fetchall=MagicMock(return_value=[{"captured_at": datetime.now(timezone.utc), "referring_domains": 3, "top_domains": []}]),
+ fetchall=MagicMock(return_value=[{"fetched_at": datetime.now(timezone.utc), "referring_domains": 3, "top_domains": []}]),
))
assert bl_mod.get_backlinks_velocity(bl_conn, ctx, {})["count"] == 1
diff --git a/tests/tools/test_audit_tools_expansion.py b/tests/tools/test_audit_tools_expansion.py
index 51dc9700..9a574921 100644
--- a/tests/tools/test_audit_tools_expansion.py
+++ b/tests/tools/test_audit_tools_expansion.py
@@ -154,6 +154,18 @@ def test_geo_tools_mocked(conn: MagicMock, ctx: AuditToolContext) -> None:
with patch.object(Ctx, "load_payload", return_value=_payload()), patch.object(Ctx, "load_crawl_df", return_value=_crawl_df()), patch(
"website_profiling.tools.audit_tools.geo.geo_tools._fetch_llms_txt",
return_value={"found": False},
+ ), patch(
+ "website_profiling.tools.audit_tools.geo.geo_tools._score_robots_ai_access",
+ return_value={"robots_score": 5},
+ ), patch(
+ "website_profiling.tools.audit_tools.geo.geo_tools._score_meta_signals",
+ return_value={"meta_score": 5},
+ ), patch(
+ "website_profiling.tools.audit_tools.geo.geo_tools._score_freshness_signals",
+ return_value={"freshness_score": 4},
+ ), patch(
+ "website_profiling.tools.audit_tools.geo.geo_tools._fetch_ai_discovery",
+ return_value={"discovery_score": 2, "found_count": 0, "endpoints": {}},
):
geo = dispatch_tool("get_geo_readiness_score", {}, context=ctx, conn=conn)
assert 0 <= geo["geo_readiness_score"] <= 100
diff --git a/tests/tools/test_audit_tools_expansion_coverage.py b/tests/tools/test_audit_tools_expansion_coverage.py
index cc368743..a84121c0 100644
--- a/tests/tools/test_audit_tools_expansion_coverage.py
+++ b/tests/tools/test_audit_tools_expansion_coverage.py
@@ -2,6 +2,7 @@
from __future__ import annotations
import json
+from contextlib import contextmanager
from unittest.mock import MagicMock, patch
import pandas as pd
@@ -127,6 +128,37 @@ def _crawl_df() -> pd.DataFrame:
])
+@contextmanager
+def _patch_geo_readiness_http(
+ *,
+ llms_found: bool = True,
+ robots_score: int = 9,
+ robots_side_effect: Exception | None = None,
+):
+ """Patch live HTTP helpers so get_geo_readiness_score never hits the network."""
+ llms_ret = {
+ "found": llms_found,
+ "depth": {"depth_score": 12} if llms_found else {},
+ }
+ robots_patch = (
+ {"side_effect": robots_side_effect}
+ if robots_side_effect is not None
+ else {"return_value": {"robots_score": robots_score}}
+ )
+ with (
+ patch.object(geo_mod, "_fetch_llms_txt", return_value=llms_ret),
+ patch.object(geo_mod, "_score_robots_ai_access", **robots_patch),
+ patch.object(geo_mod, "_score_meta_signals", return_value={"meta_score": 7}),
+ patch.object(geo_mod, "_score_freshness_signals", return_value={"freshness_score": 4}),
+ patch.object(
+ geo_mod,
+ "_fetch_ai_discovery",
+ return_value={"discovery_score": 4, "found_count": 2, "endpoints": {}},
+ ),
+ ):
+ yield
+
+
def test_payload_extras_edge_paths(conn: MagicMock, ctx: Ctx) -> None:
with patch.object(Ctx, "load_payload", return_value=None):
assert pe_mod.get_rich_results_summary(conn, ctx, {})["missing"] is True
@@ -236,9 +268,8 @@ def test_geo_tools_paths(conn: MagicMock, ctx: Ctx) -> None:
payload = {"ner_site_summary": {"entities": ["Acme", "Widgets"]}}
with patch.object(Ctx, "load_payload", return_value=payload), patch.object(
Ctx, "load_crawl_df", return_value=_crawl_df(),
- ), patch.object(Ctx, "resolve_property_domain", return_value="ex.com"), patch(
- "website_profiling.tools.audit_tools.geo.geo_tools._fetch_llms_txt",
- return_value={"found": True},
+ ), patch.object(Ctx, "resolve_property_domain", return_value="ex.com"), _patch_geo_readiness_http(
+ llms_found=True,
):
geo = geo_mod.get_geo_readiness_score(conn, ctx, {})
assert 0 <= geo["geo_readiness_score"] <= 100
@@ -282,7 +313,7 @@ def test_geo_tools_paths(conn: MagicMock, ctx: Ctx) -> None:
empty_geo = pd.DataFrame([{"url": "https://ex.com/e", "status": "404", "page_analysis": "{}"}])
with patch.object(Ctx, "load_payload", return_value={}), patch.object(Ctx, "load_crawl_df", return_value=empty_geo), patch.object(
Ctx, "resolve_property_domain", return_value="ex.com",
- ), patch("website_profiling.tools.audit_tools.geo.geo_tools._fetch_llms_txt", return_value={"found": False}):
+ ), _patch_geo_readiness_http(llms_found=False):
geo_empty = geo_mod.get_geo_readiness_score(conn, ctx, {})
assert geo_empty["components"]["schema_coverage"] == 0
@@ -292,11 +323,9 @@ def test_geo_readiness_survives_http_task_exception(conn: MagicMock, ctx: Ctx) -
# must degrade to a 0 sub-score, not crash the whole composite score.
with patch.object(Ctx, "load_payload", return_value={}), patch.object(
Ctx, "load_crawl_df", return_value=_crawl_df(),
- ), patch.object(Ctx, "resolve_property_domain", return_value="ex.com"), patch(
- "website_profiling.tools.audit_tools.geo.geo_tools._fetch_llms_txt", return_value={"found": False},
- ), patch(
- "website_profiling.tools.audit_tools.geo.geo_tools._score_robots_ai_access",
- side_effect=RuntimeError("boom"),
+ ), patch.object(Ctx, "resolve_property_domain", return_value="ex.com"), _patch_geo_readiness_http(
+ llms_found=False,
+ robots_side_effect=RuntimeError("boom"),
):
result = geo_mod.get_geo_readiness_score(conn, ctx, {})
assert 0 <= result["geo_readiness_score"] <= 100
@@ -309,11 +338,12 @@ def test_google_ctr_and_keywords(conn: MagicMock, ctx: Ctx) -> None:
gsc_data = {
"gsc": {
- "pages": [
+ "top_pages": [
{"page": "https://ex.com/a", "impressions": 500, "position": 5, "ctr": "0.5%"},
"skip",
{"page": "https://ex.com/b", "impressions": 50, "position": 10, "ctr": "5%"},
{"page": "https://ex.com/c", "impressions": 200, "position": "bad"},
+ {"page": "https://ex.com/d", "impressions": 500, "position": 0, "ctr": "0.5%"},
],
},
}
@@ -326,10 +356,10 @@ def test_google_ctr_and_keywords(conn: MagicMock, ctx: Ctx) -> None:
ctr_kw = kw_mod.list_keywords_ctr_opportunity(conn, ctx, {})
assert ctr_kw["total"] >= 1
- with patch.object(Ctx, "load_google", return_value={"gsc": {"pages": "bad"}}):
+ with patch.object(Ctx, "load_google", return_value={"gsc": {"top_pages": "bad"}}):
assert google_mod.get_gsc_ctr_opportunity_pages(conn, ctx, {})["total"] == 0
- high_ctr = {"gsc": {"pages": [{"page": "https://ex.com/good", "impressions": 1000, "position": 3, "ctr": "15%"}]}}
+ high_ctr = {"gsc": {"top_pages": [{"page": "https://ex.com/good", "impressions": 1000, "position": 3, "ctr": "15%"}]}}
with patch.object(Ctx, "load_google", return_value=high_ctr):
assert google_mod.get_gsc_ctr_opportunity_pages(conn, ctx, {})["total"] == 0
@@ -511,7 +541,7 @@ def test_expansion_coverage_gaps(conn: MagicMock, ctx: Ctx) -> None:
with patch.object(Ctx, "load_crawl_df", return_value=pag_df):
assert pe_mod.get_pagination_audit_summary(conn, ctx, {})["pages_with_rel_next"] == 1
- low_ctr = {"gsc": {"pages": [{"page": "https://ex.com/low", "impressions": 500, "position": 5, "ctr": 0.001}]}}
+ low_ctr = {"gsc": {"top_pages": [{"page": "https://ex.com/low", "impressions": 500, "position": 5, "ctr": 0.001}]}}
with patch.object(Ctx, "load_google", return_value=low_ctr):
assert google_mod.get_gsc_ctr_opportunity_pages(conn, ctx, {})["total"] == 1
diff --git a/tests/tools/test_export_audit.py b/tests/tools/test_export_audit.py
index 5198380c..3a95f087 100644
--- a/tests/tools/test_export_audit.py
+++ b/tests/tools/test_export_audit.py
@@ -1,12 +1,10 @@
"""Tests for audit export helpers."""
from __future__ import annotations
-import pytest
-
from website_profiling.tools import export_audit
-def test_export_html_contains_site_name(monkeypatch):
+def test_export_csv_contains_site_name(monkeypatch):
payload = {
"site_name": "Example Corp",
"report_generated_at": "2026-01-01",
@@ -27,22 +25,19 @@ def test_export_html_contains_site_name(monkeypatch):
}
monkeypatch.setattr(export_audit, "_load_payload", lambda _rid=None: payload)
- html_out = export_audit.export_audit_html()
- assert "Example Corp" in html_out
- assert "Missing title" in html_out
- assert "Data source glossary" in html_out
+ csv_out = export_audit.export_audit_csv()
+ assert "Example Corp" in csv_out
+ assert "Missing title" in csv_out
-def test_export_pdf_returns_bytes(monkeypatch):
- pytest.importorskip("reportlab")
- payload = {"site_name": "PDF Test", "categories": [], "links": []}
+def test_export_json_returns_payload(monkeypatch):
+ payload = {"site_name": "JSON Test", "categories": [], "links": []}
monkeypatch.setattr(export_audit, "_load_payload", lambda _rid=None: payload)
- pdf = export_audit.export_audit_pdf()
- assert isinstance(pdf, bytes)
- assert pdf[:4] == b"%PDF"
+ json_out = export_audit.export_audit_json()
+ assert '"JSON Test"' in json_out
-def test_export_html_executive_summary_and_llm_recommendation(monkeypatch):
+def test_export_csv_executive_summary_and_llm_recommendation(monkeypatch):
payload = {
"site_name": "Exec Site",
"report_generated_at": "2026-06-01",
@@ -50,14 +45,6 @@ def test_export_html_executive_summary_and_llm_recommendation(monkeypatch):
"source": "ai_insights",
"summary": "Overall health is strong with two high-priority gaps.",
"priorities": ["Fix canonical tags on /blog/", "Reduce LCP on homepage"],
- "top_issues": [
- {
- "priority": "high",
- "message": "Slow LCP",
- "url": "https://exec.example/",
- "gsc_clicks": 120,
- }
- ],
},
"categories": [
{
@@ -76,12 +63,7 @@ def test_export_html_executive_summary_and_llm_recommendation(monkeypatch):
"links": [],
}
monkeypatch.setattr(export_audit, "_load_payload", lambda _rid=None: payload)
- html_out = export_audit.export_audit_html()
csv_out = export_audit.export_audit_csv()
- assert "Executive summary" in html_out
- assert "AI insights" in html_out
- assert "Fix canonical tags on /blog/" in html_out
- assert "Top traffic-impacting issues" in html_out
- assert "Compress hero image" in html_out
assert "# Executive summary" in csv_out
assert "llm_recommendation" in csv_out
+ assert "Compress hero image" in csv_out
diff --git a/tests/tools/test_export_audit_coverage.py b/tests/tools/test_export_audit_coverage.py
index 061eaeb6..fbb511a0 100644
--- a/tests/tools/test_export_audit_coverage.py
+++ b/tests/tools/test_export_audit_coverage.py
@@ -6,6 +6,17 @@
import pytest
from website_profiling.tools import export_audit
+from website_profiling.tools.export_audit_data import (
+ _executive_export_data,
+ _executive_source_label,
+ _format_report_date,
+ _issue_recommendation,
+ _issues_rows,
+ _overall_score,
+ _priority_sort_key,
+ _score_band,
+ _summary_lines,
+)
def _rich_payload() -> dict:
@@ -34,17 +45,6 @@ def _rich_payload() -> dict:
"url": "https://example.com/top",
"gsc_clicks": "bad",
},
- {
- "priority": "medium",
- "message": "Zero clicks",
- "url": "https://example.com/zero",
- "gsc_clicks": 0,
- },
- {
- "priority": "high",
- "message": "x" * 120,
- "url": "https://example.com/" + ("segment/" * 15),
- },
],
},
"categories": [
@@ -56,27 +56,17 @@ def _rich_payload() -> dict:
],
"links": [
{"url": "https://example.com/ok", "status": "200", "title": "OK", "inlinks": 3, "word_count": 100},
- {"url": "https://example.com/redirect", "status": "301", "title": "Redir"},
- {"url": "https://example.com/missing", "status": "404", "title": ""},
- {"url": "https://example.com/error", "status": "500", "title": "Err"},
- {"url": "https://example.com/custom", "status": "200", "custom_extract": "CEF"},
"not-a-dict",
],
"report_meta": {
"data_sources": ["Crawl", "GSC"],
"google_fetched_at": "2026-06-06",
- "export_logo_url": "https://cdn.example/logo.png",
"crawl_scope": {
"pages_crawled": 50,
"max_pages_configured": 100,
"crawl_limited": True,
"render_mode": "javascript",
"js_concurrency": 4,
- "browser_diagnostics": {
- "pages_with_console_errors": 2,
- "total_console_errors": 5,
- "pages_with_page_errors": 1,
- },
},
},
"summary": {
@@ -113,79 +103,35 @@ def test_load_payload_success_and_missing() -> None:
def test_helper_functions_cover_branches() -> None:
payload = _rich_payload()
- rows = export_audit._issues_rows(payload)
+ rows = _issues_rows(payload)
assert len(rows) >= 4
- legacy = export_audit._executive_export_data({"recommendations": ["Only legacy"]})
- assert "Only legacy" in legacy["summary"]
-
- assert export_audit._executive_source_label("ai_insights") == "AI insights"
- assert export_audit._executive_source_label("deterministic") == "Measured + Search Console"
- assert export_audit._executive_source_label("custom") == "custom"
- assert export_audit._executive_source_label("") == "Audit data"
-
- html_block = export_audit._executive_summary_html(payload)
- assert "Executive summary" in html_block
- assert "Top traffic-impacting issues" in html_block
-
- assert export_audit._format_report_date("") == "—"
- assert export_audit._format_report_date("not-a-date") == "not-a-date"
- assert "2026" in export_audit._format_report_date("2026-06-07T12:00:00")
+ legacy = _executive_export_data({"recommendations": ["Only legacy"]})
+ assert legacy["summary"] == ""
- assert export_audit._overall_score({"categories": []}) is None
- assert export_audit._overall_score(payload) == 70
+ assert _executive_source_label("ai_insights") == "AI insights"
+ assert _executive_source_label("deterministic") == "Measured + Search Console"
+ assert _executive_source_label("custom") == "custom"
+ assert _executive_source_label("") == "Audit data"
- assert export_audit._score_band(None) == ("—", "score-na")
- assert export_audit._score_band(85)[1] == "score-good"
- assert export_audit._score_band(65)[1] == "score-fair"
- assert export_audit._score_band(40)[1] == "score-poor"
+ assert _format_report_date("") == "—"
+ assert _format_report_date("not-a-date") == "not-a-date"
+ assert "2026" in _format_report_date("2026-06-07T12:00:00")
- cards = export_audit._category_cards_html(payload["categories"])
- assert "Technical SEO" in cards
- assert export_audit._category_cards_html([]).startswith(" None:
- from website_profiling.tools.export_audit_html import (
- _executive_summary_html,
- _priority_stats_html,
- _report_html_styles,
- )
-
- assert _executive_summary_html({}) == ""
- assert _executive_summary_html({"executive_summary": {}}) == ""
- clicks_payload = {
- "executive_summary": {
- "top_issues": [
- {
- "priority": "high",
- "message": "Traffic issue",
- "url": "https://example.com/hot",
- "gsc_clicks": 42,
- }
- ]
- }
- }
- html_block = _executive_summary_html(clicks_payload)
- assert "42" in html_block
- assert "GSC clicks" in html_block
-
- stats = _priority_stats_html({"critical": 1, "high": 2, "medium": 0, "low": 3})
- assert "stat-critical" in stats
- assert "Critical" in stats
-
- styles = _report_html_styles()
- assert isinstance(styles, str)
- assert len(styles) > 0
-
-
-def test_summary_lines_includes_scope_and_diagnostics() -> None:
- lines = dict(export_audit._summary_lines(_rich_payload()))
+def test_summary_lines_includes_scope() -> None:
+ lines = dict(_summary_lines(_rich_payload()))
assert lines["Property"] == "Coverage Site"
assert "pages crawled" in lines["Crawl scope"]
assert "JavaScript rendering" in lines["Crawl scope"]
- assert "Browser diagnostics" in lines
assert "Google data fetched" in lines
assert "HTTP status mix" in lines
assert lines["Critical issues"] == "55"
@@ -202,86 +148,66 @@ def test_summary_lines_auto_and_static_render_modes() -> None:
}
}
}
- auto_lines = dict(export_audit._summary_lines(auto_scope))
+ auto_lines = dict(_summary_lines(auto_scope))
assert "auto rendering" in auto_lines["Crawl scope"]
static_scope = {
"report_meta": {"crawl_scope": {"pages_crawled": 5, "static_html_only": True}}
}
- static_lines = dict(export_audit._summary_lines(static_scope))
+ static_lines = dict(_summary_lines(static_scope))
assert "static HTML only" in static_lines["Crawl scope"]
def test_issue_recommendation_prefers_llm_when_distinct() -> None:
- rec, llm = export_audit._issue_recommendation(
+ rec, llm = _issue_recommendation(
{"recommendation": "Rule", "llm_recommendation": "LLM fix"}
)
assert rec == "LLM fix"
assert llm == "LLM fix"
-def test_export_json_csv_and_truncated_html(monkeypatch) -> None:
- payload = _rich_payload()
- monkeypatch.setattr(export_audit, "_load_payload", lambda _rid=None: payload)
-
- json_out = export_audit.export_audit_json()
- assert '"Coverage Site"' in json_out
-
- csv_out = export_audit.export_audit_csv()
- assert "data_sources" in csv_out
- assert "Measured + Search Console" in csv_out
-
- html_out = export_audit.export_audit_html()
- assert "Site Audit — Coverage Site" in html_out
- assert "Showing 120 of" in html_out
- assert "Audit details" in html_out
- assert "Data source glossary" in html_out
- assert "Crawled URLs (sample)" in html_out
+def test_priority_sort_key_unknown_priority() -> None:
+ assert _priority_sort_key({"priority": "unknown"}) == 9
-def test_export_pdf_full_branches(monkeypatch) -> None:
- pytest.importorskip("reportlab")
- payload = _rich_payload()
- monkeypatch.setattr(export_audit, "_load_payload", lambda _rid=None: payload)
-
- pdf = export_audit.export_audit_pdf()
- assert pdf[:4] == b"%PDF"
-
-
-def test_export_pdf_truncates_long_issue_lists(monkeypatch) -> None:
- pytest.importorskip("reportlab")
- issues = [
- {
- "priority": "low",
- "message": "x" * 150,
- "url": "https://example.com/" + ("path/" * 20),
- "recommendation": "fix",
- }
- for _ in range(90)
- ]
+def test_summary_lines_browser_diagnostics() -> None:
payload = {
- "site_name": "Truncate PDF",
- "categories": [{"name": "Technical SEO", "score": 80, "issues": issues}],
- "links": [],
+ "report_meta": {
+ "crawl_scope": {
+ "pages_crawled": 10,
+ "browser_diagnostics": {
+ "pages_with_console_errors": 2,
+ "total_console_errors": 5,
+ "pages_with_page_errors": 1,
+ },
+ }
+ }
}
- monkeypatch.setattr(export_audit, "_load_payload", lambda _rid=None: payload)
- pdf = export_audit.export_audit_pdf()
- assert pdf[:4] == b"%PDF"
+ lines = dict(_summary_lines(payload))
+ assert "Browser diagnostics" in lines
+ assert "console errors" in lines["Browser diagnostics"]
-def test_export_pdf_requires_reportlab(monkeypatch) -> None:
- payload = {"site_name": "No PDF", "categories": [], "links": []}
- monkeypatch.setattr(export_audit, "_load_payload", lambda _rid=None: payload)
+def test_issue_priority_counts() -> None:
+ from website_profiling.tools.export_audit_data import _issue_priority_counts
- import builtins
+ counts = _issue_priority_counts([
+ {"priority": "critical"},
+ {"priority": "High"},
+ {"priority": "unknown"},
+ ])
+ assert counts["critical"] == 1
+ assert counts["high"] == 1
+ assert counts["medium"] == 0
- real_import = builtins.__import__
- def fake_import(name, *args, **kwargs):
- if name == "reportlab.lib" or name.startswith("reportlab."):
- raise ImportError("no reportlab")
- return real_import(name, *args, **kwargs)
+def test_export_json_and_csv(monkeypatch) -> None:
+ payload = _rich_payload()
+ monkeypatch.setattr(export_audit, "_load_payload", lambda _rid=None: payload)
+
+ json_out = export_audit.export_audit_json()
+ assert '"Coverage Site"' in json_out
- with patch("builtins.__import__", side_effect=fake_import):
- with pytest.raises(RuntimeError, match="PDF export requires reportlab"):
- export_audit.export_audit_pdf()
+ csv_out = export_audit.export_audit_csv()
+ assert "data_sources" in csv_out
+ assert "Measured + Search Console" in csv_out
diff --git a/tests/tools/test_export_tools_coverage.py b/tests/tools/test_export_tools_coverage.py
index 4afabc78..ee789e78 100644
--- a/tests/tools/test_export_tools_coverage.py
+++ b/tests/tools/test_export_tools_coverage.py
@@ -36,13 +36,6 @@ def test_export_tools_formats(conn: MagicMock, ctx: Ctx, tmp_path, monkeypatch)
assert dispatch_tool("export_list_as_csv", {}, context=ctx, conn=conn)["error"]
assert dispatch_tool("export_list_as_csv", {"tool_name": "nope"}, context=ctx, conn=conn)["error"]
- with patch.object(Ctx, "load_payload", return_value=payload), patch(
- "website_profiling.tools.audit_tools.export.export_tools.export_audit_html",
- return_value="",
- ):
- out = dispatch_tool("export_audit_report", {"format": "html"}, context=ctx, conn=conn)
- assert out.get("artifact_id")
-
with patch.object(Ctx, "load_payload", return_value=payload), patch(
"website_profiling.tools.audit_tools.export.export_tools.export_audit_json",
return_value="{}",
@@ -51,7 +44,7 @@ def test_export_tools_formats(conn: MagicMock, ctx: Ctx, tmp_path, monkeypatch)
assert out.get("format") == "json"
with patch.object(Ctx, "load_payload", return_value=payload), patch(
- "website_profiling.tools.audit_tools.export.export_tools.export_audit_pdf",
+ "website_profiling.tools.audit_tools.export.export_tools.fetch_report_pdf",
side_effect=FileNotFoundError,
):
assert dispatch_tool("export_audit_report", {"format": "pdf"}, context=ctx, conn=conn)["error"]
@@ -77,8 +70,8 @@ def test_export_tools_formats(conn: MagicMock, ctx: Ctx, tmp_path, monkeypatch)
def test_export_audit_report_paths(conn: MagicMock, ctx: Ctx, tmp_path, monkeypatch) -> None:
monkeypatch.setenv("DATA_DIR", str(tmp_path))
with patch.object(Ctx, "load_payload", return_value=_payload()), patch(
- "website_profiling.tools.audit_tools.export.export_tools.export_audit_pdf",
- return_value=b"%PDF",
+ "website_profiling.tools.audit_tools.export.export_tools.fetch_report_pdf",
+ return_value=b"%PDF-1.4",
):
pdf_out = dispatch_tool("export_audit_report", {"format": "pdf"}, context=ctx, conn=conn)
assert pdf_out.get("format") == "pdf"
diff --git a/tests/tools/test_export_workbook.py b/tests/tools/test_export_workbook.py
deleted file mode 100644
index 1c8f9c01..00000000
--- a/tests/tools/test_export_workbook.py
+++ /dev/null
@@ -1,63 +0,0 @@
-"""Tests for crawl workbook ZIP export."""
-from __future__ import annotations
-
-import zipfile
-import io
-
-from website_profiling.tools.export_crawl_workbook import build_crawl_workbook_zip
-
-
-def test_build_workbook_zip_contains_issues_csv():
- payload = {
- "links": [{"url": "https://ex.com/", "status": "200", "title": "Home", "inlinks": 1, "outlinks": 2}],
- "categories": [{"name": "SEO", "issues": [{"message": "Missing title", "url": "https://ex.com/x", "priority": "High"}]}],
- "link_edges": [{"from_url": "https://ex.com/", "to_url": "https://ex.com/x", "anchor_text": "x", "rel": "", "is_nofollow": False, "link_type": "internal"}],
- }
- raw = build_crawl_workbook_zip(payload)
- with zipfile.ZipFile(io.BytesIO(raw)) as zf:
- names = set(zf.namelist())
- assert "internal_urls.csv" in names
- assert "issues.csv" in names
- assert "links.csv" in names
-
-
-def test_build_workbook_zip_custom_fields_columns():
- payload = {
- "links": [
- {
- "url": "https://ex.com/p",
- "custom_extract": "SKU-1",
- "custom_fields": '{"price":"9.99","sku":"SKU-1"}',
- }
- ],
- }
- raw = build_crawl_workbook_zip(payload)
- with zipfile.ZipFile(io.BytesIO(raw)) as zf:
- body = zf.read("custom_fields.csv").decode("utf-8")
- assert "price" in body
- assert "9.99" in body
- assert "SKU-1" in body
-
-
-def test_build_workbook_zip_all_sheets():
- payload = {
- "links": [
- {
- "url": "https://ex.com/",
- "status": "200",
- "title": "Home",
- "inlinks": 1,
- "outlinks": 2,
- "custom_extract": "x",
- "custom_fields": '{"a":"1"}',
- }
- ],
- "categories": [{"name": "SEO", "issues": [{"message": "x", "url": "https://ex.com/y", "priority": "High"}]}],
- "link_edges": [{"from_url": "https://ex.com/", "to_url": "https://ex.com/y", "anchor_text": "y", "rel": "", "is_nofollow": False, "link_type": "internal"}],
- "redirects": [{"url": "https://ex.com/old", "message": "301", "priority": "Low", "recommendation": "fix"}],
- }
- raw = build_crawl_workbook_zip(payload)
- with zipfile.ZipFile(io.BytesIO(raw)) as zf:
- names = set(zf.namelist())
- assert "custom_fields.csv" in names
- assert "redirects.csv" in names
diff --git a/tests/tools/test_geo_parity.py b/tests/tools/test_geo_parity.py
index 35cd7144..44969d36 100644
--- a/tests/tools/test_geo_parity.py
+++ b/tests/tools/test_geo_parity.py
@@ -4,6 +4,7 @@
from unittest.mock import MagicMock, patch
import pytest
+import requests
# ---------------------------------------------------------------------------
@@ -74,6 +75,45 @@ def test_score_robots_no_domain() -> None:
assert result["checked"] is False
+def test_score_meta_signals_request_error() -> None:
+ with patch(
+ "website_profiling.tools.audit_tools.geo.geo_tools.requests.get",
+ side_effect=requests.RequestException("network"),
+ ):
+ result = _score_meta_signals("example.com")
+ assert result["meta_score"] == 0
+ assert result["checked"] is False
+
+
+def test_score_freshness_request_errors() -> None:
+ with patch(
+ "website_profiling.tools.audit_tools.geo.geo_tools.requests.get",
+ side_effect=requests.RequestException("network"),
+ ):
+ result = _score_freshness_signals("example.com")
+ assert result["freshness_score"] == 0
+ assert result["checked"] is True
+ assert result["has_sitemap"] is False
+ assert result["has_rss_atom_feed"] is False
+
+
+def test_score_robots_ai_access_tier_scoring() -> None:
+ robots = "User-agent: *\nDisallow: /\n"
+ mock_resp = MagicMock()
+ mock_resp.status_code = 200
+ mock_resp.text = robots
+ with patch(
+ "website_profiling.tools.audit_tools.geo.geo_tools.requests.get",
+ return_value=mock_resp,
+ ):
+ result = _score_robots_ai_access("example.com")
+ assert result["checked"] is True
+ assert result["robots_score"] == 0
+ assert result["citation_bots_score"] == 0
+ assert result["search_bots_score"] == 0
+ assert result["training_bots_score"] == 0
+
+
def test_fetch_ai_discovery_no_domain() -> None:
result = _fetch_ai_discovery("")
assert result["found_count"] == 0
diff --git a/tests/tools/test_tools_branch_coverage.py b/tests/tools/test_tools_branch_coverage.py
index 78ec9049..9ac2a9ed 100644
--- a/tests/tools/test_tools_branch_coverage.py
+++ b/tests/tools/test_tools_branch_coverage.py
@@ -13,7 +13,6 @@
from website_profiling.tools import export_artifacts
from website_profiling.tools.audit_tools import _slice, dispatch_tool
from website_profiling.tools.audit_tools.context import AuditToolContext as Ctx
-from website_profiling.tools.export_crawl_workbook import build_crawl_workbook_zip
@pytest.fixture
@@ -387,19 +386,6 @@ def test_export_artifacts_workbook_and_custom(tmp_path, monkeypatch, conn: Magic
export_artifacts.delete_artifact(aid)
assert not meta_path.exists()
- from website_profiling.tools import export_crawl_workbook as wb_mod
-
- assert wb_mod._parse_custom_fields({"price": 9.99}) == {"price": "9.99"}
- assert wb_mod._parse_custom_fields("{bad") == {}
- assert wb_mod._parse_custom_fields("[]") == {}
-
- raw = build_crawl_workbook_zip({
- "links": [{"url": "https://ex.com/p", "custom_fields": '{"price":"9.99"}'}],
- "categories": ["bad", {"name": "SEO", "issues": ["bad", {"message": "x", "priority": "Low"}]}],
- })
- with zipfile.ZipFile(io.BytesIO(raw)) as zf:
- assert "custom_fields.csv" in zf.namelist()
-
def test_tools_remaining_branch_coverage(conn: MagicMock, ctx: Ctx, tmp_path, monkeypatch) -> None:
from website_profiling.tools.audit_tools.backlinks import backlinks as bl_mod
@@ -421,7 +407,6 @@ def test_tools_remaining_branch_coverage(conn: MagicMock, ctx: Ctx, tmp_path, mo
from website_profiling.tools.audit_tools.report import report as report_mod
from website_profiling.tools.audit_tools.report import report_extras as rex_mod
from website_profiling.tools.audit_tools.security import security as sec_mod
- from website_profiling.tools import export_crawl_workbook as wb_mod
monkeypatch.setenv("DATA_DIR", str(tmp_path))
@@ -612,10 +597,6 @@ def test_tools_remaining_branch_coverage(conn: MagicMock, ctx: Ctx, tmp_path, mo
portfolio = llm_mod.get_portfolio_summary(conn, ctx, {})
assert portfolio["properties"][0]["issue_counts"] == {}
- assert wb_mod._parse_custom_fields(" ") == {}
- rows, cols = wb_mod._custom_field_rows([{"url": "", "custom_fields": '{"a":"1"}'}, {"custom_extract": "x"}])
- assert rows == [] and cols
-
assert export_artifacts.read_artifact_bytes("00000000-0000-0000-0000-000000000000") is None
aid = export_artifacts.save_artifact(b"x", filename="y.bin", mime_type="application/octet-stream")["artifact_id"]
with patch("website_profiling.tools.export_artifacts.os.remove", side_effect=OSError("denied")):
@@ -721,6 +702,3 @@ def test_tools_remaining_branch_coverage(conn: MagicMock, ctx: Ctx, tmp_path, mo
], "image_inventory_summary": {"unoptimized_min_kb": 200}}
with patch.object(Ctx, "load_payload", return_value=mixed_inv):
assert img_mod.list_unoptimized_images(conn, ctx, {})["total"] == 0
-
- rows, _ = wb_mod._custom_field_rows(["bad", {"url": "https://ex.com", "custom_fields": '{"a":"1"}'}])
- assert len(rows) == 1
diff --git a/web/app/api/ai/fix-suggestion/route.ts b/web/app/api/ai/fix-suggestion/route.ts
index 78ed2a8f..4d5714f3 100644
--- a/web/app/api/ai/fix-suggestion/route.ts
+++ b/web/app/api/ai/fix-suggestion/route.ts
@@ -1,84 +1,10 @@
-import { NextResponse, type NextRequest } from 'next/server';
-import { spawn } from 'child_process';
-import { getRepoRoot, getPipelineSpawnEnv } from '@/server/pipelineSpawnEnv';
-import { resolvePythonExecutable, parsePythonJsonStdout } from '@/server/resolvePython';
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
+import { forbiddenIfNotLocal } from '@/server/localOnly';
import type { ApiRouteHandler } from '@/types/api';
-export const runtime = 'nodejs';
export const dynamic = 'force-dynamic';
-const PYTHON_SCRIPT = `
-import json, sys
-from website_profiling.llm.fix_suggestions import generate_fix_suggestion
-payload = json.load(sys.stdin)
-print(json.dumps(generate_fix_suggestion(payload, refresh=bool(payload.get("refresh")))))
-`;
-
-/**
- * POST /api/ai/fix-suggestion — on-demand LLM fix for any audit surface.
- */
export const POST: ApiRouteHandler = async (request: NextRequest): Promise => {
- let body: {
- source?: string;
- message?: string;
- url?: string;
- refresh?: boolean;
- context?: Record;
- priority?: string;
- category?: string;
- recommendation?: string;
- type?: string;
- };
- try {
- body = await request.json();
- } catch {
- return NextResponse.json({ error: 'Invalid JSON' }, { status: 400 });
- }
- const message = String(body.message || '').trim();
- if (!message) {
- return NextResponse.json({ error: 'message required' }, { status: 400 });
- }
-
- const repoRoot = getRepoRoot();
- const pythonExe = resolvePythonExecutable(null, repoRoot);
- const payload = {
- source: body.source || 'issue',
- message,
- url: body.url,
- refresh: body.refresh,
- context: body.context,
- priority: body.priority,
- category: body.category,
- recommendation: body.recommendation,
- type: body.type,
- };
-
- return new Promise((resolve) => {
- const proc = spawn(pythonExe, ['-c', PYTHON_SCRIPT], {
- cwd: repoRoot,
- env: getPipelineSpawnEnv(repoRoot),
- shell: false,
- });
- let stdout = '';
- proc.stdout?.on('data', (c: Buffer | string) => { stdout += c.toString(); });
- proc.stdin?.write(JSON.stringify(payload));
- proc.stdin?.end();
- proc.on('error', () => {
- clearTimeout(timer);
- resolve(NextResponse.json({ error: 'Fix suggestion failed: could not start Python process' }, { status: 500 }));
- });
- proc.on('close', (code) => {
- clearTimeout(timer);
- const parsed = parsePythonJsonStdout(stdout);
- if (code === 0 && parsed) {
- resolve(NextResponse.json(parsed));
- return;
- }
- resolve(NextResponse.json({ error: 'Fix suggestion failed' }, { status: 500 }));
- });
- const timer = setTimeout(() => {
- try { proc.kill(); } catch { /* ignore */ }
- resolve(NextResponse.json({ error: 'Fix suggestion timed out after 90s' }, { status: 504 }));
- }, 90_000);
- });
+ const denied = forbiddenIfNotLocal(request); if (denied) return denied; return proxyToFastAPI(request, '/api/ai/fix-suggestion');
};
diff --git a/web/app/api/alerts/check/route.ts b/web/app/api/alerts/check/route.ts
index 8a7428e3..1d765baa 100644
--- a/web/app/api/alerts/check/route.ts
+++ b/web/app/api/alerts/check/route.ts
@@ -1,71 +1,12 @@
-import { NextResponse, type NextRequest } from 'next/server';
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
import { forbiddenIfNotLocal } from '@/server/localOnly';
-import { spawn } from 'child_process';
-import path from 'path';
-import { resolvePythonExecutable, formatPythonSpawnError } from '@/server/resolvePython';
-import { getRepoRoot } from '@/server/pipelineSpawnEnv';
import type { ApiRouteHandler } from '@/types/api';
-export const runtime = 'nodejs';
export const dynamic = 'force-dynamic';
-/**
- * POST /api/alerts/check?propertyId= — run health alert rules and optional webhook dispatch.
- */
export const POST: ApiRouteHandler = async (request: NextRequest): Promise => {
const denied = forbiddenIfNotLocal(request);
if (denied) return denied;
-
- const propertyId = Number(request.nextUrl.searchParams.get('propertyId') || '0');
- if (!propertyId) {
- return NextResponse.json({ error: 'propertyId required' }, { status: 400 });
- }
-
- const repoRoot = getRepoRoot();
- const pythonExe = resolvePythonExecutable(null, repoRoot);
- const script = `
-import json, sys
-from website_profiling.tools.alert_checker import check_all_alerts, dispatch_webhook, dispatch_email
-from website_profiling.db.storage import db_session
-from website_profiling.db._common import _row_field
-
-property_id = int(sys.argv[1])
-alerts = check_all_alerts(property_id)
-webhook_sent = False
-email_sent = False
-with db_session() as conn:
- cur = conn.execute(
- "SELECT alert_webhook_url, alert_email FROM properties WHERE id = %s",
- (property_id,),
- )
- row = cur.fetchone()
- url = (_row_field(row, "alert_webhook_url", index=0) or "") if row else ""
- email = (_row_field(row, "alert_email", index=1) or "") if row else ""
- payload = {"property_id": property_id, "alerts": alerts}
- if url and alerts:
- webhook_sent = dispatch_webhook(url, payload)
- if email and alerts:
- email_sent = dispatch_email(email, payload)
-print(json.dumps({"alerts": alerts, "webhook_sent": webhook_sent, "email_sent": email_sent}))
-`;
-
- return new Promise((resolve) => {
- const proc = spawn(pythonExe, ['-c', script, String(propertyId)], {
- cwd: repoRoot,
- shell: false,
- });
- let stdout = '';
- proc.stdout?.on('data', (c: Buffer | string) => { stdout += c.toString(); });
- proc.on('error', (err: Error) => {
- resolve(NextResponse.json({ error: formatPythonSpawnError(err, pythonExe, repoRoot) }, { status: 500 }));
- });
- proc.on('close', (code) => {
- try {
- const parsed = JSON.parse(stdout.trim() || '{}');
- resolve(NextResponse.json(parsed, { status: code === 0 ? 200 : 500 }));
- } catch {
- resolve(NextResponse.json({ error: stdout.trim() || 'Alert check failed' }, { status: 500 }));
- }
- });
- });
+ return proxyToFastAPI(request, '/api/alerts/check');
};
diff --git a/web/app/api/app-settings/route.ts b/web/app/api/app-settings/route.ts
index 1b28bd8d..ec9a44a1 100644
--- a/web/app/api/app-settings/route.ts
+++ b/web/app/api/app-settings/route.ts
@@ -1,61 +1,18 @@
-import { NextResponse, type NextRequest } from 'next/server';
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
import { forbiddenIfNotLocal } from '@/server/localOnly';
-import { loadAppSetting, saveAppSetting } from '@/server/appSettings';
import type { ApiRouteHandler } from '@/types/api';
-export const runtime = 'nodejs';
+export const dynamic = 'force-dynamic';
-/** GET /api/app-settings?key= — Returns { key, value } or { key, value: null }. */
export const GET: ApiRouteHandler = async (request: NextRequest): Promise => {
const denied = forbiddenIfNotLocal(request);
if (denied) return denied;
-
- const key = request.nextUrl.searchParams.get('key');
- if (!key || typeof key !== 'string' || key.trim() === '') {
- return NextResponse.json({ error: 'Missing key query parameter' }, { status: 400 });
- }
-
- try {
- const value = await loadAppSetting(key.trim());
- return NextResponse.json({ key: key.trim(), value });
- } catch (e) {
- const msg = e instanceof Error ? e.message : String(e);
- return NextResponse.json({ error: msg }, { status: 500 });
- }
+ return proxyToFastAPI(request, '/api/app-settings');
};
-/** PUT /api/app-settings — Body: { key: string; value: string } */
export const PUT: ApiRouteHandler = async (request: NextRequest): Promise => {
const denied = forbiddenIfNotLocal(request);
if (denied) return denied;
-
- let body: unknown;
- try {
- body = await request.json();
- } catch {
- return NextResponse.json({ error: 'Invalid JSON body' }, { status: 400 });
- }
-
- if (
- typeof body !== 'object' ||
- body === null ||
- typeof (body as Record).key !== 'string' ||
- typeof (body as Record).value !== 'string'
- ) {
- return NextResponse.json({ error: 'Body must be { key: string; value: string }' }, { status: 400 });
- }
-
- const { key, value } = body as { key: string; value: string };
-
- if (key.trim() === '') {
- return NextResponse.json({ error: 'key must not be empty' }, { status: 400 });
- }
-
- try {
- await saveAppSetting(key.trim(), value);
- return NextResponse.json({ ok: true });
- } catch (e) {
- const msg = e instanceof Error ? e.message : String(e);
- return NextResponse.json({ error: msg }, { status: 500 });
- }
+ return proxyToFastAPI(request, '/api/app-settings');
};
diff --git a/web/app/api/backlinks/competitor-import/route.ts b/web/app/api/backlinks/competitor-import/route.ts
index 151289ab..7e223d22 100644
--- a/web/app/api/backlinks/competitor-import/route.ts
+++ b/web/app/api/backlinks/competitor-import/route.ts
@@ -1,73 +1,15 @@
-import { NextResponse, type NextRequest } from 'next/server';
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
+import { forbiddenIfNotLocal } from '@/server/localOnly';
import { requireApiAuth } from '@/server/auth';
-import { spawn } from 'child_process';
-import { getRepoRoot, getPipelineSpawnEnv } from '@/server/pipelineSpawnEnv';
-import { resolvePythonExecutable, parsePythonJsonStdout } from '@/server/resolvePython';
import type { ApiRouteHandler } from '@/types/api';
-export const runtime = 'nodejs';
export const dynamic = 'force-dynamic';
-/**
- * POST /api/backlinks/competitor-import
- * Body: { competitor, csvText, ourDomains?: string[] }
- */
export const POST: ApiRouteHandler = async (request: NextRequest): Promise => {
+ const denied = forbiddenIfNotLocal(request);
+ if (denied) return denied;
const authDenied = requireApiAuth(request);
if (authDenied) return authDenied;
-
- let body: { competitor?: string; csvText?: string; ourDomains?: string[] };
- try {
- body = await request.json();
- } catch {
- return NextResponse.json({ error: 'Invalid JSON' }, { status: 400 });
- }
- const competitor = String(body.competitor || '').trim();
- const csvText = String(body.csvText || '');
- if (!competitor || !csvText.trim()) {
- return NextResponse.json({ error: 'competitor and csvText required' }, { status: 400 });
- }
-
- const repoRoot = getRepoRoot();
- const pythonExe = resolvePythonExecutable(null, repoRoot);
- const script = `
-import json, sys
-from website_profiling.integrations.google.competitor_links import (
- parse_referring_domains_from_csv,
- build_competitor_domain_gap,
-)
-payload = json.load(sys.stdin)
-refs = parse_referring_domains_from_csv(payload.get("csvText") or "")
-our = set(payload.get("ourDomains") or [])
-print(json.dumps(build_competitor_domain_gap(our, payload.get("competitor") or "", refs)))
-`;
-
- return new Promise((resolve) => {
- const proc = spawn(pythonExe, ['-c', script], {
- cwd: repoRoot,
- env: getPipelineSpawnEnv(repoRoot),
- shell: false,
- });
- let stdout = '';
- proc.stdout?.on('data', (c: Buffer | string) => { stdout += c.toString(); });
- proc.stdin?.write(
- JSON.stringify({
- competitor,
- csvText,
- ourDomains: body.ourDomains || [],
- }),
- );
- proc.stdin?.end();
- proc.on('error', () => {
- resolve(NextResponse.json({ error: 'Import failed: could not start Python process' }, { status: 500 }));
- });
- proc.on('close', (code) => {
- const parsed = parsePythonJsonStdout(stdout);
- if (code === 0 && parsed) {
- resolve(NextResponse.json({ gap: parsed }));
- return;
- }
- resolve(NextResponse.json({ error: 'Competitor backlink import failed' }, { status: 500 }));
- });
- });
+ return proxyToFastAPI(request, '/api/backlinks/competitor-import');
};
diff --git a/web/app/api/backlinks/third-party-import/route.ts b/web/app/api/backlinks/third-party-import/route.ts
index fe1ef914..e93189a8 100644
--- a/web/app/api/backlinks/third-party-import/route.ts
+++ b/web/app/api/backlinks/third-party-import/route.ts
@@ -1,92 +1,15 @@
-import { NextResponse, type NextRequest } from 'next/server';
-import { spawn } from 'child_process';
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
+import { forbiddenIfNotLocal } from '@/server/localOnly';
import { requireApiAuth } from '@/server/auth';
-import { getRepoRoot, getPipelineSpawnEnv } from '@/server/pipelineSpawnEnv';
-import { resolvePythonExecutable, parsePythonJsonStdout } from '@/server/resolvePython';
import type { ApiRouteHandler } from '@/types/api';
-export const runtime = 'nodejs';
export const dynamic = 'force-dynamic';
-/**
- * POST /api/backlinks/third-party-import
- * Body: { propertyId, provider: 'moz'|'majestic', csvText, ourDomains?: string[] }
- */
export const POST: ApiRouteHandler = async (request: NextRequest): Promise => {
+ const denied = forbiddenIfNotLocal(request);
+ if (denied) return denied;
const authDenied = requireApiAuth(request);
if (authDenied) return authDenied;
-
- let body: {
- propertyId?: number;
- provider?: string;
- csvText?: string;
- ourDomains?: string[];
- };
- try {
- body = await request.json();
- } catch {
- return NextResponse.json({ error: 'Invalid JSON' }, { status: 400 });
- }
-
- const propertyId = Number(body.propertyId || 0);
- const provider = String(body.provider || 'moz').trim().toLowerCase();
- const csvText = String(body.csvText || '');
- if (!propertyId || !csvText.trim()) {
- return NextResponse.json({ error: 'propertyId and csvText required' }, { status: 400 });
- }
- if (provider !== 'moz' && provider !== 'majestic') {
- return NextResponse.json({ error: 'provider must be moz or majestic' }, { status: 400 });
- }
-
- const repoRoot = getRepoRoot();
- const pythonExe = resolvePythonExecutable(null, repoRoot);
- const script = `
-import json, sys
-from website_profiling.integrations.links.third_party_csv import build_third_party_overlay
-from website_profiling.integrations.google.gsc_links_store import import_third_party_links_overlay
-from website_profiling.db.storage import db_session
-
-payload = json.load(sys.stdin)
-property_id = int(payload["propertyId"])
-overlay = build_third_party_overlay(
- payload.get("provider") or "moz",
- payload.get("csvText") or "",
- payload.get("ourDomains") or [],
-)
-with db_session() as conn:
- result = import_third_party_links_overlay(conn, property_id, overlay)
-print(json.dumps(result))
-`;
-
- return new Promise((resolve) => {
- const proc = spawn(pythonExe, ['-c', script], {
- cwd: repoRoot,
- env: getPipelineSpawnEnv(repoRoot),
- shell: false,
- });
- let stdout = '';
- let stderr = '';
- proc.stdout?.on('data', (c: Buffer | string) => { stdout += c.toString(); });
- proc.stderr?.on('data', (c: Buffer | string) => { stderr += c.toString(); });
- proc.stdin?.write(
- JSON.stringify({
- propertyId,
- provider,
- csvText,
- ourDomains: body.ourDomains || [],
- }),
- );
- proc.stdin?.end();
- proc.on('error', () => {
- resolve(NextResponse.json({ error: 'Import failed: could not start Python process' }, { status: 500 }));
- });
- proc.on('close', (code) => {
- const parsed = parsePythonJsonStdout(stdout);
- if (code === 0 && parsed) {
- resolve(NextResponse.json(parsed));
- return;
- }
- resolve(NextResponse.json({ error: 'Third-party backlink import failed' }, { status: 500 }));
- });
- });
+ return proxyToFastAPI(request, '/api/backlinks/third-party-import');
};
diff --git a/web/app/api/backlinks/velocity/route.ts b/web/app/api/backlinks/velocity/route.ts
index ad116ffd..b2a13048 100644
--- a/web/app/api/backlinks/velocity/route.ts
+++ b/web/app/api/backlinks/velocity/route.ts
@@ -1,41 +1,9 @@
-import { NextResponse, type NextRequest } from 'next/server';
-import { withDb } from '@/server/db';
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
import type { ApiRouteHandler } from '@/types/api';
export const dynamic = 'force-dynamic';
-/**
- * GET /api/backlinks/velocity?propertyId=
- */
export const GET: ApiRouteHandler = async (request: NextRequest): Promise => {
- const propertyId = Number(request.nextUrl.searchParams.get('propertyId') || '0');
- if (!propertyId) {
- return NextResponse.json({ error: 'propertyId required' }, { status: 400 });
- }
-
- try {
- const snapshots = await withDb(async (client) => {
- const cur = await client.query<{
- captured_at: Date;
- referring_domains: number;
- top_domains: unknown;
- }>(
- `SELECT captured_at, referring_domains, top_domains
- FROM gsc_links_snapshots
- WHERE property_id = $1
- ORDER BY captured_at ASC
- LIMIT 52`,
- [propertyId],
- );
- return cur.rows.map((row) => ({
- capturedAt: row.captured_at.toISOString(),
- referringDomains: row.referring_domains,
- topDomains: row.top_domains,
- }));
- });
- return NextResponse.json({ snapshots });
- } catch (e) {
- const msg = e instanceof Error ? e.message : String(e);
- return NextResponse.json({ error: msg, snapshots: [] }, { status: 500 });
- }
+ return proxyToFastAPI(request, '/api/backlinks/velocity');
};
diff --git a/web/app/api/chat/artifacts/[id]/route.ts b/web/app/api/chat/artifacts/[id]/route.ts
index 336162ec..4933b267 100644
--- a/web/app/api/chat/artifacts/[id]/route.ts
+++ b/web/app/api/chat/artifacts/[id]/route.ts
@@ -1,34 +1,15 @@
-import { NextResponse, type NextRequest } from 'next/server';
-import { spawn } from 'child_process';
-import path from 'path';
+/**
+ * GET /api/chat/artifacts/[id] — retrieve an AI-generated artifact file via FastAPI.
+ */
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
import { forbiddenIfNotLocal } from '@/server/localOnly';
import { requireApiAuthForChat } from '@/server/auth';
-import { resolvePythonExecutable, formatPythonSpawnError } from '@/server/resolvePython';
import type { ApiRouteHandlerWithParams } from '@/types/api';
export const runtime = 'nodejs';
export const dynamic = 'force-dynamic';
-const REPO_ROOT = process.env.WEBSITE_PROFILING_ROOT || path.resolve(process.cwd(), '..');
-
-const ARTIFACT_SCRIPT = `
-import json
-import sys
-from website_profiling.tools.export_artifacts import read_artifact_bytes
-aid = sys.argv[1]
-result = read_artifact_bytes(aid)
-if not result:
- print(json.dumps({"error": "not found"}))
-else:
- meta, data = result
- import base64
- print(json.dumps({
- "filename": meta.get("filename"),
- "mime_type": meta.get("mime_type"),
- "data_base64": base64.b64encode(data).decode("ascii"),
- }))
-`;
-
export const GET: ApiRouteHandlerWithParams<{ id: string }> = async (
request: NextRequest,
context: { params: Promise<{ id: string }> },
@@ -37,69 +18,6 @@ export const GET: ApiRouteHandlerWithParams<{ id: string }> = async (
if (denied) return denied;
const authDenied = requireApiAuthForChat(request);
if (authDenied) return authDenied;
-
const { id } = await context.params;
- if (!id || !/^[a-f0-9-]{36}$/.test(id)) {
- return NextResponse.json({ error: 'Invalid artifact id' }, { status: 400 });
- }
-
- const python = resolvePythonExecutable(process.env.PYTHON, REPO_ROOT);
-
- return new Promise((resolve) => {
- const proc = spawn(python, ['-c', ARTIFACT_SCRIPT, id], {
- cwd: REPO_ROOT,
- env: {
- ...process.env,
- PYTHONPATH: path.join(REPO_ROOT, 'src'),
- PYTHONIOENCODING: 'utf-8',
- },
- });
- let out = '';
- let err = '';
- proc.stdout.on('data', (c: Buffer | string) => {
- out += c.toString();
- });
- proc.stderr.on('data', (c) => {
- err += c.toString();
- });
- proc.on('error', (spawnErr: Error) => {
- resolve(NextResponse.json({ error: formatPythonSpawnError(spawnErr, python, REPO_ROOT) }, { status: 500 }));
- });
- proc.on('close', (code) => {
- if (code !== 0) {
- resolve(NextResponse.json({ error: err.trim() || 'Artifact read failed' }, { status: 500 }));
- return;
- }
- try {
- const parsed = JSON.parse(out.trim()) as {
- error?: string;
- filename?: string;
- mime_type?: string;
- data_base64?: string;
- };
- if (parsed.error || !parsed.data_base64) {
- resolve(NextResponse.json({ error: 'Artifact not found' }, { status: 404 }));
- return;
- }
- const body = Buffer.from(parsed.data_base64, 'base64');
- const rawName = parsed.filename || 'export.bin';
- // Sanitize the ASCII fallback (strip non-printable/quote/slash chars so
- // a CR/LF or quote can't break or inject the header) and provide an
- // RFC 5987 filename* for the full UTF-8 name.
- const asciiName =
- rawName.replace(/[^\x20-\x7e]/g, '_').replace(/["\\/]/g, '_') || 'export.bin';
- const mime = parsed.mime_type || 'application/octet-stream';
- resolve(
- new NextResponse(body, {
- headers: {
- 'Content-Type': mime,
- 'Content-Disposition': `attachment; filename="${asciiName}"; filename*=UTF-8''${encodeURIComponent(rawName)}`,
- },
- }),
- );
- } catch {
- resolve(NextResponse.json({ error: 'Invalid artifact response' }, { status: 500 }));
- }
- });
- });
+ return proxyToFastAPI(request, `/api/chat/artifacts/${id}`);
};
diff --git a/web/app/api/chat/route.ts b/web/app/api/chat/route.ts
index f48b8d48..ded08f56 100644
--- a/web/app/api/chat/route.ts
+++ b/web/app/api/chat/route.ts
@@ -1,384 +1,20 @@
+/**
+ * POST /api/chat — stream agent response via FastAPI SSE.
+ * FastAPI runs the Python agent directly and streams text/event-stream.
+ */
import { type NextRequest } from 'next/server';
-import { spawn } from 'child_process';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
import { forbiddenIfNotLocal } from '@/server/localOnly';
import { requireApiAuthForChat } from '@/server/auth';
-import { getPipelineSpawnEnv, getRepoRoot } from '@/server/pipelineSpawnEnv';
-import { formatPythonSpawnError, resolvePythonExecutable } from '@/server/resolvePython';
-import {
- appendChatMessage,
- getChatMessages,
- getChatSession,
- messagesForAgentContext,
- updateChatSessionTitle,
-} from '@/server/chatDb';
-import { loadLlmConfig } from '@/server/llmConfig';
import type { ApiRouteHandler } from '@/types/api';
-import type { ChatNarrative } from '@/types/chatNarrative';
export const runtime = 'nodejs';
export const dynamic = 'force-dynamic';
-const DEFAULT_CHAT_TIMEOUT_MS = 120_000;
-const OLLAMA_MIN_TIMEOUT_MS = 300_000;
-
-async function resolveChatTimeoutMs(): Promise {
- try {
- const cfg = await loadLlmConfig();
- const provider = String(cfg.state.llm_provider || 'none');
- const timeoutS = Number(cfg.state.llm_timeout_s) || 120;
- const baseMs = Math.max(timeoutS, 30) * 1000;
- if (provider === 'ollama') {
- return Math.max(baseMs, OLLAMA_MIN_TIMEOUT_MS);
- }
- return baseMs;
- } catch {
- return DEFAULT_CHAT_TIMEOUT_MS;
- }
-}
-
-interface ChatBody {
- sessionId?: number;
- message?: string;
- propertyId?: number;
- reportId?: number;
-}
-
-function sseLine(event: string, data: Record): string {
- return `event: ${event}\ndata: ${JSON.stringify(data)}\n\n`;
-}
-
-function buildPersistedAssistantContent(
- assistantText: string,
- toolEvents: Array<{ name: string; args?: Record; result?: Record }>,
- narrative: ChatNarrative | null,
- sawError: boolean,
- lastErrorMessage: string,
-): string | null {
- if (narrative) {
- if (toolEvents.length > 0) {
- return 'Tool results from this turn are shown below.';
- }
- return '';
- }
- const text = assistantText.trim();
- if (text) return text;
- if (toolEvents.length > 0) {
- return sawError
- ? 'Tool results were saved from this turn. The assistant did not produce a final summary.'
- : 'Tool results from this turn are shown below.';
- }
- if (sawError && lastErrorMessage.trim()) {
- return lastErrorMessage.trim();
- }
- return null;
-}
-
-/** POST /api/chat — stream agent response via SSE. */
export const POST: ApiRouteHandler = async (request: NextRequest): Promise => {
const denied = forbiddenIfNotLocal(request);
if (denied) return denied;
const authDenied = requireApiAuthForChat(request);
if (authDenied) return authDenied;
-
- let body: ChatBody;
- try {
- body = await request.json();
- } catch {
- return new Response(JSON.stringify({ error: 'Invalid JSON' }), { status: 400 });
- }
-
- const sessionId = Number(body.sessionId || 0);
- const propertyId = Number(body.propertyId || 0);
- const message = String(body.message || '').trim();
- const reportId = body.reportId != null ? Number(body.reportId) : undefined;
-
- if (!sessionId || !propertyId || !message) {
- return new Response(
- JSON.stringify({ error: 'sessionId, propertyId, and message are required' }),
- { status: 400 },
- );
- }
-
- const session = await getChatSession(sessionId);
- if (!session || session.property_id !== propertyId) {
- return new Response(JSON.stringify({ error: 'session not found' }), { status: 404 });
- }
-
- await appendChatMessage(sessionId, 'user', message);
-
- const history = await getChatMessages(sessionId);
- const agentMessages = messagesForAgentContext(history, 20);
-
- const repoRoot = getRepoRoot();
- const pythonExe = resolvePythonExecutable(null, repoRoot);
- const stdinPayload = JSON.stringify({
- messages: agentMessages,
- property_id: propertyId,
- report_id: Number.isFinite(reportId) ? reportId : undefined,
- });
-
- const chatTimeoutMs = await resolveChatTimeoutMs();
- const timeoutSec = Math.round(chatTimeoutMs / 1000);
-
- // Track the spawned child so we can kill it if the client disconnects
- // (ReadableStream.cancel) instead of leaking it until the timeout fires.
- let activeProc: ReturnType | null = null;
- let activeKillTimer: ReturnType | null = null;
- let cancelled = false;
-
- const cancelChild = () => {
- cancelled = true;
- const p = activeProc;
- if (!p) return;
- try {
- p.kill('SIGTERM');
- activeKillTimer = setTimeout(() => {
- try {
- p.kill('SIGKILL');
- } catch {
- /* already exited */
- }
- }, 2000);
- (activeKillTimer as { unref?: () => void }).unref?.();
- } catch {
- /* already exited */
- }
- };
-
- const stream = new ReadableStream({
- start(controller) {
- const encoder = new TextEncoder();
- let assistantText = '';
- let buffer = '';
- let stderrAcc = '';
- let lastErrorMessage = '';
- let narrative: ChatNarrative | null = null;
- const toolEvents: Array<{
- name: string;
- args?: Record;
- result?: Record;
- }> = [];
- let sawError = false;
- let timedOut = false;
- let closed = false;
- let exitCode: number | null = null;
-
- const closeStream = () => {
- if (closed) return;
- closed = true;
- try {
- controller.close();
- } catch {
- /* stream may already be closed (client disconnect, timeout race) */
- }
- };
-
- const push = (event: string, data: Record) => {
- if (closed) return;
- if (event === 'error') {
- sawError = true;
- lastErrorMessage = String(data.message || 'Agent error');
- }
- try {
- controller.enqueue(encoder.encode(sseLine(event, data)));
- } catch {
- closed = true;
- }
- };
-
- const proc = spawn(
- pythonExe,
- ['-m', 'src', 'chat', '--stdin-json'],
- {
- cwd: repoRoot,
- env: getPipelineSpawnEnv(repoRoot, propertyId),
- shell: false,
- },
- );
- activeProc = proc;
-
- const timer = setTimeout(() => {
- timedOut = true;
- try {
- proc.kill();
- } catch {
- /* ignore */
- }
- push('error', { message: `Chat timed out after ${timeoutSec}s` });
- closeStream();
- }, chatTimeoutMs);
-
- // Without an error listener, an EPIPE/ERR_STREAM_DESTROYED on the stdin
- // pipe (child exits before reading) would surface as an unhandled stream
- // error and crash the Node process instead of a clean chat error.
- proc.stdin?.on('error', (err: Error) => {
- clearTimeout(timer);
- push('error', { message: `Failed to send request to assistant: ${err.message}` });
- closeStream();
- });
- proc.stdin?.write(stdinPayload);
- proc.stdin?.end();
-
- proc.stdout?.on('data', (chunk: Buffer) => {
- buffer += chunk.toString();
- const lines = buffer.split('\n');
- buffer = lines.pop() || '';
- for (const line of lines) {
- const trimmed = line.trim();
- if (!trimmed) continue;
- try {
- const evt = JSON.parse(trimmed) as {
- type?: string;
- text?: string;
- message?: string;
- phase?: string;
- detail?: string;
- name?: string;
- args?: Record;
- result?: Record;
- narrative?: ChatNarrative;
- };
- if (evt.type === 'token' && evt.text) {
- assistantText += evt.text;
- push('token', { text: evt.text });
- } else if (evt.type === 'status') {
- push('status', {
- phase: evt.phase || 'working',
- detail: evt.detail || evt.message || '',
- });
- } else if (evt.type === 'tool_start') {
- toolEvents.push({
- name: String(evt.name || ''),
- args: evt.args || {},
- });
- push('tool_start', evt as Record);
- } else if (evt.type === 'tool_end') {
- const name = String(evt.name || '');
- const existing = toolEvents.findIndex((t) => t.name === name && t.result == null);
- if (existing >= 0) {
- toolEvents[existing] = {
- ...toolEvents[existing],
- result: evt.result || {},
- };
- } else {
- toolEvents.push({ name, result: evt.result || {} });
- }
- push('tool_end', evt as Record);
- } else if (evt.type === 'narrative' && evt.narrative) {
- narrative = evt.narrative;
- push('narrative', { narrative: evt.narrative });
- } else if (evt.type === 'done') {
- if (evt.message) {
- assistantText = evt.message;
- }
- push('done', { message: evt.message || '' });
- } else if (evt.type === 'partial_done' && evt.message) {
- assistantText = evt.message;
- push('partial_done', { message: evt.message });
- } else if (evt.type === 'error') {
- push('error', { message: evt.message || 'Agent error' });
- }
- } catch {
- /* ignore non-JSON log lines */
- }
- }
- });
-
- proc.stderr?.on('data', (chunk: Buffer) => {
- stderrAcc += chunk.toString();
- if (stderrAcc.length > 8000) {
- stderrAcc = stderrAcc.slice(-8000);
- }
- });
-
- proc.on('error', (err: Error) => {
- clearTimeout(timer);
- push('error', { message: formatPythonSpawnError(err, pythonExe, repoRoot) });
- closeStream();
- });
-
- proc.on('close', async (code: number | null) => {
- clearTimeout(timer);
- if (activeKillTimer) {
- clearTimeout(activeKillTimer);
- activeKillTimer = null;
- }
- // On client cancel we drop the partial turn (the user navigated away);
- // on timeout the error was already streamed.
- if (timedOut || cancelled) return;
- exitCode = code;
-
- if (!sawError && !assistantText.trim() && !narrative) {
- const stderrLine = stderrAcc
- .split('\n')
- .map((l) => l.trim())
- .find((l) => l && !l.startsWith('['));
- const fallback =
- stderrLine ||
- (exitCode != null && exitCode !== 0
- ? `Assistant process exited with code ${exitCode}.`
- : 'No response from the assistant.');
- push('error', { message: fallback });
- } else if (!sawError && exitCode != null && exitCode !== 0) {
- const stderrLine = stderrAcc
- .split('\n')
- .map((l) => l.trim())
- .find((l) => l && !l.startsWith('['));
- if (stderrLine) {
- push('error', { message: stderrLine });
- }
- }
-
- const contentToSave = buildPersistedAssistantContent(
- assistantText,
- toolEvents,
- narrative,
- sawError,
- lastErrorMessage,
- );
-
- if (contentToSave !== null || narrative || toolEvents.length > 0) {
- try {
- const toolResultPayload =
- toolEvents.length || narrative || (sawError && lastErrorMessage)
- ? {
- ...(toolEvents.length ? { tool_events: toolEvents } : {}),
- ...(narrative ? { narrative } : {}),
- ...(sawError && lastErrorMessage ? { agent_error: lastErrorMessage } : {}),
- }
- : null;
- await appendChatMessage(
- sessionId,
- 'assistant',
- contentToSave ?? '',
- {
- toolResult: toolResultPayload,
- },
- );
- if (session.title === 'New chat') {
- const title = message.slice(0, 60) + (message.length > 60 ? '…' : '');
- await updateChatSessionTitle(sessionId, title);
- }
- } catch {
- /* persistence failure should not break stream */
- }
- }
-
- closeStream();
- });
- },
- cancel() {
- // Client disconnected mid-stream (reload/navigate/abort): terminate the
- // agent process so it does not keep holding the LLM connection/CPU.
- cancelChild();
- },
- });
-
- return new Response(stream, {
- headers: {
- 'Content-Type': 'text/event-stream',
- 'Cache-Control': 'no-cache',
- Connection: 'keep-alive',
- },
- });
+ return proxyToFastAPI(request, '/api/chat/');
};
diff --git a/web/app/api/chat/sessions/[id]/messages/route.ts b/web/app/api/chat/sessions/[id]/messages/route.ts
index 59a5725c..c2b2ebb8 100644
--- a/web/app/api/chat/sessions/[id]/messages/route.ts
+++ b/web/app/api/chat/sessions/[id]/messages/route.ts
@@ -1,13 +1,15 @@
-import { NextResponse, type NextRequest } from 'next/server';
+/**
+ * GET /api/chat/sessions/[id]/messages — get chat session messages via FastAPI.
+ */
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
import { forbiddenIfNotLocal } from '@/server/localOnly';
import { requireApiAuthForChat } from '@/server/auth';
-import { getChatMessages, getChatSession } from '@/server/chatDb';
import type { ApiRouteHandler } from '@/types/api';
export const runtime = 'nodejs';
export const dynamic = 'force-dynamic';
-/** GET /api/chat/sessions/[id]/messages?propertyId= */
export const GET: ApiRouteHandler = async (
request: NextRequest,
context?: { params?: Promise<{ id: string }> },
@@ -16,28 +18,6 @@ export const GET: ApiRouteHandler = async (
if (denied) return denied;
const authDenied = requireApiAuthForChat(request);
if (authDenied) return authDenied;
-
const params = context?.params ? await context.params : { id: '' };
- const sessionId = Number(params.id || '0');
- if (!sessionId) {
- return NextResponse.json({ error: 'invalid session id' }, { status: 400 });
- }
- const propertyId = Number(request.nextUrl.searchParams.get('propertyId') || '0');
- if (!propertyId) {
- return NextResponse.json({ error: 'propertyId required' }, { status: 400 });
- }
-
- try {
- // Scope conversation history to the caller's property to avoid leaking
- // another property's messages by enumerating session ids.
- const session = await getChatSession(sessionId);
- if (!session || session.property_id !== propertyId) {
- return NextResponse.json({ error: 'session not found' }, { status: 404 });
- }
- const messages = await getChatMessages(sessionId);
- return NextResponse.json({ messages });
- } catch (e) {
- const msg = e instanceof Error ? e.message : String(e);
- return NextResponse.json({ error: msg }, { status: 500 });
- }
+ return proxyToFastAPI(request, `/api/chat/sessions/${params.id}/messages`);
};
diff --git a/web/app/api/chat/sessions/[id]/route.ts b/web/app/api/chat/sessions/[id]/route.ts
index 0ed68a00..330116f3 100644
--- a/web/app/api/chat/sessions/[id]/route.ts
+++ b/web/app/api/chat/sessions/[id]/route.ts
@@ -1,13 +1,15 @@
-import { NextResponse, type NextRequest } from 'next/server';
+/**
+ * GET/DELETE /api/chat/sessions/[id] — get or delete a chat session via FastAPI.
+ */
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
import { forbiddenIfNotLocal } from '@/server/localOnly';
import { requireApiAuth, requireApiAuthForChat } from '@/server/auth';
-import { deleteChatSession, getChatSession } from '@/server/chatDb';
import type { ApiRouteHandler } from '@/types/api';
export const runtime = 'nodejs';
export const dynamic = 'force-dynamic';
-/** GET /api/chat/sessions/[id] */
export const GET: ApiRouteHandler = async (
request: NextRequest,
context?: { params?: Promise<{ id: string }> },
@@ -16,59 +18,18 @@ export const GET: ApiRouteHandler = async (
if (denied) return denied;
const authDenied = requireApiAuthForChat(request);
if (authDenied) return authDenied;
-
const params = context?.params ? await context.params : { id: '' };
- const sessionId = Number(params.id || '0');
- if (!sessionId) {
- return NextResponse.json({ error: 'invalid session id' }, { status: 400 });
- }
-
- try {
- const session = await getChatSession(sessionId);
- if (!session) {
- return NextResponse.json({ error: 'session not found' }, { status: 404 });
- }
- return NextResponse.json({ session });
- } catch (e) {
- const msg = e instanceof Error ? e.message : String(e);
- return NextResponse.json({ error: msg }, { status: 500 });
- }
+ return proxyToFastAPI(request, `/api/chat/sessions/${params.id}`);
};
-/** DELETE /api/chat/sessions/[id]?propertyId= */
export const DELETE: ApiRouteHandler = async (
request: NextRequest,
context?: { params?: Promise<{ id: string }> },
): Promise => {
const denied = forbiddenIfNotLocal(request);
if (denied) return denied;
- // Deleting a session is a destructive mutation: require a non-read-only role.
const authDenied = requireApiAuth(request);
if (authDenied) return authDenied;
-
const params = context?.params ? await context.params : { id: '' };
- const sessionId = Number(params.id || '0');
- if (!sessionId) {
- return NextResponse.json({ error: 'invalid session id' }, { status: 400 });
- }
- const propertyId = Number(request.nextUrl.searchParams.get('propertyId') || '0');
- if (!propertyId) {
- return NextResponse.json({ error: 'propertyId required' }, { status: 400 });
- }
-
- try {
- // Scope the delete to the caller's property (consistent with POST /api/chat).
- const session = await getChatSession(sessionId);
- if (!session || session.property_id !== propertyId) {
- return NextResponse.json({ error: 'session not found' }, { status: 404 });
- }
- const deleted = await deleteChatSession(sessionId);
- if (!deleted) {
- return NextResponse.json({ error: 'session not found' }, { status: 404 });
- }
- return NextResponse.json({ ok: true });
- } catch (e) {
- const msg = e instanceof Error ? e.message : String(e);
- return NextResponse.json({ error: msg }, { status: 500 });
- }
+ return proxyToFastAPI(request, `/api/chat/sessions/${params.id}`);
};
diff --git a/web/app/api/chat/sessions/route.ts b/web/app/api/chat/sessions/route.ts
index 0a3cace5..badf4e24 100644
--- a/web/app/api/chat/sessions/route.ts
+++ b/web/app/api/chat/sessions/route.ts
@@ -1,59 +1,27 @@
-import { NextResponse, type NextRequest } from 'next/server';
+/**
+ * GET/POST /api/chat/sessions — list or create chat sessions via FastAPI.
+ */
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
import { forbiddenIfNotLocal } from '@/server/localOnly';
import { requireApiAuthForChat } from '@/server/auth';
-import { createChatSession, listChatSessions } from '@/server/chatDb';
import type { ApiRouteHandler } from '@/types/api';
export const runtime = 'nodejs';
export const dynamic = 'force-dynamic';
-/** GET /api/chat/sessions?propertyId= — list chat sessions for a property. */
export const GET: ApiRouteHandler = async (request: NextRequest): Promise => {
const denied = forbiddenIfNotLocal(request);
if (denied) return denied;
const authDenied = requireApiAuthForChat(request);
if (authDenied) return authDenied;
-
- const propertyId = Number(request.nextUrl.searchParams.get('propertyId') || '0');
- if (!propertyId) {
- return NextResponse.json({ error: 'propertyId required' }, { status: 400 });
- }
-
- try {
- const sessions = await listChatSessions(propertyId);
- return NextResponse.json({ sessions });
- } catch (e) {
- const msg = e instanceof Error ? e.message : String(e);
- return NextResponse.json({ error: msg }, { status: 500 });
- }
+ return proxyToFastAPI(request, '/api/chat/sessions');
};
-/** POST /api/chat/sessions — create session { propertyId, title? }. */
export const POST: ApiRouteHandler = async (request: NextRequest): Promise => {
const denied = forbiddenIfNotLocal(request);
if (denied) return denied;
- // Chat (incl. starting a session) is intentionally available to the
- // read-only client role; only destructive deletes are restricted (see DELETE).
const authDenied = requireApiAuthForChat(request);
if (authDenied) return authDenied;
-
- let body: { propertyId?: number; title?: string };
- try {
- body = await request.json();
- } catch {
- return NextResponse.json({ error: 'Invalid JSON' }, { status: 400 });
- }
-
- const propertyId = Number(body.propertyId || 0);
- if (!propertyId) {
- return NextResponse.json({ error: 'propertyId required' }, { status: 400 });
- }
-
- try {
- const id = await createChatSession(propertyId, body.title);
- return NextResponse.json({ id, propertyId, title: body.title?.trim() || 'New chat' });
- } catch (e) {
- const msg = e instanceof Error ? e.message : String(e);
- return NextResponse.json({ error: msg }, { status: 500 });
- }
+ return proxyToFastAPI(request, '/api/chat/sessions');
};
diff --git a/web/app/api/compare/export/route.ts b/web/app/api/compare/export/route.ts
index 5d609b5c..35da28ee 100644
--- a/web/app/api/compare/export/route.ts
+++ b/web/app/api/compare/export/route.ts
@@ -1,70 +1,10 @@
-import { NextResponse, type NextRequest } from 'next/server';
-import { withDb } from '@/server/db';
-import { buildIssueDeltas } from '@/lib/reportCompareExtras';
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
+import { forbiddenIfNotLocal } from '@/server/localOnly';
import type { ApiRouteHandler } from '@/types/api';
-import type { ReportCategory, ReportPayload } from '@/types/report';
-export const runtime = 'nodejs';
export const dynamic = 'force-dynamic';
-function csvEscape(value: string): string {
- if (/[",\n]/.test(value)) return `"${value.replace(/"/g, '""')}"`;
- return value;
-}
-
-/**
- * POST /api/compare/export — CSV diff between two report ids.
- */
export const POST: ApiRouteHandler = async (request: NextRequest): Promise => {
- let body: { reportIdA?: number; reportIdB?: number };
- try {
- body = await request.json();
- } catch {
- return NextResponse.json({ error: 'Invalid JSON' }, { status: 400 });
- }
-
- const reportIdA = Number(body.reportIdA || 0);
- const reportIdB = Number(body.reportIdB || 0);
- if (!reportIdA || !reportIdB) {
- return NextResponse.json({ error: 'reportIdA and reportIdB required' }, { status: 400 });
- }
-
- try {
- const [payloadA, payloadB] = await withDb(async (client) => {
- const rows = await Promise.all(
- [reportIdA, reportIdB].map(async (id) => {
- const cur = await client.query<{ data: ReportPayload }>(
- 'SELECT data FROM report_payload WHERE id = $1',
- [id],
- );
- return cur.rows[0]?.data ?? { categories: [] as ReportCategory[] };
- }),
- );
- return rows;
- });
-
- const deltas = buildIssueDeltas(payloadA, payloadB);
- const lines = ['change,category,priority,url,message,recommendation'];
-
- for (const row of deltas) {
- const change = row.kind === 'new' ? 'added' : 'removed';
- lines.push(
- [change, row.category, row.priority, row.url, row.message, '']
- .map((v) => csvEscape(String(v)))
- .join(','),
- );
- }
-
- const csv = `${lines.join('\n')}\n`;
- return new NextResponse(csv, {
- status: 200,
- headers: {
- 'Content-Type': 'text/csv; charset=utf-8',
- 'Content-Disposition': `attachment; filename="audit-compare-${reportIdA}-vs-${reportIdB}.csv"`,
- },
- });
- } catch (e) {
- const msg = e instanceof Error ? e.message : String(e);
- return NextResponse.json({ error: msg }, { status: 500 });
- }
+ const denied = forbiddenIfNotLocal(request); if (denied) return denied; return proxyToFastAPI(request, '/api/compare/export');
};
diff --git a/web/app/api/content-drafts/[id]/route.ts b/web/app/api/content-drafts/[id]/route.ts
index 8e20da21..d5ae755f 100644
--- a/web/app/api/content-drafts/[id]/route.ts
+++ b/web/app/api/content-drafts/[id]/route.ts
@@ -1,41 +1,19 @@
-import { NextResponse, type NextRequest } from 'next/server';
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
import { forbiddenIfNotLocal } from '@/server/localOnly';
import { requireApiAuth } from '@/server/auth';
-import {
- deleteContentDraft,
- getContentDraft,
- updateContentDraft,
- type UpdateContentDraftInput,
-} from '@/server/contentDraftDb';
import type { ApiRouteHandler } from '@/types/api';
-export const runtime = 'nodejs';
export const dynamic = 'force-dynamic';
-/** GET /api/content-drafts/[id] */
export const GET: ApiRouteHandler = async (
request: NextRequest,
context?: { params?: Promise<{ id: string }> },
): Promise => {
const params = context?.params ? await context.params : { id: '' };
- const draftId = Number(params.id || '0');
- if (!draftId) {
- return NextResponse.json({ error: 'invalid draft id' }, { status: 400 });
- }
-
- try {
- const draft = await getContentDraft(draftId);
- if (!draft) {
- return NextResponse.json({ error: 'draft not found' }, { status: 404 });
- }
- return NextResponse.json({ draft });
- } catch (e) {
- const msg = e instanceof Error ? e.message : String(e);
- return NextResponse.json({ error: msg }, { status: 500 });
- }
+ return proxyToFastAPI(request, `/api/content-drafts/${params.id}`);
};
-/** PATCH /api/content-drafts/[id] */
export const PATCH: ApiRouteHandler = async (
request: NextRequest,
context?: { params?: Promise<{ id: string }> },
@@ -44,34 +22,10 @@ export const PATCH: ApiRouteHandler = async (
if (denied) return denied;
const authDenied = requireApiAuth(request);
if (authDenied) return authDenied;
-
const params = context?.params ? await context.params : { id: '' };
- const draftId = Number(params.id || '0');
- if (!draftId) {
- return NextResponse.json({ error: 'invalid draft id' }, { status: 400 });
- }
-
- let body: UpdateContentDraftInput;
- try {
- body = await request.json();
- } catch {
- return NextResponse.json({ error: 'Invalid JSON' }, { status: 400 });
- }
-
- try {
- const existing = await getContentDraft(draftId);
- if (!existing) {
- return NextResponse.json({ error: 'draft not found' }, { status: 404 });
- }
- const draft = await updateContentDraft(draftId, body);
- return NextResponse.json({ draft });
- } catch (e) {
- const msg = e instanceof Error ? e.message : String(e);
- return NextResponse.json({ error: msg }, { status: 500 });
- }
+ return proxyToFastAPI(request, `/api/content-drafts/${params.id}`);
};
-/** DELETE /api/content-drafts/[id] */
export const DELETE: ApiRouteHandler = async (
request: NextRequest,
context?: { params?: Promise<{ id: string }> },
@@ -80,21 +34,6 @@ export const DELETE: ApiRouteHandler = async (
if (denied) return denied;
const authDenied = requireApiAuth(request);
if (authDenied) return authDenied;
-
const params = context?.params ? await context.params : { id: '' };
- const draftId = Number(params.id || '0');
- if (!draftId) {
- return NextResponse.json({ error: 'invalid draft id' }, { status: 400 });
- }
-
- try {
- const ok = await deleteContentDraft(draftId);
- if (!ok) {
- return NextResponse.json({ error: 'draft not found' }, { status: 404 });
- }
- return NextResponse.json({ ok: true });
- } catch (e) {
- const msg = e instanceof Error ? e.message : String(e);
- return NextResponse.json({ error: msg }, { status: 500 });
- }
+ return proxyToFastAPI(request, `/api/content-drafts/${params.id}`);
};
diff --git a/web/app/api/content-drafts/route.ts b/web/app/api/content-drafts/route.ts
index f16f10a5..e860a4a6 100644
--- a/web/app/api/content-drafts/route.ts
+++ b/web/app/api/content-drafts/route.ts
@@ -1,55 +1,19 @@
-import { NextResponse, type NextRequest } from 'next/server';
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
import { forbiddenIfNotLocal } from '@/server/localOnly';
import { requireApiAuth } from '@/server/auth';
-import {
- createContentDraft,
- listContentDrafts,
- type CreateContentDraftInput,
-} from '@/server/contentDraftDb';
import type { ApiRouteHandler } from '@/types/api';
-export const runtime = 'nodejs';
export const dynamic = 'force-dynamic';
-/** GET /api/content-drafts?propertyId= — list drafts for a property. */
export const GET: ApiRouteHandler = async (request: NextRequest): Promise => {
- const propertyId = Number(request.nextUrl.searchParams.get('propertyId') || '0');
- if (!propertyId) {
- return NextResponse.json({ error: 'propertyId required' }, { status: 400 });
- }
- try {
- const drafts = await listContentDrafts(propertyId);
- return NextResponse.json({ drafts });
- } catch (e) {
- const msg = e instanceof Error ? e.message : String(e);
- return NextResponse.json({ error: msg }, { status: 500 });
- }
+ return proxyToFastAPI(request, '/api/content-drafts');
};
-/** POST /api/content-drafts — create a new draft. */
export const POST: ApiRouteHandler = async (request: NextRequest): Promise => {
const denied = forbiddenIfNotLocal(request);
if (denied) return denied;
const authDenied = requireApiAuth(request);
if (authDenied) return authDenied;
-
- let body: CreateContentDraftInput & { propertyId?: number };
- try {
- body = await request.json();
- } catch {
- return NextResponse.json({ error: 'Invalid JSON' }, { status: 400 });
- }
-
- const propertyId = Number(body.propertyId || 0);
- if (!propertyId) {
- return NextResponse.json({ error: 'propertyId required' }, { status: 400 });
- }
-
- try {
- const id = await createContentDraft(propertyId, body);
- return NextResponse.json({ id, propertyId });
- } catch (e) {
- const msg = e instanceof Error ? e.message : String(e);
- return NextResponse.json({ error: msg }, { status: 500 });
- }
+ return proxyToFastAPI(request, '/api/content-drafts');
};
diff --git a/web/app/api/content/analyze/route.ts b/web/app/api/content/analyze/route.ts
index 4c6861eb..02ef08ed 100644
--- a/web/app/api/content/analyze/route.ts
+++ b/web/app/api/content/analyze/route.ts
@@ -1,104 +1,15 @@
-import { NextResponse, type NextRequest } from 'next/server';
-import { spawn } from 'child_process';
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
import { forbiddenIfNotLocal } from '@/server/localOnly';
import { requireApiAuth } from '@/server/auth';
-import { getRepoRoot, getPipelineSpawnEnv } from '@/server/pipelineSpawnEnv';
-import { resolvePythonExecutable, parsePythonJsonStdout } from '@/server/resolvePython';
import type { ApiRouteHandler } from '@/types/api';
-export const runtime = 'nodejs';
export const dynamic = 'force-dynamic';
-/**
- * POST /api/content/analyze — SEO score + rule/AI suggestions (one-click analyzer).
- */
export const POST: ApiRouteHandler = async (request: NextRequest): Promise => {
const denied = forbiddenIfNotLocal(request);
if (denied) return denied;
const authDenied = requireApiAuth(request);
if (authDenied) return authDenied;
-
- let body: {
- propertyId?: number;
- keyword?: string;
- bodyHtml?: string;
- titleTag?: string;
- metaDescription?: string;
- landingUrl?: string;
- title?: string;
- useAi?: boolean;
- refresh?: boolean;
- };
- try {
- body = await request.json();
- } catch {
- return NextResponse.json({ error: 'Invalid JSON' }, { status: 400 });
- }
-
- const keyword = String(body.keyword || '').trim();
- if (!keyword) {
- return NextResponse.json({ error: 'keyword required' }, { status: 400 });
- }
-
- const propertyId = Number(body.propertyId || 0) || null;
- const repoRoot = getRepoRoot();
- const pythonExe = resolvePythonExecutable(null, repoRoot);
- const script = `
-import json, sys
-from website_profiling.content_studio.ai_suggest import analyze_content_draft
-payload = json.load(sys.stdin)
-pid = payload.get("propertyId")
-print(json.dumps(analyze_content_draft(
- int(pid) if pid else None,
- payload.get("keyword", ""),
- payload.get("bodyHtml", ""),
- payload.get("titleTag", ""),
- payload.get("metaDescription", ""),
- payload.get("landingUrl"),
- use_ai=bool(payload.get("useAi")),
- refresh=bool(payload.get("refresh")),
- title=payload.get("title", ""),
-)))
-`;
-
- return new Promise((resolve) => {
- const proc = spawn(pythonExe, ['-c', script], {
- cwd: repoRoot,
- env: getPipelineSpawnEnv(repoRoot),
- shell: false,
- });
- let stdout = '';
- proc.stdout?.on('data', (c: Buffer | string) => { stdout += c.toString(); });
- proc.stdin?.write(
- JSON.stringify({
- propertyId,
- keyword,
- bodyHtml: body.bodyHtml || '',
- titleTag: body.titleTag || '',
- metaDescription: body.metaDescription || '',
- landingUrl: body.landingUrl || null,
- title: body.title || '',
- useAi: body.useAi === true,
- refresh: body.refresh === true,
- }),
- );
- proc.stdin?.end();
- proc.on('error', () => {
- clearTimeout(timer);
- resolve(NextResponse.json({ error: 'Analyze failed: could not start Python' }, { status: 500 }));
- });
- proc.on('close', (code) => {
- clearTimeout(timer);
- const parsed = parsePythonJsonStdout(stdout);
- if (code === 0 && parsed) {
- resolve(NextResponse.json({ analysis: parsed }));
- return;
- }
- resolve(NextResponse.json({ error: 'Content analyze failed' }, { status: 500 }));
- });
- const timer = setTimeout(() => {
- try { proc.kill(); } catch { /* ignore */ }
- resolve(NextResponse.json({ error: 'Analyze timed out after 90s' }, { status: 504 }));
- }, 90_000);
- });
+ return proxyToFastAPI(request, '/api/content/analyze');
};
diff --git a/web/app/api/content/score/route.ts b/web/app/api/content/score/route.ts
index 22ad4073..28a34e13 100644
--- a/web/app/api/content/score/route.ts
+++ b/web/app/api/content/score/route.ts
@@ -1,89 +1,9 @@
-import { NextResponse, type NextRequest } from 'next/server';
-import { spawn } from 'child_process';
-import { getRepoRoot, getPipelineSpawnEnv } from '@/server/pipelineSpawnEnv';
-import { resolvePythonExecutable, parsePythonJsonStdout } from '@/server/resolvePython';
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
import type { ApiRouteHandler } from '@/types/api';
-export const runtime = 'nodejs';
export const dynamic = 'force-dynamic';
-/**
- * POST /api/content/score
- */
export const POST: ApiRouteHandler = async (request: NextRequest): Promise => {
- let body: {
- propertyId?: number;
- keyword?: string;
- bodyHtml?: string;
- titleTag?: string;
- metaDescription?: string;
- landingUrl?: string;
- };
- try {
- body = await request.json();
- } catch {
- return NextResponse.json({ error: 'Invalid JSON' }, { status: 400 });
- }
-
- const keyword = String(body.keyword || '').trim();
- if (!keyword) {
- return NextResponse.json({ error: 'keyword required' }, { status: 400 });
- }
-
- const propertyId = Number(body.propertyId || 0) || null;
-
- const repoRoot = getRepoRoot();
- const pythonExe = resolvePythonExecutable(null, repoRoot);
- const script = `
-import json, sys
-from website_profiling.content_studio.score import score_content_draft
-payload = json.load(sys.stdin)
-pid = payload.get("propertyId")
-print(json.dumps(score_content_draft(
- int(pid) if pid else None,
- payload.get("keyword", ""),
- payload.get("bodyHtml", ""),
- payload.get("titleTag", ""),
- payload.get("metaDescription", ""),
- payload.get("landingUrl"),
-)))
-`;
-
- return new Promise((resolve) => {
- const proc = spawn(pythonExe, ['-c', script], {
- cwd: repoRoot,
- env: getPipelineSpawnEnv(repoRoot),
- shell: false,
- });
- let stdout = '';
- proc.stdout?.on('data', (c: Buffer | string) => { stdout += c.toString(); });
- proc.stdin?.write(
- JSON.stringify({
- propertyId,
- keyword,
- bodyHtml: body.bodyHtml || '',
- titleTag: body.titleTag || '',
- metaDescription: body.metaDescription || '',
- landingUrl: body.landingUrl || null,
- }),
- );
- proc.stdin?.end();
- proc.on('error', () => {
- clearTimeout(timer);
- resolve(NextResponse.json({ error: 'Content score failed: could not start Python process' }, { status: 500 }));
- });
- proc.on('close', (code) => {
- clearTimeout(timer);
- const parsed = parsePythonJsonStdout(stdout);
- if (code === 0 && parsed) {
- resolve(NextResponse.json({ score: parsed }));
- return;
- }
- resolve(NextResponse.json({ error: 'Content score failed' }, { status: 500 }));
- });
- const timer = setTimeout(() => {
- try { proc.kill(); } catch { /* ignore */ }
- resolve(NextResponse.json({ error: 'Content score timed out after 30s' }, { status: 504 }));
- }, 30_000);
- });
+ return proxyToFastAPI(request, '/api/content/score');
};
diff --git a/web/app/api/content/wizard/route.ts b/web/app/api/content/wizard/route.ts
index 43b3030b..5fad8841 100644
--- a/web/app/api/content/wizard/route.ts
+++ b/web/app/api/content/wizard/route.ts
@@ -1,90 +1,15 @@
-import { NextResponse, type NextRequest } from 'next/server';
-import { spawn } from 'child_process';
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
import { forbiddenIfNotLocal } from '@/server/localOnly';
import { requireApiAuth } from '@/server/auth';
-import { getRepoRoot, getPipelineSpawnEnv } from '@/server/pipelineSpawnEnv';
-import { resolvePythonExecutable, parsePythonJsonStdout } from '@/server/resolvePython';
import type { ApiRouteHandler } from '@/types/api';
-export const runtime = 'nodejs';
export const dynamic = 'force-dynamic';
-const VALID_STEPS = new Set(['intents', 'content_types', 'tones', 'titles', 'outline', 'draft', 'research']);
-
-/**
- * POST /api/content/wizard — one step of the guided-draft wizard.
- * Body: { step, keyword, locale?, intent?, contentType?, tone?, title?, outline? }
- */
export const POST: ApiRouteHandler = async (request: NextRequest): Promise => {
const denied = forbiddenIfNotLocal(request);
if (denied) return denied;
const authDenied = requireApiAuth(request);
if (authDenied) return authDenied;
-
- let body: Record;
- try {
- body = await request.json();
- } catch {
- return NextResponse.json({ error: 'Invalid JSON' }, { status: 400 });
- }
-
- const step = String(body.step || '').trim();
- if (!VALID_STEPS.has(step)) {
- return NextResponse.json({ error: 'Invalid wizard step' }, { status: 400 });
- }
-
- const payload = {
- keyword: String(body.keyword || '').trim(),
- locale: String(body.locale || 'en-US'),
- intent: String(body.intent || ''),
- contentType: String(body.contentType || ''),
- tone: String(body.tone || ''),
- title: String(body.title || ''),
- outline: Array.isArray(body.outline) ? body.outline : [],
- };
-
- // The draft step writes a full article and can be slow on local models.
- const timeoutMs = step === 'draft' ? 180_000 : 60_000;
-
- const repoRoot = getRepoRoot();
- const pythonExe = resolvePythonExecutable(null, repoRoot);
- const script = `
-import json, sys
-from website_profiling.content_studio.wizard import run_wizard_step
-payload = json.load(sys.stdin)
-print(json.dumps(run_wizard_step(payload.get("step", ""), payload.get("payload") or {})))
-`;
-
- return new Promise((resolve) => {
- const proc = spawn(pythonExe, ['-c', script], {
- cwd: repoRoot,
- env: getPipelineSpawnEnv(repoRoot),
- shell: false,
- });
- let stdout = '';
- proc.stdout?.on('data', (c: Buffer | string) => { stdout += c.toString(); });
- proc.stdin?.write(JSON.stringify({ step, payload }));
- proc.stdin?.end();
- proc.on('error', () => {
- clearTimeout(timer);
- resolve(NextResponse.json({ error: 'Wizard failed: could not start Python' }, { status: 500 }));
- });
- proc.on('close', (code) => {
- clearTimeout(timer);
- const parsed = parsePythonJsonStdout(stdout);
- if (code === 0 && parsed) {
- if (parsed.ok === false) {
- resolve(NextResponse.json({ error: parsed.error || 'Wizard step failed' }, { status: 400 }));
- return;
- }
- resolve(NextResponse.json({ result: parsed }));
- return;
- }
- resolve(NextResponse.json({ error: 'Wizard step failed' }, { status: 500 }));
- });
- const timer = setTimeout(() => {
- try { proc.kill(); } catch { /* ignore */ }
- resolve(NextResponse.json({ error: `Wizard step timed out after ${Math.round(timeoutMs / 1000)}s` }, { status: 504 }));
- }, timeoutMs);
- });
+ return proxyToFastAPI(request, '/api/content/wizard');
};
diff --git a/web/app/api/crawl/browser-status/route.ts b/web/app/api/crawl/browser-status/route.ts
index 87725239..75864cf3 100644
--- a/web/app/api/crawl/browser-status/route.ts
+++ b/web/app/api/crawl/browser-status/route.ts
@@ -1,98 +1,12 @@
-import { spawn } from 'child_process';
-import { NextResponse } from 'next/server';
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
import { forbiddenIfNotLocal } from '@/server/localOnly';
-import { getPipelineSpawnEnv, getRepoRoot } from '@/server/pipelineSpawnEnv';
-import { formatPythonSpawnError, resolvePythonExecutable } from '@/server/resolvePython';
import type { ApiRouteHandler } from '@/types/api';
-export const runtime = 'nodejs';
export const dynamic = 'force-dynamic';
-const CHECK_SCRIPT =
- 'from website_profiling.crawl.fetchers import ensure_browser_deps; import json; print(json.dumps(ensure_browser_deps()))';
-
-/** First-time Playwright/Chromium install can take a few minutes. */
-const CHECK_TIMEOUT_MS = 180_000;
-
-/**
- * GET /api/crawl/browser-status
- * Returns whether Playwright and Chromium are available for JS/auto crawls.
- */
-export const GET: ApiRouteHandler = async (request): Promise => {
+export const GET: ApiRouteHandler = async (request: NextRequest): Promise => {
const denied = forbiddenIfNotLocal(request);
if (denied) return denied;
-
- const repoRoot = getRepoRoot();
- const pythonExe = resolvePythonExecutable(null, repoRoot);
-
- return new Promise((resolve) => {
- let stdout = '';
- let stderr = '';
- const proc = spawn(pythonExe, ['-c', CHECK_SCRIPT], {
- cwd: repoRoot,
- env: getPipelineSpawnEnv(),
- shell: false,
- });
-
- const appendStdout = (chunk: Buffer | string): void => {
- stdout += chunk.toString();
- };
- const appendStderr = (chunk: Buffer | string): void => {
- stderr += chunk.toString();
- };
- proc.stdout?.on('data', appendStdout);
- proc.stderr?.on('data', appendStderr);
-
- const finish = (payload: { ok: boolean; message?: string; error?: string }, status = 200) => {
- resolve(NextResponse.json(payload, { status }));
- };
-
- proc.on('error', (err: Error) => {
- finish({
- ok: false,
- message: formatPythonSpawnError(err, pythonExe, repoRoot),
- error: err.message,
- });
- });
-
- proc.on('close', (code: number | null) => {
- if (code !== 0) {
- finish({
- ok: false,
- message:
- stderr.trim() ||
- 'JavaScript crawl requires Playwright and Chromium. Install: pip install -r requirements.txt.',
- error: stderr.trim() || `exit ${code}`,
- });
- return;
- }
- try {
- const line = stdout.trim().split('\n').filter(Boolean).pop() || '{}';
- const parsed = JSON.parse(line) as { ok?: boolean; message?: string };
- finish({
- ok: Boolean(parsed.ok),
- message: parsed.message,
- });
- } catch {
- finish({
- ok: false,
- message: 'Could not parse browser status from Python.',
- error: stdout.slice(-500) || stderr.slice(-500),
- });
- }
- });
-
- setTimeout(() => {
- try {
- proc.kill();
- } catch {
- /* ignore */
- }
- finish({
- ok: false,
- message: 'Browser status check timed out.',
- error: 'timeout',
- });
- }, CHECK_TIMEOUT_MS);
- });
+ return proxyToFastAPI(request, '/api/crawl/browser-status');
};
diff --git a/web/app/api/crawl/page-html/route.ts b/web/app/api/crawl/page-html/route.ts
index 71f34003..095defde 100644
--- a/web/app/api/crawl/page-html/route.ts
+++ b/web/app/api/crawl/page-html/route.ts
@@ -1,75 +1,18 @@
-import { NextResponse, type NextRequest } from 'next/server';
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
import { forbiddenIfNotLocal } from '@/server/localOnly';
-import { withReportDb } from '@/server/reportDb';
-import { deletePageHtmlForRun, listCrawlPageHtmlRuns } from '@/lib/loadReportDb';
import type { ApiRouteHandler } from '@/types/api';
-export const runtime = 'nodejs';
export const dynamic = 'force-dynamic';
-type DeleteBody = {
- crawlRunId?: number | null;
-};
-
-/**
- * GET /api/crawl/page-html?limit=30
- * Lists recent crawl runs with stored HTML stats.
- */
export const GET: ApiRouteHandler = async (request: NextRequest): Promise => {
const denied = forbiddenIfNotLocal(request);
if (denied) return denied;
-
- const limitRaw = Number(request.nextUrl.searchParams.get('limit') || '30');
- const limit = Number.isFinite(limitRaw) ? Math.min(100, Math.max(1, limitRaw)) : 30;
-
- try {
- const runs = await withReportDb((client) => listCrawlPageHtmlRuns(client, { limit }));
- return NextResponse.json({ runs });
- } catch (e) {
- const msg = e instanceof Error ? e.message : String(e);
- return NextResponse.json({ error: msg, runs: [] }, { status: 500 });
- }
+ return proxyToFastAPI(request, '/api/crawl/page-html');
};
-/**
- * DELETE /api/crawl/page-html
- * Body: { crawlRunId: number }
- * Removes raw HTML for one crawl run; crawl results and reports are kept.
- */
export const DELETE: ApiRouteHandler = async (request: NextRequest): Promise => {
const denied = forbiddenIfNotLocal(request);
if (denied) return denied;
-
- let body: DeleteBody = {};
- try {
- body = (await request.json()) as DeleteBody;
- } catch {
- const crawlRunIdRaw = request.nextUrl.searchParams.get('crawlRunId');
- if (crawlRunIdRaw) body.crawlRunId = Number(crawlRunIdRaw);
- }
-
- const crawlRunId =
- body.crawlRunId != null && Number.isFinite(Number(body.crawlRunId))
- ? Number(body.crawlRunId)
- : null;
-
- if (crawlRunId == null) {
- return NextResponse.json({ error: 'crawlRunId is required' }, { status: 400 });
- }
-
- try {
- const deletedPages = await withReportDb((client) => deletePageHtmlForRun(client, crawlRunId));
- if (deletedPages === 0) {
- return NextResponse.json({
- ok: true,
- crawlRunId,
- deletedPages: 0,
- message: 'No stored HTML found for this crawl run.',
- });
- }
- return NextResponse.json({ ok: true, crawlRunId, deletedPages });
- } catch (e) {
- const msg = e instanceof Error ? e.message : String(e);
- return NextResponse.json({ error: msg }, { status: 500 });
- }
+ return proxyToFastAPI(request, '/api/crawl/page-html');
};
diff --git a/web/app/api/dashboards/[id]/route.ts b/web/app/api/dashboards/[id]/route.ts
index 1a390c7c..6e980bc0 100644
--- a/web/app/api/dashboards/[id]/route.ts
+++ b/web/app/api/dashboards/[id]/route.ts
@@ -1,108 +1,36 @@
-import { NextResponse, type NextRequest } from 'next/server';
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
import { forbiddenIfNotLocal } from '@/server/localOnly';
-import { getDashboard, updateDashboard, deleteDashboard } from '@/server/dashboardsDb';
import type { ApiRouteHandlerWithParams } from '@/types/api';
-import type { DashboardDoc } from '@/types/dashboard';
-export const runtime = 'nodejs';
export const dynamic = 'force-dynamic';
-type Params = { id: string };
-
-/**
- * GET /api/dashboards/[id]?propertyId=
- * Returns a single dashboard.
- */
-export const GET: ApiRouteHandlerWithParams = async (
+export const GET: ApiRouteHandlerWithParams<{ id: string }> = async (
request: NextRequest,
- { params },
+ { params }: { params: Promise<{ id: string }> },
): Promise => {
const denied = forbiddenIfNotLocal(request);
if (denied) return denied;
-
const { id } = await params;
- const dashboardId = Number(id);
- const propertyId = Number(new URL(request.url).searchParams.get('propertyId') || 0);
-
- if (!dashboardId || !propertyId) {
- return NextResponse.json({ error: 'id and propertyId required' }, { status: 400 });
- }
-
- try {
- const dashboard = await getDashboard(dashboardId, propertyId);
- if (!dashboard) return NextResponse.json({ error: 'Not found' }, { status: 404 });
- return NextResponse.json({ dashboard });
- } catch (e) {
- const msg = e instanceof Error ? e.message : String(e);
- return NextResponse.json({ error: msg }, { status: 500 });
- }
+ return proxyToFastAPI(request, `/api/dashboards/${id}`);
};
-/**
- * PUT /api/dashboards/[id]
- * Body: { propertyId, name?, layoutJson?, isDefault? }
- * Partial update — only provided fields are changed.
- */
-export const PUT: ApiRouteHandlerWithParams = async (
+export const PUT: ApiRouteHandlerWithParams<{ id: string }> = async (
request: NextRequest,
- { params },
+ { params }: { params: Promise<{ id: string }> },
): Promise => {
const denied = forbiddenIfNotLocal(request);
if (denied) return denied;
-
const { id } = await params;
- const dashboardId = Number(id);
-
- let body: { propertyId?: number; name?: string; layoutJson?: DashboardDoc; isDefault?: boolean };
- try {
- body = await request.json();
- } catch {
- return NextResponse.json({ error: 'Invalid JSON' }, { status: 400 });
- }
-
- const propertyId = Number(body.propertyId || 0);
- if (!dashboardId || !propertyId) {
- return NextResponse.json({ error: 'id and propertyId required' }, { status: 400 });
- }
-
- try {
- const dashboard = await updateDashboard(dashboardId, propertyId, {
- name: body.name,
- layoutJson: body.layoutJson,
- isDefault: body.isDefault,
- });
- if (!dashboard) return NextResponse.json({ error: 'Not found' }, { status: 404 });
- return NextResponse.json({ dashboard });
- } catch (e) {
- const msg = e instanceof Error ? e.message : String(e);
- return NextResponse.json({ error: msg }, { status: 500 });
- }
+ return proxyToFastAPI(request, `/api/dashboards/${id}`);
};
-/**
- * DELETE /api/dashboards/[id]?propertyId=
- */
-export const DELETE: ApiRouteHandlerWithParams = async (
+export const DELETE: ApiRouteHandlerWithParams<{ id: string }> = async (
request: NextRequest,
- { params },
+ { params }: { params: Promise<{ id: string }> },
): Promise => {
const denied = forbiddenIfNotLocal(request);
if (denied) return denied;
-
const { id } = await params;
- const dashboardId = Number(id);
- const propertyId = Number(new URL(request.url).searchParams.get('propertyId') || 0);
-
- if (!dashboardId || !propertyId) {
- return NextResponse.json({ error: 'id and propertyId required' }, { status: 400 });
- }
-
- try {
- const deleted = await deleteDashboard(dashboardId, propertyId);
- if (!deleted) return NextResponse.json({ error: 'Not found' }, { status: 404 });
- return NextResponse.json({ ok: true });
- } catch (e) {
- const msg = e instanceof Error ? e.message : String(e);
- return NextResponse.json({ error: msg }, { status: 500 });
- }
+ return proxyToFastAPI(request, `/api/dashboards/${id}`);
};
diff --git a/web/app/api/dashboards/ai-generate/route.ts b/web/app/api/dashboards/ai-generate/route.ts
new file mode 100644
index 00000000..7663f346
--- /dev/null
+++ b/web/app/api/dashboards/ai-generate/route.ts
@@ -0,0 +1,115 @@
+import { NextResponse, type NextRequest } from 'next/server';
+import { forbiddenIfNotLocal } from '@/server/localOnly';
+import { fastApiBase } from '@/server/fastApiClient';
+import { DASHBOARD_CATALOG, dimensions, measures } from '@/lib/dashboard/catalog/catalog';
+import { VIZ_LABELS } from '@/lib/dashboard/viz/labels';
+import type { ApiRouteHandler } from '@/types/api';
+
+export const runtime = 'nodejs';
+export const dynamic = 'force-dynamic';
+
+const DASHSCRIPT_HELP = `
+DashScript is a lightweight formula language for dashboard widgets.
+
+MEASURE (scalar formula, produces a single number or string):
+ field("key") — value from root result by dot-path key
+ sum("col") — sum of numeric column across all rows
+ avg("col") — average
+ count() — number of rows
+ min("col") / max("col") — min / max of column
+ if(cond, thenVal, elseVal) — conditional
+ coalesce(a, b, c) — first non-null value
+ Arithmetic: + - * / (division by zero returns null)
+ Comparison: == != < <= > >=
+ Logical: && || !
+
+TRANSFORM (row pipeline, applied to rows array before rendering):
+ filter(expr) — keep rows where expr is truthy (use row column names directly)
+ sort(col, asc|desc) — sort rows by column (default asc)
+ take(N) — keep first N rows
+ skip(N) — drop first N rows
+ project(col1, col2) — keep only listed columns
+ Stages are joined with | e.g. filter(count > 0) | sort(count, desc) | take(10)
+
+Examples:
+ measure: field("health_score")
+ measure: sum("issues") / count()
+ transform: filter(severity == "critical") | sort(count, desc) | take(5)
+`.trim();
+
+/**
+ * POST /api/dashboards/ai-generate
+ * Body: { mode, prompt, toolName?, propertyId?, reportId?, current? }
+ */
+export const POST: ApiRouteHandler = async (request: NextRequest): Promise => {
+ const denied = forbiddenIfNotLocal(request);
+ if (denied) return denied;
+
+ let body: {
+ mode?: string;
+ prompt?: string;
+ toolName?: string;
+ propertyId?: number;
+ reportId?: number | null;
+ current?: unknown;
+ };
+ try {
+ body = await request.json();
+ } catch {
+ return NextResponse.json({ error: 'Invalid JSON' }, { status: 400 });
+ }
+
+ const mode = String(body.mode || 'widget').trim().toLowerCase();
+ if (!['script', 'widget', 'dashboard'].includes(mode)) {
+ return NextResponse.json({ error: 'mode must be script, widget, or dashboard' }, { status: 400 });
+ }
+ const prompt = String(body.prompt || '').trim();
+ if (!prompt) {
+ return NextResponse.json({ error: 'prompt required' }, { status: 400 });
+ }
+
+ const payload = {
+ mode,
+ prompt,
+ toolName: String(body.toolName || '').trim() || undefined,
+ propertyId: Number(body.propertyId || 0) || undefined,
+ reportId: body.reportId != null ? Number(body.reportId) : undefined,
+ catalog: DASHBOARD_CATALOG.map((e) => ({
+ toolName: e.toolName,
+ label: e.label,
+ section: e.section,
+ fields: e.fields,
+ dimensions: dimensions(e).map((f) => ({
+ key: f.key,
+ label: f.label,
+ defaultAgg: f.defaultAgg,
+ format: f.format,
+ })),
+ measures: measures(e).map((f) => ({
+ key: f.key,
+ label: f.label,
+ defaultAgg: f.defaultAgg,
+ format: f.format,
+ })),
+ rowsPath: e.rowsPath,
+ compatibleViz: e.compatibleViz,
+ })),
+ viz_types: VIZ_LABELS,
+ dashscript_help: DASHSCRIPT_HELP,
+ current: body.current ?? null,
+ };
+
+ try {
+ const res = await fetch(`${fastApiBase()}/api/dashboards/ai-generate`, {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify(payload),
+ cache: 'no-store',
+ });
+ const data = (await res.json().catch(() => ({}))) as Record;
+ return NextResponse.json(data, { status: res.status });
+ } catch (e) {
+ const msg = e instanceof Error ? e.message : String(e);
+ return NextResponse.json({ error: msg || 'AI generation failed' }, { status: 500 });
+ }
+};
diff --git a/web/app/api/dashboards/route.ts b/web/app/api/dashboards/route.ts
index 2271a275..6cdc3f55 100644
--- a/web/app/api/dashboards/route.ts
+++ b/web/app/api/dashboards/route.ts
@@ -1,69 +1,18 @@
-import { NextResponse, type NextRequest } from 'next/server';
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
import { forbiddenIfNotLocal } from '@/server/localOnly';
-import {
- listDashboards,
- createDashboard,
-} from '@/server/dashboardsDb';
-import { emptyDashboard } from '@/types/dashboard';
import type { ApiRouteHandler } from '@/types/api';
-export const runtime = 'nodejs';
export const dynamic = 'force-dynamic';
-/**
- * GET /api/dashboards?propertyId=
- * Returns all dashboards for a property ordered by updated_at DESC.
- */
export const GET: ApiRouteHandler = async (request: NextRequest): Promise => {
const denied = forbiddenIfNotLocal(request);
if (denied) return denied;
-
- const propertyId = Number(new URL(request.url).searchParams.get('propertyId') || 0);
- if (!propertyId) {
- return NextResponse.json({ error: 'propertyId required' }, { status: 400 });
- }
-
- try {
- const dashboards = await listDashboards(propertyId);
- return NextResponse.json({ dashboards });
- } catch (e) {
- const msg = e instanceof Error ? e.message : String(e);
- return NextResponse.json({ error: msg }, { status: 500 });
- }
+ return proxyToFastAPI(request, '/api/dashboards');
};
-/**
- * POST /api/dashboards
- * Body: { propertyId, name?, layoutJson? }
- * Creates a new dashboard and returns it.
- */
export const POST: ApiRouteHandler = async (request: NextRequest): Promise => {
const denied = forbiddenIfNotLocal(request);
if (denied) return denied;
-
- let body: { propertyId?: number; name?: string; layoutJson?: unknown };
- try {
- body = await request.json();
- } catch {
- return NextResponse.json({ error: 'Invalid JSON' }, { status: 400 });
- }
-
- const propertyId = Number(body.propertyId || 0);
- if (!propertyId) {
- return NextResponse.json({ error: 'propertyId required' }, { status: 400 });
- }
-
- const name = String(body.name || 'Untitled dashboard').trim() || 'Untitled dashboard';
-
- try {
- const dashboard = await createDashboard(
- propertyId,
- name,
- (body.layoutJson as ReturnType) ?? emptyDashboard(),
- );
- return NextResponse.json({ dashboard }, { status: 201 });
- } catch (e) {
- const msg = e instanceof Error ? e.message : String(e);
- return NextResponse.json({ error: msg }, { status: 500 });
- }
+ return proxyToFastAPI(request, '/api/dashboards');
};
diff --git a/web/app/api/filters/route.ts b/web/app/api/filters/route.ts
index 60dc53c5..6ca3b5de 100644
--- a/web/app/api/filters/route.ts
+++ b/web/app/api/filters/route.ts
@@ -1,37 +1,24 @@
-import { NextResponse, type NextRequest } from 'next/server';
-import { deleteSavedFilter, listSavedFilters, upsertSavedFilter } from '@/server/savedFiltersDb';
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
+import { forbiddenIfNotLocal } from '@/server/localOnly';
+import type { ApiRouteHandler } from '@/types/api';
-export const runtime = 'nodejs';
export const dynamic = 'force-dynamic';
-export async function GET(request: NextRequest) {
- const propertyId = Number(request.nextUrl.searchParams.get('propertyId') || 0);
- if (!propertyId) {
- return NextResponse.json({ error: 'propertyId required' }, { status: 400 });
- }
- const filters = await listSavedFilters(propertyId);
- return NextResponse.json({ filters });
-}
+export const GET: ApiRouteHandler = async (request: NextRequest): Promise => {
+ const denied = forbiddenIfNotLocal(request);
+ if (denied) return denied;
+ return proxyToFastAPI(request, '/api/filters');
+};
-export async function POST(request: NextRequest) {
- const body = await request.json().catch(() => ({}));
- const propertyId = Number(body.propertyId || 0);
- const name = String(body.name || '').trim();
- const filterJson = (body.filterJson && typeof body.filterJson === 'object') ? body.filterJson : {};
- if (!propertyId || !name) {
- return NextResponse.json({ error: 'propertyId and name required' }, { status: 400 });
- }
- await upsertSavedFilter(propertyId, name, filterJson);
- return NextResponse.json({ ok: true });
-}
+export const POST: ApiRouteHandler = async (request: NextRequest): Promise => {
+ const denied = forbiddenIfNotLocal(request);
+ if (denied) return denied;
+ return proxyToFastAPI(request, '/api/filters');
+};
-export async function DELETE(request: NextRequest) {
- const body = await request.json().catch(() => ({}));
- const propertyId = Number(body.propertyId || 0);
- const name = String(body.name || '').trim();
- if (!propertyId || !name) {
- return NextResponse.json({ error: 'propertyId and name required' }, { status: 400 });
- }
- await deleteSavedFilter(propertyId, name);
- return NextResponse.json({ ok: true });
-}
+export const DELETE: ApiRouteHandler = async (request: NextRequest): Promise => {
+ const denied = forbiddenIfNotLocal(request);
+ if (denied) return denied;
+ return proxyToFastAPI(request, '/api/filters');
+};
diff --git a/web/app/api/health/route.ts b/web/app/api/health/route.ts
index 28c0a304..cb9f5a41 100644
--- a/web/app/api/health/route.ts
+++ b/web/app/api/health/route.ts
@@ -1,18 +1,10 @@
-import { NextResponse } from 'next/server';
-import { withDb } from '@/server/db';
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
import type { ApiRouteHandler } from '@/types/api';
export const runtime = 'nodejs';
export const dynamic = 'force-dynamic';
-export const GET: ApiRouteHandler = async (): Promise => {
- try {
- await withDb(async (client) => {
- await client.query('SELECT 1');
- });
- return NextResponse.json({ ok: true, database: 'up' });
- } catch (e) {
- const msg = e instanceof Error ? e.message : String(e);
- return NextResponse.json({ ok: false, database: 'down', error: msg }, { status: 503 });
- }
+export const GET: ApiRouteHandler = async (request: NextRequest): Promise => {
+ return proxyToFastAPI(request, '/api/health');
};
diff --git a/web/app/api/integrations/bing/sync/route.ts b/web/app/api/integrations/bing/sync/route.ts
index 5c934209..710b9149 100644
--- a/web/app/api/integrations/bing/sync/route.ts
+++ b/web/app/api/integrations/bing/sync/route.ts
@@ -1,63 +1,10 @@
-import { NextResponse, type NextRequest } from 'next/server';
-import { spawn } from 'child_process';
-import { getRepoRoot, getPipelineSpawnEnv } from '@/server/pipelineSpawnEnv';
-import { resolvePythonExecutable, parsePythonJsonStdout, formatPythonSpawnError } from '@/server/resolvePython';
-import { loadPipelineConfigUnmasked } from '@/server/pipelineConfig';
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
+import { forbiddenIfNotLocal } from '@/server/localOnly';
import type { ApiRouteHandler } from '@/types/api';
-export const runtime = 'nodejs';
export const dynamic = 'force-dynamic';
-/**
- * POST /api/integrations/bing/sync — fetch Bing Webmaster backlinks summary.
- */
-export const POST: ApiRouteHandler = async (_request: NextRequest): Promise => {
- let state: Record;
- try {
- // Must use the UNMASKED loader: the API key is passed to Python to authenticate
- // with Bing; loadPipelineConfig() would return a masked '••••' placeholder.
- const cfg = await loadPipelineConfigUnmasked();
- state = cfg.state;
- } catch (e) {
- const msg = e instanceof Error ? e.message : String(e);
- return NextResponse.json({ error: msg }, { status: 500 });
- }
- const apiKey = String(state.bing_webmaster_api_key || '').trim();
- const siteUrl = String(state.start_url || '').trim();
- if (!apiKey || !siteUrl) {
- return NextResponse.json(
- { error: 'Set bing_webmaster_api_key and start_url in pipeline settings.' },
- { status: 400 },
- );
- }
-
- const repoRoot = getRepoRoot();
- const pythonExe = resolvePythonExecutable(null, repoRoot);
- const script = `
-import json, sys
-from website_profiling.integrations.bing.webmaster import fetch_bing_backlinks_summary
-api_key, site_url = sys.argv[1], sys.argv[2]
-print(json.dumps(fetch_bing_backlinks_summary(api_key, site_url)))
-`;
-
- return new Promise((resolve) => {
- const proc = spawn(pythonExe, ['-c', script, apiKey, siteUrl], {
- cwd: repoRoot,
- env: getPipelineSpawnEnv(repoRoot),
- shell: false,
- });
- let stdout = '';
- proc.stdout?.on('data', (c: Buffer | string) => { stdout += c.toString(); });
- proc.on('error', (err: Error) => {
- resolve(NextResponse.json({ error: formatPythonSpawnError(err, pythonExe, repoRoot) }, { status: 500 }));
- });
- proc.on('close', (code) => {
- const parsed = parsePythonJsonStdout(stdout);
- if (code === 0 && parsed) {
- resolve(NextResponse.json(parsed));
- return;
- }
- resolve(NextResponse.json({ error: stdout.trim() || 'Bing sync failed' }, { status: 500 }));
- });
- });
+export const POST: ApiRouteHandler = async (request: NextRequest): Promise => {
+ const denied = forbiddenIfNotLocal(request); if (denied) return denied; return proxyToFastAPI(request, '/api/integrations/bing/sync');
};
diff --git a/web/app/api/integrations/google/credentials/route.ts b/web/app/api/integrations/google/credentials/route.ts
index 37e14c90..44454b50 100644
--- a/web/app/api/integrations/google/credentials/route.ts
+++ b/web/app/api/integrations/google/credentials/route.ts
@@ -1,57 +1,12 @@
-import { NextResponse, type NextRequest } from 'next/server';
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
import { forbiddenIfNotLocal } from '@/server/localOnly';
-import {
- getGoogleAppPublicStatus,
- saveGoogleAppSettings,
-} from '@/server/googleAppSettings';
-import type { ApiRouteHandler, GoogleCredentialsPostBody } from '@/types/api';
+import type { ApiRouteHandler } from '@/types/api';
-export const runtime = 'nodejs';
+export const dynamic = 'force-dynamic';
-const PROPERTY_ONLY_MSG =
- 'Per-site settings (GSC, GA4, refresh token) must be saved via property Integrations when a Site URL is set.';
-
-/** POST /api/integrations/google/credentials — save OAuth app Client ID/Secret to database. */
export const POST: ApiRouteHandler = async (request: NextRequest): Promise => {
const denied = forbiddenIfNotLocal(request);
if (denied) return denied;
- try {
- const body = (await request.json().catch(() => ({}))) as GoogleCredentialsPostBody;
-
- if (
- 'refreshToken' in body ||
- 'gscSiteUrl' in body ||
- 'ga4PropertyId' in body
- ) {
- return NextResponse.json({ error: PROPERTY_ONLY_MSG }, { status: 400 });
- }
-
- const patch: Parameters[0] = {};
- if (typeof body.clientId === 'string' && body.clientId.trim()) {
- patch.clientId = body.clientId.trim();
- }
- if (typeof body.clientSecret === 'string' && body.clientSecret.trim()) {
- patch.clientSecret = body.clientSecret.trim();
- }
- if (typeof body.dateRangeDays === 'number' && body.dateRangeDays > 0) {
- patch.dateRangeDays = body.dateRangeDays;
- }
- if (typeof body.developerToken === 'string' && body.developerToken.trim()) {
- patch.developerToken = body.developerToken.trim();
- }
- if (typeof body.loginCustomerId === 'string' && body.loginCustomerId.trim()) {
- patch.loginCustomerId = body.loginCustomerId.trim().replace(/-/g, '');
- }
-
- if (Object.keys(patch).length === 0) {
- return NextResponse.json({ error: 'No valid fields provided' }, { status: 400 });
- }
-
- await saveGoogleAppSettings(patch);
- const status = await getGoogleAppPublicStatus();
- return NextResponse.json({ ok: true, status });
- } catch (e) {
- const msg = e instanceof Error ? e.message : String(e);
- return NextResponse.json({ error: msg }, { status: 500 });
- }
+ return proxyToFastAPI(request, '/api/integrations/google/credentials');
};
diff --git a/web/app/api/integrations/google/credentials/upload/route.ts b/web/app/api/integrations/google/credentials/upload/route.ts
index 30639b49..f88000ae 100644
--- a/web/app/api/integrations/google/credentials/upload/route.ts
+++ b/web/app/api/integrations/google/credentials/upload/route.ts
@@ -1,58 +1,12 @@
-import { NextResponse, type NextRequest } from 'next/server';
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
import { forbiddenIfNotLocal } from '@/server/localOnly';
-import {
- getGoogleAppPublicStatus,
- saveGoogleAppSettings,
-} from '@/server/googleAppSettings';
-import type { ApiRouteHandler, GoogleCredentialsUploadBody, GoogleServiceAccount } from '@/types/api';
+import type { ApiRouteHandler } from '@/types/api';
-export const runtime = 'nodejs';
-
-function isServiceAccount(value: unknown): value is GoogleServiceAccount {
- return (
- value != null &&
- typeof value === 'object' &&
- (value as GoogleServiceAccount).type === 'service_account' &&
- typeof (value as GoogleServiceAccount).client_email === 'string' &&
- typeof (value as GoogleServiceAccount).private_key === 'string'
- );
-}
+export const dynamic = 'force-dynamic';
export const POST: ApiRouteHandler = async (request: NextRequest): Promise => {
const denied = forbiddenIfNotLocal(request);
if (denied) return denied;
- try {
- const body = (await request.json().catch(() => ({}))) as GoogleCredentialsUploadBody;
- const raw = body.fileContent;
- if (!raw || typeof raw !== 'string') {
- return NextResponse.json({ error: 'fileContent is required' }, { status: 400 });
- }
-
- let parsed: unknown;
- try {
- parsed = JSON.parse(raw);
- } catch {
- return NextResponse.json(
- { error: "This doesn't look like a valid JSON file." },
- { status: 400 },
- );
- }
-
- if (!isServiceAccount(parsed)) {
- return NextResponse.json(
- {
- error:
- "This doesn't look like a Google service account key file. Make sure you downloaded the JSON key from Google Cloud Console > IAM & Admin > Service Accounts.",
- },
- { status: 400 },
- );
- }
-
- await saveGoogleAppSettings({ serviceAccount: parsed });
- const status = await getGoogleAppPublicStatus();
- return NextResponse.json({ ok: true, status });
- } catch (e) {
- const msg = e instanceof Error ? e.message : String(e);
- return NextResponse.json({ error: msg }, { status: 500 });
- }
+ return proxyToFastAPI(request, '/api/integrations/google/credentials/upload');
};
diff --git a/web/app/api/integrations/google/disconnect/route.ts b/web/app/api/integrations/google/disconnect/route.ts
index b661d8a3..6ba0472c 100644
--- a/web/app/api/integrations/google/disconnect/route.ts
+++ b/web/app/api/integrations/google/disconnect/route.ts
@@ -1,20 +1,12 @@
-import { NextResponse, type NextRequest } from 'next/server';
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
import { forbiddenIfNotLocal } from '@/server/localOnly';
-import { getGoogleAppPublicStatus } from '@/server/googleAppSettings';
import type { ApiRouteHandler } from '@/types/api';
-export const runtime = 'nodejs';
+export const dynamic = 'force-dynamic';
-/** Global disconnect is deprecated — use per-property disconnect. */
export const POST: ApiRouteHandler = async (request: NextRequest): Promise => {
const denied = forbiddenIfNotLocal(request);
if (denied) return denied;
-
- const status = await getGoogleAppPublicStatus();
- return NextResponse.json({
- ok: false,
- error:
- 'Disconnect Google per site: set Site URL, open Integrations, and use Disconnect on that property.',
- status,
- });
+ return proxyToFastAPI(request, '/api/integrations/google/disconnect');
};
diff --git a/web/app/api/integrations/google/keywords/by-page/route.ts b/web/app/api/integrations/google/keywords/by-page/route.ts
index dabc00c3..e03c6531 100644
--- a/web/app/api/integrations/google/keywords/by-page/route.ts
+++ b/web/app/api/integrations/google/keywords/by-page/route.ts
@@ -1,84 +1,12 @@
-import { NextResponse, type NextRequest } from 'next/server';
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
import { forbiddenIfNotLocal } from '@/server/localOnly';
-import { withDb } from '@/server/db';
-import { parseJsonField } from '@/server/pageGoogleData';
-import { resolvePropertyIdFromRequest } from '@/server/resolvePropertyId';
import type { ApiRouteHandler } from '@/types/api';
-import type { PoolClient } from 'pg';
-export const runtime = 'nodejs';
+export const dynamic = 'force-dynamic';
-interface KeywordRow {
- gsc_url?: string;
- [key: string]: unknown;
-}
-
-interface CannibalisationEntry {
- pages?: Array<{ url?: string }>;
- [key: string]: unknown;
-}
-
-/**
- * GET /api/integrations/google/keywords/by-page?url=...&propertyId=|domain=
- */
export const GET: ApiRouteHandler = async (request: NextRequest): Promise => {
const guard = forbiddenIfNotLocal(request);
if (guard) return guard;
-
- const { searchParams } = new URL(request.url);
- const pageUrl = (searchParams.get('url') || '').trim();
- const { propertyId, error } = await resolvePropertyIdFromRequest(
- searchParams.get('propertyId'),
- searchParams.get('domain'),
- );
-
- if (!pageUrl) {
- return NextResponse.json({ error: 'url parameter is required' }, { status: 400 });
- }
- if (error || propertyId == null) {
- return NextResponse.json({ error: error || 'propertyId or domain required' }, { status: 400 });
- }
-
- try {
- return await withDb(async (client: PoolClient) => {
- const { rows } = await client.query(
- `SELECT data FROM keyword_data
- WHERE property_id = $1
- ORDER BY id DESC LIMIT 1`,
- [propertyId],
- );
- if (!rows.length) {
- return NextResponse.json({ keywords: [], cannibalisation: [] });
- }
-
- const data = parseJsonField(rows[0].data) || {};
- const allRows = Array.isArray(data.rows) ? (data.rows as KeywordRow[]) : [];
-
- const normalizedTarget = pageUrl.toLowerCase().replace(/\/$/, '');
- const pageKeywords = allRows.filter((r) => {
- const u = (r.gsc_url || '').toLowerCase().replace(/\/$/, '');
- return u === normalizedTarget || u.includes(normalizedTarget) || normalizedTarget.includes(u);
- });
-
- const cannibRaw = Array.isArray(data.cannibalisation) ? data.cannibalisation : [];
- const cannib = (cannibRaw as CannibalisationEntry[]).filter((c) =>
- (c.pages || []).some((p) => {
- const u = (p.url || '').toLowerCase().replace(/\/$/, '');
- return u === normalizedTarget;
- }),
- );
-
- return NextResponse.json({
- url: pageUrl,
- propertyId,
- keyword_count: pageKeywords.length,
- keywords: pageKeywords,
- cannibalisation: cannib,
- fetched_at: data.fetched_at,
- });
- });
- } catch (err) {
- const msg = err instanceof Error ? err.message : String(err);
- return NextResponse.json({ error: msg }, { status: 500 });
- }
+ return proxyToFastAPI(request, '/api/integrations/google/keywords/by-page');
};
diff --git a/web/app/api/integrations/google/keywords/expand/route.ts b/web/app/api/integrations/google/keywords/expand/route.ts
index c186bd98..b6aaf92a 100644
--- a/web/app/api/integrations/google/keywords/expand/route.ts
+++ b/web/app/api/integrations/google/keywords/expand/route.ts
@@ -1,112 +1,12 @@
-import { NextResponse, type NextRequest } from 'next/server';
-import { spawn } from 'child_process';
-import path from 'path';
+import { type NextRequest } from 'next/server';
+import { proxyToFastAPI } from '@/server/proxyToFastAPI';
import { forbiddenIfNotLocal } from '@/server/localOnly';
-import { formatPythonSpawnError, resolvePythonExecutable } from '@/server/resolvePython';
-import { resolvePropertyIdFromRequest } from '@/server/resolvePropertyId';
-import type { ApiRouteHandler, KeywordExpandPostBody } from '@/types/api';
+import type { ApiRouteHandler } from '@/types/api';
export const runtime = 'nodejs';
-const WEB_CWD = process.cwd();
-const DEFAULT_REPO_ROOT =
- process.env.WEBSITE_PROFILING_ROOT || path.resolve(WEB_CWD, '..');
-
-/**
- * POST /api/integrations/google/keywords/expand
- * Body: { seeds: string[], sources?: string[] }
- */
export const POST: ApiRouteHandler = async (request: NextRequest): Promise => {
- const guard = forbiddenIfNotLocal(request);
- if (guard) return guard;
-
- let body: KeywordExpandPostBody & { propertyId?: number; domain?: string };
- try {
- body = (await request.json()) as KeywordExpandPostBody & {
- propertyId?: number;
- domain?: string;
- };
- } catch {
- return NextResponse.json({ error: 'Invalid JSON body' }, { status: 400 });
- }
-
- const { propertyId, error: propError } = await resolvePropertyIdFromRequest(
- body.propertyId != null ? String(body.propertyId) : null,
- body.domain ?? null,
- );
- if (propError || propertyId == null) {
- return NextResponse.json({ error: propError || 'propertyId or domain required' }, { status: 400 });
- }
-
- const seeds = Array.isArray(body?.seeds)
- ? body.seeds.filter((s): s is string => typeof s === 'string' && Boolean(s.trim())).slice(0, 30)
- : [];
-
- if (seeds.length === 0) {
- return NextResponse.json({ error: 'No seeds provided' }, { status: 400 });
- }
-
- const sources = Array.isArray(body?.sources)
- ? body.sources.filter((s): s is string => typeof s === 'string')
- : ['web', 'youtube', 'questions'];
- const repoRoot = DEFAULT_REPO_ROOT;
- const pythonExe = resolvePythonExecutable(null, repoRoot);
-
- const pyScript = [
- 'import json, sys',
- "sys.path.insert(0, '.')",
- 'from src.website_profiling.integrations.google.suggest import batch_expand',
- `seeds = ${JSON.stringify(seeds)}`,
- `sources = tuple(${JSON.stringify(sources)})`,
- 'result = batch_expand(seeds, sources=sources, max_workers=4)',
- 'print(json.dumps(result, ensure_ascii=False))',
- ].join('\n');
-
- return new Promise