From 6fc15b7c338e48a02095d63453acac665d4e5354 Mon Sep 17 00:00:00 2001
From: PrashantUnity <priyadarshisadiku@gmail.com>
Date: Thu, 25 Jun 2026 09:58:27 +0530
Subject: [PATCH] Gsc Ranking

---
 src/website_profiling/reporting/builder.py    |   8 +
 .../reporting/categories/__init__.py          |   2 +
 .../categories/search_performance.py          | 164 ++++++++++++++++++
 .../reporting/terminology.py                  |   1 +
 .../test_category_search_performance.py       | 154 ++++++++++++++++
 web/src/lib/categoryDisplayNames.ts           |   1 +
 web/src/strings.json                          |   2 +-
 7 files changed, 331 insertions(+), 1 deletion(-)
 create mode 100644 src/website_profiling/reporting/categories/search_performance.py
 create mode 100644 tests/reporting/test_category_search_performance.py

diff --git a/src/website_profiling/reporting/builder.py b/src/website_profiling/reporting/builder.py
index 8a547baf..2e6c1d16 100644
--- a/src/website_profiling/reporting/builder.py
+++ b/src/website_profiling/reporting/builder.py
@@ -589,6 +589,14 @@ def run_simple_report(
                     report_data.get("categories") or [],
                     google_data,
                 )
+                # Add a category scored from real Search Console performance.
+                # Returns None when there is no GSC search data, so the headline
+                # health average stays internal-only when Google isn't connected.
+                from .categories import category_search_performance
+
+                sp = category_search_performance(google_data.get("gsc"))
+                if sp is not None:
+                    report_data.setdefault("categories", []).append(sp)
         except Exception:
             pass
         try:
diff --git a/src/website_profiling/reporting/categories/__init__.py b/src/website_profiling/reporting/categories/__init__.py
index e9921746..70f36f69 100644
--- a/src/website_profiling/reporting/categories/__init__.py
+++ b/src/website_profiling/reporting/categories/__init__.py
@@ -14,6 +14,7 @@
     category_core_web_vitals_from_lighthouse,
     category_performance,
 )
+from .search_performance import category_search_performance
 from .security import category_security
 from .technical_seo import category_technical_seo
 from ._helpers import (
@@ -53,6 +54,7 @@
     "category_mobile",
     "category_security",
     "category_intelligence",
+    "category_search_performance",
     "_issue",
     "_sort_issues",
     "_page_analysis_dict",
diff --git a/src/website_profiling/reporting/categories/search_performance.py b/src/website_profiling/reporting/categories/search_performance.py
new file mode 100644
index 00000000..44dc4220
--- /dev/null
+++ b/src/website_profiling/reporting/categories/search_performance.py
@@ -0,0 +1,164 @@
+"""Report category: search_performance.
+
+Scored from real Google Search Console data (average position, CTR, query
+distribution, click/impression trend) — unlike the other categories, this one
+reflects how the site actually performs in Google, not internal audit heuristics.
+
+Returns ``None`` when GSC data is unavailable (Google not connected, or the
+property has no search impressions in the window) so the builder can skip it and
+the headline Site-health average stays internal-only.
+"""
+from __future__ import annotations
+
+from typing import Any, Optional
+
+from ._helpers import (
+    _issue,
+    _score_deductions,
+    _sort_issues,
+)
+from ..terminology import CATEGORY_SEARCH_PERFORMANCE
+
+# Minimum impressions before a CTR / zero-click signal is meaningful (low-volume
+# queries have noisy CTR and shouldn't drive deductions).
+_MIN_IMPRESSIONS_FOR_CTR = 100
+_STRIKING_MIN_IMPRESSIONS = 10
+# Need at least this many daily points to split the window into halves for a trend.
+_TREND_MIN_DAYS = 6
+# A half-over-half drop below this ratio counts as a decline.
+_DECLINE_RATIO = 0.8
+
+
+def _expected_ctr(position: float) -> float:
+    """Rough organic CTR (percent) for an average position. Lower rank → less CTR."""
+    if position <= 1.5:
+        return 28.0
+    if position <= 2.5:
+        return 15.0
+    if position <= 3.5:
+        return 11.0
+    if position <= 5.0:
+        return 7.0
+    if position <= 10.0:
+        return 3.0
+    return 1.0
+
+
+def category_search_performance(gsc: Optional[dict[str, Any]]) -> Optional[dict]:
+    """Score real Google Search Console performance, or ``None`` if no GSC data."""
+    if not gsc or not isinstance(gsc, dict):
+        return None
+    summary = gsc.get("summary") or {}
+    impressions = float(summary.get("impressions") or 0)
+    if impressions <= 0:
+        return None
+
+    position = float(summary.get("position") or 0)
+    ctr = float(summary.get("ctr") or 0)  # percent (0–100)
+    top_queries = gsc.get("top_queries") or []
+    daily = gsc.get("daily") or []
+
+    issues: list[dict] = []
+    deductions: list[tuple[int, bool]] = []
+
+    # --- Average position: the headline ranking signal (1 = best) -------------
+    if position > 0:
+        if position > 20:
+            issues.append(_issue(
+                f"Average Google position is {position:.1f} — most queries rank beyond page 2.",
+                priority="High",
+                recommendation="Strengthen on-page relevance, internal linking, and content depth for target queries.",
+            ))
+            deductions.append((35, True))
+        elif position > 10:
+            issues.append(_issue(
+                f"Average Google position is {position:.1f} — ranking on page 2 for many queries.",
+                priority="High",
+                recommendation="Improve on-page optimisation and internal links to push key queries onto page 1.",
+            ))
+            deductions.append((20, True))
+        elif position > 3:
+            issues.append(_issue(
+                f"Average Google position is {position:.1f} — room to reach the top 3.",
+                priority="Medium",
+                recommendation="Refine titles, content, and internal links for queries ranking 4–10.",
+            ))
+            deductions.append((8, True))
+
+    # --- CTR vs. expected for the average position ----------------------------
+    if impressions >= _MIN_IMPRESSIONS_FOR_CTR and position > 0:
+        expected = _expected_ctr(position)
+        if ctr < expected * 0.6:
+            issues.append(_issue(
+                f"Click-through rate ({ctr:.1f}%) is below the ~{expected:.0f}% typical for "
+                f"average position {position:.1f}.",
+                priority="Medium",
+                recommendation="Improve titles and meta descriptions, and add structured data for richer SERP snippets.",
+            ))
+            deductions.append((10, True))
+
+    # --- Striking-distance queries (page 2: positions 11–20) ------------------
+    striking = [
+        q for q in top_queries
+        if isinstance(q, dict)
+        and 10 < float(q.get("position") or 0) <= 20
+        and float(q.get("impressions") or 0) >= _STRIKING_MIN_IMPRESSIONS
+    ]
+    if striking:
+        sample = ", ".join(str(q.get("query") or "") for q in striking[:3] if q.get("query"))
+        more = f" (+{len(striking) - 3} more)" if len(striking) > 3 else ""
+        issues.append(_issue(
+            f"{len(striking)} quer(y/ies) rank on page 2 (positions 11–20): {sample}{more}.",
+            priority="Medium",
+            recommendation="These are close to page 1 — add internal links and refresh content to push them up.",
+        ))
+        deductions.append((min(10, len(striking)), True))
+
+    # --- Zero-click, high-impression queries ---------------------------------
+    zero_click = [
+        q for q in top_queries
+        if isinstance(q, dict)
+        and float(q.get("impressions") or 0) >= _MIN_IMPRESSIONS_FOR_CTR
+        and float(q.get("clicks") or 0) == 0
+    ]
+    if zero_click:
+        sample = ", ".join(str(q.get("query") or "") for q in zero_click[:3] if q.get("query"))
+        more = f" (+{len(zero_click) - 3} more)" if len(zero_click) > 3 else ""
+        issues.append(_issue(
+            f"{len(zero_click)} quer(y/ies) get impressions but no clicks: {sample}{more}.",
+            priority="Medium",
+            recommendation="Review search intent match and rewrite titles/descriptions to earn the click.",
+        ))
+        deductions.append((min(8, len(zero_click)), True))
+
+    # --- Click / impression trend (first vs. second half of the window) ------
+    if len(daily) >= _TREND_MIN_DAYS:
+        mid = len(daily) // 2
+        first, second = daily[:mid], daily[mid:]
+        first_clicks = sum(float(d.get("clicks") or 0) for d in first)
+        second_clicks = sum(float(d.get("clicks") or 0) for d in second)
+        first_impr = sum(float(d.get("impressions") or 0) for d in first)
+        second_impr = sum(float(d.get("impressions") or 0) for d in second)
+        if first_clicks > 0 and second_clicks < first_clicks * _DECLINE_RATIO:
+            issues.append(_issue(
+                "Search clicks are declining over the reporting window.",
+                priority="High",
+                recommendation="Investigate ranking losses or seasonality; refresh affected pages.",
+            ))
+            deductions.append((12, True))
+        elif first_impr > 0 and second_impr < first_impr * _DECLINE_RATIO:
+            issues.append(_issue(
+                "Search impressions are declining over the reporting window.",
+                priority="Medium",
+                recommendation="Check for indexing or visibility losses; expand and refresh content.",
+            ))
+            deductions.append((8, True))
+
+    score = _score_deductions(100, deductions)
+    return {
+        "id": "search_performance",
+        "name": CATEGORY_SEARCH_PERFORMANCE,
+        "score": int(score),
+        "issues": _sort_issues(issues),
+        "recommendations": list({i["recommendation"] for i in issues if i["recommendation"]}),
+    }
diff --git a/src/website_profiling/reporting/terminology.py b/src/website_profiling/reporting/terminology.py
index bd892106..2480696e 100644
--- a/src/website_profiling/reporting/terminology.py
+++ b/src/website_profiling/reporting/terminology.py
@@ -14,6 +14,7 @@
 CATEGORY_MOBILE = "Mobile SEO"
 CATEGORY_SECURITY = "Security"
 CATEGORY_CONTENT_QUALITY = "Content quality"
+CATEGORY_SEARCH_PERFORMANCE = "Search performance"
 
 # Older audits may still use legacy names — map for exports and UI fallbacks
 LEGACY_CATEGORY_DISPLAY: dict[str, str] = {
diff --git a/tests/reporting/test_category_search_performance.py b/tests/reporting/test_category_search_performance.py
new file mode 100644
index 00000000..da9f07d5
--- /dev/null
+++ b/tests/reporting/test_category_search_performance.py
@@ -0,0 +1,154 @@
+"""Unit tests for the Search performance category (real GSC-driven scoring)."""
+from __future__ import annotations
+
+from website_profiling.reporting.categories import category_search_performance
+
+
+def _daily(values: list[tuple[int, int]]) -> list[dict]:
+    """Build a daily series from (clicks, impressions) pairs."""
+    return [
+        {
+            "date": f"2024-01-{i + 1:02d}",
+            "clicks": c,
+            "impressions": imp,
+            "ctr": round(c / imp * 100, 2) if imp else 0.0,
+            "position": 5.0,
+        }
+        for i, (c, imp) in enumerate(values)
+    ]
+
+
+# --- no data --------------------------------------------------------------
+
+
+def test_none_when_no_gsc() -> None:
+    assert category_search_performance(None) is None
+    assert category_search_performance({}) is None
+
+
+def test_none_when_zero_impressions() -> None:
+    gsc = {"summary": {"clicks": 0, "impressions": 0, "ctr": 0.0, "position": 0.0}}
+    assert category_search_performance(gsc) is None
+
+
+# --- strong performance: full score, no issues ----------------------------
+
+
+def test_strong_rankings_scores_100_no_issues() -> None:
+    gsc = {
+        "summary": {"clicks": 500, "impressions": 1000, "ctr": 50.0, "position": 2.0},
+        "top_queries": [
+            {"query": "brand", "clicks": 200, "impressions": 300, "ctr": 66.6, "position": 1.4},
+        ],
+        "top_pages": [],
+        "daily": _daily([(5, 100), (5, 100), (5, 100), (20, 200), (20, 200), (20, 200)]),
+    }
+    cat = category_search_performance(gsc)
+    assert cat is not None
+    assert cat["id"] == "search_performance"
+    assert cat["name"] == "Search performance"
+    assert cat["score"] == 100
+    assert cat["issues"] == []
+
+
+# --- poor performance: deductions + issues --------------------------------
+
+
+def test_poor_rankings_declining_trend_and_striking_distance() -> None:
+    gsc = {
+        "summary": {"clicks": 5, "impressions": 1000, "ctr": 0.5, "position": 25.0},
+        "top_queries": [
+            {"query": "q1", "clicks": 0, "impressions": 50, "ctr": 0.0, "position": 15.0},
+            {"query": "q2", "clicks": 0, "impressions": 60, "ctr": 0.0, "position": 18.0},
+            {"query": "q3", "clicks": 0, "impressions": 30, "ctr": 0.0, "position": 12.0},
+            {"query": "q4", "clicks": 0, "impressions": 200, "ctr": 0.0, "position": 30.0},
+        ],
+        "top_pages": [],
+        "daily": _daily([(20, 400), (20, 400), (20, 400), (2, 100), (2, 100), (2, 100)]),
+    }
+    cat = category_search_performance(gsc)
+    assert cat is not None
+    assert cat["score"] < 60
+    priorities = {i["priority"] for i in cat["issues"]}
+    assert "High" in priorities  # avg position > 20 and/or declining clicks
+    messages = " ".join(i["message"] for i in cat["issues"]).lower()
+    assert "page 2" in messages  # striking-distance queries surfaced
+    assert "declining" in messages  # trend signal surfaced
+    assert cat["recommendations"]  # recommendations derived from issues
+
+
+def test_top_position_uses_high_expected_ctr() -> None:
+    # Average position <= 1.5 -> ~28% expected CTR; a healthy CTR earns no deduction.
+    gsc = {
+        "summary": {"clicks": 600, "impressions": 1000, "ctr": 60.0, "position": 1.2},
+        "top_queries": [],
+        "top_pages": [],
+        "daily": [],
+    }
+    cat = category_search_performance(gsc)
+    assert cat is not None
+    assert cat["score"] == 100
+    assert cat["issues"] == []
+
+
+def test_position_three_band_low_ctr_flagged() -> None:
+    # Average position in (2.5, 3.5] -> ~11% expected CTR; a low CTR is flagged.
+    gsc = {
+        "summary": {"clicks": 20, "impressions": 1000, "ctr": 2.0, "position": 3.0},
+        "top_queries": [],
+        "top_pages": [],
+        "daily": [],
+    }
+    cat = category_search_performance(gsc)
+    assert cat is not None
+    messages = " ".join(i["message"] for i in cat["issues"]).lower()
+    assert "click-through rate" in messages
+    # position 3.0 is not > 3, so no average-position issue
+    assert "average google position" not in messages
+
+
+def test_mid_position_band_expected_ctr() -> None:
+    # Average position in (5, 10] -> ~3% expected CTR band; healthy CTR, no CTR deduction.
+    gsc = {
+        "summary": {"clicks": 80, "impressions": 1000, "ctr": 8.0, "position": 8.0},
+        "top_queries": [],
+        "top_pages": [],
+        "daily": [],
+    }
+    cat = category_search_performance(gsc)
+    assert cat is not None
+    messages = " ".join(i["message"] for i in cat["issues"]).lower()
+    assert "click-through rate" not in messages
+    assert "average google position is 8.0" in messages  # 4–10 band issue
+
+
+def test_page2_average_position_and_declining_impressions() -> None:
+    # Average position in (10, 20] -> page-2 branch; clicks flat but impressions falling.
+    gsc = {
+        "summary": {"clicks": 60, "impressions": 1000, "ctr": 6.0, "position": 15.0},
+        "top_queries": [],
+        "top_pages": [],
+        "daily": _daily([(10, 400), (10, 400), (10, 400), (10, 100), (10, 100), (10, 100)]),
+    }
+    cat = category_search_performance(gsc)
+    assert cat is not None
+    messages = " ".join(i["message"] for i in cat["issues"]).lower()
+    assert "page 2 for many queries" in messages
+    assert "impressions are declining" in messages
+    assert "clicks are declining" not in messages
+
+
+def test_striking_distance_ignores_low_impression_queries() -> None:
+    gsc = {
+        "summary": {"clicks": 50, "impressions": 500, "ctr": 10.0, "position": 4.0},
+        "top_queries": [
+            # position 11-20 but only a handful of impressions -> not striking
+            {"query": "noise", "clicks": 0, "impressions": 3, "ctr": 0.0, "position": 14.0},
+        ],
+        "top_pages": [],
+        "daily": [],
+    }
+    cat = category_search_performance(gsc)
+    assert cat is not None
+    messages = " ".join(i["message"] for i in cat["issues"]).lower()
+    assert "page 2" not in messages
diff --git a/web/src/lib/categoryDisplayNames.ts b/web/src/lib/categoryDisplayNames.ts
index ac08feee..e79df625 100644
--- a/web/src/lib/categoryDisplayNames.ts
+++ b/web/src/lib/categoryDisplayNames.ts
@@ -8,6 +8,7 @@ const CATEGORY_DISPLAY: Record<string, string> = {
   'Mobile SEO': 'Mobile SEO',
   Security: 'Security',
   'Content quality': 'Content quality',
+  'Search performance': 'Search performance',
   // Legacy payloads
   'HTML/Accessibility': 'Accessibility & markup',
   'HTML & Accessibility': 'Accessibility & markup',
diff --git a/web/src/strings.json b/web/src/strings.json
index 00753e64..3bd5c479 100644
--- a/web/src/strings.json
+++ b/web/src/strings.json
@@ -39,7 +39,7 @@
       },
       "healthScore": {
         "title": "Site health",
-        "body": "Score from 0–100 based on issue category weights in this audit. Higher is better. Reflects technical SEO health, not Google rankings."
+        "body": "Score from 0-100 averaged across this audit's category scores. Higher is better. Mainly reflects technical SEO health; also includes real Search Console performance (rankings, CTR, trends) when Google is connected."
       },
       "impactScore": {
         "title": "Impact score",