serpapi · dinjas · Jun 22, 2026 · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,4 @@
+# Saved Google result pages are fixtures, preserve their
+# exact bytes/EOLs so the byte-for-byte expected-array match
+files/*.html -text
+spec/fixtures/*.html -text
diff --git a/.rspec b/.rspec
@@ -0,0 +1,2 @@
+--require spec_helper
+--format documentation
diff --git a/.rubocop.yml b/.rubocop.yml
@@ -0,0 +1,16 @@
+AllCops:
+  TargetRubyVersion: 3.2
+  NewCops: enable
+  SuggestExtensions: false
+  Exclude:
+    - "files/**/*"
+    - "spec/fixtures/**/*"
+
+# Match the existing code (double quotes throughout).
+Style/StringLiterals:
+  EnforcedStyle: double_quotes
+
+# Long describe/context blocks are idiomatic in RSpec.
+Metrics/BlockLength:
+  Exclude:
+    - "spec/**/*"
diff --git a/Gemfile b/Gemfile
@@ -0,0 +1,13 @@
+# frozen_string_literal: true
+
+source "https://rubygems.org"
+
+gem "nokogiri", "~> 1.19"
+
+group :test do
+  gem "rspec", "~> 3.13"
+end
+
+group :development, :test do
+  gem "rubocop", "~> 1.88"
+end
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -0,0 +1,84 @@
+GEM
+  remote: https://rubygems.org/
+  specs:
+    ast (2.4.3)
+    diff-lcs (1.6.2)
+    json (2.19.9)
+    language_server-protocol (3.17.0.5)
+    lint_roller (1.1.0)
+    nokogiri (1.19.4-arm64-darwin)
+      racc (~> 1.4)
+    parallel (1.28.0)
+    parser (3.3.11.1)
+      ast (~> 2.4.1)
+      racc
+    prism (1.9.0)
+    racc (1.8.1)
+    rainbow (3.1.1)
+    regexp_parser (2.12.0)
+    rspec (3.13.2)
+      rspec-core (~> 3.13.0)
+      rspec-expectations (~> 3.13.0)
+      rspec-mocks (~> 3.13.0)
+    rspec-core (3.13.6)
+      rspec-support (~> 3.13.0)
+    rspec-expectations (3.13.5)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.13.0)
+    rspec-mocks (3.13.8)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.13.0)
+    rspec-support (3.13.7)
+    rubocop (1.88.0)
+      json (~> 2.3)
+      language_server-protocol (~> 3.17.0.2)
+      lint_roller (~> 1.1.0)
+      parallel (>= 1.10)
+      parser (>= 3.3.0.2)
+      rainbow (>= 2.2.2, < 4.0)
+      regexp_parser (>= 2.9.3, < 3.0)
+      rubocop-ast (>= 1.49.0, < 2.0)
+      ruby-progressbar (~> 1.7)
+      unicode-display_width (>= 2.4.0, < 4.0)
+    rubocop-ast (1.49.1)
+      parser (>= 3.3.7.2)
+      prism (~> 1.7)
+    ruby-progressbar (1.13.0)
+    unicode-display_width (3.2.0)
+      unicode-emoji (~> 4.1)
+    unicode-emoji (4.2.0)
+
+PLATFORMS
+  arm64-darwin-25
+
+DEPENDENCIES
+  nokogiri (~> 1.19)
+  rspec (~> 3.13)
+  rubocop (~> 1.88)
+
+CHECKSUMS
+  ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383
+  diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
+  json (2.19.9) sha256=9b9025b7cdddafa38d316eca0b2358488e42d417045c1b90d216a9fefe46b79a
+  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
+  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
+  nokogiri (1.19.4-arm64-darwin) sha256=a46db9853286e6597b36ebc6953817d15acf3a299583eb3f89fdc6f91dd63527
+  parallel (1.28.0) sha256=33e6de1484baf2524792d178b0913fc8eb94c628d6cfe45599ad4458c638c970
+  parser (3.3.11.1) sha256=d17ace7aabe3e72c3cc94043714be27cc6f852f104d81aa284c2281aecc65d54
+  prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
+  racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
+  rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
+  regexp_parser (2.12.0) sha256=35a916a1d63190ab5c9009457136ae5f3c0c7512d60291d0d1378ba18ce08ebb
+  rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
+  rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
+  rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
+  rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47
+  rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
+  rubocop (1.88.0) sha256=e420ddf1662d0ef34bc8a2910ac4b396a7ddda0b51a708264405241734b08e0b
+  rubocop-ast (1.49.1) sha256=4412f3ee70f6fe4546cc489548e0f6fcf76cafcfa80fa03af67098ffed755035
+  ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
+  unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42
+  unicode-emoji (4.2.0) sha256=519e69150f75652e40bf736106cfbc8f0f73aa3fb6a65afe62fefa7f80b0f80f
+
+BUNDLED WITH
+  4.0.10
diff --git a/README.md b/README.md
@@ -21,8 +21,128 @@ Parse directly the HTML result page ([html file]) in this repository. No extra H
 [html file]: https://raw.githubusercontent.com/serpapi/code-challenge/master/files/van-gogh-paintings.html
 [expected array]: https://raw.githubusercontent.com/serpapi/code-challenge/master/files/expected-array.json
 
-Add also to your array the painting thumbnails present in the result page file (not the ones where extra requests are needed). 
+Add also to your array the painting thumbnails present in the result page file (not the ones where extra requests are needed).
 
 Test against 2 other similar result pages to make sure it works against different layouts. (Pages that contain the same kind of carrousel. Don't necessarily have to be paintings.)
 
 The suggested time for this challenge is 4 hours. But, you can take your time and work more on it if you want.
+
+---
+
+## Solution
+
+Parses the Knowledge Graph carousel out of a saved Google results page into an
+array of `{ name, extensions, link, image }` objects.
+
+The Van Gogh paintings case is the required deliverable and is covered. The same code handles other entity types (albums, buildings, cast) because only the *locator* changes per type.
+
+### Running it
+
+```sh
+bundle install
+bundle exec rspec     # specs
+bundle exec rubocop   # lint
+```
+
+Run it against any saved results page (prints JSON to stdout):
+
+```sh
+ruby -Ilib -rcarousel_extractor -rjson \
+  -e 'puts JSON.pretty_generate(CarouselExtractor.call(File.read(ARGV[0])))' \
+  files/van-gogh-paintings.html
+```
+
+### Output
+
+An array of symbol-keyed hashes, in the key order of `files/expected-array.json`.
+The run command above pretty-prints for readability; the byte-for-byte claim is
+about compact `to_json` — serialized that way, the Van Gogh case is identical to
+the expected output:
+
+```json
+{ "name": "The Starry Night", "extensions": ["1889"], "link": "https://www.google.com/search?...", "image": "data:image/jpeg;base64,..." }
+```
+
+`extensions` is omitted entirely when a tile has no secondary line (e.g.
+yearless paintings). The only field we *know* the meaning of is the paintings
+date (from the expected fixture); for other types `extensions` carries
+whatever the tile's second line is (year, or a character name for a cast
+carousel).
+
+### Approach / design notes
+
+- **Locate by stable schema, not styling.** Tiles are found via the Knowledge
+  Graph `data-attrid` (e.g. `kc:/visual_art/visual_artist:works`), never by
+  minified classes, `jsname`, or per-request ids, those are not stable.
+- **Allowlist of carousel tags, not "any tile container."** Scoping extraction to
+  a known carousel container keeps off-target tiles out. The Grateful Dead albums
+  page is the clearest case: alongside the 12 album tiles it carries eBay/Target
+  shopping thumbnails that *also* wrap an `<img>` in a `/search?q=…` anchor. A
+  matcher keying only on "image+text tiles linking to /search" scrapes those two
+  in as extra entries (14 instead of 12); scoping to the
+  `kc:/music/artist:albums` container ignores them. Other off-target shapes the
+  allowlist skips: a non-entity strip (`kc:/common/topic:social media presence`
+  on the Unilever page) or an *entity* carousel of a type we haven't validated
+  (`kc:/business/business_operation:founder`, a company's founders). To support a
+  new type, add its tag to `CAROUSEL_ATTRIDS` plus a fixture and a spec.
+- **Per-tile extraction depends on structure.** `name` and `extensions` come from
+  the leaf text `<div>`s under each anchor (name from the first div, falling back to
+  `img@alt`); `link` from the anchor. The split is positional — the first leaf is
+  the name and any leaf after it becomes an extension — so a tile with extra
+  decorative text would leak into `extensions`. Across the current fixtures each
+  tile has at most one secondary line, so this stays clean.
+- **Thumbnails without extra requests.** The first tiles render a placeholder
+  `<img>` whose real bytes arrive later in the page: searching the HTML for the
+  base64 string from `expected-array.json` led to `_setImagesSrc(...)` `<script>`
+  blocks that map an image id to a `data:` URI (with `\xNN` escapes to unescape).
+  Tiles past the inlined batch carry the thumbnail URL directly in `data-src`.
+  Either way the value is already in the file, so extraction makes no network
+  calls. This mixed result (inline base64 for the first tiles, in-page URLs for
+  the rest) is exactly what `expected-array.json` contains.
+
+### Verified against (spec/fixtures/)
+
+| query | carousel tag | result |
+| --- | --- | --- |
+| Van Gogh paintings | `visual_art/visual_artist:works` | exact `expected-array.json` match |
+| Grateful Dead albums | `music/artist:albums` | 12 tiles; sibling eBay/Target shopping thumbnails excluded |
+| Frank Lloyd Wright buildings | `architecture/architect:designed` | no dates (extensions omitted) |
+| Breaking Bad cast | `tv/tv_program:cast` | extensions = character |
+| Mark Gonzales skateboard art / Unilever brands | — | `[]` (organic SERP / wrong-module) |
+
+### Layout
+
+```
+lib/carousel_extractor.rb  # the extractor
+spec/                      # RSpec: exact match + per-type + negatives
+spec/fixtures/             # alternate-layout result-page captures
+files/                     # challenge-provided fixtures (html + expected-array)
+```
+
+### Notes / tradeoffs
+
+- **`extensions` is kept generic on purpose.** The expected fixture only tells
+  us what the *paintings* second line means (the date). Rather than infer
+  per-type semantics, I opted for simplicity and am passing through each tile's
+  secondary as-is: a year for albums, a character name for cast, and nothing
+  for buildings.
+- **Allowlist over pattern/shape matching.** I'm detecting carousels by an
+  explicit set of `data-attrid` tags instead of "any container with N image+text
+  anchors."
+  Matching the shape would generalize to unseen types for free, but it also
+  risks false-positives on carousel-shaped modules that aren't entity collections
+  I chose to limit support to KG tags we have proven we can handle.
+  Adding a new type requires adding the tag to a list, capturing an HTML
+  fixture, and adding a spec.
+- **Multiple carousels are all returned.** When a page exposes more than one
+  allowlisted carousel (e.g. a polymath who is both architect and visual artist),
+  every populated carousel's tiles are concatenated in `CAROUSEL_ATTRIDS` order.
+  A matched block with no image tiles contributes nothing, so degenerate strips
+  (like da Vinci's empty architect block) drop out for free. The result is a
+  flat, untyped array — I don't try to guess which single carousel the caller
+  wanted. In every observed real page only one allowlisted carousel is actually
+  populated, so this never changes a single-carousel result.
+- **Where I stopped.** Fixtures are all `en`/`us` desktop captures; I didn't probe
+  other locales or mobile layouts, handle "View more" expansions/pagination, or
+  dedupe repeated tiles. The locator and per-tile extraction are independent, so
+  those would slot in without reworking the core.
diff --git a/lib/carousel_extractor.rb b/lib/carousel_extractor.rb
@@ -0,0 +1,107 @@
+# frozen_string_literal: true
+
+require "nokogiri"
+require "uri"
+
+# Extracts a Google Knowledge Graph entity carousel (paintings, albums,
+# buildings, cast, ...) into a uniform array of:
+# `{ name, extensions, link, image }`.
+# Resilience: the carousel is located by stable Knowledge Graph schema
+# (`data-attrid`) and tile structure. We don't use minified classes,
+# `jsname`, or per-request ids, because they are not stable.
+class CarouselExtractor
+  CAROUSEL_ATTRIDS = [
+    "kc:/architecture/architect:designed", # buildings
+    "kc:/music/artist:albums",             # albums
+    "kc:/tv/tv_program:cast",              # cast
+    "kc:/visual_art/visual_artist:works"   # paintings
+  ].freeze
+  GOOGLE           = "https://www.google.com"
+  HEX_ESCAPE       = /\\x([0-9a-fA-F]{2})/ # e.g. \x3d -> "="
+  # base64 thumbnails injected by:  var s='data:...';var ii=['<imgid>'];
+  THUMBNAIL_SCRIPT = %r{var s='(data:image/[^']*)';\s*var ii=\[([^\]]+)\]}
+  private_constant(*constants(false))
+
+  def self.call(html) = new(html).entries
+
+  def initialize(html)
+    @doc = Nokogiri::HTML(html)
+    @thumbnails = index_inline_thumbnails
+  end
+
+  def entries
+    carousels.flat_map { |c| c.css("a").to_a }.filter_map { |anchor| entry_for(anchor) }
+  end
+
+  private
+
+  attr_reader :doc, :thumbnails
+
+  # A page may expose several allowlisted carousels at once (e.g. da Vinci is
+  # both architect and visual artist). Return one container per attrid type, in
+  # CAROUSEL_ATTRIDS order, rather than guessing which single one the caller
+  # wanted. A matched block with no image tiles contributes no entries, so
+  # degenerate carousels drop out on their own without special handling.
+  # at_css (not css) per id on purpose: Google stamps the same data-attrid on
+  # the outer module wrapper and its descendants, so css would return the same
+  # tiles many times over. The first match is the outermost wrapper.
+  def carousels
+    @carousels ||= CAROUSEL_ATTRIDS.filter_map { |id| doc.at_css(%([data-attrid="#{id}"])) }
+  end
+
+  # name and extensions come from the leaf text divs ([name, *extensions]);
+  # name falls back to img@alt. Key order matches expected-array.json so the
+  # JSON is identical byte-for-byte.
+  def entry_for(anchor)
+    image = anchor.at_css("img")
+    return unless image && anchor["href"]
+
+    leaves = text_leaves(anchor)
+    extensions = leaves.drop(1)
+
+    {
+      name: leaves.first || image["alt"],
+      **(extensions.any? ? { extensions: extensions } : {}),
+      link: URI.join(GOOGLE, anchor["href"]).to_s,
+      image: image_for(image)
+    }
+  end
+
+  # In-page thumbnail, no extra request: script-injected base64
+  # for the first tiles, else the lazy gstatic URL in data-src. The
+  # 1x1 placeholder gif is skipped.
+  def image_for(image)
+    src = image["src"]
+    src = nil if src&.start_with?("data:image/gif")
+    thumbnails[image["id"]] || image["data-src"] || src
+  end
+
+  def image_ids(raw_ids)
+    raw_ids.scan(/'([^']+)'/).flatten
+  end
+
+  # image id => inline base64 thumbnail, parsed from the _setImagesSrc script
+  # blocks (each match pairs one data: URI with a list of image ids).
+  def index_inline_thumbnails
+    thumbnail_matches.each_with_object({}) do |(data_uri, raw_ids), map|
+      uri = unescape(data_uri)
+      image_ids(raw_ids).each { |id| map[id] = uri }
+    end
+  end
+
+  def text_leaves(anchor)
+    anchor.css("div")
+          .reject { |d| d.at_css("div") }
+          .map { |d| d.text.strip }
+          .reject(&:empty?)
+  end
+
+  def thumbnail_matches
+    doc.css("script").flat_map { |script| script.text.scan(THUMBNAIL_SCRIPT) }
+  end
+
+  # Google escapes characters like "=" as \xNN inside the script string literal.
+  def unescape(data_uri)
+    data_uri.gsub(HEX_ESCAPE) { Regexp.last_match(1).hex.chr }
+  end
+end