From 3618d077ee3a38d57d14a17aa45a17839dc7ca50 Mon Sep 17 00:00:00 2001 From: Jason Dinsmore Date: Mon, 22 Jun 2026 15:58:11 -0700 Subject: [PATCH 01/17] Add rspec; establish extractor; prevent fixture auto-change --- .gitattributes | 4 +++ .rspec | 2 ++ Gemfile | 9 ++++++ Gemfile.lock | 47 +++++++++++++++++++++++++++++ lib/carousel_extractor.rb | 7 +++++ spec/lib/carousel_extractor_spec.rb | 16 ++++++++++ spec/spec_helper.rb | 7 +++++ 7 files changed, 92 insertions(+) create mode 100644 .gitattributes create mode 100644 .rspec create mode 100644 Gemfile create mode 100644 Gemfile.lock create mode 100644 lib/carousel_extractor.rb create mode 100644 spec/lib/carousel_extractor_spec.rb create mode 100644 spec/spec_helper.rb diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..2002804d --- /dev/null +++ b/.gitattributes @@ -0,0 +1,4 @@ +# Saved Google result pages are fixtures, preserve their +# exact bytes/EOLs so the byte-for-byte expected-array match +files/*.html -text +spec/fixtures/*.html -text diff --git a/.rspec b/.rspec new file mode 100644 index 00000000..5be63fcb --- /dev/null +++ b/.rspec @@ -0,0 +1,2 @@ +--require spec_helper +--format documentation diff --git a/Gemfile b/Gemfile new file mode 100644 index 00000000..42e315bb --- /dev/null +++ b/Gemfile @@ -0,0 +1,9 @@ +# frozen_string_literal: true + +source "https://rubygems.org" + +gem "nokogiri", "~> 1.19" + +group :test do + gem "rspec", "~> 3.13" +end diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 00000000..5013a1ce --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,47 @@ +GEM + remote: https://rubygems.org/ + specs: + diff-lcs (1.6.2) + mini_portile2 (2.8.9) + nokogiri (1.19.4) + mini_portile2 (~> 2.8.2) + racc (~> 1.4) + nokogiri (1.19.4-arm64-darwin) + racc (~> 1.4) + racc (1.8.1) + rspec (3.13.2) + rspec-core (~> 3.13.0) + rspec-expectations (~> 3.13.0) + rspec-mocks (~> 3.13.0) + rspec-core (3.13.6) + rspec-support (~> 3.13.0) + rspec-expectations (3.13.5) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.13.0) + rspec-mocks (3.13.8) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.13.0) + rspec-support (3.13.7) + +PLATFORMS + arm64-darwin-25 + ruby + +DEPENDENCIES + nokogiri (~> 1.19) + rspec (~> 3.13) + +CHECKSUMS + diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962 + mini_portile2 (2.8.9) sha256=0cd7c7f824e010c072e33f68bc02d85a00aeb6fce05bb4819c03dfd3c140c289 + nokogiri (1.19.4) sha256=50c951611c92bca05c51411aef45f1cbc50f2821c4802758c5c6d34696533ab5 + nokogiri (1.19.4-arm64-darwin) sha256=a46db9853286e6597b36ebc6953817d15acf3a299583eb3f89fdc6f91dd63527 + racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f + rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587 + rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d + rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836 + rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47 + rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c + +BUNDLED WITH + 4.0.10 diff --git a/lib/carousel_extractor.rb b/lib/carousel_extractor.rb new file mode 100644 index 00000000..4a0a1d20 --- /dev/null +++ b/lib/carousel_extractor.rb @@ -0,0 +1,7 @@ +# frozen_string_literal: true + +require "nokogiri" + +class CarouselExtractor + def self.call(_html) = [] +end diff --git a/spec/lib/carousel_extractor_spec.rb b/spec/lib/carousel_extractor_spec.rb new file mode 100644 index 00000000..62522569 --- /dev/null +++ b/spec/lib/carousel_extractor_spec.rb @@ -0,0 +1,16 @@ +# frozen_string_literal: true + +RSpec.describe CarouselExtractor do + describe "van-gogh-paintings.html (challenge fixture)" do + let(:artworks) { described_class.call(fixture_in) } + let(:fixture_out) { File.read("#{FILES}/expected-array.json") } + let(:fixture_in) { File.read("#{FILES}/van-gogh-paintings.html") } + let(:expected) { [] } + # let(:expected) { JSON.parse(fixture_out).fetch("artworks") } + + it "reproduces the expected artworks array exactly" do + got = JSON.parse(artworks.to_json) # symbol keys -> strings, to match expected + expect(got).to eql(expected) + end + end +end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb new file mode 100644 index 00000000..65cde27a --- /dev/null +++ b/spec/spec_helper.rb @@ -0,0 +1,7 @@ +# frozen_string_literal: true + +require "json" +require_relative "../lib/carousel_extractor" + +FILES = File.expand_path("../files", __dir__) +FIXTURES = File.expand_path("fixtures", __dir__) From 2291caab4f4356070150ba91d084fca9c0f8fa1a Mon Sep 17 00:00:00 2001 From: Jason Dinsmore Date: Tue, 23 Jun 2026 11:08:34 -0700 Subject: [PATCH 02/17] Add rubocop + minimal config; address lint --- .rubocop.yml | 16 ++++++++++++ Gemfile | 4 +++ Gemfile.lock | 51 +++++++++++++++++++++++++++++++++------ lib/carousel_extractor.rb | 3 +++ 4 files changed, 67 insertions(+), 7 deletions(-) create mode 100644 .rubocop.yml diff --git a/.rubocop.yml b/.rubocop.yml new file mode 100644 index 00000000..4ac74de9 --- /dev/null +++ b/.rubocop.yml @@ -0,0 +1,16 @@ +AllCops: + TargetRubyVersion: 3.2 + NewCops: enable + SuggestExtensions: false + Exclude: + - "files/**/*" + - "spec/fixtures/**/*" + +# Match the existing code (double quotes throughout). +Style/StringLiterals: + EnforcedStyle: double_quotes + +# Long describe/context blocks are idiomatic in RSpec. +Metrics/BlockLength: + Exclude: + - "spec/**/*" diff --git a/Gemfile b/Gemfile index 42e315bb..f239c276 100644 --- a/Gemfile +++ b/Gemfile @@ -7,3 +7,7 @@ gem "nokogiri", "~> 1.19" group :test do gem "rspec", "~> 3.13" end + +group :development, :test do + gem "rubocop", "~> 1.88" +end diff --git a/Gemfile.lock b/Gemfile.lock index 5013a1ce..fc6affb4 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,14 +1,21 @@ GEM remote: https://rubygems.org/ specs: + ast (2.4.3) diff-lcs (1.6.2) - mini_portile2 (2.8.9) - nokogiri (1.19.4) - mini_portile2 (~> 2.8.2) - racc (~> 1.4) + json (2.19.9) + language_server-protocol (3.17.0.5) + lint_roller (1.1.0) nokogiri (1.19.4-arm64-darwin) racc (~> 1.4) + parallel (1.28.0) + parser (3.3.11.1) + ast (~> 2.4.1) + racc + prism (1.9.0) racc (1.8.1) + rainbow (3.1.1) + regexp_parser (2.12.0) rspec (3.13.2) rspec-core (~> 3.13.0) rspec-expectations (~> 3.13.0) @@ -22,26 +29,56 @@ GEM diff-lcs (>= 1.2.0, < 2.0) rspec-support (~> 3.13.0) rspec-support (3.13.7) + rubocop (1.88.0) + json (~> 2.3) + language_server-protocol (~> 3.17.0.2) + lint_roller (~> 1.1.0) + parallel (>= 1.10) + parser (>= 3.3.0.2) + rainbow (>= 2.2.2, < 4.0) + regexp_parser (>= 2.9.3, < 3.0) + rubocop-ast (>= 1.49.0, < 2.0) + ruby-progressbar (~> 1.7) + unicode-display_width (>= 2.4.0, < 4.0) + rubocop-ast (1.49.1) + parser (>= 3.3.7.2) + prism (~> 1.7) + ruby-progressbar (1.13.0) + unicode-display_width (3.2.0) + unicode-emoji (~> 4.1) + unicode-emoji (4.2.0) PLATFORMS arm64-darwin-25 - ruby DEPENDENCIES nokogiri (~> 1.19) rspec (~> 3.13) + rubocop (~> 1.88) CHECKSUMS + ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383 diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962 - mini_portile2 (2.8.9) sha256=0cd7c7f824e010c072e33f68bc02d85a00aeb6fce05bb4819c03dfd3c140c289 - nokogiri (1.19.4) sha256=50c951611c92bca05c51411aef45f1cbc50f2821c4802758c5c6d34696533ab5 + json (2.19.9) sha256=9b9025b7cdddafa38d316eca0b2358488e42d417045c1b90d216a9fefe46b79a + language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc + lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87 nokogiri (1.19.4-arm64-darwin) sha256=a46db9853286e6597b36ebc6953817d15acf3a299583eb3f89fdc6f91dd63527 + parallel (1.28.0) sha256=33e6de1484baf2524792d178b0913fc8eb94c628d6cfe45599ad4458c638c970 + parser (3.3.11.1) sha256=d17ace7aabe3e72c3cc94043714be27cc6f852f104d81aa284c2281aecc65d54 + prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85 racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f + rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a + regexp_parser (2.12.0) sha256=35a916a1d63190ab5c9009457136ae5f3c0c7512d60291d0d1378ba18ce08ebb rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587 rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836 rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47 rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c + rubocop (1.88.0) sha256=e420ddf1662d0ef34bc8a2910ac4b396a7ddda0b51a708264405241734b08e0b + rubocop-ast (1.49.1) sha256=4412f3ee70f6fe4546cc489548e0f6fcf76cafcfa80fa03af67098ffed755035 + ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33 + unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42 + unicode-emoji (4.2.0) sha256=519e69150f75652e40bf736106cfbc8f0f73aa3fb6a65afe62fefa7f80b0f80f BUNDLED WITH 4.0.10 diff --git a/lib/carousel_extractor.rb b/lib/carousel_extractor.rb index 4a0a1d20..e3f683c9 100644 --- a/lib/carousel_extractor.rb +++ b/lib/carousel_extractor.rb @@ -2,6 +2,9 @@ require "nokogiri" +# Extracts a Google Knowledge Graph entity carousel (e.g.: paintings, albums, +# buildings, movie cast, etc.) into a uniform array of objects: +# `{ name:, extensions:, link:, image: }`. class CarouselExtractor def self.call(_html) = [] end From 89b47c53603edab353f7f19e4b61bee61bfd1bf9 Mon Sep 17 00:00:00 2001 From: Jason Dinsmore Date: Tue, 23 Jun 2026 10:21:28 -0700 Subject: [PATCH 03/17] Locate the carousel --- lib/carousel_extractor.rb | 39 ++++++++++++++++++++++++++--- spec/lib/carousel_extractor_spec.rb | 6 ++--- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/lib/carousel_extractor.rb b/lib/carousel_extractor.rb index e3f683c9..0bea6531 100644 --- a/lib/carousel_extractor.rb +++ b/lib/carousel_extractor.rb @@ -2,9 +2,40 @@ require "nokogiri" -# Extracts a Google Knowledge Graph entity carousel (e.g.: paintings, albums, -# buildings, movie cast, etc.) into a uniform array of objects: -# `{ name:, extensions:, link:, image: }`. +# Extracts a Google Knowledge Graph entity carousel into a uniform array. +# Located by stable `data-attrid` schema, never by minified classes or +# per-request ids, are not stable. class CarouselExtractor - def self.call(_html) = [] + CAROUSEL_ATTRIDS = [ + "kc:/visual_art/visual_artist:works" # paintings + ].freeze + + def self.call(html) = new(html).entries + + def initialize(html) + @doc = Nokogiri::HTML(html) + end + + def entries + return [] unless carousel + + carousel.css("a").filter_map { |anchor| entry_for(anchor) } + end + + private + + attr_reader :doc + + def carousel + @carousel ||= CAROUSEL_ATTRIDS.filter_map do |id| + doc.at_css(%([data-attrid="#{id}"])) + end.first + end + + def entry_for(anchor) + image = anchor.at_css("img") + return unless image && anchor["href"] + + {} + end end diff --git a/spec/lib/carousel_extractor_spec.rb b/spec/lib/carousel_extractor_spec.rb index 62522569..24eb842c 100644 --- a/spec/lib/carousel_extractor_spec.rb +++ b/spec/lib/carousel_extractor_spec.rb @@ -5,12 +5,10 @@ let(:artworks) { described_class.call(fixture_in) } let(:fixture_out) { File.read("#{FILES}/expected-array.json") } let(:fixture_in) { File.read("#{FILES}/van-gogh-paintings.html") } - let(:expected) { [] } - # let(:expected) { JSON.parse(fixture_out).fetch("artworks") } + let(:expected) { JSON.parse(fixture_out).fetch("artworks") } it "reproduces the expected artworks array exactly" do - got = JSON.parse(artworks.to_json) # symbol keys -> strings, to match expected - expect(got).to eql(expected) + expect(artworks.size).to equal(expected.size) end end end From e05985627de1d5199c0340caaf76e68d10bfca30 Mon Sep 17 00:00:00 2001 From: Jason Dinsmore Date: Tue, 23 Jun 2026 10:23:05 -0700 Subject: [PATCH 04/17] Extract name --- lib/carousel_extractor.rb | 12 ++++++++++-- spec/lib/carousel_extractor_spec.rb | 17 +++++++++++++---- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/lib/carousel_extractor.rb b/lib/carousel_extractor.rb index 0bea6531..d0fed607 100644 --- a/lib/carousel_extractor.rb +++ b/lib/carousel_extractor.rb @@ -4,7 +4,7 @@ # Extracts a Google Knowledge Graph entity carousel into a uniform array. # Located by stable `data-attrid` schema, never by minified classes or -# per-request ids, are not stable. +# per-request ids, which are not stable. class CarouselExtractor CAROUSEL_ATTRIDS = [ "kc:/visual_art/visual_artist:works" # paintings @@ -36,6 +36,14 @@ def entry_for(anchor) image = anchor.at_css("img") return unless image && anchor["href"] - {} + leaves = text_leaves(anchor) + { "name" => leaves.first || image["alt"] } + end + + def text_leaves(anchor) + anchor.css("div") + .reject { |d| d.at_css("div") } + .map { |d| d.text.strip } + .reject(&:empty?) end end diff --git a/spec/lib/carousel_extractor_spec.rb b/spec/lib/carousel_extractor_spec.rb index 24eb842c..f802a25c 100644 --- a/spec/lib/carousel_extractor_spec.rb +++ b/spec/lib/carousel_extractor_spec.rb @@ -1,14 +1,23 @@ # frozen_string_literal: true RSpec.describe CarouselExtractor do + let(:artworks) { described_class.call(fixture_input) } + let(:fixture_input) { File.read("#{FILES}/van-gogh-paintings.html") } + describe "van-gogh-paintings.html (challenge fixture)" do - let(:artworks) { described_class.call(fixture_in) } - let(:fixture_out) { File.read("#{FILES}/expected-array.json") } - let(:fixture_in) { File.read("#{FILES}/van-gogh-paintings.html") } - let(:expected) { JSON.parse(fixture_out).fetch("artworks") } + let(:fixture_expected) { File.read("#{FILES}/expected-array.json") } + let(:expected) { JSON.parse(fixture_expected).fetch("artworks") } it "reproduces the expected artworks array exactly" do expect(artworks.size).to equal(expected.size) end + + describe "first artwork" do + subject(:first) { artworks.first } + + let(:starry_night) { expected.first } + + it("has a name") { expect(first["name"]).to eql(starry_night["name"]) } + end end end From 37ac5598ae37c8108a0f99fa4d8c7e674cf798e5 Mon Sep 17 00:00:00 2001 From: Jason Dinsmore Date: Tue, 23 Jun 2026 10:32:02 -0700 Subject: [PATCH 05/17] Extract link --- lib/carousel_extractor.rb | 8 +++++++- spec/lib/carousel_extractor_spec.rb | 3 ++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/lib/carousel_extractor.rb b/lib/carousel_extractor.rb index d0fed607..985e81dd 100644 --- a/lib/carousel_extractor.rb +++ b/lib/carousel_extractor.rb @@ -1,6 +1,7 @@ # frozen_string_literal: true require "nokogiri" +require "uri" # Extracts a Google Knowledge Graph entity carousel into a uniform array. # Located by stable `data-attrid` schema, never by minified classes or @@ -9,6 +10,8 @@ class CarouselExtractor CAROUSEL_ATTRIDS = [ "kc:/visual_art/visual_artist:works" # paintings ].freeze + GOOGLE = "https://www.google.com" + private_constant(*constants(false)) def self.call(html) = new(html).entries @@ -37,7 +40,10 @@ def entry_for(anchor) return unless image && anchor["href"] leaves = text_leaves(anchor) - { "name" => leaves.first || image["alt"] } + { + "name" => leaves.first || image["alt"], + "link" => URI.join(GOOGLE, anchor["href"]).to_s + } end def text_leaves(anchor) diff --git a/spec/lib/carousel_extractor_spec.rb b/spec/lib/carousel_extractor_spec.rb index f802a25c..0560681a 100644 --- a/spec/lib/carousel_extractor_spec.rb +++ b/spec/lib/carousel_extractor_spec.rb @@ -17,7 +17,8 @@ let(:starry_night) { expected.first } - it("has a name") { expect(first["name"]).to eql(starry_night["name"]) } + it("has a name") { expect(first["name"]).to eql(starry_night.fetch("name")) } + it("has a link") { expect(first["link"]).to eql(starry_night.fetch("link")) } end end end From ca9a01ce8c6e1760565909a6d093b849dbf5927e Mon Sep 17 00:00:00 2001 From: Jason Dinsmore Date: Tue, 23 Jun 2026 10:36:20 -0700 Subject: [PATCH 06/17] Extract extensions --- lib/carousel_extractor.rb | 6 ++++++ spec/lib/carousel_extractor_spec.rb | 17 +++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/lib/carousel_extractor.rb b/lib/carousel_extractor.rb index 985e81dd..7a57a395 100644 --- a/lib/carousel_extractor.rb +++ b/lib/carousel_extractor.rb @@ -35,13 +35,19 @@ def carousel end.first end + # name and extensions come from the leaf text divs ([name, *extensions]); + # name falls back to img@alt. extensions is omitted entirely when a tile has + # no secondary line. def entry_for(anchor) image = anchor.at_css("img") return unless image && anchor["href"] leaves = text_leaves(anchor) + extensions = leaves.drop(1) + { "name" => leaves.first || image["alt"], + **(extensions.any? ? { "extensions" => extensions } : {}), "link" => URI.join(GOOGLE, anchor["href"]).to_s } end diff --git a/spec/lib/carousel_extractor_spec.rb b/spec/lib/carousel_extractor_spec.rb index 0560681a..842bb9c9 100644 --- a/spec/lib/carousel_extractor_spec.rb +++ b/spec/lib/carousel_extractor_spec.rb @@ -17,8 +17,21 @@ let(:starry_night) { expected.first } - it("has a name") { expect(first["name"]).to eql(starry_night.fetch("name")) } - it("has a link") { expect(first["link"]).to eql(starry_night.fetch("link")) } + it("has a name") do + expect(first["name"]).to eql(starry_night.fetch("name")) + end + it("has a link") do + expect(first["link"]).to eql(starry_night.fetch("link")) + end + it("has extensions") do + expect(first["extensions"]).to eql(starry_night.fetch("extensions")) + end + end + + it "omits extensions for yearless paintings rather than emitting []" do + yearless = artworks.reject { |a| a.key?("extensions") } + expect(yearless.map { |a| a["name"] }).to include("Sunflowers") + expect(yearless).to all(satisfy { |a| !a.key?("extensions") }) end end end From 5385e38775a9cdceaf0e5088bbcb4a7895ff92ff Mon Sep 17 00:00:00 2001 From: Jason Dinsmore Date: Tue, 23 Jun 2026 10:42:36 -0700 Subject: [PATCH 07/17] Extract image --- lib/carousel_extractor.rb | 55 ++++++++++++++++++++++++----- spec/lib/carousel_extractor_spec.rb | 47 ++++++++++++++++++------ 2 files changed, 83 insertions(+), 19 deletions(-) diff --git a/lib/carousel_extractor.rb b/lib/carousel_extractor.rb index 7a57a395..5c3bef8f 100644 --- a/lib/carousel_extractor.rb +++ b/lib/carousel_extractor.rb @@ -3,20 +3,27 @@ require "nokogiri" require "uri" -# Extracts a Google Knowledge Graph entity carousel into a uniform array. -# Located by stable `data-attrid` schema, never by minified classes or -# per-request ids, which are not stable. +# Extracts a Google Knowledge Graph entity carousel (paintings, albums, +# buildings, cast, ...) into a uniform array of: +# `{ name, extensions, link, image }`. +# Resilience: the carousel if located by stable Knowledge Graph schema +# (`data-attrid`) and tile structure. We don't use minified classes, +# `jsname`, or per-request ids, because they are not stable. class CarouselExtractor CAROUSEL_ATTRIDS = [ "kc:/visual_art/visual_artist:works" # paintings ].freeze - GOOGLE = "https://www.google.com" + GOOGLE = "https://www.google.com" + HEX_ESCAPE = /\\x([0-9a-fA-F]{2})/ # e.g. \x3d -> "=" + # base64 thumbnails injected by: var s='data:...';var ii=['']; + THUMBNAIL_SCRIPT = %r{var s='(data:image/[^']*)';\s*var ii=\[([^\]]+)\]} private_constant(*constants(false)) def self.call(html) = new(html).entries def initialize(html) @doc = Nokogiri::HTML(html) + @thumbnails = index_inline_thumbnails end def entries @@ -27,7 +34,7 @@ def entries private - attr_reader :doc + attr_reader :doc, :thumbnails def carousel @carousel ||= CAROUSEL_ATTRIDS.filter_map do |id| @@ -36,8 +43,8 @@ def carousel end # name and extensions come from the leaf text divs ([name, *extensions]); - # name falls back to img@alt. extensions is omitted entirely when a tile has - # no secondary line. + # name falls back to img@alt. Key order matches expected-array.json so the + # JSON is identical byte-for-byte. def entry_for(anchor) image = anchor.at_css("img") return unless image && anchor["href"] @@ -48,14 +55,46 @@ def entry_for(anchor) { "name" => leaves.first || image["alt"], **(extensions.any? ? { "extensions" => extensions } : {}), - "link" => URI.join(GOOGLE, anchor["href"]).to_s + "link" => URI.join(GOOGLE, anchor["href"]).to_s, + "image" => image_for(image) } end + # In-page thumbnail, no extra request: script-injected base64 + # for the first tiles, else the lazy gstatic URL in data-src. The + # 1x1 placeholder gif is skipped. + def image_for(image) + src = image["src"] + src = nil if src&.start_with?("data:image/gif") + thumbnails[image["id"]] || image["data-src"] || src + end + + def image_ids(raw_ids) + raw_ids.scan(/'([^']+)'/).flatten + end + + # image id => inline base64 thumbnail, parsed from the _setImagesSrc script + # blocks (each match pairs one data: URI with a list of image ids). + def index_inline_thumbnails + thumbnail_matches.each_with_object({}) do |(data_uri, raw_ids), map| + uri = unescape(data_uri) + image_ids(raw_ids).each { |id| map[id] = uri } + end + end + def text_leaves(anchor) anchor.css("div") .reject { |d| d.at_css("div") } .map { |d| d.text.strip } .reject(&:empty?) end + + def thumbnail_matches + doc.css("script").flat_map { |script| script.text.scan(THUMBNAIL_SCRIPT) } + end + + # Google escapes characters like "=" as \xNN inside the script string literal. + def unescape(data_uri) + data_uri.gsub(HEX_ESCAPE) { Regexp.last_match(1).hex.chr } + end end diff --git a/spec/lib/carousel_extractor_spec.rb b/spec/lib/carousel_extractor_spec.rb index 842bb9c9..d06adaeb 100644 --- a/spec/lib/carousel_extractor_spec.rb +++ b/spec/lib/carousel_extractor_spec.rb @@ -3,13 +3,12 @@ RSpec.describe CarouselExtractor do let(:artworks) { described_class.call(fixture_input) } let(:fixture_input) { File.read("#{FILES}/van-gogh-paintings.html") } + let(:fixture_expected) { File.read("#{FILES}/expected-array.json") } + let(:expected) { JSON.parse(fixture_expected).fetch("artworks") } describe "van-gogh-paintings.html (challenge fixture)" do - let(:fixture_expected) { File.read("#{FILES}/expected-array.json") } - let(:expected) { JSON.parse(fixture_expected).fetch("artworks") } - it "reproduces the expected artworks array exactly" do - expect(artworks.size).to equal(expected.size) + expect(artworks).to eql(expected) end describe "first artwork" do @@ -17,14 +16,13 @@ let(:starry_night) { expected.first } - it("has a name") do - expect(first["name"]).to eql(starry_night.fetch("name")) - end - it("has a link") do - expect(first["link"]).to eql(starry_night.fetch("link")) - end + it("has a name") { expect(first["name"]).to eql(starry_night["name"]) } + it("has a link") { expect(first["link"]).to eql(starry_night["link"]) } it("has extensions") do - expect(first["extensions"]).to eql(starry_night.fetch("extensions")) + expect(first["extensions"]).to eql(starry_night["extensions"]) + end + it("has an inline base64 image") do + expect(first["image"]).to eql(starry_night["image"]) end end @@ -33,5 +31,32 @@ expect(yearless.map { |a| a["name"] }).to include("Sunflowers") expect(yearless).to all(satisfy { |a| !a.key?("extensions") }) end + + it "needs no extra HTTP requests (every image is inline data: or an in-page URL)" do + expect(artworks).to all(satisfy { |a| + a["image"].start_with?("data:image", "https://") + }) + end + end + + # Mirrors the per-artwork assertions from SerpApi's referenced Monet spec + # (which also hits the live API and covers the whole knowledge graph). + describe "conforms to SerpApi's referenced artwork contract" do + it "returns a non-empty artworks Array" do + expect(artworks).to be_an(Array) + expect(artworks).to_not be_empty + end + + it "first artwork has name/extensions/link/image of the expected types" do + first = artworks.first + expect(first["name"]).to be_a(String) + expect(first["name"]).to_not be_empty + expect(first["extensions"]).to be_a(Array) + expect(first["extensions"]).to_not be_empty + expect(first["link"]).to be_a(String) + expect(first["link"]).to_not be_empty + expect(first["image"]).to be_a(String) + expect(first["image"]).to_not be_empty + end end end From 5a32025d270ead126f0d8f42e814f824acc07614 Mon Sep 17 00:00:00 2001 From: Jason Dinsmore Date: Tue, 23 Jun 2026 11:56:34 -0700 Subject: [PATCH 08/17] Add support for "kc:/music/artist:albums" --- lib/carousel_extractor.rb | 3 +- spec/fixtures/grateful_dead_albums.html | 58 ++++++++++++++++++++ spec/lib/carousel_extractor_spec.rb | 70 +++++++++++++++++++++++++ 3 files changed, 130 insertions(+), 1 deletion(-) create mode 100644 spec/fixtures/grateful_dead_albums.html diff --git a/lib/carousel_extractor.rb b/lib/carousel_extractor.rb index 5c3bef8f..9da52d0b 100644 --- a/lib/carousel_extractor.rb +++ b/lib/carousel_extractor.rb @@ -11,7 +11,8 @@ # `jsname`, or per-request ids, because they are not stable. class CarouselExtractor CAROUSEL_ATTRIDS = [ - "kc:/visual_art/visual_artist:works" # paintings + "kc:/music/artist:albums", # albums + "kc:/visual_art/visual_artist:works" # paintings ].freeze GOOGLE = "https://www.google.com" HEX_ESCAPE = /\\x([0-9a-fA-F]{2})/ # e.g. \x3d -> "=" diff --git a/spec/fixtures/grateful_dead_albums.html b/spec/fixtures/grateful_dead_albums.html new file mode 100644 index 00000000..4868bb29 --- /dev/null +++ b/spec/fixtures/grateful_dead_albums.html @@ -0,0 +1,58 @@ +grateful dead albums - Google Search

Search Results

Grateful Dead
Rock band
Google apps
\ No newline at end of file diff --git a/spec/lib/carousel_extractor_spec.rb b/spec/lib/carousel_extractor_spec.rb index d06adaeb..4992345c 100644 --- a/spec/lib/carousel_extractor_spec.rb +++ b/spec/lib/carousel_extractor_spec.rb @@ -59,4 +59,74 @@ expect(first["image"]).to_not be_empty end end + + describe "alternate carousel type: Grateful Dead albums" do + let(:albums) do + described_class.call(File.read("#{FIXTURES}/grateful_dead_albums.html")) + end + + it "extracts the albums carousel via the music attrid" do + expect(albums.size).to equal(12) + end + + it "fills name (from the text div) + link + image for every album" do + expect(albums).to all(satisfy do |a| + a["name"].to_s != "" && + a["link"].to_s.start_with?("https://www.google.com/search") && + a["image"].to_s.start_with?("data:image") + end) + end + + it "captures release years as extensions" do + blues = albums.find { |a| a["name"] == "Blues for Allah" } + expect(blues["extensions"]).to eql(["1975"]) + end + end + + describe "per-tile guards" do + it "drops anchors that lack an image or an href, keeping only real tiles" do + html = <<~HTML + + HTML + + expect(described_class.call(html).map { |e| e["name"] }) + .to eql(["Real Album"]) + end + + it "skips the data:image/gif placeholder in favor of the in-page data-src url" do + html = <<~HTML + + HTML + + entry = described_class.call(html).first + expect(entry["image"]).to eql("https://encrypted-tbn0.gstatic.com/images?q=tbn:lazy") + end + + it "handles an img with no src attribute, falling back to data-src" do + html = <<~HTML + + HTML + + entry = described_class.call(html).first + expect(entry["image"]).to eql("https://encrypted-tbn0.gstatic.com/images?q=tbn:srcless") + end + end end From bb4d59509984720cfeb5d28f2d74e3728cc49d1b Mon Sep 17 00:00:00 2001 From: Jason Dinsmore Date: Tue, 23 Jun 2026 11:58:52 -0700 Subject: [PATCH 09/17] Add negative spec examples --- .../mark_gonzales_skateboard_art.html | 51 +++++++++++++++ spec/fixtures/unilever_brands.html | 64 +++++++++++++++++++ spec/lib/carousel_extractor_spec.rb | 12 ++++ 3 files changed, 127 insertions(+) create mode 100644 spec/fixtures/mark_gonzales_skateboard_art.html create mode 100644 spec/fixtures/unilever_brands.html diff --git a/spec/fixtures/mark_gonzales_skateboard_art.html b/spec/fixtures/mark_gonzales_skateboard_art.html new file mode 100644 index 00000000..1e05fdf0 --- /dev/null +++ b/spec/fixtures/mark_gonzales_skateboard_art.html @@ -0,0 +1,51 @@ +mark gonzales skateboard art - Google Search

Search Results

Sponsored products

Vision Mark Gonzales Modern Concave 10" Skateboard Deck, White
$89.95
The Dark Slide
The Dark Slide Vision Mark Gonzales Modern Concave 10" Skateboard Deck, Orange
$89.95
Shop app
Krooked Mark Gonz Gonzales Sweatpants 9.81" Old School Skateboard Deck
$159.99
eBay
The Dark Slide Vision Mark Gonzales "Original MG" LTD Two Tone 10" Skateboard Deck, Red/Yellow
$99.95
Shop app
Vision Mark Gonzales Modern Concave 10" Skateboard Deck, Orange
$89.95
The Dark Slide
4.5" Vision Mark Gonzales vinyl sticker. 80's Vintage style skateboard decal.
$2.95
Etsy
Google apps
Search Labs
Google Account
Jason Dinsmore
dinjas@gmail.com
\ No newline at end of file diff --git a/spec/fixtures/unilever_brands.html b/spec/fixtures/unilever_brands.html new file mode 100644 index 00000000..72c85bf8 --- /dev/null +++ b/spec/fixtures/unilever_brands.html @@ -0,0 +1,64 @@ +unilever brands - Google Search

Search Results

AI Overview
Unilever owns over 400 brands worldwide, categorized into Beauty & Wellbeing, Personal Care, Home Care, and Nutrition. Their flagship products include Dove, Hellmann's, Knorr, Axe, and Vaseline, which are household staples across North America and globally.
🧴 Beauty & Wellbeing
  • Dove & Dove Men+Care: Soaps, body washes, and hair care.
  • Paula’s Choice: Science-backed skincare and exfoliants.
  • Liquid I.V.: Hydration and wellness powders.
  • Nutrafol: Dermatologist-recommended hair growth supplements.
  • Other Brands: TRESemmé, Vaseline, Nexxus, Dermalogica, SheaMoisture, and Simple.
🧼 Personal Care
  • Our brands | Unilever
    Nutrafol. The No.1 dermatologist-recommended hair growth supplement brand in the US. OMO. Dirt Is Good. Paula's Choice. Beauty beg...
    Unilever
  • Brands | Unilever
    Domestos. Unstoppable. Dove. Change beauty into a positive experience for every woman and the next generation. Hellmann's. Knorr. ...
    Unilever
  • List of Unilever brands - Wikipedia
    Condiments and extracts. Amino – food products (Poland) Amora – French mayonnaise and dressings (France, Belgium and Morocco) Arom...
    Wikipedia
Show all
Show more
Google apps
\ No newline at end of file diff --git a/spec/lib/carousel_extractor_spec.rb b/spec/lib/carousel_extractor_spec.rb index 4992345c..dcd805a6 100644 --- a/spec/lib/carousel_extractor_spec.rb +++ b/spec/lib/carousel_extractor_spec.rb @@ -83,6 +83,18 @@ end end + describe "non-carousel / wrong-module pages (negatives: locator must not false-positive)" do + it "returns [] for an organic/ads SERP (Mark Gonzales)" do + html = File.read("#{FIXTURES}/mark_gonzales_skateboard_art.html") + expect(described_class.call(html)).to eq([]) + end + + # Unilever's only carousel-shaped module ("social media presence") isn't an entity collection. + it "returns [] when the only carousel-shaped module is not an entity collection (Unilever)" do + html = File.read("#{FIXTURES}/unilever_brands.html") + expect(described_class.call(html)).to eq([]) + end + end describe "per-tile guards" do it "drops anchors that lack an image or an href, keeping only real tiles" do html = <<~HTML From 1166dc48399c575e3f97e90cd18e95a5a238d21b Mon Sep 17 00:00:00 2001 From: Jason Dinsmore Date: Tue, 23 Jun 2026 12:02:56 -0700 Subject: [PATCH 10/17] Add support for "kc:/architecture/architect:designed" --- lib/carousel_extractor.rb | 1 + .../frank_lloyd_wright_buildings.html | 57 +++++++++++++++++++ spec/lib/carousel_extractor_spec.rb | 28 ++++++++- 3 files changed, 84 insertions(+), 2 deletions(-) create mode 100644 spec/fixtures/frank_lloyd_wright_buildings.html diff --git a/lib/carousel_extractor.rb b/lib/carousel_extractor.rb index 9da52d0b..a450dd57 100644 --- a/lib/carousel_extractor.rb +++ b/lib/carousel_extractor.rb @@ -11,6 +11,7 @@ # `jsname`, or per-request ids, because they are not stable. class CarouselExtractor CAROUSEL_ATTRIDS = [ + "kc:/architecture/architect:designed", # buildings "kc:/music/artist:albums", # albums "kc:/visual_art/visual_artist:works" # paintings ].freeze diff --git a/spec/fixtures/frank_lloyd_wright_buildings.html b/spec/fixtures/frank_lloyd_wright_buildings.html new file mode 100644 index 00000000..e2421236 --- /dev/null +++ b/spec/fixtures/frank_lloyd_wright_buildings.html @@ -0,0 +1,57 @@ +frank lloyd wright buildings - Google Search

Search Results

Frank Lloyd Wright
American architect and designer
Google apps
\ No newline at end of file diff --git a/spec/lib/carousel_extractor_spec.rb b/spec/lib/carousel_extractor_spec.rb index dcd805a6..f2e16d61 100644 --- a/spec/lib/carousel_extractor_spec.rb +++ b/spec/lib/carousel_extractor_spec.rb @@ -83,18 +83,42 @@ end end + describe "alternate carousel type: Frank Lloyd Wright buildings (no dates)" do + let(:buildings) { described_class.call(building_fixture) } + let(:building_fixture) do + File.read("#{FIXTURES}/frank_lloyd_wright_buildings.html") + end + + it "extracts the buildings carousel via the architect attrid" do + expect(buildings.size).to equal(12) + end + + it "omits extensions across the entire carousel" do + expect(buildings).to all(satisfy { |a| !a.key?("extensions") }) + end + + it "still fills name + link + base64 image for every building" do + expect(buildings).to all(satisfy { |a| + a["name"].to_s != "" && + a["link"].to_s.start_with?("https://www.google.com/search") && + a["image"].to_s.start_with?("data:image") + }) + end + end + describe "non-carousel / wrong-module pages (negatives: locator must not false-positive)" do it "returns [] for an organic/ads SERP (Mark Gonzales)" do html = File.read("#{FIXTURES}/mark_gonzales_skateboard_art.html") - expect(described_class.call(html)).to eq([]) + expect(described_class.call(html)).to eql([]) end # Unilever's only carousel-shaped module ("social media presence") isn't an entity collection. it "returns [] when the only carousel-shaped module is not an entity collection (Unilever)" do html = File.read("#{FIXTURES}/unilever_brands.html") - expect(described_class.call(html)).to eq([]) + expect(described_class.call(html)).to eql([]) end end + describe "per-tile guards" do it "drops anchors that lack an image or an href, keeping only real tiles" do html = <<~HTML From 3dcba9245f07f8d8cd297463a7cdd5fe7c6facb4 Mon Sep 17 00:00:00 2001 From: Jason Dinsmore Date: Tue, 23 Jun 2026 12:13:31 -0700 Subject: [PATCH 11/17] Add support for "kc:/tv/tv_program:cast" --- lib/carousel_extractor.rb | 1 + spec/fixtures/breaking_bad_cast.html | 62 ++++++++++++++++++++++++++++ spec/lib/carousel_extractor_spec.rb | 30 +++++++++++--- 3 files changed, 88 insertions(+), 5 deletions(-) create mode 100644 spec/fixtures/breaking_bad_cast.html diff --git a/lib/carousel_extractor.rb b/lib/carousel_extractor.rb index a450dd57..01ccf97f 100644 --- a/lib/carousel_extractor.rb +++ b/lib/carousel_extractor.rb @@ -13,6 +13,7 @@ class CarouselExtractor CAROUSEL_ATTRIDS = [ "kc:/architecture/architect:designed", # buildings "kc:/music/artist:albums", # albums + "kc:/tv/tv_program:cast", # cast "kc:/visual_art/visual_artist:works" # paintings ].freeze GOOGLE = "https://www.google.com" diff --git a/spec/fixtures/breaking_bad_cast.html b/spec/fixtures/breaking_bad_cast.html new file mode 100644 index 00000000..c11b8038 --- /dev/null +++ b/spec/fixtures/breaking_bad_cast.html @@ -0,0 +1,62 @@ +breaking bad cast - Google Search

Search Results

Breaking Bad
2008 ‧ Drama ‧ 5 seasons
Google apps
\ No newline at end of file diff --git a/spec/lib/carousel_extractor_spec.rb b/spec/lib/carousel_extractor_spec.rb index f2e16d61..8e552d64 100644 --- a/spec/lib/carousel_extractor_spec.rb +++ b/spec/lib/carousel_extractor_spec.rb @@ -6,6 +6,12 @@ let(:fixture_expected) { File.read("#{FILES}/expected-array.json") } let(:expected) { JSON.parse(fixture_expected).fetch("artworks") } + def entry_looks_valid?(entry) + entry["name"].to_s != "" && + entry["link"].to_s.start_with?("https://www.google.com/search") && + entry["image"].to_s.start_with?("data:image") + end + describe "van-gogh-paintings.html (challenge fixture)" do it "reproduces the expected artworks array exactly" do expect(artworks).to eql(expected) @@ -98,11 +104,25 @@ end it "still fills name + link + base64 image for every building" do - expect(buildings).to all(satisfy { |a| - a["name"].to_s != "" && - a["link"].to_s.start_with?("https://www.google.com/search") && - a["image"].to_s.start_with?("data:image") - }) + expect(buildings).to all(satisfy(&method(:entry_looks_valid?))) + end + end + + describe "alternate carousel type: Breaking Bad cast" do + let(:cast) { described_class.call(cast_fixture) } + let(:cast_fixture) { File.read("#{FIXTURES}/breaking_bad_cast.html") } + + it "extracts the cast carousel via the tv_program attrid" do + expect(cast.size).to eq(8) + end + + it "puts the actor in name and the character in extensions" do + cranston = cast.find { |a| a["name"] == "Bryan Cranston" } + expect(cranston["extensions"]).to eql(["Walter White"]) + end + + it "fills name + link + base64 image for every cast member" do + expect(cast).to all(satisfy(&method(:entry_looks_valid?))) end end From 097f5a84cfc202cc0ef469942b3f16c36fc5efbe Mon Sep 17 00:00:00 2001 From: Jason Dinsmore Date: Tue, 23 Jun 2026 14:13:41 -0700 Subject: [PATCH 12/17] Fix typo; switch to symbol-keyed hashes --- lib/carousel_extractor.rb | 10 ++--- spec/lib/carousel_extractor_spec.rb | 70 ++++++++++++++--------------- 2 files changed, 39 insertions(+), 41 deletions(-) diff --git a/lib/carousel_extractor.rb b/lib/carousel_extractor.rb index 01ccf97f..97288f5d 100644 --- a/lib/carousel_extractor.rb +++ b/lib/carousel_extractor.rb @@ -6,7 +6,7 @@ # Extracts a Google Knowledge Graph entity carousel (paintings, albums, # buildings, cast, ...) into a uniform array of: # `{ name, extensions, link, image }`. -# Resilience: the carousel if located by stable Knowledge Graph schema +# Resilience: the carousel is located by stable Knowledge Graph schema # (`data-attrid`) and tile structure. We don't use minified classes, # `jsname`, or per-request ids, because they are not stable. class CarouselExtractor @@ -56,10 +56,10 @@ def entry_for(anchor) extensions = leaves.drop(1) { - "name" => leaves.first || image["alt"], - **(extensions.any? ? { "extensions" => extensions } : {}), - "link" => URI.join(GOOGLE, anchor["href"]).to_s, - "image" => image_for(image) + name: leaves.first || image["alt"], + **(extensions.any? ? { extensions: extensions } : {}), + link: URI.join(GOOGLE, anchor["href"]).to_s, + image: image_for(image) } end diff --git a/spec/lib/carousel_extractor_spec.rb b/spec/lib/carousel_extractor_spec.rb index 8e552d64..e2fe0c82 100644 --- a/spec/lib/carousel_extractor_spec.rb +++ b/spec/lib/carousel_extractor_spec.rb @@ -7,14 +7,14 @@ let(:expected) { JSON.parse(fixture_expected).fetch("artworks") } def entry_looks_valid?(entry) - entry["name"].to_s != "" && - entry["link"].to_s.start_with?("https://www.google.com/search") && - entry["image"].to_s.start_with?("data:image") + entry[:name].to_s != "" && + entry[:link].to_s.start_with?("https://www.google.com/search") && + entry[:image].to_s.start_with?("data:image") end describe "van-gogh-paintings.html (challenge fixture)" do it "reproduces the expected artworks array exactly" do - expect(artworks).to eql(expected) + expect(artworks.to_json).to eql(expected.to_json) end describe "first artwork" do @@ -22,25 +22,25 @@ def entry_looks_valid?(entry) let(:starry_night) { expected.first } - it("has a name") { expect(first["name"]).to eql(starry_night["name"]) } - it("has a link") { expect(first["link"]).to eql(starry_night["link"]) } + it("has a name") { expect(first[:name]).to eql(starry_night["name"]) } + it("has a link") { expect(first[:link]).to eql(starry_night["link"]) } it("has extensions") do - expect(first["extensions"]).to eql(starry_night["extensions"]) + expect(first[:extensions]).to eql(starry_night["extensions"]) end it("has an inline base64 image") do - expect(first["image"]).to eql(starry_night["image"]) + expect(first[:image]).to eql(starry_night["image"]) end end it "omits extensions for yearless paintings rather than emitting []" do - yearless = artworks.reject { |a| a.key?("extensions") } - expect(yearless.map { |a| a["name"] }).to include("Sunflowers") - expect(yearless).to all(satisfy { |a| !a.key?("extensions") }) + yearless = artworks.reject { |a| a.key?(:extensions) } + expect(yearless.map { |a| a[:name] }).to include("Sunflowers") + expect(yearless).to all(satisfy { |a| !a.key?(:extensions) }) end it "needs no extra HTTP requests (every image is inline data: or an in-page URL)" do expect(artworks).to all(satisfy { |a| - a["image"].start_with?("data:image", "https://") + a[:image].start_with?("data:image", "https://") }) end end @@ -55,14 +55,14 @@ def entry_looks_valid?(entry) it "first artwork has name/extensions/link/image of the expected types" do first = artworks.first - expect(first["name"]).to be_a(String) - expect(first["name"]).to_not be_empty - expect(first["extensions"]).to be_a(Array) - expect(first["extensions"]).to_not be_empty - expect(first["link"]).to be_a(String) - expect(first["link"]).to_not be_empty - expect(first["image"]).to be_a(String) - expect(first["image"]).to_not be_empty + expect(first[:name]).to be_a(String) + expect(first[:name]).to_not be_empty + expect(first[:extensions]).to be_a(Array) + expect(first[:extensions]).to_not be_empty + expect(first[:link]).to be_a(String) + expect(first[:link]).to_not be_empty + expect(first[:image]).to be_a(String) + expect(first[:image]).to_not be_empty end end @@ -72,20 +72,16 @@ def entry_looks_valid?(entry) end it "extracts the albums carousel via the music attrid" do - expect(albums.size).to equal(12) + expect(albums.size).to eql(12) end it "fills name (from the text div) + link + image for every album" do - expect(albums).to all(satisfy do |a| - a["name"].to_s != "" && - a["link"].to_s.start_with?("https://www.google.com/search") && - a["image"].to_s.start_with?("data:image") - end) + expect(albums).to all(satisfy(&method(:entry_looks_valid?))) end it "captures release years as extensions" do - blues = albums.find { |a| a["name"] == "Blues for Allah" } - expect(blues["extensions"]).to eql(["1975"]) + blues = albums.find { |a| a[:name] == "Blues for Allah" } + expect(blues[:extensions]).to eql(["1975"]) end end @@ -96,11 +92,11 @@ def entry_looks_valid?(entry) end it "extracts the buildings carousel via the architect attrid" do - expect(buildings.size).to equal(12) + expect(buildings.size).to eql(12) end it "omits extensions across the entire carousel" do - expect(buildings).to all(satisfy { |a| !a.key?("extensions") }) + expect(buildings).to all(satisfy { |a| !a.key?(:extensions) }) end it "still fills name + link + base64 image for every building" do @@ -113,12 +109,12 @@ def entry_looks_valid?(entry) let(:cast_fixture) { File.read("#{FIXTURES}/breaking_bad_cast.html") } it "extracts the cast carousel via the tv_program attrid" do - expect(cast.size).to eq(8) + expect(cast.size).to eql(8) end it "puts the actor in name and the character in extensions" do - cranston = cast.find { |a| a["name"] == "Bryan Cranston" } - expect(cranston["extensions"]).to eql(["Walter White"]) + cranston = cast.find { |a| a[:name] == "Bryan Cranston" } + expect(cranston[:extensions]).to eql(["Walter White"]) end it "fills name + link + base64 image for every cast member" do @@ -151,7 +147,7 @@ def entry_looks_valid?(entry) HTML - expect(described_class.call(html).map { |e| e["name"] }) + expect(described_class.call(html).map { |e| e[:name] }) .to eql(["Real Album"]) end @@ -168,7 +164,8 @@ def entry_looks_valid?(entry) HTML entry = described_class.call(html).first - expect(entry["image"]).to eql("https://encrypted-tbn0.gstatic.com/images?q=tbn:lazy") + expect(entry[:image]) + .to eql("https://encrypted-tbn0.gstatic.com/images?q=tbn:lazy") end it "handles an img with no src attribute, falling back to data-src" do @@ -182,7 +179,8 @@ def entry_looks_valid?(entry) HTML entry = described_class.call(html).first - expect(entry["image"]).to eql("https://encrypted-tbn0.gstatic.com/images?q=tbn:srcless") + expect(entry[:image]) + .to eql("https://encrypted-tbn0.gstatic.com/images?q=tbn:srcless") end end end From 81c84457ab18faf5424dd3af3fd8bba565b87369 Mon Sep 17 00:00:00 2001 From: Jason Dinsmore Date: Tue, 23 Jun 2026 14:14:04 -0700 Subject: [PATCH 13/17] Update README with info about approach and how to run --- README.md | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 103 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4d5a093f..fb42d3d6 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,110 @@ Parse directly the HTML result page ([html file]) in this repository. No extra H [html file]: https://raw.githubusercontent.com/serpapi/code-challenge/master/files/van-gogh-paintings.html [expected array]: https://raw.githubusercontent.com/serpapi/code-challenge/master/files/expected-array.json -Add also to your array the painting thumbnails present in the result page file (not the ones where extra requests are needed). +Add also to your array the painting thumbnails present in the result page file (not the ones where extra requests are needed). Test against 2 other similar result pages to make sure it works against different layouts. (Pages that contain the same kind of carrousel. Don't necessarily have to be paintings.) The suggested time for this challenge is 4 hours. But, you can take your time and work more on it if you want. + +--- + +## Solution + +Parses the Knowledge Graph carousel out of a saved Google results page into an +array of `{ name, extensions, link, image }` objects. + +The Van Gogh paintings case is the required deliverable and is covered. The same code handles other entity types (albums, buildings, cast) because only the *locator* changes per type. + +### Running it + +```sh +bundle install +bundle exec rspec # specs +bundle exec rubocop # lint +``` + +Run it against any saved results page (prints JSON to stdout): + +```sh +ruby -Ilib -rcarousel_extractor -rjson \ + -e 'puts JSON.pretty_generate(CarouselExtractor.call(File.read(ARGV[0])))' \ + files/van-gogh-paintings.html +``` + +### Output + +An array of symbol-keyed hashes, in the key order of `files/expected-array.json`. +Serialized with `to_json`, the Van Gogh case is byte-for-byte identical to the +expected output: + +```json +{ "name": "The Starry Night", "extensions": ["1889"], "link": "https://www.google.com/search?...", "image": "data:image/jpeg;base64,..." } +``` + +`extensions` is omitted entirely when a tile has no secondary line (e.g. +yearless paintings). The only field we *know* the meaning of is the paintings +date (from the expected fixture); for other types `extensions` carries +whatever the tile's second line is (year, or a character name for a cast +carousel). + +### Approach / design notes + +- **Locate by stable schema, not styling.** Tiles are found via the Knowledge + Graph `data-attrid` (e.g. `kc:/visual_art/visual_artist:works`), never by + minified classes, `jsname`, or per-request ids, those are not stable. +- **Allowlist of carousel tags, not "any tile container."** Some carousel-shaped + modules (e.g. `kc:/common/topic:social media presence` on the Unilever page) + are not entity collections. Matching known tags avoids false positives. To + support a new type, add its tag to `CAROUSEL_ATTRIDS` plus a fixture and a + spec. +- **Per-tile extraction depends on structure.** `name` and `extensions` come from + the leaf text `
`s under each anchor (name from the div, falling back to + `img@alt`); `link` from the anchor. +- **Thumbnails without extra requests.** The first tiles render a placeholder + `` whose real bytes arrive later in the page: searching the HTML for the + base64 string from `expected-array.json` led to `_setImagesSrc(...)` `