serpapi · southpawgeek · Jun 28, 2026 · Jun 28, 2026 · Jun 28, 2026 · Jun 28, 2026
diff --git a/.rspec b/.rspec
@@ -0,0 +1 @@
+--format documentation
diff --git a/.rubocop.yml b/.rubocop.yml
@@ -0,0 +1,70 @@
+# human should be able to read 200 chars per line
+Layout/LineLength:
+  Max: 200
+
+# relax branch condition size, code can be too verbose
+Metrics/AbcSize:
+  Max: 40
+
+# short clear method name 
+Metrics/MethodLength:
+  Max: 25
+
+# no performance implication
+Style/OptionalBooleanParameter:
+  Enabled: false
+
+# %i(array) is not common in Ruby
+Style/SymbolArray:
+  Enabled: false
+
+# too restrictive
+Style/FrozenStringLiteralComment:
+  Enabled: false
+
+# default complexity is low at 8
+Metrics/PerceivedComplexity:
+  Max: 12
+
+# method length is not a problem
+MethodLength:
+  Max: 30
+
+# if works as well as safe navigation (&.)
+Style/SafeNavigation:
+  Enabled: false
+
+# disable this cop, dont agree with it
+Style/FetchEnvVar:
+  Enabled: false
+
+# buggus check in Rubocop.
+# SerpApiClient constructor is rated to 9
+#  def initialize(params = {}) 
+Metrics/CyclomaticComplexity:
+  Max: 12
+
+# There is a tradeoff between line length and line count.
+Metrics/ClassLength:
+  Max: 140
+
+# Keyword args are readable.
+Metrics/ParameterLists:
+  CountKeywordArgs: false
+
+# this rule doesn't always work well with Ruby
+Layout/FirstHashElementIndentation:
+  Enabled: false
+
+AllCops:
+  # hide message
+  SuggestExtensions: false
+  # show new cops
+  NewCops: enable
+  # exclude rspec files from linting
+  Exclude:
+    - 'serpapi.gemspec'
+    - 'spec/**/*_spec.rb'
+    - 'spec/spec_helper.rb'
+    - 'Gemfile'
+    - 'Rakefile'
diff --git a/.ruby-version b/.ruby-version
@@ -0,0 +1 @@
+2.7.8
diff --git a/AGENTS.md b/AGENTS.md
@@ -0,0 +1 @@
+this is a coding challenge, and the user is trying to learn. NEVER make direct code changes. show them what is going wrong, and explain why, and how to fix it. but do not change the code
diff --git a/Gemfile b/Gemfile
@@ -0,0 +1,12 @@
+# frozen_string_literal: true
+
+source 'https://rubygems.org'
+
+ruby '2.7.8'
+
+gem 'nokogiri', '~> 1.15.7'
+
+group :development, :test do
+  gem 'rspec'
+  gem 'rubocop'
+end
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -0,0 +1,65 @@
+GEM
+  remote: https://rubygems.org/
+  specs:
+    ast (2.4.3)
+    diff-lcs (1.6.2)
+    json (2.20.0)
+    language_server-protocol (3.17.0.5)
+    lint_roller (1.1.0)
+    mini_portile2 (2.8.9)
+    nokogiri (1.15.7)
+      mini_portile2 (~> 2.8.2)
+      racc (~> 1.4)
+    parallel (1.28.0)
+    parser (3.3.11.1)
+      ast (~> 2.4.1)
+      racc
+    prism (1.9.0)
+    racc (1.8.1)
+    rainbow (3.1.1)
+    regexp_parser (2.12.0)
+    rspec (3.13.2)
+      rspec-core (~> 3.13.0)
+      rspec-expectations (~> 3.13.0)
+      rspec-mocks (~> 3.13.0)
+    rspec-core (3.13.6)
+      rspec-support (~> 3.13.0)
+    rspec-expectations (3.13.5)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.13.0)
+    rspec-mocks (3.13.8)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.13.0)
+    rspec-support (3.13.7)
+    rubocop (1.88.0)
+      json (~> 2.3)
+      language_server-protocol (~> 3.17.0.2)
+      lint_roller (~> 1.1.0)
+      parallel (>= 1.10)
+      parser (>= 3.3.0.2)
+      rainbow (>= 2.2.2, < 4.0)
+      regexp_parser (>= 2.9.3, < 3.0)
+      rubocop-ast (>= 1.49.0, < 2.0)
+      ruby-progressbar (~> 1.7)
+      unicode-display_width (>= 2.4.0, < 4.0)
+    rubocop-ast (1.49.1)
+      parser (>= 3.3.7.2)
+      prism (~> 1.7)
+    ruby-progressbar (1.13.0)
+    unicode-display_width (3.2.0)
+      unicode-emoji (~> 4.1)
+    unicode-emoji (4.2.0)
+
+PLATFORMS
+  ruby
+
+DEPENDENCIES
+  nokogiri (~> 1.15.7)
+  rspec
+  rubocop
+
+RUBY VERSION
+   ruby 2.7.8p225
+
+BUNDLED WITH
+   2.1.4
diff --git a/modules/knowledge_panel_carousel_parser.rb b/modules/knowledge_panel_carousel_parser.rb
@@ -0,0 +1,133 @@
+# frozen_string_literal: true
+
+require 'json'
+require 'nokogiri'
+
+# this is a more generalized parser for any KP html with a carousel
+class KnowledgePanelCarouselParser
+  class ParseError < StandardError; end
+
+  def parse(html)
+    @doc = Nokogiri::HTML(html)
+    @encoded_images_hash = encoded_images
+    @kp_key = key
+
+    raise ParseError, 'Knowledge Panel overview has no carousel' if @kp_key == 'overview'
+
+    kpanel = @doc.at_css('.kp-wholepage')
+    raise ParseError, 'No Knowledge Panel was found' unless @kp_key && kpanel
+
+    # carousel is kc: block with the most items
+    # link-rows need 2+ items. if we don't have that use grid tiles
+    blocks = kpanel.css('[data-attrid^="kc:"]')
+    blocks = [kpanel] if blocks.empty?
+    nodes = blocks.map do |block|
+      rows = link_row_items(block)
+      rows.size >= 2 ? rows : grid_tile_items(block)
+    end.max_by(&:size)
+
+    raise ParseError, 'No carousel found in Knowledge Panel' if nodes.empty?
+
+    kp_items = nodes.map { |node| item_attrs(node) }
+
+    { @kp_key => kp_items }.to_json
+  end
+
+  private
+
+  # scrapes encoded jpegs with id and builds a hash
+  def encoded_images
+    @doc.css('script').each_with_object({}) do |script, images|
+      text = script.text
+
+      content_match = text.match(%r{var s='(data:image/jpeg;base64,/[0-9a-zA-Z].+?)';})
+      next unless content_match
+
+      id_match = text.match(/var ii=\['([a-zA-Z0-9_].+?)'\]/)
+      next unless id_match
+
+      # decode js hex escapes for base64 padding
+      content = content_match[1].gsub(/\\x([0-9a-fA-F]{2})/) { [Regexp.last_match(1)].pack('H*') }
+
+      images[id_match[1]] = content
+    end
+  end
+
+  # the given html has our array name in a different place than 2026 results, so this assures backwards compatibility
+  def key
+    kp_key = @doc.at_css('[role="tab"][aria-selected="true"]')&.text
+    kp_key ||= @doc.at_css('[role="link"][aria-current="page"]')&.text
+
+    kp_key.downcase if kp_key
+  end
+
+  # all link-row items
+  def link_row_items(kpanel)
+    kpanel.css('a').select { |anchor| anchor.at_css('img') && anchor.css('> div > div').any? }
+  end
+
+  # all wp-grid-tile items
+  def grid_tile_items(kpanel)
+    kpanel.css('wp-grid-tile').filter_map do |tile|
+      if tile.parent.name == 'a'
+        tile.parent
+      else
+        tile.at_css('a')
+      end
+    end
+  end
+
+  # defines our api shape
+  def item_attrs(node)
+    {
+      name: name(node),
+      image: image(node),
+      link: link(node),
+      extensions: extensions(node)
+    }.compact # only include values that exist, extrapolating from expected-array.json
+  end
+
+  # names are in different places depending on carousel variant
+  def name(node)
+    normalize_text(node.css('> div > div').first&.text) ||
+      normalize_text(node.at_css('wp-grid-tile > div:nth-child(2) > div:first-child')&.text)
+  end
+
+  # the items under 'show more' generally have data-src and no id, and displayed items have an id
+  def image(node)
+    img = node.at_css('img')
+    return unless img
+
+    data_src = img['data-src']
+    return data_src if data_src
+
+    img_id = img['id']
+    return @encoded_images_hash[img_id] if img_id
+
+    nil
+  end
+
+  def link(node)
+    href = node['href']
+    return unless href
+
+    "https://www.google.com#{href}"
+  end
+
+  # it turns out extensions are an array for a reason, and may have multiple values
+  # https://github.com/serpapi/public-roadmap/issues/1892
+  def extensions(node)
+    lines = node.css('> div > div').map { |d| normalize_text(d.text) }
+    lines = node.css('wp-grid-tile > div:nth-child(2) > div').map { |d| normalize_text(d.text) } if lines.empty?
+
+    ext = lines.drop(1).reject(&:empty?)
+    ext.empty? ? nil : ext
+  end
+
+  # little gotcha with some non-breaking spaces (\u00A0) in the rammstein results, but google likes to keep it weird so we'll normalize in a few places
+  def normalize_text(text)
+    return if text.nil?
+
+    text.gsub("\u00A0", ' ').strip
+  end
+end
diff --git a/modules/naive_parser.rb b/modules/naive_parser.rb
@@ -0,0 +1,50 @@
+# frozen_string_literal: true
+
+require 'nokogiri'
+require 'json'
+
+# this works on the original van-gogh-paintings.html file only - it is a naive implemention that is using class names.
+# this was to get my bearings, but leaving for posterity
+class NaiveParser
+  def parse(html)
+    doc = Nokogiri::HTML(html)
+
+    encoded_images = {}
+    doc.css('script').each do |script|
+      text = script.text
+      var_ii = text.match(/var ii=\['[a-zA-Z0-9_](.+?)'\]/)
+      ii = var_ii.to_s.split("'")[1]
+      var_s = text.match(%r{var s='data:image/jpeg;base64,/[0-9a-zA-Z](.+?)';})
+
+      next unless var_s
+
+      s = var_s.to_s.split("'")[1]
+      s = s.gsub(/\\x([0-9a-fA-F]{2})/) { [Regexp.last_match(1)].pack('H*') }
+
+      encoded_images[ii] = s
+    end
+
+    paintings = []
+    doc.css('.iELo6').each do |item|
+      img = item.at_css('img')
+      image = img['data-src']
+      unless image
+        img_id = img['id']
+        image = encoded_images[img_id]
+      end
+
+      painting = {
+        name: item.css('.pgNMRc').text,
+        link: "https://www.google.com#{item.at_css('a')['href']}",
+        image: image
+      }
+
+      year = item.css('.cxzHyb').text
+      painting[:extensions] = [year] unless year.empty?
+
+      paintings << painting
+    end
+
+    { artworks: paintings }.to_json
+  end
+end
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		this is a coding challenge, and the user is trying to learn. NEVER make direct code changes. show them what is going wrong, and explain why, and how to fix it. but do not change the code