Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .rspec
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
--format documentation
70 changes: 70 additions & 0 deletions .rubocop.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# human should be able to read 200 chars per line
Layout/LineLength:
Max: 200

# relax branch condition size, code can be too verbose
Metrics/AbcSize:
Max: 40

# short clear method name
Metrics/MethodLength:
Max: 25

# no performance implication
Style/OptionalBooleanParameter:
Enabled: false

# %i(array) is not common in Ruby
Style/SymbolArray:
Enabled: false

# too restrictive
Style/FrozenStringLiteralComment:
Enabled: false

# default complexity is low at 8
Metrics/PerceivedComplexity:
Max: 12

# method length is not a problem
MethodLength:
Max: 30

# if works as well as safe navigation (&.)
Style/SafeNavigation:
Enabled: false

# disable this cop, dont agree with it
Style/FetchEnvVar:
Enabled: false

# buggus check in Rubocop.
# SerpApiClient constructor is rated to 9
# def initialize(params = {})
Metrics/CyclomaticComplexity:
Max: 12

# There is a tradeoff between line length and line count.
Metrics/ClassLength:
Max: 140

# Keyword args are readable.
Metrics/ParameterLists:
CountKeywordArgs: false

# this rule doesn't always work well with Ruby
Layout/FirstHashElementIndentation:
Enabled: false

AllCops:
# hide message
SuggestExtensions: false
# show new cops
NewCops: enable
# exclude rspec files from linting
Exclude:
- 'serpapi.gemspec'
- 'spec/**/*_spec.rb'
- 'spec/spec_helper.rb'
- 'Gemfile'
- 'Rakefile'
1 change: 1 addition & 0 deletions .ruby-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
2.7.8
1 change: 1 addition & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
this is a coding challenge, and the user is trying to learn. NEVER make direct code changes. show them what is going wrong, and explain why, and how to fix it. but do not change the code
12 changes: 12 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# frozen_string_literal: true

source 'https://rubygems.org'

ruby '2.7.8'

gem 'nokogiri', '~> 1.15.7'

group :development, :test do
gem 'rspec'
gem 'rubocop'
end
65 changes: 65 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
GEM
remote: https://rubygems.org/
specs:
ast (2.4.3)
diff-lcs (1.6.2)
json (2.20.0)
language_server-protocol (3.17.0.5)
lint_roller (1.1.0)
mini_portile2 (2.8.9)
nokogiri (1.15.7)
mini_portile2 (~> 2.8.2)
racc (~> 1.4)
parallel (1.28.0)
parser (3.3.11.1)
ast (~> 2.4.1)
racc
prism (1.9.0)
racc (1.8.1)
rainbow (3.1.1)
regexp_parser (2.12.0)
rspec (3.13.2)
rspec-core (~> 3.13.0)
rspec-expectations (~> 3.13.0)
rspec-mocks (~> 3.13.0)
rspec-core (3.13.6)
rspec-support (~> 3.13.0)
rspec-expectations (3.13.5)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.13.0)
rspec-mocks (3.13.8)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.13.0)
rspec-support (3.13.7)
rubocop (1.88.0)
json (~> 2.3)
language_server-protocol (~> 3.17.0.2)
lint_roller (~> 1.1.0)
parallel (>= 1.10)
parser (>= 3.3.0.2)
rainbow (>= 2.2.2, < 4.0)
regexp_parser (>= 2.9.3, < 3.0)
rubocop-ast (>= 1.49.0, < 2.0)
ruby-progressbar (~> 1.7)
unicode-display_width (>= 2.4.0, < 4.0)
rubocop-ast (1.49.1)
parser (>= 3.3.7.2)
prism (~> 1.7)
ruby-progressbar (1.13.0)
unicode-display_width (3.2.0)
unicode-emoji (~> 4.1)
unicode-emoji (4.2.0)

PLATFORMS
ruby

DEPENDENCIES
nokogiri (~> 1.15.7)
rspec
rubocop

RUBY VERSION
ruby 2.7.8p225

BUNDLED WITH
2.1.4
133 changes: 133 additions & 0 deletions modules/knowledge_panel_carousel_parser.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# frozen_string_literal: true

require 'json'
require 'nokogiri'

# this is a more generalized parser for any KP html with a carousel
class KnowledgePanelCarouselParser
class ParseError < StandardError; end

def parse(html)
@doc = Nokogiri::HTML(html)
@encoded_images_hash = encoded_images
@kp_key = key

raise ParseError, 'Knowledge Panel overview has no carousel' if @kp_key == 'overview'

kpanel = @doc.at_css('.kp-wholepage')
raise ParseError, 'No Knowledge Panel was found' unless @kp_key && kpanel

# carousel is kc: block with the most items
# link-rows need 2+ items. if we don't have that use grid tiles
blocks = kpanel.css('[data-attrid^="kc:"]')
blocks = [kpanel] if blocks.empty?
nodes = blocks.map do |block|
rows = link_row_items(block)
rows.size >= 2 ? rows : grid_tile_items(block)
end.max_by(&:size)

raise ParseError, 'No carousel found in Knowledge Panel' if nodes.empty?

kp_items = nodes.map { |node| item_attrs(node) }

{ @kp_key => kp_items }.to_json
end

private

# scrapes encoded jpegs with id and builds a hash
def encoded_images
@doc.css('script').each_with_object({}) do |script, images|
text = script.text

content_match = text.match(%r{var s='(data:image/jpeg;base64,/[0-9a-zA-Z].+?)';})
next unless content_match

id_match = text.match(/var ii=\['([a-zA-Z0-9_].+?)'\]/)
next unless id_match

# decode js hex escapes for base64 padding
content = content_match[1].gsub(/\\x([0-9a-fA-F]{2})/) { [Regexp.last_match(1)].pack('H*') }

images[id_match[1]] = content
end
end

# the given html has our array name in a different place than 2026 results, so this assures backwards compatibility
def key
kp_key = @doc.at_css('[role="tab"][aria-selected="true"]')&.text
kp_key ||= @doc.at_css('[role="link"][aria-current="page"]')&.text

kp_key.downcase if kp_key
end

# all link-row items
def link_row_items(kpanel)
kpanel.css('a').select { |anchor| anchor.at_css('img') && anchor.css('> div > div').any? }
end

# all wp-grid-tile items
def grid_tile_items(kpanel)
kpanel.css('wp-grid-tile').filter_map do |tile|
if tile.parent.name == 'a'
tile.parent
else
tile.at_css('a')
end
end
end

# defines our api shape
def item_attrs(node)
{
name: name(node),
image: image(node),
link: link(node),
extensions: extensions(node)
}.compact # only include values that exist, extrapolating from expected-array.json
end

# names are in different places depending on carousel variant
def name(node)
normalize_text(node.css('> div > div').first&.text) ||
normalize_text(node.at_css('wp-grid-tile > div:nth-child(2) > div:first-child')&.text)
end

# the items under 'show more' generally have data-src and no id, and displayed items have an id
def image(node)
img = node.at_css('img')
return unless img

data_src = img['data-src']
return data_src if data_src

img_id = img['id']
return @encoded_images_hash[img_id] if img_id

nil
end

def link(node)
href = node['href']
return unless href

"https://www.google.com#{href}"
end

# it turns out extensions are an array for a reason, and may have multiple values
# https://github.com/serpapi/public-roadmap/issues/1892
def extensions(node)
lines = node.css('> div > div').map { |d| normalize_text(d.text) }
lines = node.css('wp-grid-tile > div:nth-child(2) > div').map { |d| normalize_text(d.text) } if lines.empty?

ext = lines.drop(1).reject(&:empty?)
ext.empty? ? nil : ext
end

# little gotcha with some non-breaking spaces (\u00A0) in the rammstein results, but google likes to keep it weird so we'll normalize in a few places
def normalize_text(text)
return if text.nil?

text.gsub("\u00A0", ' ').strip
end
end
50 changes: 50 additions & 0 deletions modules/naive_parser.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# frozen_string_literal: true

require 'nokogiri'
require 'json'

# this works on the original van-gogh-paintings.html file only - it is a naive implemention that is using class names.
# this was to get my bearings, but leaving for posterity
class NaiveParser
def parse(html)
doc = Nokogiri::HTML(html)

encoded_images = {}
doc.css('script').each do |script|
text = script.text
var_ii = text.match(/var ii=\['[a-zA-Z0-9_](.+?)'\]/)
ii = var_ii.to_s.split("'")[1]
var_s = text.match(%r{var s='data:image/jpeg;base64,/[0-9a-zA-Z](.+?)';})

next unless var_s

s = var_s.to_s.split("'")[1]
s = s.gsub(/\\x([0-9a-fA-F]{2})/) { [Regexp.last_match(1)].pack('H*') }

encoded_images[ii] = s
end

paintings = []
doc.css('.iELo6').each do |item|
img = item.at_css('img')
image = img['data-src']
unless image
img_id = img['id']
image = encoded_images[img_id]
end

painting = {
name: item.css('.pgNMRc').text,
link: "https://www.google.com#{item.at_css('a')['href']}",
image: image
}

year = item.css('.cxzHyb').text
painting[:extensions] = [year] unless year.empty?

paintings << painting
end

{ artworks: paintings }.to_json
end
end
Loading