From ed3e3f2923682b64f97f4089881ad62989821d94 Mon Sep 17 00:00:00 2001 From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com> Date: Wed, 24 Jun 2026 21:21:11 +0200 Subject: [PATCH 01/15] Add and initialize RSpec --- .rspec | 1 + Gemfile | 5 +++ Gemfile.lock | 35 ++++++++++++++++ spec/spec_helper.rb | 98 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 139 insertions(+) create mode 100644 .rspec create mode 100644 Gemfile create mode 100644 Gemfile.lock create mode 100644 spec/spec_helper.rb diff --git a/.rspec b/.rspec new file mode 100644 index 00000000..c99d2e73 --- /dev/null +++ b/.rspec @@ -0,0 +1 @@ +--require spec_helper diff --git a/Gemfile b/Gemfile new file mode 100644 index 00000000..3975ddef --- /dev/null +++ b/Gemfile @@ -0,0 +1,5 @@ +# frozen_string_literal: true + +source "https://rubygems.org" + +gem "rspec", "~> 3.13" diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 00000000..ccc2faea --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,35 @@ +GEM + remote: https://rubygems.org/ + specs: + diff-lcs (1.6.2) + rspec (3.13.2) + rspec-core (~> 3.13.0) + rspec-expectations (~> 3.13.0) + rspec-mocks (~> 3.13.0) + rspec-core (3.13.6) + rspec-support (~> 3.13.0) + rspec-expectations (3.13.5) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.13.0) + rspec-mocks (3.13.8) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.13.0) + rspec-support (3.13.7) + +PLATFORMS + arm64-darwin-23 + ruby + +DEPENDENCIES + rspec (~> 3.13) + +CHECKSUMS + diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962 + rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587 + rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d + rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836 + rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47 + rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c + +BUNDLED WITH + 4.0.10 diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb new file mode 100644 index 00000000..c80d44b9 --- /dev/null +++ b/spec/spec_helper.rb @@ -0,0 +1,98 @@ +# This file was generated by the `rspec --init` command. Conventionally, all +# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`. +# The generated `.rspec` file contains `--require spec_helper` which will cause +# this file to always be loaded, without a need to explicitly require it in any +# files. +# +# Given that it is always loaded, you are encouraged to keep this file as +# light-weight as possible. Requiring heavyweight dependencies from this file +# will add to the boot time of your test suite on EVERY test run, even for an +# individual file that may not need all of that loaded. Instead, consider making +# a separate helper file that requires the additional dependencies and performs +# the additional setup, and require it from the spec files that actually need +# it. +# +# See https://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration +RSpec.configure do |config| + # rspec-expectations config goes here. You can use an alternate + # assertion/expectation library such as wrong or the stdlib/minitest + # assertions if you prefer. + config.expect_with :rspec do |expectations| + # This option will default to `true` in RSpec 4. It makes the `description` + # and `failure_message` of custom matchers include text for helper methods + # defined using `chain`, e.g.: + # be_bigger_than(2).and_smaller_than(4).description + # # => "be bigger than 2 and smaller than 4" + # ...rather than: + # # => "be bigger than 2" + expectations.include_chain_clauses_in_custom_matcher_descriptions = true + end + + # rspec-mocks config goes here. You can use an alternate test double + # library (such as bogus or mocha) by changing the `mock_with` option here. + config.mock_with :rspec do |mocks| + # Prevents you from mocking or stubbing a method that does not exist on + # a real object. This is generally recommended, and will default to + # `true` in RSpec 4. + mocks.verify_partial_doubles = true + end + + # This option will default to `:apply_to_host_groups` in RSpec 4 (and will + # have no way to turn it off -- the option exists only for backwards + # compatibility in RSpec 3). It causes shared context metadata to be + # inherited by the metadata hash of host groups and examples, rather than + # triggering implicit auto-inclusion in groups with matching metadata. + config.shared_context_metadata_behavior = :apply_to_host_groups + +# The settings below are suggested to provide a good initial experience +# with RSpec, but feel free to customize to your heart's content. +=begin + # This allows you to limit a spec run to individual examples or groups + # you care about by tagging them with `:focus` metadata. When nothing + # is tagged with `:focus`, all examples get run. RSpec also provides + # aliases for `it`, `describe`, and `context` that include `:focus` + # metadata: `fit`, `fdescribe` and `fcontext`, respectively. + config.filter_run_when_matching :focus + + # Allows RSpec to persist some state between runs in order to support + # the `--only-failures` and `--next-failure` CLI options. We recommend + # you configure your source control system to ignore this file. + config.example_status_persistence_file_path = "spec/examples.txt" + + # Limits the available syntax to the non-monkey patched syntax that is + # recommended. For more details, see: + # https://rspec.info/features/3-12/rspec-core/configuration/zero-monkey-patching-mode/ + config.disable_monkey_patching! + + # This setting enables warnings. It's recommended, but in some cases may + # be too noisy due to issues in dependencies. + config.warnings = true + + # Many RSpec users commonly either run the entire suite or an individual + # file, and it's useful to allow more verbose output when running an + # individual spec file. + if config.files_to_run.one? + # Use the documentation formatter for detailed output, + # unless a formatter has already been configured + # (e.g. via a command-line flag). + config.default_formatter = "doc" + end + + # Print the 10 slowest examples and example groups at the + # end of the spec run, to help surface which specs are running + # particularly slow. + config.profile_examples = 10 + + # Run specs in random order to surface order dependencies. If you find an + # order dependency and want to debug it, you can fix the order by providing + # the seed, which is printed after each run. + # --seed 1234 + config.order = :random + + # Seed global randomization in this process using the `--seed` CLI option. + # Setting this allows you to use `--seed` to deterministically reproduce + # test failures related to randomization by passing the same `--seed` value + # as the one that triggered the failure. + Kernel.srand config.seed +=end +end From fc17485257be8e7b0ebd53246800deb59557bad9 Mon Sep 17 00:00:00 2001 From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com> Date: Wed, 24 Jun 2026 21:54:37 +0200 Subject: [PATCH 02/15] Add nokogiri --- Gemfile | 1 + Gemfile.lock | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/Gemfile b/Gemfile index 3975ddef..f94c154d 100644 --- a/Gemfile +++ b/Gemfile @@ -3,3 +3,4 @@ source "https://rubygems.org" gem "rspec", "~> 3.13" +gem "nokogiri", "~> 1.19" diff --git a/Gemfile.lock b/Gemfile.lock index ccc2faea..8c09d27c 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -2,6 +2,9 @@ GEM remote: https://rubygems.org/ specs: diff-lcs (1.6.2) + nokogiri (1.19.4-arm64-darwin) + racc (~> 1.4) + racc (1.8.1) rspec (3.13.2) rspec-core (~> 3.13.0) rspec-expectations (~> 3.13.0) @@ -18,13 +21,15 @@ GEM PLATFORMS arm64-darwin-23 - ruby DEPENDENCIES + nokogiri (~> 1.19) rspec (~> 3.13) CHECKSUMS diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962 + nokogiri (1.19.4-arm64-darwin) sha256=a46db9853286e6597b36ebc6953817d15acf3a299583eb3f89fdc6f91dd63527 + racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587 rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836 From 1267ae40069c39a0cb271c11183a5544724cb7ef Mon Sep 17 00:00:00 2001 From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com> Date: Wed, 24 Jun 2026 21:54:50 +0200 Subject: [PATCH 03/15] Add basic tests --- lib/file_scraper.rb | 16 +++++++++++++++ spec/file_scraper_spec.rb | 43 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 lib/file_scraper.rb create mode 100644 spec/file_scraper_spec.rb diff --git a/lib/file_scraper.rb b/lib/file_scraper.rb new file mode 100644 index 00000000..95c9ebde --- /dev/null +++ b/lib/file_scraper.rb @@ -0,0 +1,16 @@ +# frozen_string_literal: true + +require "nokogiri" +require "json" + +class FileScraper + def self.get(file_path) + html = File.read(file_path) + + raise "The file has no content" if html.nil? + + document = Nokogiri::HTML(html) + + JSON.generate({}) + end +end diff --git a/spec/file_scraper_spec.rb b/spec/file_scraper_spec.rb new file mode 100644 index 00000000..dba6594b --- /dev/null +++ b/spec/file_scraper_spec.rb @@ -0,0 +1,43 @@ +# frozen_string_literal: true + +require "file_scraper" + +RSpec.describe FileScraper do + before :all do + path = "./files/van-gogh-paintings.html" + json_response = FileScraper.get(path) + @response = JSON.parse(json_response) + end + + it "contains artworks array" do + expect(@response["artworks"]).to be_a(Array) + end + + it "artworks – name" do + expect(@response["artworks"].first["name"]).to be_a(String) + expect(@response["artworks"].first["name"]).not_to be_empty + end + + it "artworks – extensions" do + expect(@response["artworks"].first["extensions"]).to be_a(Array) + end + + it "artworks – link" do + expect(@response["artworks"].first["link"]).to be_a(String) + expect(@response["artworks"].first["link"]).not_to be_empty + end + + context "with thumbnail" do + it "artworks – image" do + expect(@response["artworks"].first["image"]).to be_a(String) + expect(@response["artworks"].first["image"]).not_to be_empty + end + end + + context "without thumbnail" do + it "artworks – image" do + expect(@response["artworks"].last["image"]).to be_a(String) + expect(@response["artworks"].last["image"]).not_to be_empty + end + end +end From 7a69e1cd6af155a168b12ff4d2e965997ef9baed Mon Sep 17 00:00:00 2001 From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com> Date: Wed, 24 Jun 2026 22:16:19 +0200 Subject: [PATCH 04/15] Scrape all but the thumbnail image --- lib/file_scraper.rb | 22 ++++++++++++++++++++-- spec/file_scraper_spec.rb | 2 +- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/lib/file_scraper.rb b/lib/file_scraper.rb index 95c9ebde..cf20e514 100644 --- a/lib/file_scraper.rb +++ b/lib/file_scraper.rb @@ -4,13 +4,31 @@ require "json" class FileScraper - def self.get(file_path) + DOMAIN_NAME = "https://www.google.com" + + def self.run(file_path) html = File.read(file_path) raise "The file has no content" if html.nil? document = Nokogiri::HTML(html) - JSON.generate({}) + artworks = document.css(".iELo6") + + result = artworks.map do |artwork| + extensions = artwork.css(".KHK6lb > div").map { it&.text } + name = extensions.shift + relative_path = artwork.at("a")["href"] + image = artwork.at("img")["data-src"] + { + name:, + extensions:, + link: DOMAIN_NAME + relative_path, + image:, + } + end + + JSON.generate({artworks: result}) end end + diff --git a/spec/file_scraper_spec.rb b/spec/file_scraper_spec.rb index dba6594b..29d2c13d 100644 --- a/spec/file_scraper_spec.rb +++ b/spec/file_scraper_spec.rb @@ -5,7 +5,7 @@ RSpec.describe FileScraper do before :all do path = "./files/van-gogh-paintings.html" - json_response = FileScraper.get(path) + json_response = FileScraper.run(path) @response = JSON.parse(json_response) end From 9106403d666885d9e3d0ddefcd90639ed2a80cc1 Mon Sep 17 00:00:00 2001 From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com> Date: Thu, 25 Jun 2026 09:00:45 +0200 Subject: [PATCH 05/15] Update image scraping and add expected json test --- lib/file_scraper.rb | 5 +++-- spec/file_scraper_spec.rb | 10 ++++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/lib/file_scraper.rb b/lib/file_scraper.rb index cf20e514..27bc8e50 100644 --- a/lib/file_scraper.rb +++ b/lib/file_scraper.rb @@ -19,12 +19,13 @@ def self.run(file_path) extensions = artwork.css(".KHK6lb > div").map { it&.text } name = extensions.shift relative_path = artwork.at("a")["href"] - image = artwork.at("img")["data-src"] + data_image = artwork.at("img")["data-src"] + src_image = artwork.at("img")["src"] { name:, extensions:, link: DOMAIN_NAME + relative_path, - image:, + image: data_image || src_image, } end diff --git a/spec/file_scraper_spec.rb b/spec/file_scraper_spec.rb index 29d2c13d..b3acf92b 100644 --- a/spec/file_scraper_spec.rb +++ b/spec/file_scraper_spec.rb @@ -5,10 +5,12 @@ RSpec.describe FileScraper do before :all do path = "./files/van-gogh-paintings.html" - json_response = FileScraper.run(path) - @response = JSON.parse(json_response) + @json_response = FileScraper.run(path) + @response = JSON.parse(@json_response) end + let(:expected_json) { File.read("./files/expected-array.json") } + it "contains artworks array" do expect(@response["artworks"]).to be_a(Array) end @@ -40,4 +42,8 @@ expect(@response["artworks"].last["image"]).not_to be_empty end end + + it "produces the expected JSON array" do + expect(@json_response).to eql(expected_json) + end end From 95431f661af4f635e792d2d56299a88448360b76 Mon Sep 17 00:00:00 2001 From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com> Date: Thu, 25 Jun 2026 10:25:06 +0200 Subject: [PATCH 06/15] Improve test for expected response --- spec/file_scraper_spec.rb | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/spec/file_scraper_spec.rb b/spec/file_scraper_spec.rb index b3acf92b..74f23d40 100644 --- a/spec/file_scraper_spec.rb +++ b/spec/file_scraper_spec.rb @@ -5,11 +5,11 @@ RSpec.describe FileScraper do before :all do path = "./files/van-gogh-paintings.html" - @json_response = FileScraper.run(path) - @response = JSON.parse(@json_response) + json_response = FileScraper.run(path) + @response = JSON.parse(json_response) end - let(:expected_json) { File.read("./files/expected-array.json") } + let(:expected_response) { JSON.parse(File.read("./files/expected-array.json")) } it "contains artworks array" do expect(@response["artworks"]).to be_a(Array) @@ -43,7 +43,13 @@ end end - it "produces the expected JSON array" do - expect(@json_response).to eql(expected_json) + it "produces the expected response" do + @response["artworks"].each.with_index do |artwork, index| + puts artwork["name"] + expect(artwork["name"]).to eq(expected_response["artworks"][index]["name"]) + expect(artwork["extensions"]).to eq(expected_response["artworks"][index]["extensions"]) + expect(artwork["link"]).to eq(expected_response["artworks"][index]["link"]) + expect(artwork["image"]).to eq(expected_response["artworks"][index]["image"]) + end end end From 61c218ce37f40fcd9b33bdb5e74784b3c3de08e5 Mon Sep 17 00:00:00 2001 From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com> Date: Thu, 25 Jun 2026 10:25:18 +0200 Subject: [PATCH 07/15] Fix error with empty extensions in artwork --- lib/file_scraper.rb | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/file_scraper.rb b/lib/file_scraper.rb index 27bc8e50..b5804a49 100644 --- a/lib/file_scraper.rb +++ b/lib/file_scraper.rb @@ -16,20 +16,21 @@ def self.run(file_path) artworks = document.css(".iELo6") result = artworks.map do |artwork| - extensions = artwork.css(".KHK6lb > div").map { it&.text } + extensions = artwork.css(".KHK6lb > div").map do |extension| + extension&.text unless extension&.text.empty? + end name = extensions.shift relative_path = artwork.at("a")["href"] data_image = artwork.at("img")["data-src"] src_image = artwork.at("img")["src"] { name:, - extensions:, + extensions: (extensions unless extensions.compact.empty?), link: DOMAIN_NAME + relative_path, image: data_image || src_image, - } + }.compact end JSON.generate({artworks: result}) end end - From 82bc0be77a02f7300d38be24b19cf68c60c43d46 Mon Sep 17 00:00:00 2001 From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com> Date: Thu, 25 Jun 2026 10:25:26 +0200 Subject: [PATCH 08/15] Add ferrum gem --- Gemfile | 1 + Gemfile.lock | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/Gemfile b/Gemfile index f94c154d..09f329c7 100644 --- a/Gemfile +++ b/Gemfile @@ -4,3 +4,4 @@ source "https://rubygems.org" gem "rspec", "~> 3.13" gem "nokogiri", "~> 1.19" +gem "ferrum", "~> 0.17" diff --git a/Gemfile.lock b/Gemfile.lock index 8c09d27c..946c3376 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,9 +1,20 @@ GEM remote: https://rubygems.org/ specs: + addressable (2.9.0) + public_suffix (>= 2.0.2, < 8.0) + base64 (0.3.0) + concurrent-ruby (1.3.7) diff-lcs (1.6.2) + ferrum (0.17.2) + addressable (~> 2.5) + base64 (~> 0.2) + concurrent-ruby (~> 1.1) + webrick (~> 1.7) + websocket-driver (~> 0.7) nokogiri (1.19.4-arm64-darwin) racc (~> 1.4) + public_suffix (7.0.5) racc (1.8.1) rspec (3.13.2) rspec-core (~> 3.13.0) @@ -18,23 +29,37 @@ GEM diff-lcs (>= 1.2.0, < 2.0) rspec-support (~> 3.13.0) rspec-support (3.13.7) + webrick (1.9.2) + websocket-driver (0.8.2) + base64 + websocket-extensions (>= 0.1.0) + websocket-extensions (0.1.5) PLATFORMS arm64-darwin-23 DEPENDENCIES + ferrum (~> 0.17) nokogiri (~> 1.19) rspec (~> 3.13) CHECKSUMS + addressable (2.9.0) sha256=7fdf6ac3660f7f4e867a0838be3f6cf722ace541dd97767fa42bc6cfa980c7af + base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b + concurrent-ruby (1.3.7) sha256=4412caec3a5ea2e5fdc52076724c071a81f2c0593d83b2ac8cbb8ca63b3151b0 diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962 + ferrum (0.17.2) sha256=2c2540a850b211a46f4d81de21bfd62048f507e4c327d1807225c3823c17e6ee nokogiri (1.19.4-arm64-darwin) sha256=a46db9853286e6597b36ebc6953817d15acf3a299583eb3f89fdc6f91dd63527 + public_suffix (7.0.5) sha256=1a8bb08f1bbea19228d3bed6e5ed908d1cb4f7c2726d18bd9cadf60bc676f623 racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587 rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836 rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47 rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c + webrick (1.9.2) sha256=beb4a15fc474defed24a3bda4ffd88a490d517c9e4e6118c3edce59e45864131 + websocket-driver (0.8.2) sha256=97c556b019bf3410b4961002ac501621e9322d3f8a7bc02161a09301cc4c4146 + websocket-extensions (0.1.5) sha256=1c6ba63092cda343eb53fc657110c71c754c56484aad42578495227d717a8241 BUNDLED WITH 4.0.10 From 05247a7c6ba96606e9cd90a806a215dcde0fcb6f Mon Sep 17 00:00:00 2001 From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com> Date: Thu, 25 Jun 2026 11:00:11 +0200 Subject: [PATCH 09/15] Collect html document after JS execution --- lib/file_scraper.rb | 27 +++++++++++++++++++++------ spec/file_scraper_spec.rb | 1 - 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/lib/file_scraper.rb b/lib/file_scraper.rb index b5804a49..821f3b22 100644 --- a/lib/file_scraper.rb +++ b/lib/file_scraper.rb @@ -1,15 +1,14 @@ # frozen_string_literal: true -require "nokogiri" +require "ferrum" require "json" +require "nokogiri" class FileScraper DOMAIN_NAME = "https://www.google.com" def self.run(file_path) - html = File.read(file_path) - - raise "The file has no content" if html.nil? + html = extract_html(file_path) document = Nokogiri::HTML(html) @@ -28,9 +27,25 @@ def self.run(file_path) extensions: (extensions unless extensions.compact.empty?), link: DOMAIN_NAME + relative_path, image: data_image || src_image, - }.compact + } end - JSON.generate({artworks: result}) + JSON.generate(artworks: result) + end + + private + + def self.extract_html(file_path) + file_extension = file_path.split(".").last + + raise "Please use an HTML file" unless file_extension == "html" + + begin + browser = Ferrum::Browser.new + browser.go_to("file:///#{File.expand_path(file_path)}") + browser.body + ensure + browser.quit + end end end diff --git a/spec/file_scraper_spec.rb b/spec/file_scraper_spec.rb index 74f23d40..e0390bbd 100644 --- a/spec/file_scraper_spec.rb +++ b/spec/file_scraper_spec.rb @@ -45,7 +45,6 @@ it "produces the expected response" do @response["artworks"].each.with_index do |artwork, index| - puts artwork["name"] expect(artwork["name"]).to eq(expected_response["artworks"][index]["name"]) expect(artwork["extensions"]).to eq(expected_response["artworks"][index]["extensions"]) expect(artwork["link"]).to eq(expected_response["artworks"][index]["link"]) From 3f9dd5296c68231f7a58d578cad9f77b9c31b7c4 Mon Sep 17 00:00:00 2001 From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com> Date: Thu, 25 Jun 2026 11:24:18 +0200 Subject: [PATCH 10/15] Add 2 similar results pages --- files/gerhard-richter-paintings.html | 54 ++++++++++++++++++++++++++++ files/rene-magritte-paintings.html | 51 ++++++++++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 files/gerhard-richter-paintings.html create mode 100644 files/rene-magritte-paintings.html diff --git a/files/gerhard-richter-paintings.html b/files/gerhard-richter-paintings.html new file mode 100644 index 00000000..28e806b7 --- /dev/null +++ b/files/gerhard-richter-paintings.html @@ -0,0 +1,54 @@ +Gerhard Richter paintings - Google Search
Skip to main contentAccessibility help

Search Results

Gerhard Richter
German visual artist
Google apps
Google Account
Rob McCormick
robmc@hey.com
\ No newline at end of file diff --git a/files/rene-magritte-paintings.html b/files/rene-magritte-paintings.html new file mode 100644 index 00000000..4493b1f8 --- /dev/null +++ b/files/rene-magritte-paintings.html @@ -0,0 +1,51 @@ +René Magritte paintings - Google Search
Skip to main contentAccessibility help

Search Results

René Magritte
Belgian artist
Google apps
Google Account
Rob McCormick
robmc@hey.com
\ No newline at end of file From d3abc1c7624c2405f3b13a26e5479fafcba0df3c Mon Sep 17 00:00:00 2001 From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com> Date: Thu, 25 Jun 2026 11:24:45 +0200 Subject: [PATCH 11/15] Update tests to test all sample pages --- spec/file_scraper_spec.rb | 87 ++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 38 deletions(-) diff --git a/spec/file_scraper_spec.rb b/spec/file_scraper_spec.rb index e0390bbd..68878c96 100644 --- a/spec/file_scraper_spec.rb +++ b/spec/file_scraper_spec.rb @@ -3,52 +3,63 @@ require "file_scraper" RSpec.describe FileScraper do - before :all do - path = "./files/van-gogh-paintings.html" - json_response = FileScraper.run(path) - @response = JSON.parse(json_response) - end + FILE_PATHS = [ + "./files/van-gogh-paintings.html", + "./files/rene-magritte-paintings.html", + "./files/gerhard-richter-paintings.html", + ] - let(:expected_response) { JSON.parse(File.read("./files/expected-array.json")) } + FILE_PATHS.each do |file_path| + context "using #{file_path}" do + before :all do + json_response = FileScraper.run(file_path) + @response = JSON.parse(json_response) + end - it "contains artworks array" do - expect(@response["artworks"]).to be_a(Array) - end + let(:expected_response) { JSON.parse(File.read("./files/expected-array.json")) } - it "artworks – name" do - expect(@response["artworks"].first["name"]).to be_a(String) - expect(@response["artworks"].first["name"]).not_to be_empty - end + it "contains artworks array" do + expect(@response["artworks"]).to be_a(Array) + end - it "artworks – extensions" do - expect(@response["artworks"].first["extensions"]).to be_a(Array) - end + it "artworks – name" do + expect(@response["artworks"].first["name"]).to be_a(String) + expect(@response["artworks"].first["name"]).not_to be_empty + end - it "artworks – link" do - expect(@response["artworks"].first["link"]).to be_a(String) - expect(@response["artworks"].first["link"]).not_to be_empty - end + it "artworks – extensions" do + expect(@response["artworks"].first["extensions"]).to be_a(Array) + end - context "with thumbnail" do - it "artworks – image" do - expect(@response["artworks"].first["image"]).to be_a(String) - expect(@response["artworks"].first["image"]).not_to be_empty - end - end + it "artworks – link" do + expect(@response["artworks"].first["link"]).to be_a(String) + expect(@response["artworks"].first["link"]).not_to be_empty + end - context "without thumbnail" do - it "artworks – image" do - expect(@response["artworks"].last["image"]).to be_a(String) - expect(@response["artworks"].last["image"]).not_to be_empty - end - end + context "with thumbnail" do + it "artworks – image" do + expect(@response["artworks"].first["image"]).to be_a(String) + expect(@response["artworks"].first["image"]).not_to be_empty + end + end + + context "without thumbnail" do + it "artworks – image" do + expect(@response["artworks"].last["image"]).to be_a(String) + expect(@response["artworks"].last["image"]).not_to be_empty + end + end - it "produces the expected response" do - @response["artworks"].each.with_index do |artwork, index| - expect(artwork["name"]).to eq(expected_response["artworks"][index]["name"]) - expect(artwork["extensions"]).to eq(expected_response["artworks"][index]["extensions"]) - expect(artwork["link"]).to eq(expected_response["artworks"][index]["link"]) - expect(artwork["image"]).to eq(expected_response["artworks"][index]["image"]) + if file_path == "./files/van-gogh-paintings.html" + it "produces the expected response" do + @response["artworks"].each.with_index do |artwork, index| + expect(artwork["name"]).to eq(expected_response["artworks"][index]["name"]) + expect(artwork["extensions"]).to eq(expected_response["artworks"][index]["extensions"]) + expect(artwork["link"]).to eq(expected_response["artworks"][index]["link"]) + expect(artwork["image"]).to eq(expected_response["artworks"][index]["image"]) + end + end + end end end end From a9d76ec31dc03b5273e8ef688295a640f981509d Mon Sep 17 00:00:00 2001 From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com> Date: Thu, 25 Jun 2026 11:38:39 +0200 Subject: [PATCH 12/15] Update selectors to work in all example pages --- lib/file_scraper.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/file_scraper.rb b/lib/file_scraper.rb index 821f3b22..08fe1c8f 100644 --- a/lib/file_scraper.rb +++ b/lib/file_scraper.rb @@ -12,10 +12,10 @@ def self.run(file_path) document = Nokogiri::HTML(html) - artworks = document.css(".iELo6") + artworks = document.css("g-loading-icon + div").children result = artworks.map do |artwork| - extensions = artwork.css(".KHK6lb > div").map do |extension| + extensions = artwork.css("img + div").children.map do |extension| extension&.text unless extension&.text.empty? end name = extensions.shift From b317e0519efcf99c9e0e51306f63a7637c4b70cb Mon Sep 17 00:00:00 2001 From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com> Date: Thu, 25 Jun 2026 12:10:30 +0200 Subject: [PATCH 13/15] Make conditional clearer --- lib/file_scraper.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/file_scraper.rb b/lib/file_scraper.rb index 08fe1c8f..b90863e2 100644 --- a/lib/file_scraper.rb +++ b/lib/file_scraper.rb @@ -16,7 +16,7 @@ def self.run(file_path) result = artworks.map do |artwork| extensions = artwork.css("img + div").children.map do |extension| - extension&.text unless extension&.text.empty? + extension.text if extension && !extension.text.empty? end name = extensions.shift relative_path = artwork.at("a")["href"] From f7c964d60cace4afe66f1b142b722813f1b5c701 Mon Sep 17 00:00:00 2001 From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com> Date: Thu, 25 Jun 2026 12:29:29 +0200 Subject: [PATCH 14/15] Add code comment --- lib/file_scraper.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/file_scraper.rb b/lib/file_scraper.rb index b90863e2..6b817a5d 100644 --- a/lib/file_scraper.rb +++ b/lib/file_scraper.rb @@ -41,6 +41,8 @@ def self.extract_html(file_path) raise "Please use an HTML file" unless file_extension == "html" begin + # Note: Ferrum uses a Chrome or Chromium driver – you need to have one of these installed. + # Docs: https://docs.rubycdp.com/docs/ferrum/introduction/ browser = Ferrum::Browser.new browser.go_to("file:///#{File.expand_path(file_path)}") browser.body From 325e46f036bef436fc8b3786599ea21ea991c87b Mon Sep 17 00:00:00 2001 From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com> Date: Thu, 25 Jun 2026 12:53:38 +0200 Subject: [PATCH 15/15] Improve expected response test and fix bug --- lib/file_scraper.rb | 2 +- spec/file_scraper_spec.rb | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/lib/file_scraper.rb b/lib/file_scraper.rb index 6b817a5d..de64bfe1 100644 --- a/lib/file_scraper.rb +++ b/lib/file_scraper.rb @@ -27,7 +27,7 @@ def self.run(file_path) extensions: (extensions unless extensions.compact.empty?), link: DOMAIN_NAME + relative_path, image: data_image || src_image, - } + }.compact end JSON.generate(artworks: result) diff --git a/spec/file_scraper_spec.rb b/spec/file_scraper_spec.rb index 68878c96..b8aaffd2 100644 --- a/spec/file_scraper_spec.rb +++ b/spec/file_scraper_spec.rb @@ -53,10 +53,7 @@ if file_path == "./files/van-gogh-paintings.html" it "produces the expected response" do @response["artworks"].each.with_index do |artwork, index| - expect(artwork["name"]).to eq(expected_response["artworks"][index]["name"]) - expect(artwork["extensions"]).to eq(expected_response["artworks"][index]["extensions"]) - expect(artwork["link"]).to eq(expected_response["artworks"][index]["link"]) - expect(artwork["image"]).to eq(expected_response["artworks"][index]["image"]) + expect(artwork).to eq(expected_response["artworks"][index]) end end end