From ed3e3f2923682b64f97f4089881ad62989821d94 Mon Sep 17 00:00:00 2001
From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com>
Date: Wed, 24 Jun 2026 21:21:11 +0200
Subject: [PATCH 01/15] Add and initialize RSpec
---
.rspec | 1 +
Gemfile | 5 +++
Gemfile.lock | 35 ++++++++++++++++
spec/spec_helper.rb | 98 +++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 139 insertions(+)
create mode 100644 .rspec
create mode 100644 Gemfile
create mode 100644 Gemfile.lock
create mode 100644 spec/spec_helper.rb
diff --git a/.rspec b/.rspec
new file mode 100644
index 00000000..c99d2e73
--- /dev/null
+++ b/.rspec
@@ -0,0 +1 @@
+--require spec_helper
diff --git a/Gemfile b/Gemfile
new file mode 100644
index 00000000..3975ddef
--- /dev/null
+++ b/Gemfile
@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+
+source "https://rubygems.org"
+
+gem "rspec", "~> 3.13"
diff --git a/Gemfile.lock b/Gemfile.lock
new file mode 100644
index 00000000..ccc2faea
--- /dev/null
+++ b/Gemfile.lock
@@ -0,0 +1,35 @@
+GEM
+ remote: https://rubygems.org/
+ specs:
+ diff-lcs (1.6.2)
+ rspec (3.13.2)
+ rspec-core (~> 3.13.0)
+ rspec-expectations (~> 3.13.0)
+ rspec-mocks (~> 3.13.0)
+ rspec-core (3.13.6)
+ rspec-support (~> 3.13.0)
+ rspec-expectations (3.13.5)
+ diff-lcs (>= 1.2.0, < 2.0)
+ rspec-support (~> 3.13.0)
+ rspec-mocks (3.13.8)
+ diff-lcs (>= 1.2.0, < 2.0)
+ rspec-support (~> 3.13.0)
+ rspec-support (3.13.7)
+
+PLATFORMS
+ arm64-darwin-23
+ ruby
+
+DEPENDENCIES
+ rspec (~> 3.13)
+
+CHECKSUMS
+ diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
+ rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
+ rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
+ rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
+ rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47
+ rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
+
+BUNDLED WITH
+ 4.0.10
diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb
new file mode 100644
index 00000000..c80d44b9
--- /dev/null
+++ b/spec/spec_helper.rb
@@ -0,0 +1,98 @@
+# This file was generated by the `rspec --init` command. Conventionally, all
+# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
+# The generated `.rspec` file contains `--require spec_helper` which will cause
+# this file to always be loaded, without a need to explicitly require it in any
+# files.
+#
+# Given that it is always loaded, you are encouraged to keep this file as
+# light-weight as possible. Requiring heavyweight dependencies from this file
+# will add to the boot time of your test suite on EVERY test run, even for an
+# individual file that may not need all of that loaded. Instead, consider making
+# a separate helper file that requires the additional dependencies and performs
+# the additional setup, and require it from the spec files that actually need
+# it.
+#
+# See https://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
+RSpec.configure do |config|
+ # rspec-expectations config goes here. You can use an alternate
+ # assertion/expectation library such as wrong or the stdlib/minitest
+ # assertions if you prefer.
+ config.expect_with :rspec do |expectations|
+ # This option will default to `true` in RSpec 4. It makes the `description`
+ # and `failure_message` of custom matchers include text for helper methods
+ # defined using `chain`, e.g.:
+ # be_bigger_than(2).and_smaller_than(4).description
+ # # => "be bigger than 2 and smaller than 4"
+ # ...rather than:
+ # # => "be bigger than 2"
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
+ end
+
+ # rspec-mocks config goes here. You can use an alternate test double
+ # library (such as bogus or mocha) by changing the `mock_with` option here.
+ config.mock_with :rspec do |mocks|
+ # Prevents you from mocking or stubbing a method that does not exist on
+ # a real object. This is generally recommended, and will default to
+ # `true` in RSpec 4.
+ mocks.verify_partial_doubles = true
+ end
+
+ # This option will default to `:apply_to_host_groups` in RSpec 4 (and will
+ # have no way to turn it off -- the option exists only for backwards
+ # compatibility in RSpec 3). It causes shared context metadata to be
+ # inherited by the metadata hash of host groups and examples, rather than
+ # triggering implicit auto-inclusion in groups with matching metadata.
+ config.shared_context_metadata_behavior = :apply_to_host_groups
+
+# The settings below are suggested to provide a good initial experience
+# with RSpec, but feel free to customize to your heart's content.
+=begin
+ # This allows you to limit a spec run to individual examples or groups
+ # you care about by tagging them with `:focus` metadata. When nothing
+ # is tagged with `:focus`, all examples get run. RSpec also provides
+ # aliases for `it`, `describe`, and `context` that include `:focus`
+ # metadata: `fit`, `fdescribe` and `fcontext`, respectively.
+ config.filter_run_when_matching :focus
+
+ # Allows RSpec to persist some state between runs in order to support
+ # the `--only-failures` and `--next-failure` CLI options. We recommend
+ # you configure your source control system to ignore this file.
+ config.example_status_persistence_file_path = "spec/examples.txt"
+
+ # Limits the available syntax to the non-monkey patched syntax that is
+ # recommended. For more details, see:
+ # https://rspec.info/features/3-12/rspec-core/configuration/zero-monkey-patching-mode/
+ config.disable_monkey_patching!
+
+ # This setting enables warnings. It's recommended, but in some cases may
+ # be too noisy due to issues in dependencies.
+ config.warnings = true
+
+ # Many RSpec users commonly either run the entire suite or an individual
+ # file, and it's useful to allow more verbose output when running an
+ # individual spec file.
+ if config.files_to_run.one?
+ # Use the documentation formatter for detailed output,
+ # unless a formatter has already been configured
+ # (e.g. via a command-line flag).
+ config.default_formatter = "doc"
+ end
+
+ # Print the 10 slowest examples and example groups at the
+ # end of the spec run, to help surface which specs are running
+ # particularly slow.
+ config.profile_examples = 10
+
+ # Run specs in random order to surface order dependencies. If you find an
+ # order dependency and want to debug it, you can fix the order by providing
+ # the seed, which is printed after each run.
+ # --seed 1234
+ config.order = :random
+
+ # Seed global randomization in this process using the `--seed` CLI option.
+ # Setting this allows you to use `--seed` to deterministically reproduce
+ # test failures related to randomization by passing the same `--seed` value
+ # as the one that triggered the failure.
+ Kernel.srand config.seed
+=end
+end
From fc17485257be8e7b0ebd53246800deb59557bad9 Mon Sep 17 00:00:00 2001
From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com>
Date: Wed, 24 Jun 2026 21:54:37 +0200
Subject: [PATCH 02/15] Add nokogiri
---
Gemfile | 1 +
Gemfile.lock | 7 ++++++-
2 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/Gemfile b/Gemfile
index 3975ddef..f94c154d 100644
--- a/Gemfile
+++ b/Gemfile
@@ -3,3 +3,4 @@
source "https://rubygems.org"
gem "rspec", "~> 3.13"
+gem "nokogiri", "~> 1.19"
diff --git a/Gemfile.lock b/Gemfile.lock
index ccc2faea..8c09d27c 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -2,6 +2,9 @@ GEM
remote: https://rubygems.org/
specs:
diff-lcs (1.6.2)
+ nokogiri (1.19.4-arm64-darwin)
+ racc (~> 1.4)
+ racc (1.8.1)
rspec (3.13.2)
rspec-core (~> 3.13.0)
rspec-expectations (~> 3.13.0)
@@ -18,13 +21,15 @@ GEM
PLATFORMS
arm64-darwin-23
- ruby
DEPENDENCIES
+ nokogiri (~> 1.19)
rspec (~> 3.13)
CHECKSUMS
diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
+ nokogiri (1.19.4-arm64-darwin) sha256=a46db9853286e6597b36ebc6953817d15acf3a299583eb3f89fdc6f91dd63527
+ racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
From 1267ae40069c39a0cb271c11183a5544724cb7ef Mon Sep 17 00:00:00 2001
From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com>
Date: Wed, 24 Jun 2026 21:54:50 +0200
Subject: [PATCH 03/15] Add basic tests
---
lib/file_scraper.rb | 16 +++++++++++++++
spec/file_scraper_spec.rb | 43 +++++++++++++++++++++++++++++++++++++++
2 files changed, 59 insertions(+)
create mode 100644 lib/file_scraper.rb
create mode 100644 spec/file_scraper_spec.rb
diff --git a/lib/file_scraper.rb b/lib/file_scraper.rb
new file mode 100644
index 00000000..95c9ebde
--- /dev/null
+++ b/lib/file_scraper.rb
@@ -0,0 +1,16 @@
+# frozen_string_literal: true
+
+require "nokogiri"
+require "json"
+
+class FileScraper
+ def self.get(file_path)
+ html = File.read(file_path)
+
+ raise "The file has no content" if html.nil?
+
+ document = Nokogiri::HTML(html)
+
+ JSON.generate({})
+ end
+end
diff --git a/spec/file_scraper_spec.rb b/spec/file_scraper_spec.rb
new file mode 100644
index 00000000..dba6594b
--- /dev/null
+++ b/spec/file_scraper_spec.rb
@@ -0,0 +1,43 @@
+# frozen_string_literal: true
+
+require "file_scraper"
+
+RSpec.describe FileScraper do
+ before :all do
+ path = "./files/van-gogh-paintings.html"
+ json_response = FileScraper.get(path)
+ @response = JSON.parse(json_response)
+ end
+
+ it "contains artworks array" do
+ expect(@response["artworks"]).to be_a(Array)
+ end
+
+ it "artworks – name" do
+ expect(@response["artworks"].first["name"]).to be_a(String)
+ expect(@response["artworks"].first["name"]).not_to be_empty
+ end
+
+ it "artworks – extensions" do
+ expect(@response["artworks"].first["extensions"]).to be_a(Array)
+ end
+
+ it "artworks – link" do
+ expect(@response["artworks"].first["link"]).to be_a(String)
+ expect(@response["artworks"].first["link"]).not_to be_empty
+ end
+
+ context "with thumbnail" do
+ it "artworks – image" do
+ expect(@response["artworks"].first["image"]).to be_a(String)
+ expect(@response["artworks"].first["image"]).not_to be_empty
+ end
+ end
+
+ context "without thumbnail" do
+ it "artworks – image" do
+ expect(@response["artworks"].last["image"]).to be_a(String)
+ expect(@response["artworks"].last["image"]).not_to be_empty
+ end
+ end
+end
From 7a69e1cd6af155a168b12ff4d2e965997ef9baed Mon Sep 17 00:00:00 2001
From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com>
Date: Wed, 24 Jun 2026 22:16:19 +0200
Subject: [PATCH 04/15] Scrape all but the thumbnail image
---
lib/file_scraper.rb | 22 ++++++++++++++++++++--
spec/file_scraper_spec.rb | 2 +-
2 files changed, 21 insertions(+), 3 deletions(-)
diff --git a/lib/file_scraper.rb b/lib/file_scraper.rb
index 95c9ebde..cf20e514 100644
--- a/lib/file_scraper.rb
+++ b/lib/file_scraper.rb
@@ -4,13 +4,31 @@
require "json"
class FileScraper
- def self.get(file_path)
+ DOMAIN_NAME = "https://www.google.com"
+
+ def self.run(file_path)
html = File.read(file_path)
raise "The file has no content" if html.nil?
document = Nokogiri::HTML(html)
- JSON.generate({})
+ artworks = document.css(".iELo6")
+
+ result = artworks.map do |artwork|
+ extensions = artwork.css(".KHK6lb > div").map { it&.text }
+ name = extensions.shift
+ relative_path = artwork.at("a")["href"]
+ image = artwork.at("img")["data-src"]
+ {
+ name:,
+ extensions:,
+ link: DOMAIN_NAME + relative_path,
+ image:,
+ }
+ end
+
+ JSON.generate({artworks: result})
end
end
+
diff --git a/spec/file_scraper_spec.rb b/spec/file_scraper_spec.rb
index dba6594b..29d2c13d 100644
--- a/spec/file_scraper_spec.rb
+++ b/spec/file_scraper_spec.rb
@@ -5,7 +5,7 @@
RSpec.describe FileScraper do
before :all do
path = "./files/van-gogh-paintings.html"
- json_response = FileScraper.get(path)
+ json_response = FileScraper.run(path)
@response = JSON.parse(json_response)
end
From 9106403d666885d9e3d0ddefcd90639ed2a80cc1 Mon Sep 17 00:00:00 2001
From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com>
Date: Thu, 25 Jun 2026 09:00:45 +0200
Subject: [PATCH 05/15] Update image scraping and add expected json test
---
lib/file_scraper.rb | 5 +++--
spec/file_scraper_spec.rb | 10 ++++++++--
2 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/lib/file_scraper.rb b/lib/file_scraper.rb
index cf20e514..27bc8e50 100644
--- a/lib/file_scraper.rb
+++ b/lib/file_scraper.rb
@@ -19,12 +19,13 @@ def self.run(file_path)
extensions = artwork.css(".KHK6lb > div").map { it&.text }
name = extensions.shift
relative_path = artwork.at("a")["href"]
- image = artwork.at("img")["data-src"]
+ data_image = artwork.at("img")["data-src"]
+ src_image = artwork.at("img")["src"]
{
name:,
extensions:,
link: DOMAIN_NAME + relative_path,
- image:,
+ image: data_image || src_image,
}
end
diff --git a/spec/file_scraper_spec.rb b/spec/file_scraper_spec.rb
index 29d2c13d..b3acf92b 100644
--- a/spec/file_scraper_spec.rb
+++ b/spec/file_scraper_spec.rb
@@ -5,10 +5,12 @@
RSpec.describe FileScraper do
before :all do
path = "./files/van-gogh-paintings.html"
- json_response = FileScraper.run(path)
- @response = JSON.parse(json_response)
+ @json_response = FileScraper.run(path)
+ @response = JSON.parse(@json_response)
end
+ let(:expected_json) { File.read("./files/expected-array.json") }
+
it "contains artworks array" do
expect(@response["artworks"]).to be_a(Array)
end
@@ -40,4 +42,8 @@
expect(@response["artworks"].last["image"]).not_to be_empty
end
end
+
+ it "produces the expected JSON array" do
+ expect(@json_response).to eql(expected_json)
+ end
end
From 95431f661af4f635e792d2d56299a88448360b76 Mon Sep 17 00:00:00 2001
From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com>
Date: Thu, 25 Jun 2026 10:25:06 +0200
Subject: [PATCH 06/15] Improve test for expected response
---
spec/file_scraper_spec.rb | 16 +++++++++++-----
1 file changed, 11 insertions(+), 5 deletions(-)
diff --git a/spec/file_scraper_spec.rb b/spec/file_scraper_spec.rb
index b3acf92b..74f23d40 100644
--- a/spec/file_scraper_spec.rb
+++ b/spec/file_scraper_spec.rb
@@ -5,11 +5,11 @@
RSpec.describe FileScraper do
before :all do
path = "./files/van-gogh-paintings.html"
- @json_response = FileScraper.run(path)
- @response = JSON.parse(@json_response)
+ json_response = FileScraper.run(path)
+ @response = JSON.parse(json_response)
end
- let(:expected_json) { File.read("./files/expected-array.json") }
+ let(:expected_response) { JSON.parse(File.read("./files/expected-array.json")) }
it "contains artworks array" do
expect(@response["artworks"]).to be_a(Array)
@@ -43,7 +43,13 @@
end
end
- it "produces the expected JSON array" do
- expect(@json_response).to eql(expected_json)
+ it "produces the expected response" do
+ @response["artworks"].each.with_index do |artwork, index|
+ puts artwork["name"]
+ expect(artwork["name"]).to eq(expected_response["artworks"][index]["name"])
+ expect(artwork["extensions"]).to eq(expected_response["artworks"][index]["extensions"])
+ expect(artwork["link"]).to eq(expected_response["artworks"][index]["link"])
+ expect(artwork["image"]).to eq(expected_response["artworks"][index]["image"])
+ end
end
end
From 61c218ce37f40fcd9b33bdb5e74784b3c3de08e5 Mon Sep 17 00:00:00 2001
From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com>
Date: Thu, 25 Jun 2026 10:25:18 +0200
Subject: [PATCH 07/15] Fix error with empty extensions in artwork
---
lib/file_scraper.rb | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/lib/file_scraper.rb b/lib/file_scraper.rb
index 27bc8e50..b5804a49 100644
--- a/lib/file_scraper.rb
+++ b/lib/file_scraper.rb
@@ -16,20 +16,21 @@ def self.run(file_path)
artworks = document.css(".iELo6")
result = artworks.map do |artwork|
- extensions = artwork.css(".KHK6lb > div").map { it&.text }
+ extensions = artwork.css(".KHK6lb > div").map do |extension|
+ extension&.text unless extension&.text.empty?
+ end
name = extensions.shift
relative_path = artwork.at("a")["href"]
data_image = artwork.at("img")["data-src"]
src_image = artwork.at("img")["src"]
{
name:,
- extensions:,
+ extensions: (extensions unless extensions.compact.empty?),
link: DOMAIN_NAME + relative_path,
image: data_image || src_image,
- }
+ }.compact
end
JSON.generate({artworks: result})
end
end
-
From 82bc0be77a02f7300d38be24b19cf68c60c43d46 Mon Sep 17 00:00:00 2001
From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com>
Date: Thu, 25 Jun 2026 10:25:26 +0200
Subject: [PATCH 08/15] Add ferrum gem
---
Gemfile | 1 +
Gemfile.lock | 25 +++++++++++++++++++++++++
2 files changed, 26 insertions(+)
diff --git a/Gemfile b/Gemfile
index f94c154d..09f329c7 100644
--- a/Gemfile
+++ b/Gemfile
@@ -4,3 +4,4 @@ source "https://rubygems.org"
gem "rspec", "~> 3.13"
gem "nokogiri", "~> 1.19"
+gem "ferrum", "~> 0.17"
diff --git a/Gemfile.lock b/Gemfile.lock
index 8c09d27c..946c3376 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -1,9 +1,20 @@
GEM
remote: https://rubygems.org/
specs:
+ addressable (2.9.0)
+ public_suffix (>= 2.0.2, < 8.0)
+ base64 (0.3.0)
+ concurrent-ruby (1.3.7)
diff-lcs (1.6.2)
+ ferrum (0.17.2)
+ addressable (~> 2.5)
+ base64 (~> 0.2)
+ concurrent-ruby (~> 1.1)
+ webrick (~> 1.7)
+ websocket-driver (~> 0.7)
nokogiri (1.19.4-arm64-darwin)
racc (~> 1.4)
+ public_suffix (7.0.5)
racc (1.8.1)
rspec (3.13.2)
rspec-core (~> 3.13.0)
@@ -18,23 +29,37 @@ GEM
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.13.0)
rspec-support (3.13.7)
+ webrick (1.9.2)
+ websocket-driver (0.8.2)
+ base64
+ websocket-extensions (>= 0.1.0)
+ websocket-extensions (0.1.5)
PLATFORMS
arm64-darwin-23
DEPENDENCIES
+ ferrum (~> 0.17)
nokogiri (~> 1.19)
rspec (~> 3.13)
CHECKSUMS
+ addressable (2.9.0) sha256=7fdf6ac3660f7f4e867a0838be3f6cf722ace541dd97767fa42bc6cfa980c7af
+ base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b
+ concurrent-ruby (1.3.7) sha256=4412caec3a5ea2e5fdc52076724c071a81f2c0593d83b2ac8cbb8ca63b3151b0
diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
+ ferrum (0.17.2) sha256=2c2540a850b211a46f4d81de21bfd62048f507e4c327d1807225c3823c17e6ee
nokogiri (1.19.4-arm64-darwin) sha256=a46db9853286e6597b36ebc6953817d15acf3a299583eb3f89fdc6f91dd63527
+ public_suffix (7.0.5) sha256=1a8bb08f1bbea19228d3bed6e5ed908d1cb4f7c2726d18bd9cadf60bc676f623
racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47
rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
+ webrick (1.9.2) sha256=beb4a15fc474defed24a3bda4ffd88a490d517c9e4e6118c3edce59e45864131
+ websocket-driver (0.8.2) sha256=97c556b019bf3410b4961002ac501621e9322d3f8a7bc02161a09301cc4c4146
+ websocket-extensions (0.1.5) sha256=1c6ba63092cda343eb53fc657110c71c754c56484aad42578495227d717a8241
BUNDLED WITH
4.0.10
From 05247a7c6ba96606e9cd90a806a215dcde0fcb6f Mon Sep 17 00:00:00 2001
From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com>
Date: Thu, 25 Jun 2026 11:00:11 +0200
Subject: [PATCH 09/15] Collect html document after JS execution
---
lib/file_scraper.rb | 27 +++++++++++++++++++++------
spec/file_scraper_spec.rb | 1 -
2 files changed, 21 insertions(+), 7 deletions(-)
diff --git a/lib/file_scraper.rb b/lib/file_scraper.rb
index b5804a49..821f3b22 100644
--- a/lib/file_scraper.rb
+++ b/lib/file_scraper.rb
@@ -1,15 +1,14 @@
# frozen_string_literal: true
-require "nokogiri"
+require "ferrum"
require "json"
+require "nokogiri"
class FileScraper
DOMAIN_NAME = "https://www.google.com"
def self.run(file_path)
- html = File.read(file_path)
-
- raise "The file has no content" if html.nil?
+ html = extract_html(file_path)
document = Nokogiri::HTML(html)
@@ -28,9 +27,25 @@ def self.run(file_path)
extensions: (extensions unless extensions.compact.empty?),
link: DOMAIN_NAME + relative_path,
image: data_image || src_image,
- }.compact
+ }
end
- JSON.generate({artworks: result})
+ JSON.generate(artworks: result)
+ end
+
+ private
+
+ def self.extract_html(file_path)
+ file_extension = file_path.split(".").last
+
+ raise "Please use an HTML file" unless file_extension == "html"
+
+ begin
+ browser = Ferrum::Browser.new
+ browser.go_to("file:///#{File.expand_path(file_path)}")
+ browser.body
+ ensure
+ browser.quit
+ end
end
end
diff --git a/spec/file_scraper_spec.rb b/spec/file_scraper_spec.rb
index 74f23d40..e0390bbd 100644
--- a/spec/file_scraper_spec.rb
+++ b/spec/file_scraper_spec.rb
@@ -45,7 +45,6 @@
it "produces the expected response" do
@response["artworks"].each.with_index do |artwork, index|
- puts artwork["name"]
expect(artwork["name"]).to eq(expected_response["artworks"][index]["name"])
expect(artwork["extensions"]).to eq(expected_response["artworks"][index]["extensions"])
expect(artwork["link"]).to eq(expected_response["artworks"][index]["link"])
From 3f9dd5296c68231f7a58d578cad9f77b9c31b7c4 Mon Sep 17 00:00:00 2001
From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com>
Date: Thu, 25 Jun 2026 11:24:18 +0200
Subject: [PATCH 10/15] Add 2 similar results pages
---
files/gerhard-richter-paintings.html | 54 ++++++++++++++++++++++++++++
files/rene-magritte-paintings.html | 51 ++++++++++++++++++++++++++
2 files changed, 105 insertions(+)
create mode 100644 files/gerhard-richter-paintings.html
create mode 100644 files/rene-magritte-paintings.html
diff --git a/files/gerhard-richter-paintings.html b/files/gerhard-richter-paintings.html
new file mode 100644
index 00000000..28e806b7
--- /dev/null
+++ b/files/gerhard-richter-paintings.html
@@ -0,0 +1,54 @@
+
Gerhard Richter paintings - Google Search Please click
here if you are not redirected within a few seconds.
Search Results
Complementary results Description Gerhard Richter is a German visual artist. Richter has produced abstract as well as photorealistic paintings, photographs and glass pieces. Wikipedia People also search for
Sabine Moritz-Richter artwork
Web results During this period, Richter created paintings that were based on photographs he collected , such as Party (1963) and Women Descending the Staircase (1965). His ... Read more
Gerhard Richter's diverse body of work includes influential paintings ranging from abstract to photorealistic . Born in 1932 in Dresden, Germany…
Gerhard Richter: Off the Scale | Contemporary Art | Sotheby's
Process as painting: Gerhard Richter - Unit
Gerhard Richter: Painting After All - The Brooklyn Rail
Web results His work is an exploration of the possibilities which painting still holds. Richter originally regarded his grey paintings , produced between 1968 and 1976, as ... Read more
Explore Gerhard Richter's biography, achievements, artworks, auction results, and shows on Artsy. One of the most famous artists to emerge from post-war ...
YouTube · Mary Lynn Buchanan
4 key moments 4 key moments in this video
Gerhard Richter, Grün Blau Rot (Green Blue Red) (1993)
Gerhard Richter Schilderij 'Zeegezicht
Gerhard Richter, Zaun (2010)
Gerhard Richter Schilderij 'Vogels
Gerhard Richter 'Offset of abstract painting'
Gerhard Richter Schilderij 'Abstract Schilderij
Gerhard Richter - Images Of An Era
Gerhard Richter – Manöver | Signed Unique Edition
Web results 8 Jan 2026 — From photorealism to abstraction, discover the iconic work of German Post-War artist Gerhard Richter by taking a look at his 10 most famous ...
Web results Gerhard Richter | Buy, Sell & Discover entire catalogue of authentic prints, originals, sculptures, collectibles and books online at the best price ...
As a long-term permanent loan, the Gerhard Richter Kunststiftung has given the Nationalgalerie 100 works that will be exhibited from March 2023 .
Stay up to date with Gerhard Richter (German, 1932), APT artist. Discover works for sale, auction results, market data, news and exhibitions on MutualArt.
Notices about Filtered Results Some results may have been removed under data protection law in Europe. Learn more
Copy link Copy link Link copied
Google apps
Google Account
Rob McCormick
robmc@hey.com
\ No newline at end of file
diff --git a/files/rene-magritte-paintings.html b/files/rene-magritte-paintings.html
new file mode 100644
index 00000000..4493b1f8
--- /dev/null
+++ b/files/rene-magritte-paintings.html
@@ -0,0 +1,51 @@
+René Magritte paintings - Google Search Please click
here if you are not redirected within a few seconds.
Search Results
Web results A Belgian surrealist painter, Rene Magritte's witty and thought-provoking paintings sought to have viewers question their perceptions of reality. Read more
Masterpieces of René Magritte . Young Girl Eating a Bird, 1927. The Lovers I, 1928. The Lovers II, 1928. Attempting the Impossible, 1928. The False Mirror, 1928.Read more
Rene Magritte: The Fifth Season · SFMOMA
The art of living, 1967, 54×65 cm by René Magritte: History ...
Classic Rene Magritte Surrealism Canvas Paintings Gallery Portrait Posters and Prints Pictures for Interior Wall Decor 40x55cm frameless : Amazon.nl: ...
Complementary results Description René François Ghislain Magritte was a Belgian surrealist artist known for his depictions of familiar objects in unfamiliar, unexpected contexts, which often provoked questions about the nature and boundaries of reality and representation. His imagery has influenced pop art, minimalist art, and conceptual art. Wikipedia
Web results Selected list of works · 1920 Landscape · 1922 The Station and L'Écuyère · 1923 Self-portrait, Sixth Nocturne, Georgette at the Piano and Donna · 1925 The Bather ... Read more
The Musée Magritte Museum not only holds the largest collection of works by the famous Belgian Surrealist but also the most important collection of works ... Read more
Featuring lithographs, paintings, and prints by René Magritte for sale , many of which depict his iconic umbrella, bowler hat, pipe, ...
YouTube · Great Art Explained
YouTube · Art Gallery of NSW
Web results 21 Nov 2025 — René Magritte in 10 Paintings · 1. The Lovers, 1928 · 2. The Treachery of Images, 1929 · 3. The False Mirror, 1929 · 4. Not to Be Reproduced, ... Read more
René Magritte , the Belgian painter and well-known Surrealist. He produced a body of work that rendered such commonplace things strange.Read more
Table of contents · Key moments in Magritte's life · 1. The Son of Man and the symbolic apple · 2. The Lovers II, a mournful kiss · 3. The Treachery of Images, an ... Read more
Notices about Filtered Results Some results may have been removed under data protection law in Europe. Learn more
Copy link Copy link Link copied
Google apps
Google Account
Rob McCormick
robmc@hey.com
\ No newline at end of file
From d3abc1c7624c2405f3b13a26e5479fafcba0df3c Mon Sep 17 00:00:00 2001
From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com>
Date: Thu, 25 Jun 2026 11:24:45 +0200
Subject: [PATCH 11/15] Update tests to test all sample pages
---
spec/file_scraper_spec.rb | 87 ++++++++++++++++++++++-----------------
1 file changed, 49 insertions(+), 38 deletions(-)
diff --git a/spec/file_scraper_spec.rb b/spec/file_scraper_spec.rb
index e0390bbd..68878c96 100644
--- a/spec/file_scraper_spec.rb
+++ b/spec/file_scraper_spec.rb
@@ -3,52 +3,63 @@
require "file_scraper"
RSpec.describe FileScraper do
- before :all do
- path = "./files/van-gogh-paintings.html"
- json_response = FileScraper.run(path)
- @response = JSON.parse(json_response)
- end
+ FILE_PATHS = [
+ "./files/van-gogh-paintings.html",
+ "./files/rene-magritte-paintings.html",
+ "./files/gerhard-richter-paintings.html",
+ ]
- let(:expected_response) { JSON.parse(File.read("./files/expected-array.json")) }
+ FILE_PATHS.each do |file_path|
+ context "using #{file_path}" do
+ before :all do
+ json_response = FileScraper.run(file_path)
+ @response = JSON.parse(json_response)
+ end
- it "contains artworks array" do
- expect(@response["artworks"]).to be_a(Array)
- end
+ let(:expected_response) { JSON.parse(File.read("./files/expected-array.json")) }
- it "artworks – name" do
- expect(@response["artworks"].first["name"]).to be_a(String)
- expect(@response["artworks"].first["name"]).not_to be_empty
- end
+ it "contains artworks array" do
+ expect(@response["artworks"]).to be_a(Array)
+ end
- it "artworks – extensions" do
- expect(@response["artworks"].first["extensions"]).to be_a(Array)
- end
+ it "artworks – name" do
+ expect(@response["artworks"].first["name"]).to be_a(String)
+ expect(@response["artworks"].first["name"]).not_to be_empty
+ end
- it "artworks – link" do
- expect(@response["artworks"].first["link"]).to be_a(String)
- expect(@response["artworks"].first["link"]).not_to be_empty
- end
+ it "artworks – extensions" do
+ expect(@response["artworks"].first["extensions"]).to be_a(Array)
+ end
- context "with thumbnail" do
- it "artworks – image" do
- expect(@response["artworks"].first["image"]).to be_a(String)
- expect(@response["artworks"].first["image"]).not_to be_empty
- end
- end
+ it "artworks – link" do
+ expect(@response["artworks"].first["link"]).to be_a(String)
+ expect(@response["artworks"].first["link"]).not_to be_empty
+ end
- context "without thumbnail" do
- it "artworks – image" do
- expect(@response["artworks"].last["image"]).to be_a(String)
- expect(@response["artworks"].last["image"]).not_to be_empty
- end
- end
+ context "with thumbnail" do
+ it "artworks – image" do
+ expect(@response["artworks"].first["image"]).to be_a(String)
+ expect(@response["artworks"].first["image"]).not_to be_empty
+ end
+ end
+
+ context "without thumbnail" do
+ it "artworks – image" do
+ expect(@response["artworks"].last["image"]).to be_a(String)
+ expect(@response["artworks"].last["image"]).not_to be_empty
+ end
+ end
- it "produces the expected response" do
- @response["artworks"].each.with_index do |artwork, index|
- expect(artwork["name"]).to eq(expected_response["artworks"][index]["name"])
- expect(artwork["extensions"]).to eq(expected_response["artworks"][index]["extensions"])
- expect(artwork["link"]).to eq(expected_response["artworks"][index]["link"])
- expect(artwork["image"]).to eq(expected_response["artworks"][index]["image"])
+ if file_path == "./files/van-gogh-paintings.html"
+ it "produces the expected response" do
+ @response["artworks"].each.with_index do |artwork, index|
+ expect(artwork["name"]).to eq(expected_response["artworks"][index]["name"])
+ expect(artwork["extensions"]).to eq(expected_response["artworks"][index]["extensions"])
+ expect(artwork["link"]).to eq(expected_response["artworks"][index]["link"])
+ expect(artwork["image"]).to eq(expected_response["artworks"][index]["image"])
+ end
+ end
+ end
end
end
end
From a9d76ec31dc03b5273e8ef688295a640f981509d Mon Sep 17 00:00:00 2001
From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com>
Date: Thu, 25 Jun 2026 11:38:39 +0200
Subject: [PATCH 12/15] Update selectors to work in all example pages
---
lib/file_scraper.rb | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/lib/file_scraper.rb b/lib/file_scraper.rb
index 821f3b22..08fe1c8f 100644
--- a/lib/file_scraper.rb
+++ b/lib/file_scraper.rb
@@ -12,10 +12,10 @@ def self.run(file_path)
document = Nokogiri::HTML(html)
- artworks = document.css(".iELo6")
+ artworks = document.css("g-loading-icon + div").children
result = artworks.map do |artwork|
- extensions = artwork.css(".KHK6lb > div").map do |extension|
+ extensions = artwork.css("img + div").children.map do |extension|
extension&.text unless extension&.text.empty?
end
name = extensions.shift
From b317e0519efcf99c9e0e51306f63a7637c4b70cb Mon Sep 17 00:00:00 2001
From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com>
Date: Thu, 25 Jun 2026 12:10:30 +0200
Subject: [PATCH 13/15] Make conditional clearer
---
lib/file_scraper.rb | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/file_scraper.rb b/lib/file_scraper.rb
index 08fe1c8f..b90863e2 100644
--- a/lib/file_scraper.rb
+++ b/lib/file_scraper.rb
@@ -16,7 +16,7 @@ def self.run(file_path)
result = artworks.map do |artwork|
extensions = artwork.css("img + div").children.map do |extension|
- extension&.text unless extension&.text.empty?
+ extension.text if extension && !extension.text.empty?
end
name = extensions.shift
relative_path = artwork.at("a")["href"]
From f7c964d60cace4afe66f1b142b722813f1b5c701 Mon Sep 17 00:00:00 2001
From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com>
Date: Thu, 25 Jun 2026 12:29:29 +0200
Subject: [PATCH 14/15] Add code comment
---
lib/file_scraper.rb | 2 ++
1 file changed, 2 insertions(+)
diff --git a/lib/file_scraper.rb b/lib/file_scraper.rb
index b90863e2..6b817a5d 100644
--- a/lib/file_scraper.rb
+++ b/lib/file_scraper.rb
@@ -41,6 +41,8 @@ def self.extract_html(file_path)
raise "Please use an HTML file" unless file_extension == "html"
begin
+ # Note: Ferrum uses a Chrome or Chromium driver – you need to have one of these installed.
+ # Docs: https://docs.rubycdp.com/docs/ferrum/introduction/
browser = Ferrum::Browser.new
browser.go_to("file:///#{File.expand_path(file_path)}")
browser.body
From 325e46f036bef436fc8b3786599ea21ea991c87b Mon Sep 17 00:00:00 2001
From: Rob McCormick <51120728+rob-mccormick@users.noreply.github.com>
Date: Thu, 25 Jun 2026 12:53:38 +0200
Subject: [PATCH 15/15] Improve expected response test and fix bug
---
lib/file_scraper.rb | 2 +-
spec/file_scraper_spec.rb | 5 +----
2 files changed, 2 insertions(+), 5 deletions(-)
diff --git a/lib/file_scraper.rb b/lib/file_scraper.rb
index 6b817a5d..de64bfe1 100644
--- a/lib/file_scraper.rb
+++ b/lib/file_scraper.rb
@@ -27,7 +27,7 @@ def self.run(file_path)
extensions: (extensions unless extensions.compact.empty?),
link: DOMAIN_NAME + relative_path,
image: data_image || src_image,
- }
+ }.compact
end
JSON.generate(artworks: result)
diff --git a/spec/file_scraper_spec.rb b/spec/file_scraper_spec.rb
index 68878c96..b8aaffd2 100644
--- a/spec/file_scraper_spec.rb
+++ b/spec/file_scraper_spec.rb
@@ -53,10 +53,7 @@
if file_path == "./files/van-gogh-paintings.html"
it "produces the expected response" do
@response["artworks"].each.with_index do |artwork, index|
- expect(artwork["name"]).to eq(expected_response["artworks"][index]["name"])
- expect(artwork["extensions"]).to eq(expected_response["artworks"][index]["extensions"])
- expect(artwork["link"]).to eq(expected_response["artworks"][index]["link"])
- expect(artwork["image"]).to eq(expected_response["artworks"][index]["image"])
+ expect(artwork).to eq(expected_response["artworks"][index])
end
end
end