diff --git a/.rspec b/.rspec new file mode 100644 index 00000000..c99d2e73 --- /dev/null +++ b/.rspec @@ -0,0 +1 @@ +--require spec_helper diff --git a/Gemfile b/Gemfile new file mode 100644 index 00000000..09f329c7 --- /dev/null +++ b/Gemfile @@ -0,0 +1,7 @@ +# frozen_string_literal: true + +source "https://rubygems.org" + +gem "rspec", "~> 3.13" +gem "nokogiri", "~> 1.19" +gem "ferrum", "~> 0.17" diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 00000000..946c3376 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,65 @@ +GEM + remote: https://rubygems.org/ + specs: + addressable (2.9.0) + public_suffix (>= 2.0.2, < 8.0) + base64 (0.3.0) + concurrent-ruby (1.3.7) + diff-lcs (1.6.2) + ferrum (0.17.2) + addressable (~> 2.5) + base64 (~> 0.2) + concurrent-ruby (~> 1.1) + webrick (~> 1.7) + websocket-driver (~> 0.7) + nokogiri (1.19.4-arm64-darwin) + racc (~> 1.4) + public_suffix (7.0.5) + racc (1.8.1) + rspec (3.13.2) + rspec-core (~> 3.13.0) + rspec-expectations (~> 3.13.0) + rspec-mocks (~> 3.13.0) + rspec-core (3.13.6) + rspec-support (~> 3.13.0) + rspec-expectations (3.13.5) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.13.0) + rspec-mocks (3.13.8) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.13.0) + rspec-support (3.13.7) + webrick (1.9.2) + websocket-driver (0.8.2) + base64 + websocket-extensions (>= 0.1.0) + websocket-extensions (0.1.5) + +PLATFORMS + arm64-darwin-23 + +DEPENDENCIES + ferrum (~> 0.17) + nokogiri (~> 1.19) + rspec (~> 3.13) + +CHECKSUMS + addressable (2.9.0) sha256=7fdf6ac3660f7f4e867a0838be3f6cf722ace541dd97767fa42bc6cfa980c7af + base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b + concurrent-ruby (1.3.7) sha256=4412caec3a5ea2e5fdc52076724c071a81f2c0593d83b2ac8cbb8ca63b3151b0 + diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962 + ferrum (0.17.2) sha256=2c2540a850b211a46f4d81de21bfd62048f507e4c327d1807225c3823c17e6ee + nokogiri (1.19.4-arm64-darwin) sha256=a46db9853286e6597b36ebc6953817d15acf3a299583eb3f89fdc6f91dd63527 + public_suffix (7.0.5) sha256=1a8bb08f1bbea19228d3bed6e5ed908d1cb4f7c2726d18bd9cadf60bc676f623 + racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f + rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587 + rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d + rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836 + rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47 + rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c + webrick (1.9.2) sha256=beb4a15fc474defed24a3bda4ffd88a490d517c9e4e6118c3edce59e45864131 + websocket-driver (0.8.2) sha256=97c556b019bf3410b4961002ac501621e9322d3f8a7bc02161a09301cc4c4146 + websocket-extensions (0.1.5) sha256=1c6ba63092cda343eb53fc657110c71c754c56484aad42578495227d717a8241 + +BUNDLED WITH + 4.0.10 diff --git a/files/gerhard-richter-paintings.html b/files/gerhard-richter-paintings.html new file mode 100644 index 00000000..28e806b7 --- /dev/null +++ b/files/gerhard-richter-paintings.html @@ -0,0 +1,54 @@ +Gerhard Richter paintings - Google Search
Skip to main contentAccessibility help

Search Results

Gerhard Richter
German visual artist
Google apps
Google Account
Rob McCormick
robmc@hey.com
\ No newline at end of file diff --git a/files/rene-magritte-paintings.html b/files/rene-magritte-paintings.html new file mode 100644 index 00000000..4493b1f8 --- /dev/null +++ b/files/rene-magritte-paintings.html @@ -0,0 +1,51 @@ +René Magritte paintings - Google Search
Skip to main contentAccessibility help

Search Results

René Magritte
Belgian artist
Google apps
Google Account
Rob McCormick
robmc@hey.com
\ No newline at end of file diff --git a/lib/file_scraper.rb b/lib/file_scraper.rb new file mode 100644 index 00000000..de64bfe1 --- /dev/null +++ b/lib/file_scraper.rb @@ -0,0 +1,53 @@ +# frozen_string_literal: true + +require "ferrum" +require "json" +require "nokogiri" + +class FileScraper + DOMAIN_NAME = "https://www.google.com" + + def self.run(file_path) + html = extract_html(file_path) + + document = Nokogiri::HTML(html) + + artworks = document.css("g-loading-icon + div").children + + result = artworks.map do |artwork| + extensions = artwork.css("img + div").children.map do |extension| + extension.text if extension && !extension.text.empty? + end + name = extensions.shift + relative_path = artwork.at("a")["href"] + data_image = artwork.at("img")["data-src"] + src_image = artwork.at("img")["src"] + { + name:, + extensions: (extensions unless extensions.compact.empty?), + link: DOMAIN_NAME + relative_path, + image: data_image || src_image, + }.compact + end + + JSON.generate(artworks: result) + end + + private + + def self.extract_html(file_path) + file_extension = file_path.split(".").last + + raise "Please use an HTML file" unless file_extension == "html" + + begin + # Note: Ferrum uses a Chrome or Chromium driver – you need to have one of these installed. + # Docs: https://docs.rubycdp.com/docs/ferrum/introduction/ + browser = Ferrum::Browser.new + browser.go_to("file:///#{File.expand_path(file_path)}") + browser.body + ensure + browser.quit + end + end +end diff --git a/spec/file_scraper_spec.rb b/spec/file_scraper_spec.rb new file mode 100644 index 00000000..b8aaffd2 --- /dev/null +++ b/spec/file_scraper_spec.rb @@ -0,0 +1,62 @@ +# frozen_string_literal: true + +require "file_scraper" + +RSpec.describe FileScraper do + FILE_PATHS = [ + "./files/van-gogh-paintings.html", + "./files/rene-magritte-paintings.html", + "./files/gerhard-richter-paintings.html", + ] + + FILE_PATHS.each do |file_path| + context "using #{file_path}" do + before :all do + json_response = FileScraper.run(file_path) + @response = JSON.parse(json_response) + end + + let(:expected_response) { JSON.parse(File.read("./files/expected-array.json")) } + + it "contains artworks array" do + expect(@response["artworks"]).to be_a(Array) + end + + it "artworks – name" do + expect(@response["artworks"].first["name"]).to be_a(String) + expect(@response["artworks"].first["name"]).not_to be_empty + end + + it "artworks – extensions" do + expect(@response["artworks"].first["extensions"]).to be_a(Array) + end + + it "artworks – link" do + expect(@response["artworks"].first["link"]).to be_a(String) + expect(@response["artworks"].first["link"]).not_to be_empty + end + + context "with thumbnail" do + it "artworks – image" do + expect(@response["artworks"].first["image"]).to be_a(String) + expect(@response["artworks"].first["image"]).not_to be_empty + end + end + + context "without thumbnail" do + it "artworks – image" do + expect(@response["artworks"].last["image"]).to be_a(String) + expect(@response["artworks"].last["image"]).not_to be_empty + end + end + + if file_path == "./files/van-gogh-paintings.html" + it "produces the expected response" do + @response["artworks"].each.with_index do |artwork, index| + expect(artwork).to eq(expected_response["artworks"][index]) + end + end + end + end + end +end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb new file mode 100644 index 00000000..c80d44b9 --- /dev/null +++ b/spec/spec_helper.rb @@ -0,0 +1,98 @@ +# This file was generated by the `rspec --init` command. Conventionally, all +# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`. +# The generated `.rspec` file contains `--require spec_helper` which will cause +# this file to always be loaded, without a need to explicitly require it in any +# files. +# +# Given that it is always loaded, you are encouraged to keep this file as +# light-weight as possible. Requiring heavyweight dependencies from this file +# will add to the boot time of your test suite on EVERY test run, even for an +# individual file that may not need all of that loaded. Instead, consider making +# a separate helper file that requires the additional dependencies and performs +# the additional setup, and require it from the spec files that actually need +# it. +# +# See https://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration +RSpec.configure do |config| + # rspec-expectations config goes here. You can use an alternate + # assertion/expectation library such as wrong or the stdlib/minitest + # assertions if you prefer. + config.expect_with :rspec do |expectations| + # This option will default to `true` in RSpec 4. It makes the `description` + # and `failure_message` of custom matchers include text for helper methods + # defined using `chain`, e.g.: + # be_bigger_than(2).and_smaller_than(4).description + # # => "be bigger than 2 and smaller than 4" + # ...rather than: + # # => "be bigger than 2" + expectations.include_chain_clauses_in_custom_matcher_descriptions = true + end + + # rspec-mocks config goes here. You can use an alternate test double + # library (such as bogus or mocha) by changing the `mock_with` option here. + config.mock_with :rspec do |mocks| + # Prevents you from mocking or stubbing a method that does not exist on + # a real object. This is generally recommended, and will default to + # `true` in RSpec 4. + mocks.verify_partial_doubles = true + end + + # This option will default to `:apply_to_host_groups` in RSpec 4 (and will + # have no way to turn it off -- the option exists only for backwards + # compatibility in RSpec 3). It causes shared context metadata to be + # inherited by the metadata hash of host groups and examples, rather than + # triggering implicit auto-inclusion in groups with matching metadata. + config.shared_context_metadata_behavior = :apply_to_host_groups + +# The settings below are suggested to provide a good initial experience +# with RSpec, but feel free to customize to your heart's content. +=begin + # This allows you to limit a spec run to individual examples or groups + # you care about by tagging them with `:focus` metadata. When nothing + # is tagged with `:focus`, all examples get run. RSpec also provides + # aliases for `it`, `describe`, and `context` that include `:focus` + # metadata: `fit`, `fdescribe` and `fcontext`, respectively. + config.filter_run_when_matching :focus + + # Allows RSpec to persist some state between runs in order to support + # the `--only-failures` and `--next-failure` CLI options. We recommend + # you configure your source control system to ignore this file. + config.example_status_persistence_file_path = "spec/examples.txt" + + # Limits the available syntax to the non-monkey patched syntax that is + # recommended. For more details, see: + # https://rspec.info/features/3-12/rspec-core/configuration/zero-monkey-patching-mode/ + config.disable_monkey_patching! + + # This setting enables warnings. It's recommended, but in some cases may + # be too noisy due to issues in dependencies. + config.warnings = true + + # Many RSpec users commonly either run the entire suite or an individual + # file, and it's useful to allow more verbose output when running an + # individual spec file. + if config.files_to_run.one? + # Use the documentation formatter for detailed output, + # unless a formatter has already been configured + # (e.g. via a command-line flag). + config.default_formatter = "doc" + end + + # Print the 10 slowest examples and example groups at the + # end of the spec run, to help surface which specs are running + # particularly slow. + config.profile_examples = 10 + + # Run specs in random order to surface order dependencies. If you find an + # order dependency and want to debug it, you can fix the order by providing + # the seed, which is printed after each run. + # --seed 1234 + config.order = :random + + # Seed global randomization in this process using the `--seed` CLI option. + # Setting this allows you to use `--seed` to deterministically reproduce + # test failures related to randomization by passing the same `--seed` value + # as the one that triggered the failure. + Kernel.srand config.seed +=end +end