diff --git a/.env.test b/.env.test index 8e4a15ce..e39230dd 100644 --- a/.env.test +++ b/.env.test @@ -1,4 +1,6 @@ ALMA_OPENURL=https://na06.alma.exlibrisgroup.com/view/uresolver/01MIT_INST/openurl? +TURNSTILE_SITEKEY=test-sitekey +TURNSTILE_SECRET=test-secret FEATURE_TIMDEX_FULLTEXT=true FEATURE_GEODATA=false MIT_PRIMO_URL=https://mit.primo.exlibrisgroup.com diff --git a/Gemfile b/Gemfile index 5d604a28..2e0e339e 100644 --- a/Gemfile +++ b/Gemfile @@ -4,6 +4,7 @@ git_source(:github) { |repo| "https://github.com/#{repo}.git" } ruby '3.4.8' gem 'bootsnap', require: false +gem 'crawler_detect' gem 'graphql' gem 'graphql-client' gem 'http' @@ -14,6 +15,7 @@ gem 'openssl' gem 'puma' gem 'rack-attack' gem 'rack-timeout' +gem 'rails_cloudflare_turnstile' gem 'rails', '~> 7.2.0' gem 'redis' gem 'scout_apm' diff --git a/Gemfile.lock b/Gemfile.lock index 6629835e..4ed08521 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -120,6 +120,8 @@ GEM bigdecimal rexml crass (1.0.6) + crawler_detect (1.2.9) + qonfig (>= 0.24) date (3.5.1) debug (1.11.1) irb (~> 1.10) @@ -134,6 +136,12 @@ GEM drb (2.2.3) erb (5.1.3) erubi (1.13.1) + faraday (2.14.1) + faraday-net_http (>= 2.0, < 3.5) + json + logger + faraday-net_http (3.4.2) + net-http (~> 0.5) ffi (1.17.2-aarch64-linux-gnu) ffi (1.17.2-arm64-darwin) ffi (1.17.2-x86_64-darwin) @@ -206,6 +214,8 @@ GEM mocha (2.8.2) ruby2_keywords (>= 0.0.5) msgpack (1.8.0) + net-http (0.9.1) + uri (>= 0.11.1) net-imap (0.5.13) date net-protocol @@ -243,6 +253,8 @@ GEM public_suffix (6.0.2) puma (7.2.0) nio4r (~> 2.0) + qonfig (0.30.0) + base64 (>= 0.2) racc (1.8.1) rack (3.1.20) rack-attack (6.8.0) @@ -276,6 +288,9 @@ GEM rails-html-sanitizer (1.7.0) loofah (~> 2.25) nokogiri (>= 1.15.7, != 1.16.7, != 1.16.6, != 1.16.5, != 1.16.4, != 1.16.3, != 1.16.2, != 1.16.1, != 1.16.0.rc1, != 1.16.0) + rails_cloudflare_turnstile (0.5.0) + faraday (>= 1.0, < 3.0) + rails (>= 6.0, < 8.2) railties (7.2.3) actionpack (= 7.2.3) activesupport (= 7.2.3) @@ -381,6 +396,7 @@ GEM unicode-display_width (3.2.0) unicode-emoji (~> 4.1) unicode-emoji (4.2.0) + uri (1.1.1) useragent (0.16.11) vcr (6.4.0) web-console (4.2.1) @@ -421,6 +437,7 @@ DEPENDENCIES bootsnap capybara climate_control + crawler_detect debug dotenv-rails graphql @@ -437,6 +454,7 @@ DEPENDENCIES rack-attack rack-timeout rails (~> 7.2.0) + rails_cloudflare_turnstile redis rubocop rubocop-rails diff --git a/README.md b/README.md index 1e9ff60c..40ffd8e9 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,7 @@ See `Optional Environment Variables` for more information. - `BOOLEAN_OPTIONS`: comma separated list of values to present to testers on instances where `BOOLEAN_PICKER` feature is enabled. - `FEATURE_BOOLEAN_PICKER`: feature to allow users to select their preferred boolean type. If set to `true`, feature is enabled. This feature is only intended for internal team testing and should never be enabled in production (mostly because the UI is a mess more than it would cause harm). +- `FEATURE_BOT_DETECTION`: When set to `true`, enables bot detection using crawler_detect and Cloudflare Turnstile challenges for suspected bots on search result pages. Requires `TURNSTILE_SITEKEY` and `TURNSTILE_SECRET` to be set. If disabled, bots may crawl search results freely. - `FEATURE_GEODATA`: Enables features related to geospatial data discovery. Setting this variable to `true` will trigger geodata mode. Note that this is currently intended _only_ for the geodata app and may have unexpected consequences if applied to other TIMDEX UI apps. @@ -146,6 +147,8 @@ instance is sending what search traffic. Defaults to "unset" if not defined. - `TIMDEX_INDEX`: Name of the index, or alias, to provide to the GraphQL endpoint. Defaults to `nil` which will let TIMDEX determine the best index to use. Wildcard values can be set, for example `rdi*` would search any indexes that begin with `rdi` in the underlying OpenSearch instance behind TIMDEX. - `TIMDEX_SOURCES`: Comma-separated list of sources to display in the advanced-search source selection element. This overrides the default which is set in ApplicationHelper. +- `TURNSTILE_SECRET`: The Cloudflare Turnstile secret key used to verify challenge responses. If not set, bot challenge protection is disabled. +- `TURNSTILE_SITEKEY`: The Cloudflare Turnstile site key used to render the challenge widget. If not set, bot challenge protection is disabled. #### Test Environment-only Variables diff --git a/app/controllers/search_controller.rb b/app/controllers/search_controller.rb index 1d1d3ba2..40d13bb0 100644 --- a/app/controllers/search_controller.rb +++ b/app/controllers/search_controller.rb @@ -2,6 +2,7 @@ class SearchController < ApplicationController before_action :validate_q!, only: %i[results] before_action :validate_format_token, only: %i[results] before_action :set_active_tab, only: %i[results] + before_action :challenge_bots!, only: %i[results] around_action :sleep_if_too_fast, only: %i[results] before_action :validate_geobox_presence!, only: %i[results] @@ -271,6 +272,15 @@ def validate_q! redirect_to root_url end + # Redirect suspected crawlers to Turnstile when the bot_detection feature is enabled. + def challenge_bots! + return unless Feature.enabled?(:bot_detection) + return if session[:passed_turnstile] + return unless BotDetector.should_challenge?(request) + + redirect_to turnstile_path(return_to: request.fullpath) + end + def validate_geodistance_presence! return unless Feature.enabled?(:geodata) diff --git a/app/controllers/turnstile_controller.rb b/app/controllers/turnstile_controller.rb new file mode 100644 index 00000000..dc594da3 --- /dev/null +++ b/app/controllers/turnstile_controller.rb @@ -0,0 +1,33 @@ +class TurnstileController < ApplicationController + before_action :validate_cloudflare_turnstile, only: :verify + + rescue_from RailsCloudflareTurnstile::Forbidden, with: :handle_forbidden + + def show + @return_to = params[:return_to].presence || root_path + end + + def verify + session[:passed_turnstile] = true + redirect_to safe_return_path + end + + private + + # Handles Turnstile rejecting token submission due to invalid token, network issue, etc. + def handle_forbidden + flash.now[:error] = "We couldn't complete the verification. Please try again." + render :show, status: :unprocessable_entity + end + + # Returns a safe path to redirect to after Turnstile verification. Valid paths should begin with + # a single slash. Falls back to root_path if the provided path is invalid. + def safe_return_path + return_to = params[:return_to].to_s + return root_path if return_to.blank? + return root_path if return_to.start_with?('//') + return return_to if return_to.start_with?('/') + + root_path + end +end diff --git a/app/models/bot_detector.rb b/app/models/bot_detector.rb new file mode 100644 index 00000000..5c5a6dcd --- /dev/null +++ b/app/models/bot_detector.rb @@ -0,0 +1,26 @@ +class BotDetector + # Returns true if the request appears to be a bot according to crawler_detect. + def self.bot?(request) + ua = request.user_agent.to_s + detector = CrawlerDetect.new(ua) + detector.is_crawler? + rescue StandardError => e + Rails.logger.debug("BotDetector: crawler_detect failed for UA '#{ua}': #{e.message}") + false + end + + # Returns true when the request appears to be performing crawling behavior that we + # want to challenge. For our initial approach, treat requests to the search results + # endpoint as subject to challenge if they're flagged as bots. + def self.should_challenge?(request) + return false unless bot?(request) + + # Basic rule: crawling search results or record pages triggers a challenge. + # /results is the search results page and /record is the full record view. + # This keeps the rule simple and conservative. + path = request.path.to_s + return true if path.start_with?('/results') || path.start_with?('/record') + + false + end +end diff --git a/app/models/feature.rb b/app/models/feature.rb index 8ee5c61c..391db847 100644 --- a/app/models/feature.rb +++ b/app/models/feature.rb @@ -33,7 +33,7 @@ # class Feature # List of all valid features in the application - VALID_FEATURES = %i[geodata boolean_picker oa_always simulate_search_latency tab_primo_all tab_timdex_all + VALID_FEATURES = %i[bot_detection geodata boolean_picker oa_always simulate_search_latency tab_primo_all tab_timdex_all tab_timdex_alma record_link timdex_fulltext].freeze # Check if a feature is enabled by name diff --git a/app/views/turnstile/show.html.erb b/app/views/turnstile/show.html.erb new file mode 100644 index 00000000..3d13cd9f --- /dev/null +++ b/app/views/turnstile/show.html.erb @@ -0,0 +1,22 @@ +<%= cloudflare_turnstile_script_tag %> + +
+
+

Verify you're human

+

+ Please complete this verification to continue. +

+ + <%= form_with url: turnstile_verify_path, method: :post, local: true do %> + <%= hidden_field_tag :return_to, @return_to %> + +
+ <%= cloudflare_turnstile(action: 'search') %> +
+ +
+ <%= submit_tag 'Submit', class: 'btn button-primary' %> +
+ <% end %> +
+
diff --git a/config/initializers/cloudflare_turnstile.rb b/config/initializers/cloudflare_turnstile.rb new file mode 100644 index 00000000..42014a16 --- /dev/null +++ b/config/initializers/cloudflare_turnstile.rb @@ -0,0 +1,38 @@ +# Explicitly require Feature model to check if bot detection is enabled +require Rails.root.join('app/models/feature') + +module TurnstileConfig + module_function + + def apply + RailsCloudflareTurnstile.reset_configuration! + enabled = bot_detection_enabled? + enabled = false if Rails.env.test? + + RailsCloudflareTurnstile.configure do |config| + config.site_key = ENV['TURNSTILE_SITEKEY'] + config.secret_key = ENV['TURNSTILE_SECRET'] + config.enabled = enabled + config.fail_open = !enabled + config.mock_enabled = Rails.env.test? + end + end + + def bot_detection_enabled? + return false unless Feature.enabled?(:bot_detection) + + # Check that required env is present + sitekey = ENV.fetch('TURNSTILE_SITEKEY', nil) + secret = ENV.fetch('TURNSTILE_SECRET', nil) + + if sitekey.blank? || secret.blank? + Rails.logger.error('Bot detection enabled but missing TURNSTILE_SITEKEY or TURNSTILE_SECRET') + Sentry.capture_message('Bot detection misconfigured: missing Turnstile credentials', level: :error) + return false + end + + true + end +end + +TurnstileConfig.apply diff --git a/config/routes.rb b/config/routes.rb index 440ea41e..04ab1f40 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -12,6 +12,8 @@ as: 'record', :constraints => { :id => /[0-z\.\-\_~\(\)]+/ } get 'results', to: 'search#results' + get 'turnstile', to: 'turnstile#show', as: 'turnstile' + post 'turnstile/verify', to: 'turnstile#verify', as: 'turnstile_verify' get 'style-guide', to: 'static#style_guide' get 'boolpref', to: 'static#boolpref' diff --git a/test/controllers/search_controller_test.rb b/test/controllers/search_controller_test.rb index 865ce0ba..31813ba3 100644 --- a/test/controllers/search_controller_test.rb +++ b/test/controllers/search_controller_test.rb @@ -1165,4 +1165,35 @@ def source_filter_count(controller) get '/results?q=test&format=foo' assert_response :not_acceptable end + + # Bot detection tests + test 'bots are redirected to Turnstile challenge' do + ClimateControl.modify(FEATURE_BOT_DETECTION: 'true') do + bot_ua = 'Mozilla/5.0 (compatible; Googlebot/2.1)' + + get '/results?q=test', headers: { 'HTTP_USER_AGENT' => bot_ua } + + assert_redirected_to turnstile_path(return_to: '/results?q=test') + end + end + + test 'human users bypass Turnstile challenge' do + ClimateControl.modify(FEATURE_BOT_DETECTION: 'true') do + human_ua = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/91.0.4472.124 Safari/537.36' + mock_primo_search_success + + get '/results?q=test&tab=primo', headers: { 'HTTP_USER_AGENT' => human_ua } + + assert_response :success + end + end + + test 'bots on non-search paths are not challenged' do + bot_ua = 'Googlebot/2.1' + + get '/', headers: { 'HTTP_USER_AGENT' => bot_ua } + + # Should not be redirected to Turnstile (doesn't hit SearchController) + assert_response :success + end end diff --git a/test/controllers/turnstile_controller_test.rb b/test/controllers/turnstile_controller_test.rb new file mode 100644 index 00000000..a12e5e9d --- /dev/null +++ b/test/controllers/turnstile_controller_test.rb @@ -0,0 +1,47 @@ +require 'test_helper' +require 'climate_control' + +class TurnstileControllerTest < ActionDispatch::IntegrationTest + def with_bot_detection_enabled + ClimateControl.modify(FEATURE_BOT_DETECTION: 'true') do + TurnstileConfig.apply + yield + ensure + TurnstileConfig.apply + end + end + + test 'show renders when bot detection is enabled' do + with_bot_detection_enabled do + get turnstile_path + assert_response :success + end + end + + test 'verify sets session and redirects back to search' do + with_bot_detection_enabled do + post turnstile_verify_path, params: { 'cf-turnstile-response' => 'mocked', return_to: '/results?q=ocean' } + + assert_redirected_to '/results?q=ocean' + assert session[:passed_turnstile] + end + end + + test 'verify re-renders on failed validation' do + with_bot_detection_enabled do + post turnstile_verify_path + + assert_response :unprocessable_entity + assert_match "We couldn't complete the verification", response.body + refute session[:passed_turnstile] + end + end + + test 'verify falls back to root_path for invalid return_to' do + with_bot_detection_enabled do + post turnstile_verify_path, params: { 'cf-turnstile-response' => 'mocked', return_to: 'foo' } + assert_redirected_to root_path + assert session[:passed_turnstile] + end + end +end diff --git a/test/models/bot_detector_test.rb b/test/models/bot_detector_test.rb new file mode 100644 index 00000000..a8ee1d0a --- /dev/null +++ b/test/models/bot_detector_test.rb @@ -0,0 +1,97 @@ +require 'test_helper' +require 'ostruct' + +class BotDetectorTest < ActiveSupport::TestCase + # Helper method to instantiate request objects. + def request(user_agent, path) + Struct.new(:user_agent, :path).new(user_agent, path) + end + + test 'bot? detects bots when crawler_detect returns true' do + request = mock(user_agent: 'Googlebot/2.1') + + # Mock CrawlerDetect to return a detector that reports a bot + mock_detector = mock(is_crawler?: true) + CrawlerDetect.stubs(:new).returns(mock_detector) + + assert BotDetector.bot?(request) + end + + test 'bot? allows non-bots when crawler_detect returns false' do + request = mock(user_agent: 'Mozilla/5.0 (X11; Linux x86_64)') + + mock_detector = mock(is_crawler?: false) + CrawlerDetect.stubs(:new).returns(mock_detector) + + refute BotDetector.bot?(request) + end + + test 'bot? handles nil user agent gracefully' do + request = mock(user_agent: nil) + + mock_detector = mock(is_crawler?: false) + CrawlerDetect.stubs(:new).returns(mock_detector) + + refute BotDetector.bot?(request) + end + + test 'bot? logs and returns false on detector failure' do + request = mock(user_agent: 'Test UA') + + # Mock crawler_detect to raise an error + CrawlerDetect.stubs(:new).raises(StandardError.new('Detector failure')) + + Rails.logger.expects(:debug).with(includes('BotDetector: crawler_detect failed')) + + refute BotDetector.bot?(request) + end + + test 'should_challenge? returns false for non-bots' do + req = request('Mozilla/5.0 (X11; Linux)', '/search') + + mock_detector = mock(is_crawler?: false) + CrawlerDetect.stubs(:new).returns(mock_detector) + + refute BotDetector.should_challenge?(req) + end + + test 'should_challenge? returns false for bots not on search paths' do + bot_ua = 'Googlebot/2.1' + req = request(bot_ua, '/static/style-guide') + + mock_detector = mock(is_crawler?: true) + CrawlerDetect.stubs(:new).returns(mock_detector) + + refute BotDetector.should_challenge?(req) + end + + test 'should_challenge? returns false for bots on /search paths' do + bot_ua = 'Googlebot/2.1' + req = request(bot_ua, '/search') + + mock_detector = mock(is_crawler?: true) + CrawlerDetect.stubs(:new).returns(mock_detector) + + refute BotDetector.should_challenge?(req) + end + + test 'should_challenge? returns true for bots on results endpoint' do + bot_ua = 'Mozilla/5.0 (compatible; bingbot/2.0)' + req = request(bot_ua, '/results?q=test') + + mock_detector = mock(is_crawler?: true) + CrawlerDetect.stubs(:new).returns(mock_detector) + + assert BotDetector.should_challenge?(req) + end + + test 'should_challenge? handles nil path gracefully' do + bot_ua = 'Googlebot/2.1' + req = request(bot_ua, path: nil) + + mock_detector = mock(is_crawler?: true) + CrawlerDetect.stubs(:new).returns(mock_detector) + + refute BotDetector.should_challenge?(req) + end +end