Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .env.test
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
ALMA_OPENURL=https://na06.alma.exlibrisgroup.com/view/uresolver/01MIT_INST/openurl?
TURNSTILE_SITEKEY=test-sitekey
TURNSTILE_SECRET=test-secret
FEATURE_TIMDEX_FULLTEXT=true
FEATURE_GEODATA=false
MIT_PRIMO_URL=https://mit.primo.exlibrisgroup.com
Expand Down
2 changes: 2 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ git_source(:github) { |repo| "https://github.com/#{repo}.git" }
ruby '3.4.8'

gem 'bootsnap', require: false
gem 'crawler_detect'
gem 'graphql'
gem 'graphql-client'
gem 'http'
Expand All @@ -14,6 +15,7 @@ gem 'openssl'
gem 'puma'
gem 'rack-attack'
gem 'rack-timeout'
gem 'rails_cloudflare_turnstile'
gem 'rails', '~> 7.2.0'
gem 'redis'
gem 'scout_apm'
Expand Down
18 changes: 18 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ GEM
bigdecimal
rexml
crass (1.0.6)
crawler_detect (1.2.9)
qonfig (>= 0.24)
date (3.5.1)
debug (1.11.1)
irb (~> 1.10)
Expand All @@ -134,6 +136,12 @@ GEM
drb (2.2.3)
erb (5.1.3)
erubi (1.13.1)
faraday (2.14.1)
faraday-net_http (>= 2.0, < 3.5)
json
logger
faraday-net_http (3.4.2)
net-http (~> 0.5)
ffi (1.17.2-aarch64-linux-gnu)
ffi (1.17.2-arm64-darwin)
ffi (1.17.2-x86_64-darwin)
Expand Down Expand Up @@ -206,6 +214,8 @@ GEM
mocha (2.8.2)
ruby2_keywords (>= 0.0.5)
msgpack (1.8.0)
net-http (0.9.1)
uri (>= 0.11.1)
net-imap (0.5.13)
date
net-protocol
Expand Down Expand Up @@ -243,6 +253,8 @@ GEM
public_suffix (6.0.2)
puma (7.2.0)
nio4r (~> 2.0)
qonfig (0.30.0)
base64 (>= 0.2)
racc (1.8.1)
rack (3.1.20)
rack-attack (6.8.0)
Expand Down Expand Up @@ -276,6 +288,9 @@ GEM
rails-html-sanitizer (1.7.0)
loofah (~> 2.25)
nokogiri (>= 1.15.7, != 1.16.7, != 1.16.6, != 1.16.5, != 1.16.4, != 1.16.3, != 1.16.2, != 1.16.1, != 1.16.0.rc1, != 1.16.0)
rails_cloudflare_turnstile (0.5.0)
faraday (>= 1.0, < 3.0)
rails (>= 6.0, < 8.2)
railties (7.2.3)
actionpack (= 7.2.3)
activesupport (= 7.2.3)
Expand Down Expand Up @@ -381,6 +396,7 @@ GEM
unicode-display_width (3.2.0)
unicode-emoji (~> 4.1)
unicode-emoji (4.2.0)
uri (1.1.1)
useragent (0.16.11)
vcr (6.4.0)
web-console (4.2.1)
Expand Down Expand Up @@ -421,6 +437,7 @@ DEPENDENCIES
bootsnap
capybara
climate_control
crawler_detect
debug
dotenv-rails
graphql
Expand All @@ -437,6 +454,7 @@ DEPENDENCIES
rack-attack
rack-timeout
rails (~> 7.2.0)
rails_cloudflare_turnstile
redis
rubocop
rubocop-rails
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ See `Optional Environment Variables` for more information.
- `BOOLEAN_OPTIONS`: comma separated list of values to present to testers on instances where `BOOLEAN_PICKER` feature is enabled.
- `FEATURE_BOOLEAN_PICKER`: feature to allow users to select their preferred boolean type. If set to `true`, feature is enabled. This feature is only intended for internal team
testing and should never be enabled in production (mostly because the UI is a mess more than it would cause harm).
- `FEATURE_BOT_DETECTION`: When set to `true`, enables bot detection using crawler_detect and Cloudflare Turnstile challenges for suspected bots on search result pages. Requires `TURNSTILE_SITEKEY` and `TURNSTILE_SECRET` to be set. If disabled, bots may crawl search results freely.
- `FEATURE_GEODATA`: Enables features related to geospatial data discovery. Setting this variable to `true` will trigger geodata
mode. Note that this is currently intended _only_ for the geodata app and
may have unexpected consequences if applied to other TIMDEX UI apps.
Expand Down Expand Up @@ -146,6 +147,8 @@ instance is sending what search traffic. Defaults to "unset" if not defined.
- `TIMDEX_INDEX`: Name of the index, or alias, to provide to the GraphQL endpoint. Defaults to `nil` which will let TIMDEX determine the best index to use. Wildcard values can be set, for example `rdi*` would search any indexes that begin with `rdi` in the underlying OpenSearch instance behind TIMDEX.
- `TIMDEX_SOURCES`: Comma-separated list of sources to display in the advanced-search source selection element. This
overrides the default which is set in ApplicationHelper.
- `TURNSTILE_SECRET`: The Cloudflare Turnstile secret key used to verify challenge responses. If not set, bot challenge protection is disabled.
- `TURNSTILE_SITEKEY`: The Cloudflare Turnstile site key used to render the challenge widget. If not set, bot challenge protection is disabled.

#### Test Environment-only Variables

Expand Down
10 changes: 10 additions & 0 deletions app/controllers/search_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ class SearchController < ApplicationController
before_action :validate_q!, only: %i[results]
before_action :validate_format_token, only: %i[results]
before_action :set_active_tab, only: %i[results]
before_action :challenge_bots!, only: %i[results]
around_action :sleep_if_too_fast, only: %i[results]

before_action :validate_geobox_presence!, only: %i[results]
Expand Down Expand Up @@ -271,6 +272,15 @@ def validate_q!
redirect_to root_url
end

# Redirect suspected crawlers to Turnstile when the bot_detection feature is enabled.
def challenge_bots!
return unless Feature.enabled?(:bot_detection)
return if session[:passed_turnstile]
return unless BotDetector.should_challenge?(request)

redirect_to turnstile_path(return_to: request.fullpath)
end

def validate_geodistance_presence!
return unless Feature.enabled?(:geodata)

Expand Down
33 changes: 33 additions & 0 deletions app/controllers/turnstile_controller.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
class TurnstileController < ApplicationController
before_action :validate_cloudflare_turnstile, only: :verify

rescue_from RailsCloudflareTurnstile::Forbidden, with: :handle_forbidden

def show
@return_to = params[:return_to].presence || root_path
end

def verify
session[:passed_turnstile] = true
redirect_to safe_return_path
end

private

# Handles Turnstile rejecting token submission due to invalid token, network issue, etc.
def handle_forbidden
flash.now[:error] = "We couldn't complete the verification. Please try again."
render :show, status: :unprocessable_entity
end

# Returns a safe path to redirect to after Turnstile verification. Valid paths should begin with
# a single slash. Falls back to root_path if the provided path is invalid.
def safe_return_path
return_to = params[:return_to].to_s
return root_path if return_to.blank?
return root_path if return_to.start_with?('//')
return return_to if return_to.start_with?('/')

root_path
end
end
26 changes: 26 additions & 0 deletions app/models/bot_detector.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
class BotDetector
# Returns true if the request appears to be a bot according to crawler_detect.
def self.bot?(request)
ua = request.user_agent.to_s
detector = CrawlerDetect.new(ua)
detector.is_crawler?
rescue StandardError => e
Rails.logger.debug("BotDetector: crawler_detect failed for UA '#{ua}': #{e.message}")
false
end

# Returns true when the request appears to be performing crawling behavior that we
# want to challenge. For our initial approach, treat requests to the search results
# endpoint as subject to challenge if they're flagged as bots.
def self.should_challenge?(request)
return false unless bot?(request)

# Basic rule: crawling search results or record pages triggers a challenge.
# /results is the search results page and /record is the full record view.
# This keeps the rule simple and conservative.
path = request.path.to_s
return true if path.start_with?('/results') || path.start_with?('/record')

false
end
end
2 changes: 1 addition & 1 deletion app/models/feature.rb
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
#
class Feature
# List of all valid features in the application
VALID_FEATURES = %i[geodata boolean_picker oa_always simulate_search_latency tab_primo_all tab_timdex_all
VALID_FEATURES = %i[bot_detection geodata boolean_picker oa_always simulate_search_latency tab_primo_all tab_timdex_all
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Line is too long. [121/120] [rubocop:Layout/LineLength]

tab_timdex_alma record_link timdex_fulltext].freeze

# Check if a feature is enabled by name
Expand Down
22 changes: 22 additions & 0 deletions app/views/turnstile/show.html.erb
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<%= cloudflare_turnstile_script_tag %>

<section class="turnstile-challenge">
<div class="turnstile-challenge__inner">
<h1>Verify you're human</h1>
<p>
Please complete this verification to continue.
</p>

<%= form_with url: turnstile_verify_path, method: :post, local: true do %>
<%= hidden_field_tag :return_to, @return_to %>

<div class="turnstile-widget">
<%= cloudflare_turnstile(action: 'search') %>
</div>

<div class="turnstile-challenge__actions">
<%= submit_tag 'Submit', class: 'btn button-primary' %>
</div>
<% end %>
</div>
</section>
38 changes: 38 additions & 0 deletions config/initializers/cloudflare_turnstile.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Explicitly require Feature model to check if bot detection is enabled
require Rails.root.join('app/models/feature')

module TurnstileConfig
module_function

def apply
RailsCloudflareTurnstile.reset_configuration!
enabled = bot_detection_enabled?
enabled = false if Rails.env.test?

RailsCloudflareTurnstile.configure do |config|
config.site_key = ENV['TURNSTILE_SITEKEY']
config.secret_key = ENV['TURNSTILE_SECRET']
config.enabled = enabled
config.fail_open = !enabled
config.mock_enabled = Rails.env.test?
end
end

def bot_detection_enabled?
return false unless Feature.enabled?(:bot_detection)

# Check that required env is present
sitekey = ENV.fetch('TURNSTILE_SITEKEY', nil)
secret = ENV.fetch('TURNSTILE_SECRET', nil)

if sitekey.blank? || secret.blank?
Rails.logger.error('Bot detection enabled but missing TURNSTILE_SITEKEY or TURNSTILE_SECRET')
Sentry.capture_message('Bot detection misconfigured: missing Turnstile credentials', level: :error)
return false
end

true
end
end

TurnstileConfig.apply
2 changes: 2 additions & 0 deletions config/routes.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
as: 'record',
:constraints => { :id => /[0-z\.\-\_~\(\)]+/ }
get 'results', to: 'search#results'
get 'turnstile', to: 'turnstile#show', as: 'turnstile'
post 'turnstile/verify', to: 'turnstile#verify', as: 'turnstile_verify'
get 'style-guide', to: 'static#style_guide'

get 'boolpref', to: 'static#boolpref'
Expand Down
31 changes: 31 additions & 0 deletions test/controllers/search_controller_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -1165,4 +1165,35 @@ def source_filter_count(controller)
get '/results?q=test&format=foo'
assert_response :not_acceptable
end

# Bot detection tests
test 'bots are redirected to Turnstile challenge' do
ClimateControl.modify(FEATURE_BOT_DETECTION: 'true') do
bot_ua = 'Mozilla/5.0 (compatible; Googlebot/2.1)'

get '/results?q=test', headers: { 'HTTP_USER_AGENT' => bot_ua }

assert_redirected_to turnstile_path(return_to: '/results?q=test')
end
end

test 'human users bypass Turnstile challenge' do
ClimateControl.modify(FEATURE_BOT_DETECTION: 'true') do
human_ua = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/91.0.4472.124 Safari/537.36'
mock_primo_search_success

get '/results?q=test&tab=primo', headers: { 'HTTP_USER_AGENT' => human_ua }

assert_response :success
end
end

test 'bots on non-search paths are not challenged' do
bot_ua = 'Googlebot/2.1'

get '/', headers: { 'HTTP_USER_AGENT' => bot_ua }

# Should not be redirected to Turnstile (doesn't hit SearchController)
assert_response :success
end
end
47 changes: 47 additions & 0 deletions test/controllers/turnstile_controller_test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
require 'test_helper'
require 'climate_control'

class TurnstileControllerTest < ActionDispatch::IntegrationTest
def with_bot_detection_enabled
ClimateControl.modify(FEATURE_BOT_DETECTION: 'true') do
TurnstileConfig.apply
yield
ensure
TurnstileConfig.apply
end
end

test 'show renders when bot detection is enabled' do
with_bot_detection_enabled do
get turnstile_path
assert_response :success
end
end

test 'verify sets session and redirects back to search' do
with_bot_detection_enabled do
post turnstile_verify_path, params: { 'cf-turnstile-response' => 'mocked', return_to: '/results?q=ocean' }

assert_redirected_to '/results?q=ocean'
assert session[:passed_turnstile]
end
end

test 'verify re-renders on failed validation' do
with_bot_detection_enabled do
post turnstile_verify_path

assert_response :unprocessable_entity
assert_match "We couldn't complete the verification", response.body
refute session[:passed_turnstile]
end
end

test 'verify falls back to root_path for invalid return_to' do
with_bot_detection_enabled do
post turnstile_verify_path, params: { 'cf-turnstile-response' => 'mocked', return_to: 'foo' }
assert_redirected_to root_path
assert session[:passed_turnstile]
end
end
end
Loading