diff options
Diffstat (limited to 'modules')
-rw-r--r-- | modules/apache/templates/vhost_fcgid.conf | 35 | ||||
-rw-r--r-- | modules/apache/templates/vhost_fcgid_norobot.conf | 45 | ||||
-rw-r--r-- | modules/viewvc/manifests/init.pp | 4 |
3 files changed, 47 insertions, 37 deletions
diff --git a/modules/apache/templates/vhost_fcgid.conf b/modules/apache/templates/vhost_fcgid.conf index f137c866..fefa4a49 100644 --- a/modules/apache/templates/vhost_fcgid.conf +++ b/modules/apache/templates/vhost_fcgid.conf @@ -4,38 +4,3 @@ AddHandler fcgid-script .pl <%- end -%> FcgidMinProcessesPerClass <%= @process %> FcgidIdleTimeout 30 - -# These robots were scraping the whole of svnweb in 2024-04, causing severe -# load, so they are banned. It's not clear whether they obey robots.txt or -# not (we didn't give them enough of a chance to find out), so we could -# consider giving them a chance to redeem themselves at some point in the -# future. -RewriteEngine on -RewriteCond %{HTTP_USER_AGENT} ClaudeBot|Amazonbot -RewriteRule . - [R=403,L] - -# Block expensive SVN operations on all common robots ("spider" covers a -# bunch). "Expensive" is considered to be most operations other than showing a -# directory or downloading a specific version of a file. -# Note: eliminating view=log and annotate= doesn't make much difference to the -# CPU load when robots are hitting the server in real world operation. -RewriteCond %{QUERY_STRING} pathrev=|r1= -RewriteCond %{HTTP_USER_AGENT} "Googlebot|GoogleOther|bingbot|Yahoo! Slurp|ClaudeBot|Amazonbot|YandexBot|SemrushBot|Barkrowler|DataForSeoBot|PetalBot|facebookexternalhit|GPTBot|ImagesiftBot|spider|Spider|iPod|Trident|Presto" -RewriteRule . - [R=403,L] - -# Only let expensive operations through when a cookie is set. If no cookie is -# set, redirect to a page where it will be set using JavaScript and redirect -# back. This will block requests from user agents that do not support -# JavaScript, which includes many robots. -RewriteMap urlescape prg:/usr/local/bin/urlescape -RewriteCond %{QUERY_STRING} pathrev=|r1= -RewriteCond %{REQUEST_URI} !/_check -RewriteCond %{HTTP_COOKIE} !session=([^;]+) [novary] -RewriteRule . %{REQUEST_SCHEME}://%{SERVER_NAME}:%{SERVER_PORT}/_check?to=%{REQUEST_URI}?${urlescape:%{QUERY_STRING}} [R=302,L] - -# Block abusive spiders by IP address who don't identify themselves in the -# User-Agent: string -RewriteCond expr "-R '47.76.0.0/14' || -R '47.80.0.0/14' || -R '47.208.0.0/16' || -R '47.238.0.0/16' || -R '8.210.0.0/16' || -R '8.218.0.0/16' || -R '188.239.0.0/18' || -R '166.108.192.0/18' || -R '124.243.160.0/19' || -R '101.46.0.0/20'" -RewriteRule . - [R=403,L] - -ErrorDocument 403 "<html><body>Impolite robots are not allowed</body></html>" diff --git a/modules/apache/templates/vhost_fcgid_norobot.conf b/modules/apache/templates/vhost_fcgid_norobot.conf new file mode 100644 index 00000000..0643cac9 --- /dev/null +++ b/modules/apache/templates/vhost_fcgid_norobot.conf @@ -0,0 +1,45 @@ +AddHandler fcgid-script .pl +<%- @script_aliases.keys.sort {|a,b| a.size <=> b.size }.reverse.each do |key| -%> + ScriptAlias <%= key %> <%= @script_aliases[key] %> +<%- end -%> +FcgidMinProcessesPerClass <%= @process %> +FcgidIdleTimeout 30 + +# These robots were scraping the whole of svnweb in 2024-04, causing severe +# load, so they are banned. It's not clear whether they obey robots.txt or +# not (we didn't give them enough of a chance to find out), so we could +# consider giving them a chance to redeem themselves at some point in the +# future. +RewriteEngine on +RewriteCond %{HTTP_USER_AGENT} ClaudeBot|Amazonbot +RewriteRule . - [R=403,L] + +# Block expensive SVN operations on all common robots ("spider" covers a +# bunch). "Expensive" is considered to be most operations other than showing a +# directory or downloading a specific version of a file. +# Note: eliminating view=log and annotate= doesn't make much difference to the +# CPU load when robots are hitting the server in real world operation. +#RewriteCond %{QUERY_STRING} pathrev=|r1= +# Treat anything other than a plain path as "expensive" +RewriteCond %{QUERY_STRING} . +RewriteCond %{HTTP_USER_AGENT} "Googlebot|GoogleOther|bingbot|Yahoo! Slurp|ClaudeBot|Amazonbot|YandexBot|SemrushBot|Barkrowler|DataForSeoBot|PetalBot|facebookexternalhit|GPTBot|ImagesiftBot|spider|Spider|iPod|Trident|Presto" +RewriteRule . - [R=403,L] + +# Only let expensive operations through when a cookie is set. If no cookie is +# set, redirect to a page where it will be set using JavaScript and redirect +# back. This will block requests from user agents that do not support +# JavaScript, which includes many robots. +RewriteMap urlescape prg:/usr/local/bin/urlescape +#RewriteCond %{QUERY_STRING} pathrev=|r1= +# Treat anything other than a plain path as "expensive" +RewriteCond %{QUERY_STRING} . +RewriteCond %{REQUEST_URI} !/_check +RewriteCond %{HTTP_COOKIE} !session=([^;]+) [novary] +RewriteRule . %{REQUEST_SCHEME}://%{SERVER_NAME}:%{SERVER_PORT}/_check?to=%{REQUEST_URI}?${urlescape:%{QUERY_STRING}} [R=302,L] + +# Block abusive spiders by IP address who don't identify themselves in the +# User-Agent: string +RewriteCond expr "-R '47.76.0.0/14' || -R '47.80.0.0/14' || -R '47.208.0.0/16' || -R '47.238.0.0/16' || -R '8.210.0.0/16' || -R '8.218.0.0/16' || -R '188.239.0.0/18' || -R '166.108.192.0/18' || -R '124.243.160.0/19' || -R '101.46.0.0/20'" +RewriteRule . - [R=403,L] + +ErrorDocument 403 "<html><body>Impolite robots are not allowed</body></html>" diff --git a/modules/viewvc/manifests/init.pp b/modules/viewvc/manifests/init.pp index e1d336c9..bd676f29 100644 --- a/modules/viewvc/manifests/init.pp +++ b/modules/viewvc/manifests/init.pp @@ -62,13 +62,13 @@ class viewvc { apache::vhost::base { $viewvc::var::hostname: aliases => $vhost_aliases, - content => template('apache/vhost_fcgid.conf'), + content => template('apache/vhost_fcgid_norobot.conf'), } apache::vhost::base { "ssl_${viewvc::var::hostname}": vhost => $viewvc::var::hostname, use_ssl => true, aliases => $vhost_aliases, - content => template('apache/vhost_fcgid.conf'), + content => template('apache/vhost_fcgid_norobot.conf'), } } |