Dynamically generate bad bots to block by fetching remote list
This commit is contained in:
parent
31631b9fe6
commit
4d88d0033b
|
@ -14,37 +14,5 @@ Header set Cache-Control "no-cache, public"
|
|||
RewriteEngine on
|
||||
|
||||
# Block bad bots
|
||||
RewriteCond %{HTTP_USER_AGENT} (Amazonbot) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (Applebot) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (Applebot-Extended) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (Bytespider) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (CCBot) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (ChatGPT-User) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (Claude-Web) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (ClaudeBot) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (Diffbot) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (FacebookBot) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (FriendlyCrawler) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (GPTBot) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (Google-Extended) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (GoogleOther) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (GoogleOther-Image) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (GoogleOther-Video) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (ICC-Crawler) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (ImagesiftBot) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (Meta-ExternalAgent) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (Meta-ExternalFetcher) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (OAI-SearchBot) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (PerplexityBot) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (PetalBot) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (Scrapy) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (Timpibot) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (VelenPublicWebCrawler) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (YouBot) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (anthropic-ai) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (cohere-ai) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (facebookexternalhit) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (img2dataset) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (omgili) [NC,OR]
|
||||
RewriteCond %{HTTP_USER_AGENT} (omgilibot) [NC]
|
||||
RewriteCond %{HTTP_USER_AGENT} {{ robots.htaccess }} [NC]
|
||||
RewriteRule .* https://nocommercialuse.org/ [L]
|
|
@ -0,0 +1,32 @@
|
|||
// by Robb Knight: https://rknight.me/blog/blocking-bots-with-nginx/
|
||||
|
||||
import EleventyFetch from "@11ty/eleventy-fetch";
|
||||
|
||||
export default async function () {
|
||||
const url =
|
||||
"https://raw.githubusercontent.com/ai-robots-txt/ai.robots.txt/main/robots.txt";
|
||||
let txt = await EleventyFetch(url, {
|
||||
duration: "1w",
|
||||
type: "text",
|
||||
});
|
||||
|
||||
txt = txt
|
||||
.split("\n")
|
||||
.filter((line) => line !== "User-agent: Applebot")
|
||||
.join("\n");
|
||||
|
||||
const bots = txt
|
||||
.split("\n")
|
||||
.filter((line) => {
|
||||
return (
|
||||
line.startsWith("User-agent:") &&
|
||||
line !== "User-agent: Applebot"
|
||||
);
|
||||
})
|
||||
.map((line) => line.split(":")[1].trim());
|
||||
|
||||
return {
|
||||
txt: txt,
|
||||
htaccess: bots.join("|"),
|
||||
};
|
||||
}
|
|
@ -2,42 +2,9 @@
|
|||
permalink: robots.txt
|
||||
eleventyExcludeFromCollections: true
|
||||
---
|
||||
Sitemap: {{ sitemeta.siteUrl }}/sitemap/
|
||||
|
||||
User-agent: *
|
||||
Disallow:
|
||||
|
||||
User-agent: Amazonbot
|
||||
User-agent: Applebot
|
||||
User-agent: Applebot-Extended
|
||||
User-agent: Bytespider
|
||||
User-agent: CCBot
|
||||
User-agent: ChatGPT-User
|
||||
User-agent: Claude-Web
|
||||
User-agent: ClaudeBot
|
||||
User-agent: Diffbot
|
||||
User-agent: FacebookBot
|
||||
User-agent: FriendlyCrawler
|
||||
User-agent: GPTBot
|
||||
User-agent: Google-Extended
|
||||
User-agent: GoogleOther
|
||||
User-agent: GoogleOther-Image
|
||||
User-agent: GoogleOther-Video
|
||||
User-agent: ICC-Crawler
|
||||
User-agent: ImagesiftBot
|
||||
User-agent: Meta-ExternalAgent
|
||||
User-agent: Meta-ExternalFetcher
|
||||
User-agent: OAI-SearchBot
|
||||
User-agent: PerplexityBot
|
||||
User-agent: PetalBot
|
||||
User-agent: Scrapy
|
||||
User-agent: Timpibot
|
||||
User-agent: VelenPublicWebCrawler
|
||||
User-agent: YouBot
|
||||
User-agent: anthropic-ai
|
||||
User-agent: cohere-ai
|
||||
User-agent: facebookexternalhit
|
||||
User-agent: img2dataset
|
||||
User-agent: omgili
|
||||
User-agent: omgilibot
|
||||
Disallow: /
|
||||
{{ robots.txt }}
|
||||
|
||||
Sitemap: {{ sitemeta.siteUrl }}/sitemap/
|
Loading…
Reference in New Issue