From 4d88d0033bf148653fb68281aa1187ffa1f5b30e Mon Sep 17 00:00:00 2001 From: Helen Chong <119173961+helenclx@users.noreply.github.com> Date: Tue, 20 Aug 2024 09:53:37 +0800 Subject: [PATCH] Dynamically generate bad bots to block by fetching remote list --- src/.htaccess.njk | 34 +--------------------------------- src/_data/robots.js | 32 ++++++++++++++++++++++++++++++++ src/robots.txt.njk | 39 +++------------------------------------ 3 files changed, 36 insertions(+), 69 deletions(-) create mode 100644 src/_data/robots.js diff --git a/src/.htaccess.njk b/src/.htaccess.njk index 06bd00ae..a79f318d 100644 --- a/src/.htaccess.njk +++ b/src/.htaccess.njk @@ -14,37 +14,5 @@ Header set Cache-Control "no-cache, public" RewriteEngine on # Block bad bots -RewriteCond %{HTTP_USER_AGENT} (Amazonbot) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (Applebot) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (Applebot-Extended) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (Bytespider) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (CCBot) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (ChatGPT-User) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (Claude-Web) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (ClaudeBot) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (Diffbot) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (FacebookBot) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (FriendlyCrawler) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (GPTBot) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (Google-Extended) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (GoogleOther) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (GoogleOther-Image) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (GoogleOther-Video) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (ICC-Crawler) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (ImagesiftBot) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (Meta-ExternalAgent) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (Meta-ExternalFetcher) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (OAI-SearchBot) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (PerplexityBot) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (PetalBot) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (Scrapy) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (Timpibot) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (VelenPublicWebCrawler) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (YouBot) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (anthropic-ai) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (cohere-ai) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (facebookexternalhit) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (img2dataset) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (omgili) [NC,OR] -RewriteCond %{HTTP_USER_AGENT} (omgilibot) [NC] +RewriteCond %{HTTP_USER_AGENT} {{ robots.htaccess }} [NC] RewriteRule .* https://nocommercialuse.org/ [L] \ No newline at end of file diff --git a/src/_data/robots.js b/src/_data/robots.js new file mode 100644 index 00000000..bfa1b677 --- /dev/null +++ b/src/_data/robots.js @@ -0,0 +1,32 @@ +// by Robb Knight: https://rknight.me/blog/blocking-bots-with-nginx/ + +import EleventyFetch from "@11ty/eleventy-fetch"; + +export default async function () { + const url = + "https://raw.githubusercontent.com/ai-robots-txt/ai.robots.txt/main/robots.txt"; + let txt = await EleventyFetch(url, { + duration: "1w", + type: "text", + }); + + txt = txt + .split("\n") + .filter((line) => line !== "User-agent: Applebot") + .join("\n"); + + const bots = txt + .split("\n") + .filter((line) => { + return ( + line.startsWith("User-agent:") && + line !== "User-agent: Applebot" + ); + }) + .map((line) => line.split(":")[1].trim()); + + return { + txt: txt, + htaccess: bots.join("|"), + }; +} \ No newline at end of file diff --git a/src/robots.txt.njk b/src/robots.txt.njk index 587a4416..52f203e5 100644 --- a/src/robots.txt.njk +++ b/src/robots.txt.njk @@ -2,42 +2,9 @@ permalink: robots.txt eleventyExcludeFromCollections: true --- -Sitemap: {{ sitemeta.siteUrl }}/sitemap/ - User-agent: * Disallow: -User-agent: Amazonbot -User-agent: Applebot -User-agent: Applebot-Extended -User-agent: Bytespider -User-agent: CCBot -User-agent: ChatGPT-User -User-agent: Claude-Web -User-agent: ClaudeBot -User-agent: Diffbot -User-agent: FacebookBot -User-agent: FriendlyCrawler -User-agent: GPTBot -User-agent: Google-Extended -User-agent: GoogleOther -User-agent: GoogleOther-Image -User-agent: GoogleOther-Video -User-agent: ICC-Crawler -User-agent: ImagesiftBot -User-agent: Meta-ExternalAgent -User-agent: Meta-ExternalFetcher -User-agent: OAI-SearchBot -User-agent: PerplexityBot -User-agent: PetalBot -User-agent: Scrapy -User-agent: Timpibot -User-agent: VelenPublicWebCrawler -User-agent: YouBot -User-agent: anthropic-ai -User-agent: cohere-ai -User-agent: facebookexternalhit -User-agent: img2dataset -User-agent: omgili -User-agent: omgilibot -Disallow: / \ No newline at end of file +{{ robots.txt }} + +Sitemap: {{ sitemeta.siteUrl }}/sitemap/ \ No newline at end of file