From 43e510ba1d234b8f3cd01ce547ad40aa54883af0 Mon Sep 17 00:00:00 2001 From: Matthias Kretschmann Date: Fri, 6 Oct 2023 11:59:26 +0100 Subject: [PATCH] block all the ai bots --- ...feed-ai-models-by-scraping-your-website.md | 14 ++++++++++++++ public/robots.txt | 19 ++++++++++++++++++- src/layouts/Base/Head.astro | 1 + 3 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 content/links/2023-10-06-block-the-bots-that-feed-ai-models-by-scraping-your-website.md diff --git a/content/links/2023-10-06-block-the-bots-that-feed-ai-models-by-scraping-your-website.md b/content/links/2023-10-06-block-the-bots-that-feed-ai-models-by-scraping-your-website.md new file mode 100644 index 00000000..eee22eac --- /dev/null +++ b/content/links/2023-10-06-block-the-bots-that-feed-ai-models-by-scraping-your-website.md @@ -0,0 +1,14 @@ +--- +date: 2023-10-06T10:22:16.581Z + +title: Block the Bots that Feed “AI” Models by Scraping Your Website +linkurl: https://neil-clarke.com/block-the-bots-that-feed-ai-models-by-scraping-your-website/ + +tags: + - development + - ai +--- + +Neil Clarke with an [excellent overview](https://neil-clarke.com/block-the-bots-that-feed-ai-models-by-scraping-your-website/) about current techniques to block all the "AI" web scraping bots from your content, e.g. via `robots.txt`. The reasons for doing so are numerous: + +> “AI” companies think that we should have to opt-out of data-scraping bots that take our work to train their products. [...] These companies should be prevented from using data that they haven’t been given explicit consent for. Opt-out is problematic as it counts on concerned parties hearing about new or modified bots BEFORE their sites are targeted by them. That is simply not practical.[...] The online community is under no responsibility to help them create their products. diff --git a/public/robots.txt b/public/robots.txt index 62352cb6..9b217a6d 100644 --- a/public/robots.txt +++ b/public/robots.txt @@ -1,7 +1,24 @@ -# https://platform.openai.com/docs/gptbot +User-agent: CCBot +Disallow: / + +User-agent: ChatGPT-User +Disallow: / + User-agent: GPTBot Disallow: / +User-agent: Google-Extended +Disallow: / + +User-agent: Omgilibot +Disallow: / + +User-agent: Omgili +Disallow: / + +User-agent: FacebookBot +Disallow: / + User-agent: * Disallow: /search/ Disallow: /page/ diff --git a/src/layouts/Base/Head.astro b/src/layouts/Base/Head.astro index 1c0eb165..92a9b2f7 100644 --- a/src/layouts/Base/Head.astro +++ b/src/layouts/Base/Head.astro @@ -78,6 +78,7 @@ const faviconSvg = await getImage({ src: faviconSvgSrc, format: 'svg' }) +