# Robots.txt for DigiScalability - Digital Agency # Last Updated: January 2025 # Domain: digiscalability.com # Purpose: AI/AEO/GEO-optimized crawler management for enhanced visibility in AI search engines # =============================================== # STANDARD SEARCH ENGINE CRAWLERS # =============================================== # Google Search User-agent: Googlebot Allow: / Crawl-delay: 1 # Google Images User-agent: Googlebot-Image Allow: / Allow: /public/images/ Allow: /_next/static/media/ # Google News User-agent: Googlebot-News Allow: /blog/ Disallow: /admin/ Disallow: /dashboard/ # Bing Search User-agent: Bingbot Allow: / Crawl-delay: 2 # Yahoo Search User-agent: Slurp Allow: / Crawl-delay: 3 # Yandex User-agent: YandexBot Allow: / Crawl-delay: 2 # Baidu User-agent: Baiduspider Allow: / Crawl-delay: 5 # DuckDuckGo User-agent: DuckDuckBot Allow: / Crawl-delay: 2 # =============================================== # AI CHATBOTS & LLM CRAWLERS (PREMIUM ACCESS) # =============================================== # OpenAI ChatGPT User-agent: ChatGPT-User Allow: / Allow: /blog/ Allow: /services/ Allow: /case-studies/ Allow: /llm.txt Crawl-delay: 2 # OpenAI GPTBot for training User-agent: GPTBot Allow: / Allow: /blog/ Allow: /services/ Allow: /case-studies/ Allow: /privacy-policy.html Allow: /terms-of-service.html Allow: /cookie-policy.html Allow: /llm.txt Crawl-delay: 3 # OpenAI SearchBot User-agent: OAI-SearchBot Allow: / Allow: /blog/ Allow: /services/ Allow: /case-studies/ Allow: /llm.txt Crawl-delay: 2 # Claude (Anthropic) User-agent: ClaudeBot Allow: / Allow: /blog/ Allow: /services/ Allow: /case-studies/ Allow: /llm.txt Crawl-delay: 2 User-agent: Claude-SearchBot Allow: / Allow: /blog/ Allow: /services/ Allow: /case-studies/ Allow: /llm.txt Crawl-delay: 2 User-agent: Claude-User Allow: / Allow: /blog/ Allow: /services/ Allow: /case-studies/ Allow: /llm.txt Crawl-delay: 2 User-agent: Claude-Web Allow: / Allow: /blog/ Allow: /services/ Allow: /case-studies/ Allow: /llm.txt Crawl-delay: 2 User-agent: anthropic-ai Allow: / Allow: /blog/ Allow: /services/ Allow: /case-studies/ Allow: /llm.txt Crawl-delay: 3 # Google Bard/Gemini (Extended AI features) User-agent: Google-Extended Allow: / Allow: /blog/ Allow: /services/ Allow: /case-studies/ Allow: /llm.txt Crawl-delay: 2 User-agent: GoogleOther Allow: / Allow: /blog/ Allow: /services/ Allow: /case-studies/ Crawl-delay: 2 User-agent: GoogleOther-Image Allow: /public/images/ Allow: /_next/static/media/ Allow: /og-image.png Crawl-delay: 3 User-agent: GoogleOther-Video Allow: / Crawl-delay: 5 User-agent: Google-CloudVertexBot Allow: / Allow: /blog/ Allow: /services/ Allow: /case-studies/ Allow: /llm.txt Crawl-delay: 3 User-agent: Gemini-Deep-Research Allow: / Allow: /blog/ Allow: /services/ Allow: /case-studies/ Allow: /llm.txt Crawl-delay: 2 # Microsoft Copilot & Bing AI User-agent: BingBot Allow: / Crawl-delay: 2 # Perplexity AI User-agent: PerplexityBot Allow: / Allow: /blog/ Allow: /services/ Allow: /case-studies/ Allow: /llm.txt Crawl-delay: 3 User-agent: Perplexity-User Allow: / Allow: /blog/ Allow: /services/ Allow: /case-studies/ Allow: /llm.txt Crawl-delay: 2 # Meta AI User-agent: Meta-ExternalAgent Allow: / Allow: /blog/ Allow: /services/ Allow: /llm.txt Crawl-delay: 4 User-agent: Meta-ExternalFetcher Allow: / Allow: /blog/ Allow: /services/ Allow: /llm.txt Crawl-delay: 4 User-agent: FacebookBot Allow: / Allow: /blog/ Allow: /services/ Allow: /llm.txt Crawl-delay: 4 # Mistral AI User-agent: MistralAI-User Allow: / Allow: /blog/ Allow: /services/ Allow: /case-studies/ Allow: /llm.txt Crawl-delay: 3 User-agent: MistralAI-User/1.0 Allow: / Allow: /blog/ Allow: /services/ Allow: /case-studies/ Allow: /llm.txt Crawl-delay: 3 # DuckDuckGo AI User-agent: DuckAssistBot Allow: / Allow: /blog/ Allow: /services/ Allow: /case-studies/ Allow: /llm.txt Crawl-delay: 3 # Cohere AI User-agent: cohere-ai Allow: / Allow: /blog/ Allow: /services/ Allow: /case-studies/ Allow: /llm.txt Crawl-delay: 4 User-agent: cohere-training-data-crawler Allow: / Allow: /blog/ Allow: /services/ Allow: /case-studies/ Allow: /llm.txt Crawl-delay: 5 # You.com Bot User-agent: YouBot Allow: / Allow: /blog/ Allow: /services/ Allow: /case-studies/ Allow: /llm.txt Crawl-delay: 4 # Phind AI User-agent: PhindBot Allow: / Allow: /blog/ Allow: /services/ Allow: /case-studies/ Allow: /llm.txt Crawl-delay: 4 # Apple Intelligence User-agent: Applebot Allow: / Allow: /blog/ Allow: /services/ Allow: /llm.txt Crawl-delay: 3 User-agent: Applebot-Extended Allow: / Allow: /blog/ Allow: /services/ Allow: /llm.txt Crawl-delay: 3 # =============================================== # AI CRAWLERS (RESTRICTED ACCESS) # =============================================== # Common Crawl (data collection) User-agent: CCBot Allow: / Allow: /blog/ Allow: /services/ Allow: /case-studies/ Allow: /llm.txt Disallow: /admin/ Disallow: /dashboard/ Disallow: /api/ Crawl-delay: 8 # AI2 Bot (Allen Institute for AI) User-agent: AI2Bot Allow: /blog/ Allow: /services/ Allow: /case-studies/ Allow: /llm.txt Disallow: /admin/ Disallow: /dashboard/ Crawl-delay: 10 User-agent: Ai2Bot-Dolma Allow: /blog/ Allow: /services/ Allow: /case-studies/ Allow: /llm.txt Disallow: /admin/ Disallow: /dashboard/ Crawl-delay: 10 # Amazon Alexa User-agent: Amazonbot Allow: / Allow: /blog/ Allow: /services/ Allow: /llm.txt Crawl-delay: 5 # Bytedance/TikTok AI User-agent: Bytespider Allow: /blog/ Allow: /services/ Allow: /llm.txt Disallow: /admin/ Disallow: /dashboard/ Crawl-delay: 10 User-agent: TikTokSpider Allow: /blog/ Allow: /services/ Allow: /llm.txt Disallow: /admin/ Disallow: /dashboard/ Crawl-delay: 10 # Research and Academic Crawlers User-agent: ImagesiftBot Allow: /blog/ Allow: /services/ Allow: /public/images/ Allow: /llm.txt Crawl-delay: 15 User-agent: img2dataset Allow: /public/images/ Allow: /llm.txt Disallow: / Crawl-delay: 20 # Diffbot AI User-agent: Diffbot Allow: /blog/ Allow: /services/ Allow: /case-studies/ Allow: /llm.txt Disallow: /admin/ Disallow: /dashboard/ Crawl-delay: 10 # PetalBot (Huawei) User-agent: PetalBot Allow: /blog/ Allow: /services/ Allow: /llm.txt Disallow: /admin/ Disallow: /dashboard/ Crawl-delay: 15 # Scrapy-based crawlers User-agent: Scrapy* Disallow: /admin/ Disallow: /dashboard/ Disallow: /api/ Allow: /blog/ Allow: /llm.txt Crawl-delay: 20 # Friendly AI crawlers User-agent: FriendlyCrawler Allow: /blog/ Allow: /services/ Allow: /llm.txt Crawl-delay: 8 # Additional modern AI bots User-agent: Kangaroo Bot Allow: /blog/ Allow: /services/ Allow: /llm.txt Crawl-delay: 10 User-agent: SummalyBot Allow: /blog/ Allow: /services/ Allow: /llm.txt Crawl-delay: 8 User-agent: Thinkbot Allow: /blog/ Allow: /services/ Allow: /llm.txt Crawl-delay: 10 User-agent: WARDBot Allow: /blog/ Allow: /services/ Allow: /llm.txt Crawl-delay: 12 User-agent: Operator Allow: /blog/ Allow: /services/ Allow: /llm.txt Crawl-delay: 8 User-agent: Devin Allow: /blog/ Allow: /services/ Allow: /llm.txt Crawl-delay: 10 User-agent: QualifiedBot Allow: /blog/ Allow: /services/ Allow: /llm.txt Crawl-delay: 10 # =============================================== # AI CRAWLERS (BLOCKED - AGGRESSIVE/UNWANTED) # =============================================== # Aggressive AI scrapers and data harvesters User-agent: DataForSeoBot Disallow: / User-agent: MJ12bot Disallow: / User-agent: DotBot Disallow: / User-agent: AhrefsBot Disallow: / User-agent: SemrushBot Disallow: / User-agent: SemrushBot-OCOB Disallow: / User-agent: SemrushBot-SWA Disallow: / User-agent: MegaIndex.ru Disallow: / User-agent: BLEXBot Disallow: / # Unwanted AI training crawlers User-agent: aiHitBot Disallow: / User-agent: Andibot Disallow: / User-agent: Awario Disallow: / User-agent: bedrockbot Disallow: / User-agent: Brightbot 1.0 Disallow: / User-agent: Cotoyogi Disallow: / User-agent: Crawlspace Disallow: / User-agent: Datenbank Crawler Disallow: / User-agent: Echobot Bot Disallow: / User-agent: EchoboxBot Disallow: / User-agent: Factset_spyderbot Disallow: / User-agent: FirecrawlAgent Disallow: / User-agent: iaskspider/2.0 Disallow: / User-agent: ICC-Crawler Disallow: / User-agent: ISSCyberRiskCrawler Disallow: / User-agent: MyCentralAIScraperBot Disallow: / User-agent: netEstate Imprint Crawler Disallow: / User-agent: NovaAct Disallow: / User-agent: omgili Disallow: / User-agent: omgilibot Disallow: / User-agent: PanguBot Disallow: / User-agent: Panscient Disallow: / User-agent: panscient.com Disallow: / User-agent: Poseidon Research Crawler Disallow: / User-agent: QuillBot Disallow: / User-agent: quillbot.com Disallow: / User-agent: SBIntuitionsBot Disallow: / User-agent: Sidetrade indexer bot Disallow: / User-agent: Timpibot Disallow: / User-agent: VelenPublicWebCrawler Disallow: / User-agent: Webzio-Extended Disallow: / User-agent: wpbot Disallow: / User-agent: YandexAdditional Disallow: / User-agent: YandexAdditionalBot Disallow: / # Generic aggressive crawlers and scrapers User-agent: *Bot*Scraper* Disallow: / User-agent: *Harvest* Disallow: / User-agent: *Extractor* Disallow: / User-agent: crawler.with.dots Disallow: / User-agent: star***crawler Disallow: / # Regex patterns and suspicious user agents User-agent: Is this a crawler? Disallow: / User-agent: a[mazing]{42}(robot) Disallow: / User-agent: 2^32$ Disallow: / User-agent: curl|sudo bash Disallow: / # =============================================== # SOCIAL MEDIA CRAWLERS # =============================================== # Facebook/Meta User-agent: facebookexternalhit Allow: / Allow: /blog/ Allow: /services/ Allow: /case-studies/ # LinkedIn User-agent: LinkedInBot Allow: / Allow: /blog/ Allow: /case-studies/ # Twitter/X User-agent: Twitterbot Allow: / Allow: /blog/ Allow: /case-studies/ # WhatsApp User-agent: WhatsApp Allow: / Allow: /blog/ # Telegram User-agent: TelegramBot Allow: / Allow: /blog/ # =============================================== # DEFAULT RULES FOR ALL CRAWLERS # =============================================== # Default rules for unlisted crawlers User-agent: * Allow: / # =============================================== # DISALLOWED DIRECTORIES & FILES # =============================================== # Admin and dashboard areas Disallow: /admin/ Disallow: /dashboard/ Disallow: /wp-admin/ Disallow: /wp-content/ Disallow: /wp-includes/ # API endpoints Disallow: /api/ Disallow: /_next/ Disallow: /_vercel/ # Private files and configurations Disallow: /.env Disallow: /.env.* Disallow: /config/ Disallow: /logs/ Disallow: /backup/ Disallow: /temp/ Disallow: /tmp/ # Development and test files Disallow: /test/ Disallow: /tests/ Disallow: /testing/ Disallow: /dev/ Disallow: /development/ Disallow: /staging/ # Database and sensitive files Disallow: *.sql Disallow: *.sql.gz Disallow: *.db Disallow: *.log Disallow: *.txt$ # Search and form processing Disallow: /search?* Disallow: /*?q=* Disallow: /*?search=* Disallow: /thank-you* Disallow: /form-success* # User accounts and personal data Disallow: /login/ Disallow: /register/ Disallow: /user/ Disallow: /account/ Disallow: /profile/ Disallow: /my-account/ # Duplicate content parameters Disallow: /*?utm_* Disallow: /*?ref=* Disallow: /*?source=* Disallow: /*?campaign=* Disallow: /*?fbclid=* Disallow: /*?gclid=* # =============================================== # SPECIFIC FILE PERMISSIONS # =============================================== # Allow important legal pages Allow: /privacy-policy.html Allow: /terms-of-service.html Allow: /cookie-policy.html # Allow important business pages Allow: /about Allow: /services/ Allow: /pricing Allow: /contact Allow: /blog/ Allow: /case-studies/ Allow: /portfolio/ # Allow assets and media Allow: /images/ Allow: /public/images/ Allow: /assets/ Allow: /media/ Allow: /_next/static/ Allow: /_next/image/ # Allow sitemaps and feeds Allow: /sitemap*.xml Allow: /sitemap*.txt Allow: /robots.txt Allow: /llm.txt Allow: /rss.xml Allow: /feed.xml Allow: /atom.xml # =============================================== # SITEMAPS # =============================================== # Main sitemap Sitemap: https://digiscalability.com/sitemap.xml # Additional sitemaps (if applicable) Sitemap: https://digiscalability.com/blog-sitemap.xml Sitemap: https://digiscalability.com/services-sitemap.xml Sitemap: https://digiscalability.com/case-studies-sitemap.xml # =============================================== # CRAWL RATE LIMITING # =============================================== # Global crawl delay for aggressive crawlers Request-rate: 1/10s # Host directive for primary domain Host: digiscalability.com