#!/usr/bin/env python # -*- coding: iso-8859-1 -*- """ This is unit test code for robotexclusionrulesparser.py. For more info, see: http://NikitaTheSpider.com/python/rerp/ """ import robotexclusionrulesparser import time import calendar import urllib2 # These are disabled by default. RUN_FETCH_TESTS = False rerp = robotexclusionrulesparser.RobotExclusionRulesParser() # mk1994 = the 1994 robots.txt draft spec (http://www.robotstxt.org/orig.html) # mk1996 = the 1996 robots.txt draft spec (http://www.robotstxt.org/norobots-rfc.txt) # ----------------------------------------------------------- # This is the example from mk1994 # ----------------------------------------------------------- TestName = "MK1994 example" s = """ # robots.txt for http://www.example.com/ User-agent: * Disallow: /cyberworld/map/ # This is an infinite virtual URL space Disallow: /tmp/ # these will soon disappear Disallow: /foo.html """ rerp.parse(s) #print rerp try: assert(rerp.is_allowed("CrunchyFrogBot", "/") == True) assert(rerp.is_allowed("CrunchyFrogBot", "/foo.html") == False) assert(rerp.is_allowed("CrunchyFrogBot", "/foo.htm") == True) assert(rerp.is_allowed("CrunchyFrogBot", "/foo.shtml") == True) assert(rerp.is_allowed("CrunchyFrogBot", "/foo.htmlx") == False) assert(rerp.is_allowed("CrunchyFrogBot", "/cyberworld/index.html") == True) assert(rerp.is_allowed("CrunchyFrogBot", "/tmp/foo.html") == False) # Since it is the caller's responsibility to make sure the host name matches, # the parser disallows foo.html regardless of what I pass for host name and protocol. assert(rerp.is_allowed("CrunchyFrogBot", "http://example.com/foo.html") == False) assert(rerp.is_allowed("CrunchyFrogBot", "http://www.example.com/foo.html") == False) assert(rerp.is_allowed("CrunchyFrogBot", "http://www.example.org/foo.html") == False) assert(rerp.is_allowed("CrunchyFrogBot", "https://www.example.org/foo.html") == False) assert(rerp.is_allowed("CrunchyFrogBot", "ftp://NikitaTheSpider.com/foo.html") == False) except AssertionError: print "%s test failed." % TestName raise AssertionError print "%s test passed." % TestName # ----------------------------------------------------------- # This is the example A from MK1996 # ----------------------------------------------------------- TestName = "Allows based on MK1996 example A" s = """ # robots.txt for http://www.example.com/ User-agent: 1bot Allow: /tmp Disallow: / User-agent: 2bot Allow: /tmp/ Disallow: / User-agent: 3bot Allow: /a%3cd.html Disallow: / User-agent: 4bot Allow: /a%3Cd.html Disallow: / User-agent: 5bot Allow: /a%2fb.html Disallow: / User-agent: 6bot Allow: /a/b.html Disallow: / User-agent: 7bot Allow: /%7ejoe/index.html Disallow: / User-agent: 8bot Allow: /~joe/index.html Disallow: / """ rerp.parse(s) #print rerp try: assert(rerp.is_allowed("1bot", "/tmp") == True) assert(rerp.is_allowed("1bot", "/tmp.html") == True) assert(rerp.is_allowed("1bot", "/tmp/a.html") == True) assert(rerp.is_allowed("2bot", "/tmp") == False) assert(rerp.is_allowed("2bot", "/tmp/") == True) assert(rerp.is_allowed("2bot", "/tmp/a.html") == True) assert(rerp.is_allowed("3bot", "/a%3cd.html") == True) assert(rerp.is_allowed("3bot", "/a%3Cd.html") == True) assert(rerp.is_allowed("4bot", "/a%3cd.html") == True) assert(rerp.is_allowed("4bot", "/a%3Cd.html") == True) assert(rerp.is_allowed("5bot", "/a%2fb.html") == True) assert(rerp.is_allowed("5bot", "/a/b.html") == False) assert(rerp.is_allowed("6bot", "/a%2fb.html") == False) assert(rerp.is_allowed("6bot", "/a/b.html") == True) assert(rerp.is_allowed("7bot", "/~joe/index.html") == True) assert(rerp.is_allowed("8bot", "/%7Ejoe/index.html") == True) except AssertionError: print "%s test failed." % TestName raise AssertionError print "%s test passed." % TestName # ----------------------------------------------------------- # This is the example B from MK1996 # ----------------------------------------------------------- TestName = "MK1996 example B" s = """ # /robots.txt for http://www.fict.org/ # comments to webmaster@fict.org User-agent: unhipbot Disallow: / User-agent: webcrawler User-agent: excite Disallow: User-agent: * Disallow: /org/plans.html Allow: /org/ Allow: /serv Allow: /~mak Disallow: / """ rerp.parse(s) #print rerp try: assert(rerp.is_allowed("unhipbot", "http://www.fict.org/") == False) assert(rerp.is_allowed("webcrawler", "http://www.fict.org/") == True) assert(rerp.is_allowed("excite", "http://www.fict.org/") == True) assert(rerp.is_allowed("OtherBot", "http://www.fict.org/") == False) assert(rerp.is_allowed("unhipbot", "http://www.fict.org/index.html") == False) assert(rerp.is_allowed("webcrawler", "http://www.fict.org/index.html") == True) assert(rerp.is_allowed("excite", "http://www.fict.org/index.html") == True) assert(rerp.is_allowed("OtherBot", "http://www.fict.org/index.html") == False) # Test for robots.txt dropped -- I presume that no one will fetch robots.txt # to see if they're allowed to fetch robots.txt. Sheesh... # assert(rerp.is_allowed("unhipbot", "http://www.fict.org/robots.txt") == True) # assert(rerp.is_allowed("webcrawler", "http://www.fict.org/robots.txt") == True) # assert(rerp.is_allowed("excite", "http://www.fict.org/robots.txt") == True) # assert(rerp.is_allowed("OtherBot", "http://www.fict.org/robots.txt") == True) assert(rerp.is_allowed("unhipbot", "http://www.fict.org/server.html") == False) assert(rerp.is_allowed("webcrawler", "http://www.fict.org/server.html") == True) assert(rerp.is_allowed("excite", "http://www.fict.org/server.html") == True) assert(rerp.is_allowed("OtherBot", "http://www.fict.org/server.html") == True) assert(rerp.is_allowed("unhipbot", "http://www.fict.org/services/fast.html") == False) assert(rerp.is_allowed("webcrawler", "http://www.fict.org/services/fast.html") == True) assert(rerp.is_allowed("excite", "http://www.fict.org/services/fast.html") == True) assert(rerp.is_allowed("OtherBot", "http://www.fict.org/services/fast.html") == True) assert(rerp.is_allowed("unhipbot", "http://www.fict.org/services/slow.html") == False) assert(rerp.is_allowed("webcrawler", "http://www.fict.org/services/slow.html") == True) assert(rerp.is_allowed("excite", "http://www.fict.org/services/slow.html") == True) assert(rerp.is_allowed("OtherBot", "http://www.fict.org/services/slow.html") == True) assert(rerp.is_allowed("unhipbot", "http://www.fict.org/orgo.gif") == False) assert(rerp.is_allowed("webcrawler", "http://www.fict.org/orgo.gif") == True) assert(rerp.is_allowed("excite", "http://www.fict.org/orgo.gif") == True) assert(rerp.is_allowed("OtherBot", "http://www.fict.org/orgo.gif") == False) assert(rerp.is_allowed("unhipbot", "http://www.fict.org/org/about.html") == False) assert(rerp.is_allowed("webcrawler", "http://www.fict.org/org/about.html") == True) assert(rerp.is_allowed("excite", "http://www.fict.org/org/about.html") == True) assert(rerp.is_allowed("OtherBot", "http://www.fict.org/org/about.html") == True) assert(rerp.is_allowed("unhipbot", "http://www.fict.org/org/plans.html") == False) assert(rerp.is_allowed("webcrawler", "http://www.fict.org/org/plans.html") == True) assert(rerp.is_allowed("excite", "http://www.fict.org/org/plans.html") == True) assert(rerp.is_allowed("OtherBot", "http://www.fict.org/org/plans.html") == False) assert(rerp.is_allowed("unhipbot", "http://www.fict.org/%7Ejim/jim.html") == False) assert(rerp.is_allowed("webcrawler", "http://www.fict.org/%7Ejim/jim.html") == True) assert(rerp.is_allowed("excite", "http://www.fict.org/%7Ejim/jim.html") == True) assert(rerp.is_allowed("OtherBot", "http://www.fict.org/%7Ejim/jim.html") == False) assert(rerp.is_allowed("unhipbot", "http://www.fict.org/%7Emak/mak.html") == False) assert(rerp.is_allowed("webcrawler", "http://www.fict.org/%7Emak/mak.html") == True) assert(rerp.is_allowed("excite", "http://www.fict.org/%7Emak/mak.html") == True) assert(rerp.is_allowed("OtherBot", "http://www.fict.org/%7Emak/mak.html") == True) except AssertionError: print "%s test failed." % TestName raise AssertionError print "%s test passed." % TestName # ----------------------------------------------------------- # Test a blank (or non-existent) robots.txt # ----------------------------------------------------------- TestName = "Blank" s = "" rerp.parse(s) #print rerp try: assert(rerp.is_allowed("foobot", "/") == True) assert(rerp.is_allowed("anybot", "/foo.html") == True) assert(rerp.is_allowed("anybot", "/TheGoldenAgeOfBallooning/") == True) assert(rerp.is_allowed("anybot", "/TheGoldenAgeOfBallooning/claret.html") == True) except AssertionError: print "%s test failed." % TestName raise AssertionError print "%s test passed." % TestName # ----------------------------------------------------------- # Test the parser's generosity # ----------------------------------------------------------- TestName = "Generosity" Utf8ByteOrderMark = chr(0xef) + chr(0xbb) + chr(0xbf) s = """%sUSERAGENT: FOOBOT %suser-agent:%s%s%sbarbot%s disallow: /foo/ """ % (Utf8ByteOrderMark, '\t', '\t', '\t', '\t', chr(0xb)) rerp.parse(s) #print rerp try: assert(rerp.is_allowed("foobot", "/") == True) assert(rerp.is_allowed("foobot", "/foo/bar.html") == False) assert(rerp.is_allowed("AnotherBot", "/foo/bar.html") == True) assert(rerp.is_allowed("Foobot Version 1.0", "/foo/bar.html") == False) assert(rerp.is_allowed("Mozilla/5.0 (compatible; Foobot/2.1)", "/foo/bar.html") == False) assert(rerp.is_allowed("barbot", "/foo/bar.html") == False) assert(rerp.is_allowed("barbot", "/tmp/") == True) except AssertionError: print "%s test failed." % TestName raise AssertionError print "%s test passed." % TestName # ----------------------------------------------------------- # Test the parser's ability to handle non-ASCII # ----------------------------------------------------------- TestName = "Non-ASCII" s = u"""# robots.txt for http://www.example.com/ UserAgent: Jävla-Foobot Disallow: / UserAgent: \u041b\u044c\u0432\u0456\u0432-bot Disallow: /totalitarianism/ """ rerp.parse(s) try: assert(rerp.is_allowed("foobot", "/") == True) assert(rerp.is_allowed(u"jävla fanbot", "/foo/bar.html") == True) assert(rerp.is_allowed(u"jävla-foobot", "/foo/bar.html") == False) assert(rerp.is_allowed(u"Mozilla/5.0 (compatible; \u041b\u044c\u0432\u0456\u0432-bot/1.1)", "/") == True) assert(rerp.is_allowed(u"Mozilla/5.0 (compatible; \u041b\u044c\u0432\u0456\u0432-bot/1.1)", "/totalitarianism/foo.htm") == False) except AssertionError: print "%s test failed." % TestName raise AssertionError print "%s test passed." % TestName # ----------------------------------------------------------- # Test the implicit allow rule # ----------------------------------------------------------- TestName = "Implicit allow" s =""" # robots.txt for http://www.example.com/ User-agent: * Disallow: / User-agent: foobot Disallow: """ rerp.parse(s) try: assert(rerp.is_allowed("foobot", "/") == True) assert(rerp.is_allowed("foobot", "/bar.html") == True) assert(rerp.is_allowed("SomeOtherBot", "/") == False) assert(rerp.is_allowed("SomeOtherBot", "/blahblahblah") == False) except AssertionError: print "%s test failed." % TestName raise AssertionError print "%s test passed." % TestName if RUN_FETCH_TESTS: # ----------------------------------------------------------- # Test the parser's ability to fetch and decode files from the Net # ----------------------------------------------------------- TestName = "Fetch and Decode" print "Testing network fetching. This may take a moment..." try: rerp.fetch("http://example.com/robots.txt") except urllib2.URLError: # Expected pass except Exception: print "%s test failed." % TestName raise Exception # This might fail if Google changes their robots.txt. rerp.fetch("http://www.google.com/robots.txt") assert(rerp.is_allowed("foobot", "/search") == False) assert(rerp.is_allowed("foobot", "http://www.google.com/search") == False) # This file exists but it uses a non-ASCII encoding (iso-8859-1) rerp.fetch("http://semanchuk.com/philip/boneyard/rerp/robots.txt.iso8859-1") assert(rerp.is_allowed("foobot", "/stuff") == True) assert(rerp.is_allowed(u"jävla-foobot", "/stuff") == False) assert(rerp.is_allowed(u"jävla-osvenskan", "/stuff") == True) # This file exists but it uses a non-ASCII encoding (utf-8) rerp.fetch("http://semanchuk.com/philip/boneyard/rerp/robots.txt.utf8") assert(rerp.is_allowed("foobot", "/stuff") == True) assert(rerp.is_allowed(u"jävla-foobot", "/stuff") == False) assert(rerp.is_allowed(u"jävla-osvenskan", "/stuff") == True) # Test 404 handling rerp.fetch("http://NikitaTheSpider.com/ThisDirectoryDoesNotExist/robots.txt") assert(rerp.is_allowed("foobot", "/") == True) assert(rerp.is_allowed(u"jävla-foobot", "/stuff") == True) assert(rerp.is_allowed("anybot", "/TotallySecretStuff") == True) print "%s test passed." % TestName # ----------------------------------------------------------- # Test handling of bad syntax # ----------------------------------------------------------- TestName = "Bad Syntax" s =""" # robots.txt for http://www.example.com/ # This is nonsense; UA most come first. Disallow: / User-agent: * # With apologies to Dr. Seuss, this syntax won't act as the author expects. It will only # match UA strings that contain "onebot twobot greenbot bluebot". To match multiple # UAs to a single rule, use multiple "User-agent:" lines. User-agent: onebot twobot greenbot bluebot Disallow: / # Disallow: * will match only a literal '*' as a filename. It isn't a wildcard. User-agent: Disallow: * # Wildcards aren't allowed in the disallow line; correct syntax would be this: # Disallow: /private/ # The '*' below will be treated as a literal. User-agent: threebot Disallow: /private/* # Blank lines indicate an end-of-record so the first UA listed here is ignored. User-agent: OneTwoFiveThreeSirBot # Note from Webmaster: add new user-agents below: User-agent: WotBehindTheRabbitBot User-agent: ItIsTheRabbitBot Disallow: /HolyHandGrenade/ """ rerp.parse(s) try: assert(rerp.is_allowed("onebot", "/") == True) assert(rerp.is_allowed("onebot", "/foo/bar.html") == True) assert(rerp.is_allowed("bluebot", "/") == True) assert(rerp.is_allowed("bluebot", "/foo/bar.html") == True) assert(rerp.is_allowed("threebot", "/private/BrotherMaynard.html") == True) assert(rerp.is_allowed("threebot", "/private/*") == False) assert(rerp.is_allowed("OneTwoFiveThreeSirBot", "/HolyHandGrenade/Antioch.html") == True) assert(rerp.is_allowed("WotBehindTheRabbitBot", "/HolyHandGrenade/Antioch.html") == False) except AssertionError: print "%s test failed." % TestName raise AssertionError print "%s test passed." % TestName if RUN_FETCH_TESTS: # ----------------------------------------------------------- # Test the parser's ability to handle non-200 response codes # ----------------------------------------------------------- TestName = "Fetch Failures" print "Testing network fetching. This may take a moment..." rerp.fetch("http://semanchuk.com/philip/boneyard/rerp/MagicResponseCode.php?code=401") try: assert(rerp.is_allowed("NigelBot", "/") == False) assert(rerp.is_allowed("StigBot", "/foo/bar.html") == False) assert(rerp.is_allowed("BruceBruceBruceBot", "/") == False) except AssertionError: print "%s test failed." % TestName raise AssertionError rerp.fetch("http://semanchuk.com/philip/boneyard/rerp/MagicResponseCode.php?code=403") try: assert(rerp.is_allowed("NigelBot", "/") == False) assert(rerp.is_allowed("StigBot", "/foo/bar.html") == False) assert(rerp.is_allowed("BruceBruceBruceBot", "/") == False) except AssertionError: print "%s test failed." % TestName raise AssertionError rerp.fetch("http://semanchuk.com/philip/boneyard/rerp/MagicResponseCode.php?code=404") try: assert(rerp.is_allowed("NigelBot", "/") == True) assert(rerp.is_allowed("StigBot", "/foo/bar.html") == True) assert(rerp.is_allowed("BruceBruceBruceBot", "/") == True) except AssertionError: print "%s test failed." % TestName raise AssertionError try: rerp.fetch("http://semanchuk.com/philip/boneyard/rerp/MagicResponseCode.php?code=500") except urllib2.URLError: # This is exactly what's supposed to happen. pass except Exception: print "%s test failed." % TestName raise Exception if RUN_FETCH_TESTS: # ----------------------------------------------------------- # Test the parser's expiration features # ----------------------------------------------------------- TestName = "Fetch Failures" # Create a fresh parser to (re)set the expiration date. I test to see if the dates are # accurate to +/-1 minute. If your local clock is off by more than that, these tests # will fail. # Test local time rerp = robotexclusionrulesparser.RobotExclusionRulesParser() localtime = time.mktime(time.localtime()) assert((rerp.expiration_date > localtime + robotexclusionrulesparser.SEVEN_DAYS - 60) and (rerp.expiration_date < localtime + robotexclusionrulesparser.SEVEN_DAYS + 60)) # Test UTC rerp = robotexclusionrulesparser.RobotExclusionRulesParser() rerp.use_local_time = False utc = calendar.timegm(time.gmtime()) assert((rerp.expiration_date > utc + robotexclusionrulesparser.SEVEN_DAYS - 60) and (rerp.expiration_date < utc + robotexclusionrulesparser.SEVEN_DAYS + 60)) # This is UTC rerp.fetch("http://semanchuk.com/philip/boneyard/rerp/MagicResponseCode.php?code=200") utc = calendar.timegm(time.gmtime()) assert((rerp.expiration_date > utc - 60) and (rerp.expiration_date < utc + 60)) # This is the time in Philly rerp.fetch("http://semanchuk.com/philip/boneyard/rerp/MagicResponseCode.php?code=200&city=philly") utc = calendar.timegm(time.gmtime()) assert((rerp.expiration_date > utc - 60) and (rerp.expiration_date < utc + 60)) # This is the time in Stockholm rerp.fetch("http://semanchuk.com/philip/boneyard/rerp/MagicResponseCode.php?code=200&city=stockholm") utc = calendar.timegm(time.gmtime()) assert((rerp.expiration_date > utc - 60) and (rerp.expiration_date < utc + 60))