Python robotparser.RobotFileParser() Examples
The following are 25
code examples of robotparser.RobotFileParser().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
robotparser
, or try the search function
.
Example #1
Source File: test_robotparser.py From BinderFilter with MIT License | 6 votes |
def testPasswordProtectedSite(self): test_support.requires('network') with test_support.transient_internet('mueblesmoraleda.com'): url = 'http://mueblesmoraleda.com' robots_url = url + "/robots.txt" # First check the URL is usable for our purposes, since the # test site is a bit flaky. try: urlopen(robots_url) except HTTPError as e: if e.code not in {401, 403}: self.skipTest( "%r should return a 401 or 403 HTTP error, not %r" % (robots_url, e.code)) else: self.skipTest( "%r should return a 401 or 403 HTTP error, not succeed" % (robots_url)) parser = robotparser.RobotFileParser() parser.set_url(url) try: parser.read() except IOError: self.skipTest('%s is unavailable' % url) self.assertEqual(parser.can_fetch("*", robots_url), False)
Example #2
Source File: test_robotparser.py From oss-ftp with MIT License | 6 votes |
def testPasswordProtectedSite(self): test_support.requires('network') with test_support.transient_internet('mueblesmoraleda.com'): url = 'http://mueblesmoraleda.com' robots_url = url + "/robots.txt" # First check the URL is usable for our purposes, since the # test site is a bit flaky. try: urlopen(robots_url) except HTTPError as e: if e.code not in {401, 403}: self.skipTest( "%r should return a 401 or 403 HTTP error, not %r" % (robots_url, e.code)) else: self.skipTest( "%r should return a 401 or 403 HTTP error, not succeed" % (robots_url)) parser = robotparser.RobotFileParser() parser.set_url(url) try: parser.read() except IOError: self.skipTest('%s is unavailable' % url) self.assertEqual(parser.can_fetch("*", robots_url), False)
Example #3
Source File: test_robotparser.py From gcblue with BSD 3-Clause "New" or "Revised" License | 6 votes |
def testPasswordProtectedSite(self): test_support.requires('network') with test_support.transient_internet('mueblesmoraleda.com'): url = 'http://mueblesmoraleda.com' robots_url = url + "/robots.txt" # First check the URL is usable for our purposes, since the # test site is a bit flaky. try: urlopen(robots_url) except HTTPError as e: if e.code not in {401, 403}: self.skipTest( "%r should return a 401 or 403 HTTP error, not %r" % (robots_url, e.code)) else: self.skipTest( "%r should return a 401 or 403 HTTP error, not succeed" % (robots_url)) parser = robotparser.RobotFileParser() parser.set_url(url) try: parser.read() except IOError: self.skipTest('%s is unavailable' % url) self.assertEqual(parser.can_fetch("*", robots_url), False)
Example #4
Source File: test_robotparser.py From CTFCrackTools with GNU General Public License v3.0 | 5 votes |
def testPythonOrg(self): support.requires('network') with support.transient_internet('www.python.org'): parser = robotparser.RobotFileParser( "http://www.python.org/robots.txt") parser.read() self.assertTrue( parser.can_fetch("*", "http://www.python.org/robots.txt"))
Example #5
Source File: test_robotparser.py From ironpython2 with Apache License 2.0 | 5 votes |
def setUp(self): lines = StringIO.StringIO(self.robots_txt).readlines() self.parser = robotparser.RobotFileParser() self.parser.parse(lines)
Example #6
Source File: test_robotparser.py From CTFCrackTools with GNU General Public License v3.0 | 5 votes |
def testPasswordProtectedSite(self): addr = self.server.server_address url = 'http://' + support.HOST + ':' + str(addr[1]) robots_url = url + "/robots.txt" parser = robotparser.RobotFileParser() parser.set_url(url) parser.read() self.assertFalse(parser.can_fetch("*", robots_url))
Example #7
Source File: test_robotparser.py From CTFCrackTools with GNU General Public License v3.0 | 5 votes |
def RobotTest(index, robots_txt, good_urls, bad_urls, agent="test_robotparser"): lines = StringIO(robots_txt).readlines() parser = robotparser.RobotFileParser() parser.parse(lines) for url in good_urls: tests.addTest(RobotTestCase(index, parser, url, 1, agent)) for url in bad_urls: tests.addTest(RobotTestCase(index, parser, url, 0, agent)) # Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002) # 1.
Example #8
Source File: test_robotparser.py From CTFCrackTools-V2 with GNU General Public License v3.0 | 5 votes |
def testPythonOrg(self): support.requires('network') with support.transient_internet('www.python.org'): parser = robotparser.RobotFileParser( "http://www.python.org/robots.txt") parser.read() self.assertTrue( parser.can_fetch("*", "http://www.python.org/robots.txt"))
Example #9
Source File: test_robotparser.py From CTFCrackTools-V2 with GNU General Public License v3.0 | 5 votes |
def testPasswordProtectedSite(self): addr = self.server.server_address url = 'http://' + support.HOST + ':' + str(addr[1]) robots_url = url + "/robots.txt" parser = robotparser.RobotFileParser() parser.set_url(url) parser.read() self.assertFalse(parser.can_fetch("*", robots_url))
Example #10
Source File: test_robotparser.py From CTFCrackTools-V2 with GNU General Public License v3.0 | 5 votes |
def RobotTest(index, robots_txt, good_urls, bad_urls, agent="test_robotparser"): lines = StringIO(robots_txt).readlines() parser = robotparser.RobotFileParser() parser.parse(lines) for url in good_urls: tests.addTest(RobotTestCase(index, parser, url, 1, agent)) for url in bad_urls: tests.addTest(RobotTestCase(index, parser, url, 0, agent)) # Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002) # 1.
Example #11
Source File: _http.py From pelisalacarta-ce with GNU General Public License v3.0 | 5 votes |
def __init__(self, url='', opener=None): robotparser.RobotFileParser.__init__(self, url) self._opener = opener self._timeout = _sockettimeout._GLOBAL_DEFAULT_TIMEOUT
Example #12
Source File: _http.py From BruteXSS with GNU General Public License v3.0 | 5 votes |
def __init__(self, url='', opener=None): robotparser.RobotFileParser.__init__(self, url) self._opener = opener self._timeout = _sockettimeout._GLOBAL_DEFAULT_TIMEOUT
Example #13
Source File: test_robotparser.py From medicare-demo with Apache License 2.0 | 5 votes |
def runTest(self): test_support.requires('network') # whole site is password-protected. url = 'http://mueblesmoraleda.com' parser = robotparser.RobotFileParser() parser.set_url(url) parser.read() self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False)
Example #14
Source File: test_robotparser.py From medicare-demo with Apache License 2.0 | 5 votes |
def RobotTest(index, robots_txt, good_urls, bad_urls, agent="test_robotparser"): lines = StringIO.StringIO(robots_txt).readlines() parser = robotparser.RobotFileParser() parser.parse(lines) for url in good_urls: tests.addTest(RobotTestCase(index, parser, url, 1, agent)) for url in bad_urls: tests.addTest(RobotTestCase(index, parser, url, 0, agent)) # Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002) # 1.
Example #15
Source File: webchecker.py From datafari with Apache License 2.0 | 5 votes |
def addrobot(self, root): root = urlparse.urljoin(root, "/") if self.robots.has_key(root): return url = urlparse.urljoin(root, "/robots.txt") self.robots[root] = rp = robotparser.RobotFileParser() self.note(2, "Parsing %s", url) rp.debug = self.verbose > 3 rp.set_url(url) try: rp.read() except (OSError, IOError), msg: self.note(1, "I/O error parsing %s: %s", url, msg)
Example #16
Source File: test_robotparser.py From gcblue with BSD 3-Clause "New" or "Revised" License | 5 votes |
def testPythonOrg(self): test_support.requires('network') with test_support.transient_internet('www.python.org'): parser = robotparser.RobotFileParser( "http://www.python.org/robots.txt") parser.read() self.assertTrue( parser.can_fetch("*", "http://www.python.org/robots.txt"))
Example #17
Source File: test_robotparser.py From gcblue with BSD 3-Clause "New" or "Revised" License | 5 votes |
def RobotTest(index, robots_txt, good_urls, bad_urls, agent="test_robotparser"): lines = StringIO.StringIO(robots_txt).readlines() parser = robotparser.RobotFileParser() parser.parse(lines) for url in good_urls: tests.addTest(RobotTestCase(index, parser, url, 1, agent)) for url in bad_urls: tests.addTest(RobotTestCase(index, parser, url, 0, agent)) # Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002) # 1.
Example #18
Source File: test_robotparser.py From oss-ftp with MIT License | 5 votes |
def testPythonOrg(self): test_support.requires('network') with test_support.transient_internet('www.python.org'): parser = robotparser.RobotFileParser( "https://www.python.org/robots.txt") parser.read() self.assertTrue( parser.can_fetch("*", "https://www.python.org/robots.txt"))
Example #19
Source File: test_robotparser.py From oss-ftp with MIT License | 5 votes |
def RobotTest(index, robots_txt, good_urls, bad_urls, agent="test_robotparser"): lines = StringIO.StringIO(robots_txt).readlines() parser = robotparser.RobotFileParser() parser.parse(lines) for url in good_urls: tests.addTest(RobotTestCase(index, parser, url, 1, agent)) for url in bad_urls: tests.addTest(RobotTestCase(index, parser, url, 0, agent)) # Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002) # 1.
Example #20
Source File: webchecker.py From oss-ftp with MIT License | 5 votes |
def addrobot(self, root): root = urlparse.urljoin(root, "/") if self.robots.has_key(root): return url = urlparse.urljoin(root, "/robots.txt") self.robots[root] = rp = robotparser.RobotFileParser() self.note(2, "Parsing %s", url) rp.debug = self.verbose > 3 rp.set_url(url) try: rp.read() except (OSError, IOError), msg: self.note(1, "I/O error parsing %s: %s", url, msg)
Example #21
Source File: test_robotparser.py From BinderFilter with MIT License | 5 votes |
def testPythonOrg(self): test_support.requires('network') with test_support.transient_internet('www.python.org'): parser = robotparser.RobotFileParser( "http://www.python.org/robots.txt") parser.read() self.assertTrue( parser.can_fetch("*", "http://www.python.org/robots.txt"))
Example #22
Source File: test_robotparser.py From BinderFilter with MIT License | 5 votes |
def RobotTest(index, robots_txt, good_urls, bad_urls, agent="test_robotparser"): lines = StringIO.StringIO(robots_txt).readlines() parser = robotparser.RobotFileParser() parser.parse(lines) for url in good_urls: tests.addTest(RobotTestCase(index, parser, url, 1, agent)) for url in bad_urls: tests.addTest(RobotTestCase(index, parser, url, 0, agent)) # Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002) # 1.
Example #23
Source File: test_robotparser.py From ironpython2 with Apache License 2.0 | 5 votes |
def test_read_404(self): parser = robotparser.RobotFileParser(self.url('i-robot.txt')) parser.read() self.assertTrue(parser.allow_all) self.assertFalse(parser.disallow_all) self.assertEqual(parser.mtime(), 0)
Example #24
Source File: test_robotparser.py From ironpython2 with Apache License 2.0 | 5 votes |
def setUpClass(cls): support.requires('network') with support.transient_internet(cls.base_url): cls.parser = robotparser.RobotFileParser(cls.robots_txt) cls.parser.read()
Example #25
Source File: test_robotparser.py From ironpython2 with Apache License 2.0 | 5 votes |
def testPasswordProtectedSite(self): addr = self.server.server_address url = 'http://' + support.HOST + ':' + str(addr[1]) robots_url = url + "/robots.txt" parser = robotparser.RobotFileParser() parser.set_url(url) parser.read() self.assertFalse(parser.can_fetch("*", robots_url))