Python robotparser.RobotFileParser() Examples

The following are 25 code examples of robotparser.RobotFileParser(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module robotparser , or try the search function .
Example #1
Source File: test_robotparser.py    From BinderFilter with MIT License 6 votes vote down vote up
def testPasswordProtectedSite(self):
        test_support.requires('network')
        with test_support.transient_internet('mueblesmoraleda.com'):
            url = 'http://mueblesmoraleda.com'
            robots_url = url + "/robots.txt"
            # First check the URL is usable for our purposes, since the
            # test site is a bit flaky.
            try:
                urlopen(robots_url)
            except HTTPError as e:
                if e.code not in {401, 403}:
                    self.skipTest(
                        "%r should return a 401 or 403 HTTP error, not %r"
                        % (robots_url, e.code))
            else:
                self.skipTest(
                    "%r should return a 401 or 403 HTTP error, not succeed"
                    % (robots_url))
            parser = robotparser.RobotFileParser()
            parser.set_url(url)
            try:
                parser.read()
            except IOError:
                self.skipTest('%s is unavailable' % url)
            self.assertEqual(parser.can_fetch("*", robots_url), False) 
Example #2
Source File: test_robotparser.py    From oss-ftp with MIT License 6 votes vote down vote up
def testPasswordProtectedSite(self):
        test_support.requires('network')
        with test_support.transient_internet('mueblesmoraleda.com'):
            url = 'http://mueblesmoraleda.com'
            robots_url = url + "/robots.txt"
            # First check the URL is usable for our purposes, since the
            # test site is a bit flaky.
            try:
                urlopen(robots_url)
            except HTTPError as e:
                if e.code not in {401, 403}:
                    self.skipTest(
                        "%r should return a 401 or 403 HTTP error, not %r"
                        % (robots_url, e.code))
            else:
                self.skipTest(
                    "%r should return a 401 or 403 HTTP error, not succeed"
                    % (robots_url))
            parser = robotparser.RobotFileParser()
            parser.set_url(url)
            try:
                parser.read()
            except IOError:
                self.skipTest('%s is unavailable' % url)
            self.assertEqual(parser.can_fetch("*", robots_url), False) 
Example #3
Source File: test_robotparser.py    From gcblue with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def testPasswordProtectedSite(self):
        test_support.requires('network')
        with test_support.transient_internet('mueblesmoraleda.com'):
            url = 'http://mueblesmoraleda.com'
            robots_url = url + "/robots.txt"
            # First check the URL is usable for our purposes, since the
            # test site is a bit flaky.
            try:
                urlopen(robots_url)
            except HTTPError as e:
                if e.code not in {401, 403}:
                    self.skipTest(
                        "%r should return a 401 or 403 HTTP error, not %r"
                        % (robots_url, e.code))
            else:
                self.skipTest(
                    "%r should return a 401 or 403 HTTP error, not succeed"
                    % (robots_url))
            parser = robotparser.RobotFileParser()
            parser.set_url(url)
            try:
                parser.read()
            except IOError:
                self.skipTest('%s is unavailable' % url)
            self.assertEqual(parser.can_fetch("*", robots_url), False) 
Example #4
Source File: test_robotparser.py    From CTFCrackTools with GNU General Public License v3.0 5 votes vote down vote up
def testPythonOrg(self):
        support.requires('network')
        with support.transient_internet('www.python.org'):
            parser = robotparser.RobotFileParser(
                "http://www.python.org/robots.txt")
            parser.read()
            self.assertTrue(
                parser.can_fetch("*", "http://www.python.org/robots.txt")) 
Example #5
Source File: test_robotparser.py    From ironpython2 with Apache License 2.0 5 votes vote down vote up
def setUp(self):
        lines = StringIO.StringIO(self.robots_txt).readlines()
        self.parser = robotparser.RobotFileParser()
        self.parser.parse(lines) 
Example #6
Source File: test_robotparser.py    From CTFCrackTools with GNU General Public License v3.0 5 votes vote down vote up
def testPasswordProtectedSite(self):
        addr = self.server.server_address
        url = 'http://' + support.HOST + ':' + str(addr[1])
        robots_url = url + "/robots.txt"
        parser = robotparser.RobotFileParser()
        parser.set_url(url)
        parser.read()
        self.assertFalse(parser.can_fetch("*", robots_url)) 
Example #7
Source File: test_robotparser.py    From CTFCrackTools with GNU General Public License v3.0 5 votes vote down vote up
def RobotTest(index, robots_txt, good_urls, bad_urls,
              agent="test_robotparser"):

    lines = StringIO(robots_txt).readlines()
    parser = robotparser.RobotFileParser()
    parser.parse(lines)
    for url in good_urls:
        tests.addTest(RobotTestCase(index, parser, url, 1, agent))
    for url in bad_urls:
        tests.addTest(RobotTestCase(index, parser, url, 0, agent))

# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)

# 1. 
Example #8
Source File: test_robotparser.py    From CTFCrackTools-V2 with GNU General Public License v3.0 5 votes vote down vote up
def testPythonOrg(self):
        support.requires('network')
        with support.transient_internet('www.python.org'):
            parser = robotparser.RobotFileParser(
                "http://www.python.org/robots.txt")
            parser.read()
            self.assertTrue(
                parser.can_fetch("*", "http://www.python.org/robots.txt")) 
Example #9
Source File: test_robotparser.py    From CTFCrackTools-V2 with GNU General Public License v3.0 5 votes vote down vote up
def testPasswordProtectedSite(self):
        addr = self.server.server_address
        url = 'http://' + support.HOST + ':' + str(addr[1])
        robots_url = url + "/robots.txt"
        parser = robotparser.RobotFileParser()
        parser.set_url(url)
        parser.read()
        self.assertFalse(parser.can_fetch("*", robots_url)) 
Example #10
Source File: test_robotparser.py    From CTFCrackTools-V2 with GNU General Public License v3.0 5 votes vote down vote up
def RobotTest(index, robots_txt, good_urls, bad_urls,
              agent="test_robotparser"):

    lines = StringIO(robots_txt).readlines()
    parser = robotparser.RobotFileParser()
    parser.parse(lines)
    for url in good_urls:
        tests.addTest(RobotTestCase(index, parser, url, 1, agent))
    for url in bad_urls:
        tests.addTest(RobotTestCase(index, parser, url, 0, agent))

# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)

# 1. 
Example #11
Source File: _http.py    From pelisalacarta-ce with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, url='', opener=None):
        robotparser.RobotFileParser.__init__(self, url)
        self._opener = opener
        self._timeout = _sockettimeout._GLOBAL_DEFAULT_TIMEOUT 
Example #12
Source File: _http.py    From BruteXSS with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, url='', opener=None):
        robotparser.RobotFileParser.__init__(self, url)
        self._opener = opener
        self._timeout = _sockettimeout._GLOBAL_DEFAULT_TIMEOUT 
Example #13
Source File: test_robotparser.py    From medicare-demo with Apache License 2.0 5 votes vote down vote up
def runTest(self):
        test_support.requires('network')
        # whole site is password-protected.
        url = 'http://mueblesmoraleda.com'
        parser = robotparser.RobotFileParser()
        parser.set_url(url)
        parser.read()
        self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False) 
Example #14
Source File: test_robotparser.py    From medicare-demo with Apache License 2.0 5 votes vote down vote up
def RobotTest(index, robots_txt, good_urls, bad_urls,
              agent="test_robotparser"):

    lines = StringIO.StringIO(robots_txt).readlines()
    parser = robotparser.RobotFileParser()
    parser.parse(lines)
    for url in good_urls:
        tests.addTest(RobotTestCase(index, parser, url, 1, agent))
    for url in bad_urls:
        tests.addTest(RobotTestCase(index, parser, url, 0, agent))

# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)

# 1. 
Example #15
Source File: webchecker.py    From datafari with Apache License 2.0 5 votes vote down vote up
def addrobot(self, root):
        root = urlparse.urljoin(root, "/")
        if self.robots.has_key(root): return
        url = urlparse.urljoin(root, "/robots.txt")
        self.robots[root] = rp = robotparser.RobotFileParser()
        self.note(2, "Parsing %s", url)
        rp.debug = self.verbose > 3
        rp.set_url(url)
        try:
            rp.read()
        except (OSError, IOError), msg:
            self.note(1, "I/O error parsing %s: %s", url, msg) 
Example #16
Source File: test_robotparser.py    From gcblue with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def testPythonOrg(self):
        test_support.requires('network')
        with test_support.transient_internet('www.python.org'):
            parser = robotparser.RobotFileParser(
                "http://www.python.org/robots.txt")
            parser.read()
            self.assertTrue(
                parser.can_fetch("*", "http://www.python.org/robots.txt")) 
Example #17
Source File: test_robotparser.py    From gcblue with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def RobotTest(index, robots_txt, good_urls, bad_urls,
              agent="test_robotparser"):

    lines = StringIO.StringIO(robots_txt).readlines()
    parser = robotparser.RobotFileParser()
    parser.parse(lines)
    for url in good_urls:
        tests.addTest(RobotTestCase(index, parser, url, 1, agent))
    for url in bad_urls:
        tests.addTest(RobotTestCase(index, parser, url, 0, agent))

# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)

# 1. 
Example #18
Source File: test_robotparser.py    From oss-ftp with MIT License 5 votes vote down vote up
def testPythonOrg(self):
        test_support.requires('network')
        with test_support.transient_internet('www.python.org'):
            parser = robotparser.RobotFileParser(
                "https://www.python.org/robots.txt")
            parser.read()
            self.assertTrue(
                parser.can_fetch("*", "https://www.python.org/robots.txt")) 
Example #19
Source File: test_robotparser.py    From oss-ftp with MIT License 5 votes vote down vote up
def RobotTest(index, robots_txt, good_urls, bad_urls,
              agent="test_robotparser"):

    lines = StringIO.StringIO(robots_txt).readlines()
    parser = robotparser.RobotFileParser()
    parser.parse(lines)
    for url in good_urls:
        tests.addTest(RobotTestCase(index, parser, url, 1, agent))
    for url in bad_urls:
        tests.addTest(RobotTestCase(index, parser, url, 0, agent))

# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)

# 1. 
Example #20
Source File: webchecker.py    From oss-ftp with MIT License 5 votes vote down vote up
def addrobot(self, root):
        root = urlparse.urljoin(root, "/")
        if self.robots.has_key(root): return
        url = urlparse.urljoin(root, "/robots.txt")
        self.robots[root] = rp = robotparser.RobotFileParser()
        self.note(2, "Parsing %s", url)
        rp.debug = self.verbose > 3
        rp.set_url(url)
        try:
            rp.read()
        except (OSError, IOError), msg:
            self.note(1, "I/O error parsing %s: %s", url, msg) 
Example #21
Source File: test_robotparser.py    From BinderFilter with MIT License 5 votes vote down vote up
def testPythonOrg(self):
        test_support.requires('network')
        with test_support.transient_internet('www.python.org'):
            parser = robotparser.RobotFileParser(
                "http://www.python.org/robots.txt")
            parser.read()
            self.assertTrue(
                parser.can_fetch("*", "http://www.python.org/robots.txt")) 
Example #22
Source File: test_robotparser.py    From BinderFilter with MIT License 5 votes vote down vote up
def RobotTest(index, robots_txt, good_urls, bad_urls,
              agent="test_robotparser"):

    lines = StringIO.StringIO(robots_txt).readlines()
    parser = robotparser.RobotFileParser()
    parser.parse(lines)
    for url in good_urls:
        tests.addTest(RobotTestCase(index, parser, url, 1, agent))
    for url in bad_urls:
        tests.addTest(RobotTestCase(index, parser, url, 0, agent))

# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)

# 1. 
Example #23
Source File: test_robotparser.py    From ironpython2 with Apache License 2.0 5 votes vote down vote up
def test_read_404(self):
        parser = robotparser.RobotFileParser(self.url('i-robot.txt'))
        parser.read()
        self.assertTrue(parser.allow_all)
        self.assertFalse(parser.disallow_all)
        self.assertEqual(parser.mtime(), 0) 
Example #24
Source File: test_robotparser.py    From ironpython2 with Apache License 2.0 5 votes vote down vote up
def setUpClass(cls):
        support.requires('network')
        with support.transient_internet(cls.base_url):
            cls.parser = robotparser.RobotFileParser(cls.robots_txt)
            cls.parser.read() 
Example #25
Source File: test_robotparser.py    From ironpython2 with Apache License 2.0 5 votes vote down vote up
def testPasswordProtectedSite(self):
        addr = self.server.server_address
        url = 'http://' + support.HOST + ':' + str(addr[1])
        robots_url = url + "/robots.txt"
        parser = robotparser.RobotFileParser()
        parser.set_url(url)
        parser.read()
        self.assertFalse(parser.can_fetch("*", robots_url))