Python Examples of robotparser.RobotFileParser

Source File: test_robotparser.py From BinderFilter with MIT License

6 votes

def testPasswordProtectedSite(self):
        test_support.requires('network')
        with test_support.transient_internet('mueblesmoraleda.com'):
            url = 'http://mueblesmoraleda.com'
            robots_url = url + "/robots.txt"
            # First check the URL is usable for our purposes, since the
            # test site is a bit flaky.
            try:
                urlopen(robots_url)
            except HTTPError as e:
                if e.code not in {401, 403}:
                    self.skipTest(
                        "%r should return a 401 or 403 HTTP error, not %r"
                        % (robots_url, e.code))
            else:
                self.skipTest(
                    "%r should return a 401 or 403 HTTP error, not succeed"
                    % (robots_url))
            parser = robotparser.RobotFileParser()
            parser.set_url(url)
            try:
                parser.read()
            except IOError:
                self.skipTest('%s is unavailable' % url)
            self.assertEqual(parser.can_fetch("*", robots_url), False)

Source File: test_robotparser.py From oss-ftp with MIT License

6 votes

def testPasswordProtectedSite(self):
        test_support.requires('network')
        with test_support.transient_internet('mueblesmoraleda.com'):
            url = 'http://mueblesmoraleda.com'
            robots_url = url + "/robots.txt"
            # First check the URL is usable for our purposes, since the
            # test site is a bit flaky.
            try:
                urlopen(robots_url)
            except HTTPError as e:
                if e.code not in {401, 403}:
                    self.skipTest(
                        "%r should return a 401 or 403 HTTP error, not %r"
                        % (robots_url, e.code))
            else:
                self.skipTest(
                    "%r should return a 401 or 403 HTTP error, not succeed"
                    % (robots_url))
            parser = robotparser.RobotFileParser()
            parser.set_url(url)
            try:
                parser.read()
            except IOError:
                self.skipTest('%s is unavailable' % url)
            self.assertEqual(parser.can_fetch("*", robots_url), False)

Source File: test_robotparser.py From gcblue with BSD 3-Clause "New" or "Revised" License

6 votes

def testPasswordProtectedSite(self):
        test_support.requires('network')
        with test_support.transient_internet('mueblesmoraleda.com'):
            url = 'http://mueblesmoraleda.com'
            robots_url = url + "/robots.txt"
            # First check the URL is usable for our purposes, since the
            # test site is a bit flaky.
            try:
                urlopen(robots_url)
            except HTTPError as e:
                if e.code not in {401, 403}:
                    self.skipTest(
                        "%r should return a 401 or 403 HTTP error, not %r"
                        % (robots_url, e.code))
            else:
                self.skipTest(
                    "%r should return a 401 or 403 HTTP error, not succeed"
                    % (robots_url))
            parser = robotparser.RobotFileParser()
            parser.set_url(url)
            try:
                parser.read()
            except IOError:
                self.skipTest('%s is unavailable' % url)
            self.assertEqual(parser.can_fetch("*", robots_url), False)

Source File: test_robotparser.py From CTFCrackTools with GNU General Public License v3.0

5 votes

def testPythonOrg(self):
        support.requires('network')
        with support.transient_internet('www.python.org'):
            parser = robotparser.RobotFileParser(
                "http://www.python.org/robots.txt")
            parser.read()
            self.assertTrue(
                parser.can_fetch("*", "http://www.python.org/robots.txt"))

Source File: test_robotparser.py From ironpython2 with Apache License 2.0

5 votes

def setUp(self):
        lines = StringIO.StringIO(self.robots_txt).readlines()
        self.parser = robotparser.RobotFileParser()
        self.parser.parse(lines)

Source File: test_robotparser.py From CTFCrackTools with GNU General Public License v3.0

5 votes

def testPasswordProtectedSite(self):
        addr = self.server.server_address
        url = 'http://' + support.HOST + ':' + str(addr[1])
        robots_url = url + "/robots.txt"
        parser = robotparser.RobotFileParser()
        parser.set_url(url)
        parser.read()
        self.assertFalse(parser.can_fetch("*", robots_url))

Source File: test_robotparser.py From CTFCrackTools with GNU General Public License v3.0

5 votes

def RobotTest(index, robots_txt, good_urls, bad_urls,
              agent="test_robotparser"):

    lines = StringIO(robots_txt).readlines()
    parser = robotparser.RobotFileParser()
    parser.parse(lines)
    for url in good_urls:
        tests.addTest(RobotTestCase(index, parser, url, 1, agent))
    for url in bad_urls:
        tests.addTest(RobotTestCase(index, parser, url, 0, agent))

# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)

# 1.

Source File: test_robotparser.py From CTFCrackTools-V2 with GNU General Public License v3.0

5 votes

def testPythonOrg(self):
        support.requires('network')
        with support.transient_internet('www.python.org'):
            parser = robotparser.RobotFileParser(
                "http://www.python.org/robots.txt")
            parser.read()
            self.assertTrue(
                parser.can_fetch("*", "http://www.python.org/robots.txt"))

Source File: test_robotparser.py From CTFCrackTools-V2 with GNU General Public License v3.0

5 votes

def testPasswordProtectedSite(self):
        addr = self.server.server_address
        url = 'http://' + support.HOST + ':' + str(addr[1])
        robots_url = url + "/robots.txt"
        parser = robotparser.RobotFileParser()
        parser.set_url(url)
        parser.read()
        self.assertFalse(parser.can_fetch("*", robots_url))

Source File: test_robotparser.py From CTFCrackTools-V2 with GNU General Public License v3.0

5 votes

def RobotTest(index, robots_txt, good_urls, bad_urls,
              agent="test_robotparser"):

    lines = StringIO(robots_txt).readlines()
    parser = robotparser.RobotFileParser()
    parser.parse(lines)
    for url in good_urls:
        tests.addTest(RobotTestCase(index, parser, url, 1, agent))
    for url in bad_urls:
        tests.addTest(RobotTestCase(index, parser, url, 0, agent))

# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)

# 1.

Source File: _http.py From pelisalacarta-ce with GNU General Public License v3.0

5 votes

def __init__(self, url='', opener=None):
        robotparser.RobotFileParser.__init__(self, url)
        self._opener = opener
        self._timeout = _sockettimeout._GLOBAL_DEFAULT_TIMEOUT

Source File: _http.py From BruteXSS with GNU General Public License v3.0

5 votes

def __init__(self, url='', opener=None):
        robotparser.RobotFileParser.__init__(self, url)
        self._opener = opener
        self._timeout = _sockettimeout._GLOBAL_DEFAULT_TIMEOUT

Source File: test_robotparser.py From medicare-demo with Apache License 2.0

5 votes

def runTest(self):
        test_support.requires('network')
        # whole site is password-protected.
        url = 'http://mueblesmoraleda.com'
        parser = robotparser.RobotFileParser()
        parser.set_url(url)
        parser.read()
        self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False)

Source File: test_robotparser.py From medicare-demo with Apache License 2.0

5 votes

def RobotTest(index, robots_txt, good_urls, bad_urls,
              agent="test_robotparser"):

    lines = StringIO.StringIO(robots_txt).readlines()
    parser = robotparser.RobotFileParser()
    parser.parse(lines)
    for url in good_urls:
        tests.addTest(RobotTestCase(index, parser, url, 1, agent))
    for url in bad_urls:
        tests.addTest(RobotTestCase(index, parser, url, 0, agent))

# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)

# 1.

Source File: webchecker.py From datafari with Apache License 2.0

5 votes

def addrobot(self, root):
        root = urlparse.urljoin(root, "/")
        if self.robots.has_key(root): return
        url = urlparse.urljoin(root, "/robots.txt")
        self.robots[root] = rp = robotparser.RobotFileParser()
        self.note(2, "Parsing %s", url)
        rp.debug = self.verbose > 3
        rp.set_url(url)
        try:
            rp.read()
        except (OSError, IOError), msg:
            self.note(1, "I/O error parsing %s: %s", url, msg)

Source File: test_robotparser.py From gcblue with BSD 3-Clause "New" or "Revised" License

5 votes

def testPythonOrg(self):
        test_support.requires('network')
        with test_support.transient_internet('www.python.org'):
            parser = robotparser.RobotFileParser(
                "http://www.python.org/robots.txt")
            parser.read()
            self.assertTrue(
                parser.can_fetch("*", "http://www.python.org/robots.txt"))

Source File: test_robotparser.py From gcblue with BSD 3-Clause "New" or "Revised" License

5 votes

def RobotTest(index, robots_txt, good_urls, bad_urls,
              agent="test_robotparser"):

    lines = StringIO.StringIO(robots_txt).readlines()
    parser = robotparser.RobotFileParser()
    parser.parse(lines)
    for url in good_urls:
        tests.addTest(RobotTestCase(index, parser, url, 1, agent))
    for url in bad_urls:
        tests.addTest(RobotTestCase(index, parser, url, 0, agent))

# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)

# 1.

Source File: test_robotparser.py From oss-ftp with MIT License

5 votes

def testPythonOrg(self):
        test_support.requires('network')
        with test_support.transient_internet('www.python.org'):
            parser = robotparser.RobotFileParser(
                "https://www.python.org/robots.txt")
            parser.read()
            self.assertTrue(
                parser.can_fetch("*", "https://www.python.org/robots.txt"))

Source File: test_robotparser.py From oss-ftp with MIT License

5 votes

def RobotTest(index, robots_txt, good_urls, bad_urls,
              agent="test_robotparser"):

    lines = StringIO.StringIO(robots_txt).readlines()
    parser = robotparser.RobotFileParser()
    parser.parse(lines)
    for url in good_urls:
        tests.addTest(RobotTestCase(index, parser, url, 1, agent))
    for url in bad_urls:
        tests.addTest(RobotTestCase(index, parser, url, 0, agent))

# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)

# 1.

Source File: webchecker.py From oss-ftp with MIT License

5 votes

def addrobot(self, root):
        root = urlparse.urljoin(root, "/")
        if self.robots.has_key(root): return
        url = urlparse.urljoin(root, "/robots.txt")
        self.robots[root] = rp = robotparser.RobotFileParser()
        self.note(2, "Parsing %s", url)
        rp.debug = self.verbose > 3
        rp.set_url(url)
        try:
            rp.read()
        except (OSError, IOError), msg:
            self.note(1, "I/O error parsing %s: %s", url, msg)

Source File: test_robotparser.py From BinderFilter with MIT License

5 votes

def testPythonOrg(self):
        test_support.requires('network')
        with test_support.transient_internet('www.python.org'):
            parser = robotparser.RobotFileParser(
                "http://www.python.org/robots.txt")
            parser.read()
            self.assertTrue(
                parser.can_fetch("*", "http://www.python.org/robots.txt"))

Source File: test_robotparser.py From BinderFilter with MIT License

5 votes

def RobotTest(index, robots_txt, good_urls, bad_urls,
              agent="test_robotparser"):

    lines = StringIO.StringIO(robots_txt).readlines()
    parser = robotparser.RobotFileParser()
    parser.parse(lines)
    for url in good_urls:
        tests.addTest(RobotTestCase(index, parser, url, 1, agent))
    for url in bad_urls:
        tests.addTest(RobotTestCase(index, parser, url, 0, agent))

# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)

# 1.

Source File: test_robotparser.py From ironpython2 with Apache License 2.0

5 votes

def test_read_404(self):
        parser = robotparser.RobotFileParser(self.url('i-robot.txt'))
        parser.read()
        self.assertTrue(parser.allow_all)
        self.assertFalse(parser.disallow_all)
        self.assertEqual(parser.mtime(), 0)

Source File: test_robotparser.py From ironpython2 with Apache License 2.0

5 votes

def setUpClass(cls):
        support.requires('network')
        with support.transient_internet(cls.base_url):
            cls.parser = robotparser.RobotFileParser(cls.robots_txt)
            cls.parser.read()

Source File: test_robotparser.py From ironpython2 with Apache License 2.0

5 votes

def testPasswordProtectedSite(self):
        addr = self.server.server_address
        url = 'http://' + support.HOST + ':' + str(addr[1])
        robots_url = url + "/robots.txt"
        parser = robotparser.RobotFileParser()
        parser.set_url(url)
        parser.read()
        self.assertFalse(parser.can_fetch("*", robots_url))

Python robotparser.RobotFileParser() Examples