Python Examples of lxml.html.document

Source File: parsers.py From crestify with BSD 3-Clause "New" or "Revised" License

6 votes

def __init__(self, file_name, user_id):
        with open(file_name, 'r') as self.opened_file:
            #  So Instapaper doesn't close <li> tags
            #  This was causing infinite recursion when using BS directly
            #  Hence why the stuff below is being done, so that the <li> tags get closed
            self.html = html.document_fromstring(self.opened_file.read())
            self.html = html.tostring(self.html)
        self.soup = BeautifulSoup4(self.html)
        self.user = user_id
        self.urls = dict()
        self.check_duplicates = dict()
        self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user,
                                                            Bookmark.deleted == False).all()
        for bmark in self.check_duplicates_query:
            self.check_duplicates[bmark.main_url] = bmark
        self.tags_dict = dict()
        self.tags_set = set()
        self.valid_url = re.compile(
            r'^(?:[a-z0-9\.\-]*)://'
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|'
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'
            r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'
            r'(?::\d+)?'
            r'(?:/?|[/?]\S+)$', re.IGNORECASE)

Source File: importer.py From python-ooxml with GNU Affero General Public License v3.0

6 votes

def parse_html_string(s):
    from lxml import html

    utf8_parser = html.HTMLParser(encoding='utf-8')
    html_tree = html.document_fromstring(s , parser=utf8_parser)

    return html_tree

Source File: server.py From autologin with Apache License 2.0

6 votes

def download_page(url, cookie_jar):
    """
    Request page using authenticated cookies (cookiejar).
    Download html source and save in browser directory, to
    be used by in show_in_browser().
    """
    browser_dir = os.path.join(server_path, 'static/browser')
    delete_directory_files(browser_dir)
    filename = '{}.html'.format(uuid.uuid4())
    filepath = os.path.join(browser_dir, filename)
    try:
        response = cookie_request(url, cookie_jar)
    except requests.RequestException as e:
        return e, None
    doc = html.document_fromstring(response.text)
    with open(filepath, 'wb') as f:
        f.write(html.tostring(doc))
    return None, filename

Source File: scraping.py From text-mining-class with MIT License

5 votes

def __init__(self, html_content, encoding="utf-8"):
        if isinstance(html_content, bytes):
            html_content = html_content.decode(encoding)
        self.document = document_fromstring(html_content)
        strip_elements(self.document, "style")

Source File: scraping.py From text-mining-class with MIT License

5 votes

def __init__(self, html_content, encoding="utf-8"):
        if isinstance(html_content, bytes):
            html_content = html_content.decode(encoding)
        self.document = document_fromstring(html_content)
        strip_elements(self.document, "style")

Source File: googlerbot.py From SnapchatBot with MIT License

5 votes

def reverse_image_search(url):
    value = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
    headers = {'User-Bot': value}
    search_url = 'https://www.google.com/searchbyimage?image_url=%s' % url
    resp = requests.get(search_url, headers=headers)
    root = document_fromstring(resp.content)
    href = root.cssselect(".bia")[0].attrib['href']
    print(search_url)
    new_url = "https://www.google.com" + href
    resp = requests.get(new_url, headers=headers)
    return re.search("imgurl=([^&]*)", resp.content).group(1)

Source File: cnae.py From socios-brasil with GNU Lesser General Public License v3.0

5 votes

def parse_items(self, response, root_name=None):
        """Recursively get data/make requests for all parser hierarchical levels"""

        data = response.request.meta.get("data", {})
        root_name = root_name or response.request.meta["root_name"]
        metadata = self.parsers[root_name]
        xpath_id = metadata["xpath_id"]
        xpath_description = metadata["xpath_description"]
        xpath_url = metadata["xpath_url"]
        item_name = metadata["next"]
        for item in response.xpath(metadata["xpath_items"]):
            tree = document_fromstring(item.extract())
            url = urljoin("https://cnae.ibge.gov.br/", tree.xpath(xpath_url)[0])
            item_id = get_text(tree.xpath(xpath_id))
            item_description = get_text(tree.xpath(xpath_description))
            item_data = {}
            if item_name == "subclasse" or len(item_id) == self.parsers[item_name]["id_length"]:
                next_root_name = item_name
            else:
                descricao = response.xpath("//span[@class = 'destaque']//text()").extract()[0]
                item_data[f"id_{item_name}"] = descricao.split()[0]
                item_data[f"descricao_{item_name}"] = descricao
                next_root_name = self.parsers[item_name]["next"]
            item_data.update({
                f"id_{next_root_name}": item_id.strip(),
                f"descricao_{next_root_name}": item_description.strip(),
            })
            item_data.update(data)

            callback = self.parse_items if next_root_name != "subclasse" else self.parse_subclasse
            yield scrapy.Request(
                url=url,
                meta={"data": item_data, "root_name": next_root_name},
                callback=callback,
            )

Source File: cnae.py From socios-brasil with GNU Lesser General Public License v3.0

5 votes

def parse_subclasse(self, response):
        """Yield the subclass item (last mile of the recursive strategy)"""
        data = response.request.meta["data"]
        tree = document_fromstring(response.body)
        data["notas_explicativas"] = "\n".join(
            [
                line.strip()
                for line in tree.xpath('//div[@id = "notas-explicativas"]//text()')
                if line.strip()
            ]
        )
        data["url"] = response.request.url
        data["id"] = int(data["id_subclasse"].replace("/", "").replace("-", ""))
        data["versao"] = self.versao
        yield data

Source File: cleaner.py From wanish with MIT License

5 votes

def initial_output(html_partial=False):
        """
        Creates initial HTML document according to the given flag
        :param html_partial: determines if there should be the html page or only a fragment
        :return: html output element
        """
        return fragment_fromstring('<div/>') if html_partial else document_fromstring('<div/>')

Source File: column.py From zhihu2ebook with MIT License

5 votes

def replace_img_url(self, content):
        utf8_parser = html.HTMLParser(encoding='utf-8')
        tree = html.document_fromstring(str(content), parser=utf8_parser)

        for _pic_link in tree.xpath("//img"):
            href = str(_pic_link.get('src'))
            pic_id, pic_type = href.split('.')
            _pic_link.set('src', "https://pic4.zhimg.com/" + pic_id + "_b." + pic_type)
        replaced_content = etree.tostring(tree, encoding=str)
        return replaced_content

Source File: utils.py From jorvik with GNU General Public License v3.0

5 votes

def get_drive_file(file):
    req = urllib.request.Request("https://docs.google.com/feeds/download/documents/export/Export?id=%s&exportFormat=html" %(file,))
    str = urllib.request.urlopen(req).read().decode('UTF-8')
    doc = html.document_fromstring(str)
    head = doc.xpath('//head')[0]
    head.tag = 'div'
    body = doc.xpath('//body')[0]
    body.tag = 'div'
    str = html.tostring(head)+html.tostring(body)
    return str

Source File: models.py From jorvik with GNU General Public License v3.0

5 votes

def corpo_body(self):
        """
        Prova ad estrarre il corpo della pagina (body).
        :return:
        """
        if not self.corpo:
            return ""
        doc = html.document_fromstring(self.corpo)
        body = doc.xpath('//body')[0]
        body.tag = 'div'
        #try:
        return html.tostring(body)
        #except:
        #    return self.corpo
        #print html.parse('http://someurl.at.domain').xpath('//body')[0].text_content()

Source File: models.py From jorvik with GNU General Public License v3.0

5 votes

def processa_link(self):
        """
        Controlla i link nella e-mail relativi e li rende assoluti.
        """
        doc = html.document_fromstring(self.corpo)
        links = doc.xpath('//a')
        for el in links:
            try:
                url = el.attrib['href']
                if '://' not in url:
                    el.attrib['href'] = "https://gaia.cri.it%s" % (url,)
            except KeyError:
                continue
        self.corpo = html.tostring(doc, pretty_print=True).decode('UTF-8')

Source File: test_parsers.py From crestify with BSD 3-Clause "New" or "Revised" License

4 votes

def setUp(self):
        query_user = User.query.filter_by(email='instapaper@example.com').first()
        if query_user:
            query_bookmarks = Bookmark.query.filter_by(user=query_user.id)
            for bmark in query_bookmarks:
                db.session.delete(bmark)
            db.session.commit()
            db.session.delete(query_user)
            db.session.commit()
        create_user = User()
        create_user.first_name = 'Instapaper'
        create_user.last_name = 'Test'
        create_user.email = 'instapaper@example.com'
        create_user.password = 'instapaper_pass'
        create_user.active = True
        create_user.confirmed_at = datetime.datetime.utcnow()
        db.session.add(create_user)
        db.session.commit()
        self.user = create_user
        with open('Instapaper.html') as json_file:
            create_file = open(os.path.join(app.config['CRESTIFY_UPLOAD_DIRECTORY'], 'test_instapaper.html'), 'w+')
            self.data = html.document_fromstring(json_file.read())
            self.data = html.tostring(self.data)
            self.html_data = BeautifulSoup4(self.data)
            self.bookmarks = {}
            for tag in self.html_data.find_all('h1'):
                parent_elem = tag.find_next_sibling('ol')
                links = parent_elem.find_all('a')
                for link in links:
                    title = link.text
                    url = link['href']
                    tags = [tag.text]
                    tags.append('Imported')
                    #  Thanks Instapaper for not adding timestamps
                    self.bookmarks[url] = {
                        'href': url,
                        'title': title,
                        'tags': tags
                    }
            create_file.write(self.data)
            self.file_path = create_file.name
            create_file.close()
        init_parser = InstapaperParser(self.file_path, self.user.id)
        init_parser.process()
        init_parser.add_to_database()
        self.query = Bookmark.query.filter_by(user=self.user.id).all()
        self.html_parser = HTMLParser()

Source File: parse-bbc-html-data.py From XSum with MIT License

4 votes

def __init__(self, story, corpus):
    self.story = story
    self.corpus = corpus
    self.parser = html.HTMLParser(encoding=chardet.detect(self.story.html)['encoding'])
    self.tree = html.document_fromstring(self.story.html, parser=self.parser)
    
    # Elements to delete.
    self.delete_selectors = {
      'bbc': [
        '//blockquote[contains(@class, "twitter-tweet")]',
        '//blockquote[contains(@class, "instagram-media")]'
      ]
    }
    
    # Title Selector
    self.title_selectors = {
      'bbc': [
        '//h1[contains(@class, "story-headline")]',
        '//h1[contains(@class, "story-body__h1")]'
      ]
    }
    
    # Introduction Selector
    self.introduction_selectors = {
      'bbc': [
        '//p[contains(@class, "story-body__introduction")]'
      ]
    }
    
    # Rest Content exclusions: ads, links, bylines, comments, headline and story introduction
    self.bbc_exclude = (
      'not(contains(@class, "story-headline"))'
      ' and not(contains(@class, "story-body__h1"))'
      ' and not(contains(@class, "story-body__introduction"))'
      ' and not(contains(@class, "with-extracted-share-icons"))'
    )

    # Rest Content Selector
    self.restcontent_selectors = {
      'bbc': [
        '//div[contains(@class, "story-body")]//p[%s]' % self.bbc_exclude    # story-body__inner
      ]
    }

Source File: tests.py From jorvik with GNU General Public License v3.0

4 votes

def test_us_attivazione_credenziali(self):

        EMAIL_UTENZA = email_fittizzia()

        presidente = crea_persona()
        persona, sede, appartenenza = crea_persona_sede_appartenenza(presidente=presidente)

        sessione_presidente = self.sessione_utente(persona=presidente)

        sessione_presidente.visit("%s%s" % (self.live_server_url, persona.url_profilo_credenziali))
        sessione_presidente.fill('email', EMAIL_UTENZA)
        sessione_presidente.find_by_xpath("//button[@type='submit']").first.click()

        self.assertTrue(
            Utenza.objects.filter(persona=persona).exists(),
            msg="L'utenza e' stata creata correttamente"
        )

        self.assertTrue(
            Utenza.objects.get(persona=persona).email == EMAIL_UTENZA,
            msg="L'email e' stata correttamente creata"
        )

        # Ottieni e-mail inviata
        msg = Messaggio.objects.filter(oggetto__icontains="credenziali",
                                       oggetti_destinatario__persona=persona)

        self.assertTrue(
            msg.exists(),
            msg="Email delle credenziali spedita"
        )

        corpo_msg = msg.first().corpo

        self.assertTrue(
            EMAIL_UTENZA in corpo_msg,
            msg="L'email contiene il nuovo indirizzo e-mail"
        )

        doc = html.document_fromstring(corpo_msg)
        nuova_pwd = doc.xpath("//*[@id='nuova-password']")[0].text.strip()

        utenza = persona.utenza
        utenza.password_testing = nuova_pwd  # Password per accesso

        # Prova accesso con nuova utenza.
        sessione_persona = self.sessione_utente(utente=utenza)

Python lxml.html.document_fromstring() Examples