Python lxml.html.document_fromstring() Examples

The following are 16 code examples of lxml.html.document_fromstring(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module lxml.html , or try the search function .
Example #1
Source File: parsers.py    From crestify with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def __init__(self, file_name, user_id):
        with open(file_name, 'r') as self.opened_file:
            #  So Instapaper doesn't close <li> tags
            #  This was causing infinite recursion when using BS directly
            #  Hence why the stuff below is being done, so that the <li> tags get closed
            self.html = html.document_fromstring(self.opened_file.read())
            self.html = html.tostring(self.html)
        self.soup = BeautifulSoup4(self.html)
        self.user = user_id
        self.urls = dict()
        self.check_duplicates = dict()
        self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user,
                                                            Bookmark.deleted == False).all()
        for bmark in self.check_duplicates_query:
            self.check_duplicates[bmark.main_url] = bmark
        self.tags_dict = dict()
        self.tags_set = set()
        self.valid_url = re.compile(
            r'^(?:[a-z0-9\.\-]*)://'
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|'
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'
            r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'
            r'(?::\d+)?'
            r'(?:/?|[/?]\S+)$', re.IGNORECASE) 
Example #2
Source File: importer.py    From python-ooxml with GNU Affero General Public License v3.0 6 votes vote down vote up
def parse_html_string(s):
    from lxml import html

    utf8_parser = html.HTMLParser(encoding='utf-8')
    html_tree = html.document_fromstring(s , parser=utf8_parser)

    return html_tree 
Example #3
Source File: server.py    From autologin with Apache License 2.0 6 votes vote down vote up
def download_page(url, cookie_jar):
    """
    Request page using authenticated cookies (cookiejar).
    Download html source and save in browser directory, to
    be used by in show_in_browser().
    """
    browser_dir = os.path.join(server_path, 'static/browser')
    delete_directory_files(browser_dir)
    filename = '{}.html'.format(uuid.uuid4())
    filepath = os.path.join(browser_dir, filename)
    try:
        response = cookie_request(url, cookie_jar)
    except requests.RequestException as e:
        return e, None
    doc = html.document_fromstring(response.text)
    with open(filepath, 'wb') as f:
        f.write(html.tostring(doc))
    return None, filename 
Example #4
Source File: scraping.py    From text-mining-class with MIT License 5 votes vote down vote up
def __init__(self, html_content, encoding="utf-8"):
        if isinstance(html_content, bytes):
            html_content = html_content.decode(encoding)
        self.document = document_fromstring(html_content)
        strip_elements(self.document, "style") 
Example #5
Source File: scraping.py    From text-mining-class with MIT License 5 votes vote down vote up
def __init__(self, html_content, encoding="utf-8"):
        if isinstance(html_content, bytes):
            html_content = html_content.decode(encoding)
        self.document = document_fromstring(html_content)
        strip_elements(self.document, "style") 
Example #6
Source File: googlerbot.py    From SnapchatBot with MIT License 5 votes vote down vote up
def reverse_image_search(url):
    value = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
    headers = {'User-Bot': value}
    search_url = 'https://www.google.com/searchbyimage?image_url=%s' % url
    resp = requests.get(search_url, headers=headers)
    root = document_fromstring(resp.content)
    href = root.cssselect(".bia")[0].attrib['href']
    print(search_url)
    new_url = "https://www.google.com" + href
    resp = requests.get(new_url, headers=headers)
    return re.search("imgurl=([^&]*)", resp.content).group(1) 
Example #7
Source File: cnae.py    From socios-brasil with GNU Lesser General Public License v3.0 5 votes vote down vote up
def parse_items(self, response, root_name=None):
        """Recursively get data/make requests for all parser hierarchical levels"""

        data = response.request.meta.get("data", {})
        root_name = root_name or response.request.meta["root_name"]
        metadata = self.parsers[root_name]
        xpath_id = metadata["xpath_id"]
        xpath_description = metadata["xpath_description"]
        xpath_url = metadata["xpath_url"]
        item_name = metadata["next"]
        for item in response.xpath(metadata["xpath_items"]):
            tree = document_fromstring(item.extract())
            url = urljoin("https://cnae.ibge.gov.br/", tree.xpath(xpath_url)[0])
            item_id = get_text(tree.xpath(xpath_id))
            item_description = get_text(tree.xpath(xpath_description))
            item_data = {}
            if item_name == "subclasse" or len(item_id) == self.parsers[item_name]["id_length"]:
                next_root_name = item_name
            else:
                descricao = response.xpath("//span[@class = 'destaque']//text()").extract()[0]
                item_data[f"id_{item_name}"] = descricao.split()[0]
                item_data[f"descricao_{item_name}"] = descricao
                next_root_name = self.parsers[item_name]["next"]
            item_data.update({
                f"id_{next_root_name}": item_id.strip(),
                f"descricao_{next_root_name}": item_description.strip(),
            })
            item_data.update(data)

            callback = self.parse_items if next_root_name != "subclasse" else self.parse_subclasse
            yield scrapy.Request(
                url=url,
                meta={"data": item_data, "root_name": next_root_name},
                callback=callback,
            ) 
Example #8
Source File: cnae.py    From socios-brasil with GNU Lesser General Public License v3.0 5 votes vote down vote up
def parse_subclasse(self, response):
        """Yield the subclass item (last mile of the recursive strategy)"""
        data = response.request.meta["data"]
        tree = document_fromstring(response.body)
        data["notas_explicativas"] = "\n".join(
            [
                line.strip()
                for line in tree.xpath('//div[@id = "notas-explicativas"]//text()')
                if line.strip()
            ]
        )
        data["url"] = response.request.url
        data["id"] = int(data["id_subclasse"].replace("/", "").replace("-", ""))
        data["versao"] = self.versao
        yield data 
Example #9
Source File: cleaner.py    From wanish with MIT License 5 votes vote down vote up
def initial_output(html_partial=False):
        """
        Creates initial HTML document according to the given flag
        :param html_partial: determines if there should be the html page or only a fragment
        :return: html output element
        """
        return fragment_fromstring('<div/>') if html_partial else document_fromstring('<div/>') 
Example #10
Source File: column.py    From zhihu2ebook with MIT License 5 votes vote down vote up
def replace_img_url(self, content):
        utf8_parser = html.HTMLParser(encoding='utf-8')
        tree = html.document_fromstring(str(content), parser=utf8_parser)

        for _pic_link in tree.xpath("//img"):
            href = str(_pic_link.get('src'))
            pic_id, pic_type = href.split('.')
            _pic_link.set('src', "https://pic4.zhimg.com/" + pic_id + "_b." + pic_type)
        replaced_content = etree.tostring(tree, encoding=str)
        return replaced_content 
Example #11
Source File: utils.py    From jorvik with GNU General Public License v3.0 5 votes vote down vote up
def get_drive_file(file):
    req = urllib.request.Request("https://docs.google.com/feeds/download/documents/export/Export?id=%s&exportFormat=html" %(file,))
    str = urllib.request.urlopen(req).read().decode('UTF-8')
    doc = html.document_fromstring(str)
    head = doc.xpath('//head')[0]
    head.tag = 'div'
    body = doc.xpath('//body')[0]
    body.tag = 'div'
    str = html.tostring(head)+html.tostring(body)
    return str 
Example #12
Source File: models.py    From jorvik with GNU General Public License v3.0 5 votes vote down vote up
def corpo_body(self):
        """
        Prova ad estrarre il corpo della pagina (body).
        :return:
        """
        if not self.corpo:
            return ""
        doc = html.document_fromstring(self.corpo)
        body = doc.xpath('//body')[0]
        body.tag = 'div'
        #try:
        return html.tostring(body)
        #except:
        #    return self.corpo
        #print html.parse('http://someurl.at.domain').xpath('//body')[0].text_content() 
Example #13
Source File: models.py    From jorvik with GNU General Public License v3.0 5 votes vote down vote up
def processa_link(self):
        """
        Controlla i link nella e-mail relativi e li rende assoluti.
        """
        doc = html.document_fromstring(self.corpo)
        links = doc.xpath('//a')
        for el in links:
            try:
                url = el.attrib['href']
                if '://' not in url:
                    el.attrib['href'] = "https://gaia.cri.it%s" % (url,)
            except KeyError:
                continue
        self.corpo = html.tostring(doc, pretty_print=True).decode('UTF-8') 
Example #14
Source File: test_parsers.py    From crestify with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def setUp(self):
        query_user = User.query.filter_by(email='instapaper@example.com').first()
        if query_user:
            query_bookmarks = Bookmark.query.filter_by(user=query_user.id)
            for bmark in query_bookmarks:
                db.session.delete(bmark)
            db.session.commit()
            db.session.delete(query_user)
            db.session.commit()
        create_user = User()
        create_user.first_name = 'Instapaper'
        create_user.last_name = 'Test'
        create_user.email = 'instapaper@example.com'
        create_user.password = 'instapaper_pass'
        create_user.active = True
        create_user.confirmed_at = datetime.datetime.utcnow()
        db.session.add(create_user)
        db.session.commit()
        self.user = create_user
        with open('Instapaper.html') as json_file:
            create_file = open(os.path.join(app.config['CRESTIFY_UPLOAD_DIRECTORY'], 'test_instapaper.html'), 'w+')
            self.data = html.document_fromstring(json_file.read())
            self.data = html.tostring(self.data)
            self.html_data = BeautifulSoup4(self.data)
            self.bookmarks = {}
            for tag in self.html_data.find_all('h1'):
                parent_elem = tag.find_next_sibling('ol')
                links = parent_elem.find_all('a')
                for link in links:
                    title = link.text
                    url = link['href']
                    tags = [tag.text]
                    tags.append('Imported')
                    #  Thanks Instapaper for not adding timestamps
                    self.bookmarks[url] = {
                        'href': url,
                        'title': title,
                        'tags': tags
                    }
            create_file.write(self.data)
            self.file_path = create_file.name
            create_file.close()
        init_parser = InstapaperParser(self.file_path, self.user.id)
        init_parser.process()
        init_parser.add_to_database()
        self.query = Bookmark.query.filter_by(user=self.user.id).all()
        self.html_parser = HTMLParser() 
Example #15
Source File: parse-bbc-html-data.py    From XSum with MIT License 4 votes vote down vote up
def __init__(self, story, corpus):
    self.story = story
    self.corpus = corpus
    self.parser = html.HTMLParser(encoding=chardet.detect(self.story.html)['encoding'])
    self.tree = html.document_fromstring(self.story.html, parser=self.parser)
    
    # Elements to delete.
    self.delete_selectors = {
      'bbc': [
        '//blockquote[contains(@class, "twitter-tweet")]',
        '//blockquote[contains(@class, "instagram-media")]'
      ]
    }
    
    # Title Selector
    self.title_selectors = {
      'bbc': [
        '//h1[contains(@class, "story-headline")]',
        '//h1[contains(@class, "story-body__h1")]'
      ]
    }
    
    # Introduction Selector
    self.introduction_selectors = {
      'bbc': [
        '//p[contains(@class, "story-body__introduction")]'
      ]
    }
    
    # Rest Content exclusions: ads, links, bylines, comments, headline and story introduction
    self.bbc_exclude = (
      'not(contains(@class, "story-headline"))'
      ' and not(contains(@class, "story-body__h1"))'
      ' and not(contains(@class, "story-body__introduction"))'
      ' and not(contains(@class, "with-extracted-share-icons"))'
    )

    # Rest Content Selector
    self.restcontent_selectors = {
      'bbc': [
        '//div[contains(@class, "story-body")]//p[%s]' % self.bbc_exclude    # story-body__inner
      ]
    } 
Example #16
Source File: tests.py    From jorvik with GNU General Public License v3.0 4 votes vote down vote up
def test_us_attivazione_credenziali(self):

        EMAIL_UTENZA = email_fittizzia()

        presidente = crea_persona()
        persona, sede, appartenenza = crea_persona_sede_appartenenza(presidente=presidente)

        sessione_presidente = self.sessione_utente(persona=presidente)

        sessione_presidente.visit("%s%s" % (self.live_server_url, persona.url_profilo_credenziali))
        sessione_presidente.fill('email', EMAIL_UTENZA)
        sessione_presidente.find_by_xpath("//button[@type='submit']").first.click()

        self.assertTrue(
            Utenza.objects.filter(persona=persona).exists(),
            msg="L'utenza e' stata creata correttamente"
        )

        self.assertTrue(
            Utenza.objects.get(persona=persona).email == EMAIL_UTENZA,
            msg="L'email e' stata correttamente creata"
        )

        # Ottieni e-mail inviata
        msg = Messaggio.objects.filter(oggetto__icontains="credenziali",
                                       oggetti_destinatario__persona=persona)

        self.assertTrue(
            msg.exists(),
            msg="Email delle credenziali spedita"
        )

        corpo_msg = msg.first().corpo

        self.assertTrue(
            EMAIL_UTENZA in corpo_msg,
            msg="L'email contiene il nuovo indirizzo e-mail"
        )

        doc = html.document_fromstring(corpo_msg)
        nuova_pwd = doc.xpath("//*[@id='nuova-password']")[0].text.strip()

        utenza = persona.utenza
        utenza.password_testing = nuova_pwd  # Password per accesso

        # Prova accesso con nuova utenza.
        sessione_persona = self.sessione_utente(utente=utenza)