Python Examples of lxml.html.tostring

Source File: parsers.py From crestify with BSD 3-Clause "New" or "Revised" License

6 votes

def __init__(self, file_name, user_id):
        with open(file_name, 'r') as self.opened_file:
            #  So Instapaper doesn't close <li> tags
            #  This was causing infinite recursion when using BS directly
            #  Hence why the stuff below is being done, so that the <li> tags get closed
            self.html = html.document_fromstring(self.opened_file.read())
            self.html = html.tostring(self.html)
        self.soup = BeautifulSoup4(self.html)
        self.user = user_id
        self.urls = dict()
        self.check_duplicates = dict()
        self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user,
                                                            Bookmark.deleted == False).all()
        for bmark in self.check_duplicates_query:
            self.check_duplicates[bmark.main_url] = bmark
        self.tags_dict = dict()
        self.tags_set = set()
        self.valid_url = re.compile(
            r'^(?:[a-z0-9\.\-]*)://'
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|'
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'
            r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'
            r'(?::\d+)?'
            r'(?:/?|[/?]\S+)$', re.IGNORECASE)

Source File: messaging.py From okcupyd with MIT License

6 votes

def content(self):
        """
        :returns: The text body of the message.
        """
        # The code that follows is obviously pretty disgusting.
        # It seems like it might be impossible to completely replicate
        # the text of the original message if it has trailing whitespace
        message = self._content_xpb.one_(self._message_element)
        first_line = message.text
        if message.text[:2] == '  ':
            first_line = message.text[2:]
        else:
            log.debug("message did not have expected leading whitespace")
        subsequent_lines = ''.join([
            html.tostring(child, encoding='unicode').replace('<br>', '\n')
            for child in message.iterchildren()
        ])
        message_text = first_line + subsequent_lines
        if len(message_text) > 0 and message_text[-1] == ' ':
            message_text = message_text[:-1]
        else:
            log.debug("message did not have expected leading whitespace")

        return message_text

Source File: views.py From openprescribing with MIT License

6 votes

def gdoc_view(request, doc_id):
    try:
        gdoc_id = settings.GDOC_DOCS[doc_id]
    except KeyError:
        raise Http404("No doc named %s" % doc_id)
    url = "https://docs.google.com/document/d/%s/pub?embedded=true" % gdoc_id
    page = requests.get(url)
    tree = html.fromstring(page.text)

    content = (
        "<style>"
        + "".join(
            [
                html.tostring(child).decode("utf8")
                for child in tree.head.xpath("//style")
            ]
        )
        + "</style>"
    )
    content += "".join([html.tostring(child).decode("utf8") for child in tree.body])
    context = {"content": content}
    return render(request, "gdoc.html", context)

Source File: server.py From autologin with Apache License 2.0

6 votes

def download_page(url, cookie_jar):
    """
    Request page using authenticated cookies (cookiejar).
    Download html source and save in browser directory, to
    be used by in show_in_browser().
    """
    browser_dir = os.path.join(server_path, 'static/browser')
    delete_directory_files(browser_dir)
    filename = '{}.html'.format(uuid.uuid4())
    filepath = os.path.join(browser_dir, filename)
    try:
        response = cookie_request(url, cookie_jar)
    except requests.RequestException as e:
        return e, None
    doc = html.document_fromstring(response.text)
    with open(filepath, 'wb') as f:
        f.write(html.tostring(doc))
    return None, filename

Source File: clean.py From memorious with MIT License

6 votes

def clean_html(context, data):
    """Clean an HTML DOM and store the changed version."""
    doc = _get_html_document(context, data)
    if doc is None:
        context.emit(data=data)
        return

    remove_paths = context.params.get('remove_paths')
    for path in ensure_list(remove_paths):
        for el in doc.xpath(path):
            el.drop_tree()

    html_text = html.tostring(doc, pretty_print=True)
    content_hash = context.store_data(html_text)
    data['content_hash'] = content_hash
    context.emit(data=data)

Source File: html.py From mailur with GNU General Public License v3.0

6 votes

def from_text(txt):
    def replace(match):
        txt = match.group()
        if '\n' in txt:
            return '<br>' * txt.count('\n')
        else:
            return '&nbsp;' * txt.count(' ')

    tpl = '<p>%s</p>'
    htm = escape(txt)
    htm = fromstring(tpl % htm)
    fix_links(htm)
    htm = tostring(htm, encoding='unicode')
    htm = htm[3:-4]
    htm = re.sub('(?m)((\r?\n)+| [ ]+|^ )', replace, htm)
    htm = tpl % htm
    return htm

Source File: xml.py From ingestors with MIT License

6 votes

def ingest(self, file_path):
        """Ingestor implementation."""
        file_size = self.result.size or os.path.getsize(file_path)
        if file_size > self.MAX_SIZE:
            raise ProcessingException("XML file is too large.")

        try:
            doc = etree.parse(file_path)
        except (ParserError, ParseError):
            raise ProcessingException("XML could not be parsed.")

        text = self.extract_html_text(doc.getroot())
        transform = etree.XSLT(self.XSLT)
        html_doc = transform(doc)
        html_body = html.tostring(html_doc, encoding=str, pretty_print=True)
        self.result.flag(self.result.FLAG_HTML)
        self.result.emit_html_body(html_body, text)

Source File: external.py From trafilatura with GNU General Public License v3.0

6 votes

def try_justext(tree, url, target_language):
    '''Second safety net: try with the generic algorithm justext'''
    result_body = etree.Element('body')
    justtextstring = html.tostring(tree, pretty_print=False, encoding='utf-8')
    # determine language
    if target_language is not None and target_language in JUSTEXT_LANGUAGES:
        langsetting = JUSTEXT_LANGUAGES[target_language]
        justext_stoplist = justext.get_stoplist(langsetting)
    else:
        #justext_stoplist = justext.get_stoplist(JUSTEXT_DEFAULT)
        justext_stoplist = JT_STOPLIST
    # extract
    try:
        paragraphs = justext.justext(justtextstring, justext_stoplist, 50, 200, 0.1, 0.2, 0.2, 200, True)
    except ValueError as err:  # not an XML element: HtmlComment
        LOGGER.error('justext %s %s', err, url)
        result_body = None
    else:
        for paragraph in paragraphs:
            if not paragraph.is_boilerplate:
                #if duplicate_test(paragraph) is not True:
                elem = etree.Element('p')
                elem.text = paragraph.text
                result_body.append(elem)
    return result_body

Source File: requests_html.py From requests-html with MIT License

5 votes

def xpath(self, selector: str, *, clean: bool = False, first: bool = False, _encoding: str = None) -> _XPath:
        """Given an XPath selector, returns a list of
        :class:`Element <Element>` objects or a single one.

        :param selector: XPath Selector to use.
        :param clean: Whether or not to sanitize the found HTML of ``<script>`` and ``<style>`` tags.
        :param first: Whether or not to return just the first result.
        :param _encoding: The encoding format.

        If a sub-selector is specified (e.g. ``//a/@href``), a simple
        list of results is returned.

        See W3School's `XPath Examples
        <https://www.w3schools.com/xml/xpath_examples.asp>`_
        for more details.

        If ``first`` is ``True``, only returns the first
        :class:`Element <Element>` found.
        """
        selected = self.lxml.xpath(selector)

        elements = [
            Element(element=selection, url=self.url, default_encoding=_encoding or self.encoding)
            if not isinstance(selection, etree._ElementUnicodeResult) else str(selection)
            for selection in selected
        ]

        # Sanitize the found HTML.
        if clean:
            elements_copy = elements.copy()
            elements = []

            for element in elements_copy:
                element.raw_html = lxml_html_tostring(cleaner.clean_html(element.lxml))
                elements.append(element)

        return _get_first_or_list(elements, first)

Source File: models.py From jorvik with GNU General Public License v3.0

5 votes

def processa_link(self):
        """
        Controlla i link nella e-mail relativi e li rende assoluti.
        """
        doc = html.document_fromstring(self.corpo)
        links = doc.xpath('//a')
        for el in links:
            try:
                url = el.attrib['href']
                if '://' not in url:
                    el.attrib['href'] = "https://gaia.cri.it%s" % (url,)
            except KeyError:
                continue
        self.corpo = html.tostring(doc, pretty_print=True).decode('UTF-8')

Source File: models.py From jorvik with GNU General Public License v3.0

5 votes

def corpo_body(self):
        """
        Prova ad estrarre il corpo della pagina (body).
        :return:
        """
        if not self.corpo:
            return ""
        doc = html.document_fromstring(self.corpo)
        body = doc.xpath('//body')[0]
        body.tag = 'div'
        #try:
        return html.tostring(body)
        #except:
        #    return self.corpo
        #print html.parse('http://someurl.at.domain').xpath('//body')[0].text_content()

Source File: utils.py From jorvik with GNU General Public License v3.0

5 votes

def get_drive_file(file):
    req = urllib.request.Request("https://docs.google.com/feeds/download/documents/export/Export?id=%s&exportFormat=html" %(file,))
    str = urllib.request.urlopen(req).read().decode('UTF-8')
    doc = html.document_fromstring(str)
    head = doc.xpath('//head')[0]
    head.tag = 'div'
    body = doc.xpath('//body')[0]
    body.tag = 'div'
    str = html.tostring(head)+html.tostring(body)
    return str

Source File: EntityLinking.py From ClusType with GNU General Public License v3.0

5 votes

def run(self):
        print "Start DBpediaSpotlight"
        g = open('tmp/temp' + str(self.offset) + '.txt', 'w')
        index = 0
        while 1:
            did = str(index + self.offset)
            if did in self.docList:
                try:
                    doc = self.docList[did]
                    url = "http://spotlight.sztaki.hu:2222/rest/annotate"
                    #url = "http://localhost:2222/rest/annotate"
                    data = {"confidence":self.confidence}
                    data["support"] = "20"
                    data["text"] = doc;
                    data = urllib.urlencode(data)
                    req = urllib2.Request(url)
                    req.add_header('Accept', 'application/json') #text/xml')
                    # print did
                    page = html.fromstring(urllib2.urlopen(req, data, timeout=100).read())
                    docJson = html.tostring(page)[3:-4]
                    #print docJson
                    validEntities = extractAnnotations(docJson)
                    for entity in validEntities:
                        linkToFreebase(entity)
                        if (entity['@URI'] != None):
                            g.write(str(index + self.offset) + '\t' + entity['@surfaceForm'] + '\t' + entity['@URI'] + '\t'
                             + entity['@similarityScore'] + '\t' + entity['@percentageOfSecondRank']+ '\n')
                    index += threadNum
                except:
                    index += threadNum
                    print 'noresult'
            else:
                break
        g.close()

Source File: lxml_toolkit_object.py From enaml-web with MIT License

5 votes

def render(self, method='html', encoding='unicode', **kwargs):
        """ Render the widget tree into a string """
        return tostring(self.widget, method=method, encoding=encoding, **kwargs)

Source File: metadata.py From trafilatura with GNU General Public License v3.0

5 votes

def extract_url(tree, default_url=None):
    '''Extract the URL from the canonical link'''
    # https://www.tutorialrepublic.com/html-reference/html-base-tag.php
    # default url as fallback
    url = default_url
    # try canonical link first
    element = tree.find('.//head//link[@rel="canonical"]')
    if element is not None:
        url = element.attrib['href']
    # try default language link
    else:
        for element in tree.xpath('//head//link[@rel="alternate"]'):
            if 'hreflang' in element.attrib and element.attrib['hreflang'] is not None and element.attrib['hreflang'] == 'x-default':
                LOGGER.debug(html.tostring(element, pretty_print=False, encoding='unicode').strip())
                url = element.attrib['href']
    # add domain name if it's missing
    if url is not None and url.startswith('/'):
        for element in tree.xpath('//head//meta[@content]'):
            if 'name' in element.attrib:
                attrtype = element.attrib['name']
            elif 'property' in element.attrib:
                attrtype = element.attrib['property']
            else:
                continue
            if attrtype.startswith('og:') or attrtype.startswith('twitter:'):
                domain_match = re.match(r'https?://[^/]+', element.attrib['content'])
                if domain_match:
                    # prepend URL
                    url = domain_match.group(0) + url
                    break
    return url

Source File: core.py From trafilatura with GNU General Public License v3.0

5 votes

def determine_returnstring(docmeta, postbody, commentsbody, output_format, tei_validation, record_id):
    '''Convert XML tree to chosen format, clean the result and output it as a string'''
    # XML (TEI) steps
    if 'xml' in output_format:
        # last cleaning
        for element in postbody.iter():
            if len(element) == 0 and not element.text and not element.tail:
                parent = element.getparent()
                if parent is not None:
                    parent.remove(element)
        # build output trees
        if output_format == 'xml':
            output = build_xml_output(postbody, commentsbody)
            output = add_xml_meta(output, docmeta)
        elif output_format == 'xmltei':
            output = build_tei_output(postbody, commentsbody, docmeta)
        # can be improved
        control_string = etree.tostring(output, encoding='unicode')
        control_string = sanitize(control_string)
        # necessary for cleaning
        control_parser = etree.XMLParser(remove_blank_text=True)
        output_tree = etree.fromstring(control_string, control_parser)
        # validate
        if output_format == 'xmltei' and tei_validation is True:
            result = validate_tei(output_tree)
            LOGGER.info('TEI validation result: %s %s %s', result, record_id, docmeta.url)
        # output as string
        returnstring = etree.tostring(output_tree, pretty_print=True, encoding='unicode').strip()
    # CSV + TXT output
    else:
        if output_format == 'csv':
            posttext = xmltotxt(postbody)
            if commentsbody is not None:
                commentstext = xmltotxt(commentsbody)
            else:
                commentstext = ''
            returnstring = txttocsv(posttext, commentstext, docmeta)
        else:
            output = build_xml_output(postbody, commentsbody)
            returnstring = xmltotxt(output)
    return returnstring

Source File: course_info_model.py From ANALYSE with GNU Affero General Public License v3.0

5 votes

def _course_info_content(html_parsed):
    """
    Constructs the HTML for the course info update, not including the header.
    """
    if len(html_parsed) == 1:
        # could enforce that update[0].tag == 'h2'
        content = html_parsed[0].tail
    else:
        content = html_parsed[0].tail if html_parsed[0].tail is not None else ""
        content += "\n".join([html.tostring(ele) for ele in html_parsed[1:]])
    return content

Source File: html_to_telegraph.py From html-telegraph-poster with MIT License

5 votes

def convert_html_to_telegraph_format(html_string, clean_html=True, output_format="json_string"):
    if clean_html:
        html_string = clean_article_html(html_string)

        body = preprocess_fragments(
            _fragments_from_string(html_string)
        )
        if body is not None:
            desc = [x for x in body.iterdescendants()]
            for tag in desc:
                preprocess_media_tags(tag)
            move_to_top(body)
            post_process(body)
    else:
        fragments = _fragments_from_string(html_string)
        body = fragments[0].getparent() if len(fragments) else None

    content = []
    if body is not None:
        content = [_recursive_convert(x) for x in body.iterchildren()]

    if output_format == 'json_string':
        return json.dumps(content, ensure_ascii=False)
    elif output_format == 'python_list':
        return content
    elif output_format == 'html_string':
        return html.tostring(body, encoding='unicode')

Source File: html_to_telegraph.py From html-telegraph-poster with MIT License

5 votes

def convert_json_to_html(elements):
    content = html.fragment_fromstring('<div></div>')
    for element in elements:
        content.append(_recursive_convert_json(element))
    content.make_links_absolute(base_url=base_url)
    for x in content.xpath('.//span'):
        x.drop_tag()
    html_string = html.tostring(content, encoding='unicode')
    html_string = replace_line_breaks_except_pre(html_string, '<br/>')
    html_string = html_string[5:-6]
    return html_string

Source File: hearth.py From hearthstats with GNU General Public License v2.0

5 votes

def get_deck_list(deckid):
    """
    For a given HearthPwn deck ID, return a list of Cards that belong to that
    deck.

    Parameters:

    - 'deckid' - a HearthPwn deck ID
    """
    # http://www.hearthpwn.com/decks/listing/ + deckid + /neutral or /class
    url = 'http://www.hearthpwn.com/decks/listing/'
    css = '#cards > tbody > tr > td.col-name'

    deck = []

    # Class Cards
    htmlelement = get_htmlelement_from_url(url + str(deckid) + '/class')
    cardelements = htmlelement.cssselect(css)
    # Neutral Cards
    htmlelement = get_htmlelement_from_url(url + str(deckid) + '/neutral')
    cardelements += htmlelement.cssselect(css)

    regex = re.compile('&#215;\s+(\d+)')
    for element in cardelements:
        # cssselect always returns an array, but in our case the result is
        # always just one element.
        cardname = element.cssselect('a')[0].text.strip()
        elementtext = html.tostring(element).decode('UTF-8')
        # There's probably a better way to get the amount, but we currently
        # look for the "x #" in the raw text of the element
        match = re.search(regex, elementtext)
        if match:
            amount = int(match.group(1))
        else:
            print('ERROR: Unable to get amount for card ' + cardname)
            # This shouldn't happen, but when it does, just continue on after
            # logging an error.
            amount = 0
        deck.append(Card(cardname, amount))

    return deck

Source File: html.py From mailur with GNU General Public License v3.0

5 votes

def fix_privacy(htm, only_proxy=False):
    if not htm.strip():
        return htm

    use_proxy = conf['USE_PROXY']
    if only_proxy and not use_proxy:
        return htm

    htm = fromstring(htm)
    for img in htm.xpath('//img[@src]'):
        src = img.attrib['src']
        if re.match('^(https?://|//).*', src):
            if src.startswith('//'):
                src = 'https:' + src
            if use_proxy:
                src = '/proxy?url=' + src
            if only_proxy:
                img.attrib['src'] = src
            else:
                img.attrib['data-src'] = src
                del img.attrib['src']

    if not only_proxy:
        # style could contain "background-image", etc.
        for el in htm.xpath('//*[@style]'):
            el.attrib['data-style'] = el.attrib['style']
            del el.attrib['style']

    htm = tostring(htm, encoding='unicode').strip()
    htm = re.sub('(^<div>|</div>$)', '', htm)
    return htm

Source File: cleanhtml.py From zing with GNU General Public License v3.0

5 votes

def url_trim(html):
    """Trims anchor texts that are longer than 70 chars."""
    fragment = fromstring(html)
    for el, attrib_, link_, pos_ in fragment.iterlinks():
        new_link_text = trim_url(el.text_content())
        el.text = new_link_text

    return mark_safe(tostring(fragment, encoding="unicode"))

Source File: __init__.py From online-judge with GNU Affero General Public License v3.0

5 votes

def fragment_tree_to_str(tree):
    return html.tostring(tree, encoding='unicode')[len('<div>'):-len('</div>')]

Source File: lxml_tree.py From online-judge with GNU Affero General Public License v3.0

5 votes

def __str__(self):
        return mark_safe(html.tostring(self._tree, encoding='unicode'))

Source File: test_items.py From ant_nest with GNU Lesser General Public License v3.0

4 votes

def test_extract_item():
    with open("./tests/test.html", "rb") as f:
        response = httpx.Response(
            200, request=httpx.Request("Get", "https://test.com"), content=f.read()
        )

    class Item:
        pass

    # extract item with xpath and regex
    item_extractor = Extractor(Item)
    item_extractor.add_extractor(
        "paragraph",
        lambda x: html.fromstring(x.text).xpath("/html/body/div/p/text()")[0],
    )
    item_extractor.add_extractor(
        "title", lambda x: re.findall(r"<title>([A-Z a-z]+)</title>", x.text)[0]
    )
    item = item_extractor.extract(response)
    assert item.paragraph == "test"
    assert item.title == "Test html"
    # extract with jpath
    response = httpx.Response(
        200,
        request=httpx.Request("Get", "https://test.com"),
        content=b'{"a": {"b": {"c": 1}}, "d": null}',
    )
    item_extractor = Extractor(Item)
    item_extractor.add_extractor(
        "author", lambda x: jpath.get_all("a.b.c", x.json())[0]
    )
    item_extractor.add_extractor("freedom", lambda x: jpath.get_all("d", x.json())[0])
    item = item_extractor.extract(response)
    assert item.author == 1
    assert item.freedom is None
    # ItemNestExtractor tests
    with open("./tests/test.html", "rb") as f:
        response = httpx.Response(
            200, request=httpx.Request("Get", "https://test.com"), content=f.read()
        )
    item_nest_extractor = NestExtractor(
        Item, lambda x: html.fromstring(x.text).xpath('//div[@id="nest"]/div')
    )
    item_nest_extractor.add_extractor("xpath_key", lambda x: x.xpath("./p/text()")[0])
    item_nest_extractor.add_extractor(
        "regex_key",
        lambda x: re.findall(r"regex(\d+)</", html.tostring(x, encoding="unicode"))[0],
    )
    temp = 1
    for item in item_nest_extractor.extract_items(response):
        assert item.xpath_key == str(temp)
        assert item.regex_key == str(temp)
        temp += 1

Source File: core.py From trafilatura with GNU General Public License v3.0

4 votes

def extract_content(tree, include_tables=False):
    '''Find the main content of a page using a set of XPath expressions,
       then extract relevant elements, strip them of unwanted subparts and
       convert them'''
    sure_thing = False
    result_body = etree.Element('body')
    # iterate
    for expr in BODY_XPATH:
        # select tree if the expression has been found
        subtree = tree.xpath(expr)
        if not subtree:
            continue
        subtree = subtree[0]
        # prune
        subtree = discard_unwanted(subtree)
        # remove elements by link density
        for elem in subtree.iter('list'):
            if link_density_test(elem) is True:
                elem.getparent().remove(elem)
                continue
            elem.attrib.clear()
            #for subelem in elem.iter('item'):
            #    subelem.attrib.clear()
        etree.strip_tags(subtree, 'a', 'link', 'span')
        # define iteration strategy
        potential_tags = set(TAG_CATALOG)  # + 'span'?
        if include_tables is True:
            potential_tags.add('table')
        # no paragraphs containing text
        if not subtree.xpath('//p//text()'):
            potential_tags.add('div')
        LOGGER.debug(sorted(potential_tags))
        # etree.strip_tags(subtree, 'lb') # BoingBoing-Bug
        # print(html.tostring(subtree, pretty_print=True, encoding='unicode'))
        # extract content
        processed_elems = [handle_textelem(e, potential_tags) for e in subtree.xpath('.//*')]
        # list(filter(None.__ne__, processed_elems))
        result_body.extend([e for e in processed_elems if e is not None])
        # exit the loop if the result has children
        if len(result_body) > 0:
            sure_thing = True
            LOGGER.debug(expr)
            break
    # try parsing wild <p> elements if nothing found or text too short
    temp_text = trim(' '.join(result_body.itertext()))
    len_text = len(temp_text)
    if len(result_body) == 0 or len_text < MIN_EXTRACTED_SIZE:
        result_body = recover_wild_paragraphs(tree, result_body)
        #search_tree = discard_unwanted(tree)
        #search_tree = prune_html(search_tree)
        #result_body, _, _ = baseline(search_tree)
        temp_text = trim(' '.join(result_body.itertext()))
        len_text = len(temp_text)
    # filter output
    etree.strip_elements(result_body, 'done')
    etree.strip_tags(result_body, 'div')
    # return
    return result_body, temp_text, len_text, sure_thing

Source File: html.py From mailur with GNU General Public License v3.0

4 votes

def clean(htm, embeds=None):
    htm = re.sub(r'^\s*<\?xml.*?\?>', '', htm).strip()
    if not htm:
        return '', {}

    htm = htm.replace('\r\n', '\n')
    cleaner = Cleaner(
        links=False,
        style=True,
        inline_style=False,
        kill_tags=['head'],
        remove_tags=['html', 'base'],
        safe_attrs=list(set(Cleaner.safe_attrs) - {'class'}) + ['style'],
    )
    htm = fromstring(htm)
    htm = cleaner.clean_html(htm)

    ext_images = 0
    embeds = embeds or {}
    for img in htm.xpath('//img[@src]'):
        src = img.attrib.get('src')
        cid = re.match('^cid:(.*)', src)
        url = cid and embeds.get('<%s>' % cid.group(1))
        if url:
            img.attrib['src'] = url
        elif re.match('^data:image/.*', src):
            pass
        elif re.match('^(https?://|//).*', src):
            ext_images += 1
        else:
            del img.attrib['src']

    styles = False
    for el in htm.xpath('//*[@style]'):
        styles = True
        break

    fix_links(htm)

    richer = (('styles', styles), ('ext_images', ext_images))
    richer = {k: v for k, v in richer if v}

    htm = tostring(htm, encoding='unicode').strip()
    htm = re.sub('(^<div>|</div>$)', '', htm)
    return htm, richer

Source File: weibo.py From news_spider with MIT License

4 votes

def parse_article_detail_js(self, response):
        """
        文章详情解析 js 版
        :param response:
        :return:
        """
        article_detail_body = response.body_as_unicode()
        article_detail_rule = r'<script>FM.view\({"ns":.*?"html":"(.*?)"}\)</script>'
        article_detail_re_parse = re.compile(article_detail_rule, re.S).findall(article_detail_body)
        if not article_detail_re_parse:
            return
        article_detail_html = ''.join(article_detail_re_parse)

        # 转义字符处理
        article_detail_html = article_detail_html.replace('\\r', '')
        article_detail_html = article_detail_html.replace('\\t', '')
        article_detail_html = article_detail_html.replace('\\n', '')
        article_detail_html = article_detail_html.replace('\\"', '"')
        article_detail_html = article_detail_html.replace('\\/', '/')

        article_detail_doc = fromstring(article_detail_html)

        article_title_parse = article_detail_doc.xpath('//h1[@class="title"]/text()')
        article_title = article_title_parse[0].strip() if article_title_parse else ''

        article_pub_time_parse = article_detail_doc.xpath('//span[@class="time"]/text()')
        article_pub_time = self.trans_time(article_pub_time_parse[0].strip()) if article_pub_time_parse else time.strftime('%Y-%m-%d %H:%M:%S')

        article_content_parse = article_detail_doc.xpath('//div[@class="WBA_content"]')
        article_content = tostring(article_content_parse[0], encoding='unicode').strip() if article_content_parse else ''

        fetch_result_item = FetchResultItem()
        fetch_result_item['task_id'] = response.meta['task_id']
        fetch_result_item['platform_id'] = response.meta['platform_id']
        fetch_result_item['platform_name'] = platform_name_map.get(response.meta['platform_id'], '')
        fetch_result_item['channel_id'] = response.meta['channel_id']
        fetch_result_item['channel_name'] = channel_name_map.get(response.meta['channel_id'], '')
        fetch_result_item['article_id'] = response.meta['article_id']
        fetch_result_item['article_title'] = article_title
        fetch_result_item['article_author_id'] = response.meta['follow_id']
        fetch_result_item['article_author_name'] = response.meta['follow_name']
        fetch_result_item['article_pub_time'] = time_local_to_utc(article_pub_time).strftime('%Y-%m-%d %H:%M:%S')
        fetch_result_item['article_url'] = response.url
        fetch_result_item['article_tags'] = ''
        fetch_result_item['article_abstract'] = response.meta['article_abstract']
        fetch_result_item['article_content'] = article_content
        yield fetch_result_item

Source File: utils.py From scrape with MIT License

4 votes

def write_part_file(args, url, raw_html, html=None, part_num=None):
    """Write PART.html file(s) to disk, images in PART_files directory.

    Keyword arguments:
    args -- program arguments (dict)
    raw_html -- unparsed HTML file content (list)
    html -- parsed HTML file content (lxml.html.HtmlElement) (default: None)
    part_num -- PART(#).html file number (int) (default: None)
    """
    if part_num is None:
        part_num = get_num_part_files() + 1
    filename = "PART{0}.html".format(part_num)

    # Decode bytes to string in Python 3 versions
    if not PY2 and isinstance(raw_html, bytes):
        raw_html = raw_html.encode("ascii", "ignore")

    # Convert html to an lh.HtmlElement object for parsing/saving images
    if html is None:
        html = lh.fromstring(raw_html)

    # Parse HTML if XPath entered
    if args["xpath"]:
        raw_html = parse_html(html, args["xpath"])
        if isinstance(raw_html, list):
            if not isinstance(raw_html[0], lh.HtmlElement):
                raise ValueError("XPath should return an HtmlElement object.")
        else:
            if not isinstance(raw_html, lh.HtmlElement):
                raise ValueError("XPath should return an HtmlElement object.")

    # Write HTML and possibly images to disk
    if raw_html:
        if not args["no_images"] and (args["pdf"] or args["html"]):
            raw_html = write_part_images(url, raw_html, html, filename)
        with open(filename, "w") as part:
            if not isinstance(raw_html, list):
                raw_html = [raw_html]
                if isinstance(raw_html[0], lh.HtmlElement):
                    for elem in raw_html:
                        part.write(lh.tostring(elem))
                else:
                    for line in raw_html:
                        part.write(line)

Source File: test_parsers.py From crestify with BSD 3-Clause "New" or "Revised" License

4 votes

def setUp(self):
        query_user = User.query.filter_by(email='instapaper@example.com').first()
        if query_user:
            query_bookmarks = Bookmark.query.filter_by(user=query_user.id)
            for bmark in query_bookmarks:
                db.session.delete(bmark)
            db.session.commit()
            db.session.delete(query_user)
            db.session.commit()
        create_user = User()
        create_user.first_name = 'Instapaper'
        create_user.last_name = 'Test'
        create_user.email = 'instapaper@example.com'
        create_user.password = 'instapaper_pass'
        create_user.active = True
        create_user.confirmed_at = datetime.datetime.utcnow()
        db.session.add(create_user)
        db.session.commit()
        self.user = create_user
        with open('Instapaper.html') as json_file:
            create_file = open(os.path.join(app.config['CRESTIFY_UPLOAD_DIRECTORY'], 'test_instapaper.html'), 'w+')
            self.data = html.document_fromstring(json_file.read())
            self.data = html.tostring(self.data)
            self.html_data = BeautifulSoup4(self.data)
            self.bookmarks = {}
            for tag in self.html_data.find_all('h1'):
                parent_elem = tag.find_next_sibling('ol')
                links = parent_elem.find_all('a')
                for link in links:
                    title = link.text
                    url = link['href']
                    tags = [tag.text]
                    tags.append('Imported')
                    #  Thanks Instapaper for not adding timestamps
                    self.bookmarks[url] = {
                        'href': url,
                        'title': title,
                        'tags': tags
                    }
            create_file.write(self.data)
            self.file_path = create_file.name
            create_file.close()
        init_parser = InstapaperParser(self.file_path, self.user.id)
        init_parser.process()
        init_parser.add_to_database()
        self.query = Bookmark.query.filter_by(user=self.user.id).all()
        self.html_parser = HTMLParser()

Python lxml.html.tostring() Examples