Python lxml.html.tostring() Examples

The following are 30 code examples of lxml.html.tostring(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module lxml.html , or try the search function .
Example #1
Source File: parsers.py    From crestify with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def __init__(self, file_name, user_id):
        with open(file_name, 'r') as self.opened_file:
            #  So Instapaper doesn't close <li> tags
            #  This was causing infinite recursion when using BS directly
            #  Hence why the stuff below is being done, so that the <li> tags get closed
            self.html = html.document_fromstring(self.opened_file.read())
            self.html = html.tostring(self.html)
        self.soup = BeautifulSoup4(self.html)
        self.user = user_id
        self.urls = dict()
        self.check_duplicates = dict()
        self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user,
                                                            Bookmark.deleted == False).all()
        for bmark in self.check_duplicates_query:
            self.check_duplicates[bmark.main_url] = bmark
        self.tags_dict = dict()
        self.tags_set = set()
        self.valid_url = re.compile(
            r'^(?:[a-z0-9\.\-]*)://'
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|'
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'
            r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'
            r'(?::\d+)?'
            r'(?:/?|[/?]\S+)$', re.IGNORECASE) 
Example #2
Source File: messaging.py    From okcupyd with MIT License 6 votes vote down vote up
def content(self):
        """
        :returns: The text body of the message.
        """
        # The code that follows is obviously pretty disgusting.
        # It seems like it might be impossible to completely replicate
        # the text of the original message if it has trailing whitespace
        message = self._content_xpb.one_(self._message_element)
        first_line = message.text
        if message.text[:2] == '  ':
            first_line = message.text[2:]
        else:
            log.debug("message did not have expected leading whitespace")
        subsequent_lines = ''.join([
            html.tostring(child, encoding='unicode').replace('<br>', '\n')
            for child in message.iterchildren()
        ])
        message_text = first_line + subsequent_lines
        if len(message_text) > 0 and message_text[-1] == ' ':
            message_text = message_text[:-1]
        else:
            log.debug("message did not have expected leading whitespace")

        return message_text 
Example #3
Source File: views.py    From openprescribing with MIT License 6 votes vote down vote up
def gdoc_view(request, doc_id):
    try:
        gdoc_id = settings.GDOC_DOCS[doc_id]
    except KeyError:
        raise Http404("No doc named %s" % doc_id)
    url = "https://docs.google.com/document/d/%s/pub?embedded=true" % gdoc_id
    page = requests.get(url)
    tree = html.fromstring(page.text)

    content = (
        "<style>"
        + "".join(
            [
                html.tostring(child).decode("utf8")
                for child in tree.head.xpath("//style")
            ]
        )
        + "</style>"
    )
    content += "".join([html.tostring(child).decode("utf8") for child in tree.body])
    context = {"content": content}
    return render(request, "gdoc.html", context) 
Example #4
Source File: server.py    From autologin with Apache License 2.0 6 votes vote down vote up
def download_page(url, cookie_jar):
    """
    Request page using authenticated cookies (cookiejar).
    Download html source and save in browser directory, to
    be used by in show_in_browser().
    """
    browser_dir = os.path.join(server_path, 'static/browser')
    delete_directory_files(browser_dir)
    filename = '{}.html'.format(uuid.uuid4())
    filepath = os.path.join(browser_dir, filename)
    try:
        response = cookie_request(url, cookie_jar)
    except requests.RequestException as e:
        return e, None
    doc = html.document_fromstring(response.text)
    with open(filepath, 'wb') as f:
        f.write(html.tostring(doc))
    return None, filename 
Example #5
Source File: clean.py    From memorious with MIT License 6 votes vote down vote up
def clean_html(context, data):
    """Clean an HTML DOM and store the changed version."""
    doc = _get_html_document(context, data)
    if doc is None:
        context.emit(data=data)
        return

    remove_paths = context.params.get('remove_paths')
    for path in ensure_list(remove_paths):
        for el in doc.xpath(path):
            el.drop_tree()

    html_text = html.tostring(doc, pretty_print=True)
    content_hash = context.store_data(html_text)
    data['content_hash'] = content_hash
    context.emit(data=data) 
Example #6
Source File: html.py    From mailur with GNU General Public License v3.0 6 votes vote down vote up
def from_text(txt):
    def replace(match):
        txt = match.group()
        if '\n' in txt:
            return '<br>' * txt.count('\n')
        else:
            return '&nbsp;' * txt.count(' ')

    tpl = '<p>%s</p>'
    htm = escape(txt)
    htm = fromstring(tpl % htm)
    fix_links(htm)
    htm = tostring(htm, encoding='unicode')
    htm = htm[3:-4]
    htm = re.sub('(?m)((\r?\n)+| [ ]+|^ )', replace, htm)
    htm = tpl % htm
    return htm 
Example #7
Source File: xml.py    From ingestors with MIT License 6 votes vote down vote up
def ingest(self, file_path):
        """Ingestor implementation."""
        file_size = self.result.size or os.path.getsize(file_path)
        if file_size > self.MAX_SIZE:
            raise ProcessingException("XML file is too large.")

        try:
            doc = etree.parse(file_path)
        except (ParserError, ParseError):
            raise ProcessingException("XML could not be parsed.")

        text = self.extract_html_text(doc.getroot())
        transform = etree.XSLT(self.XSLT)
        html_doc = transform(doc)
        html_body = html.tostring(html_doc, encoding=str, pretty_print=True)
        self.result.flag(self.result.FLAG_HTML)
        self.result.emit_html_body(html_body, text) 
Example #8
Source File: external.py    From trafilatura with GNU General Public License v3.0 6 votes vote down vote up
def try_justext(tree, url, target_language):
    '''Second safety net: try with the generic algorithm justext'''
    result_body = etree.Element('body')
    justtextstring = html.tostring(tree, pretty_print=False, encoding='utf-8')
    # determine language
    if target_language is not None and target_language in JUSTEXT_LANGUAGES:
        langsetting = JUSTEXT_LANGUAGES[target_language]
        justext_stoplist = justext.get_stoplist(langsetting)
    else:
        #justext_stoplist = justext.get_stoplist(JUSTEXT_DEFAULT)
        justext_stoplist = JT_STOPLIST
    # extract
    try:
        paragraphs = justext.justext(justtextstring, justext_stoplist, 50, 200, 0.1, 0.2, 0.2, 200, True)
    except ValueError as err:  # not an XML element: HtmlComment
        LOGGER.error('justext %s %s', err, url)
        result_body = None
    else:
        for paragraph in paragraphs:
            if not paragraph.is_boilerplate:
                #if duplicate_test(paragraph) is not True:
                elem = etree.Element('p')
                elem.text = paragraph.text
                result_body.append(elem)
    return result_body 
Example #9
Source File: requests_html.py    From requests-html with MIT License 5 votes vote down vote up
def xpath(self, selector: str, *, clean: bool = False, first: bool = False, _encoding: str = None) -> _XPath:
        """Given an XPath selector, returns a list of
        :class:`Element <Element>` objects or a single one.

        :param selector: XPath Selector to use.
        :param clean: Whether or not to sanitize the found HTML of ``<script>`` and ``<style>`` tags.
        :param first: Whether or not to return just the first result.
        :param _encoding: The encoding format.

        If a sub-selector is specified (e.g. ``//a/@href``), a simple
        list of results is returned.

        See W3School's `XPath Examples
        <https://www.w3schools.com/xml/xpath_examples.asp>`_
        for more details.

        If ``first`` is ``True``, only returns the first
        :class:`Element <Element>` found.
        """
        selected = self.lxml.xpath(selector)

        elements = [
            Element(element=selection, url=self.url, default_encoding=_encoding or self.encoding)
            if not isinstance(selection, etree._ElementUnicodeResult) else str(selection)
            for selection in selected
        ]

        # Sanitize the found HTML.
        if clean:
            elements_copy = elements.copy()
            elements = []

            for element in elements_copy:
                element.raw_html = lxml_html_tostring(cleaner.clean_html(element.lxml))
                elements.append(element)

        return _get_first_or_list(elements, first) 
Example #10
Source File: models.py    From jorvik with GNU General Public License v3.0 5 votes vote down vote up
def processa_link(self):
        """
        Controlla i link nella e-mail relativi e li rende assoluti.
        """
        doc = html.document_fromstring(self.corpo)
        links = doc.xpath('//a')
        for el in links:
            try:
                url = el.attrib['href']
                if '://' not in url:
                    el.attrib['href'] = "https://gaia.cri.it%s" % (url,)
            except KeyError:
                continue
        self.corpo = html.tostring(doc, pretty_print=True).decode('UTF-8') 
Example #11
Source File: models.py    From jorvik with GNU General Public License v3.0 5 votes vote down vote up
def corpo_body(self):
        """
        Prova ad estrarre il corpo della pagina (body).
        :return:
        """
        if not self.corpo:
            return ""
        doc = html.document_fromstring(self.corpo)
        body = doc.xpath('//body')[0]
        body.tag = 'div'
        #try:
        return html.tostring(body)
        #except:
        #    return self.corpo
        #print html.parse('http://someurl.at.domain').xpath('//body')[0].text_content() 
Example #12
Source File: utils.py    From jorvik with GNU General Public License v3.0 5 votes vote down vote up
def get_drive_file(file):
    req = urllib.request.Request("https://docs.google.com/feeds/download/documents/export/Export?id=%s&exportFormat=html" %(file,))
    str = urllib.request.urlopen(req).read().decode('UTF-8')
    doc = html.document_fromstring(str)
    head = doc.xpath('//head')[0]
    head.tag = 'div'
    body = doc.xpath('//body')[0]
    body.tag = 'div'
    str = html.tostring(head)+html.tostring(body)
    return str 
Example #13
Source File: EntityLinking.py    From ClusType with GNU General Public License v3.0 5 votes vote down vote up
def run(self):
        print "Start DBpediaSpotlight"
        g = open('tmp/temp' + str(self.offset) + '.txt', 'w')
        index = 0
        while 1:
            did = str(index + self.offset)
            if did in self.docList:
                try:
                    doc = self.docList[did]
                    url = "http://spotlight.sztaki.hu:2222/rest/annotate"
                    #url = "http://localhost:2222/rest/annotate"
                    data = {"confidence":self.confidence}
                    data["support"] = "20"
                    data["text"] = doc;
                    data = urllib.urlencode(data)
                    req = urllib2.Request(url)
                    req.add_header('Accept', 'application/json') #text/xml')
                    # print did
                    page = html.fromstring(urllib2.urlopen(req, data, timeout=100).read())
                    docJson = html.tostring(page)[3:-4]
                    #print docJson
                    validEntities = extractAnnotations(docJson)
                    for entity in validEntities:
                        linkToFreebase(entity)
                        if (entity['@URI'] != None):
                            g.write(str(index + self.offset) + '\t' + entity['@surfaceForm'] + '\t' + entity['@URI'] + '\t'
                             + entity['@similarityScore'] + '\t' + entity['@percentageOfSecondRank']+ '\n')
                    index += threadNum
                except:
                    index += threadNum
                    print 'noresult'
            else:
                break
        g.close() 
Example #14
Source File: lxml_toolkit_object.py    From enaml-web with MIT License 5 votes vote down vote up
def render(self, method='html', encoding='unicode', **kwargs):
        """ Render the widget tree into a string """
        return tostring(self.widget, method=method, encoding=encoding, **kwargs) 
Example #15
Source File: metadata.py    From trafilatura with GNU General Public License v3.0 5 votes vote down vote up
def extract_url(tree, default_url=None):
    '''Extract the URL from the canonical link'''
    # https://www.tutorialrepublic.com/html-reference/html-base-tag.php
    # default url as fallback
    url = default_url
    # try canonical link first
    element = tree.find('.//head//link[@rel="canonical"]')
    if element is not None:
        url = element.attrib['href']
    # try default language link
    else:
        for element in tree.xpath('//head//link[@rel="alternate"]'):
            if 'hreflang' in element.attrib and element.attrib['hreflang'] is not None and element.attrib['hreflang'] == 'x-default':
                LOGGER.debug(html.tostring(element, pretty_print=False, encoding='unicode').strip())
                url = element.attrib['href']
    # add domain name if it's missing
    if url is not None and url.startswith('/'):
        for element in tree.xpath('//head//meta[@content]'):
            if 'name' in element.attrib:
                attrtype = element.attrib['name']
            elif 'property' in element.attrib:
                attrtype = element.attrib['property']
            else:
                continue
            if attrtype.startswith('og:') or attrtype.startswith('twitter:'):
                domain_match = re.match(r'https?://[^/]+', element.attrib['content'])
                if domain_match:
                    # prepend URL
                    url = domain_match.group(0) + url
                    break
    return url 
Example #16
Source File: core.py    From trafilatura with GNU General Public License v3.0 5 votes vote down vote up
def determine_returnstring(docmeta, postbody, commentsbody, output_format, tei_validation, record_id):
    '''Convert XML tree to chosen format, clean the result and output it as a string'''
    # XML (TEI) steps
    if 'xml' in output_format:
        # last cleaning
        for element in postbody.iter():
            if len(element) == 0 and not element.text and not element.tail:
                parent = element.getparent()
                if parent is not None:
                    parent.remove(element)
        # build output trees
        if output_format == 'xml':
            output = build_xml_output(postbody, commentsbody)
            output = add_xml_meta(output, docmeta)
        elif output_format == 'xmltei':
            output = build_tei_output(postbody, commentsbody, docmeta)
        # can be improved
        control_string = etree.tostring(output, encoding='unicode')
        control_string = sanitize(control_string)
        # necessary for cleaning
        control_parser = etree.XMLParser(remove_blank_text=True)
        output_tree = etree.fromstring(control_string, control_parser)
        # validate
        if output_format == 'xmltei' and tei_validation is True:
            result = validate_tei(output_tree)
            LOGGER.info('TEI validation result: %s %s %s', result, record_id, docmeta.url)
        # output as string
        returnstring = etree.tostring(output_tree, pretty_print=True, encoding='unicode').strip()
    # CSV + TXT output
    else:
        if output_format == 'csv':
            posttext = xmltotxt(postbody)
            if commentsbody is not None:
                commentstext = xmltotxt(commentsbody)
            else:
                commentstext = ''
            returnstring = txttocsv(posttext, commentstext, docmeta)
        else:
            output = build_xml_output(postbody, commentsbody)
            returnstring = xmltotxt(output)
    return returnstring 
Example #17
Source File: course_info_model.py    From ANALYSE with GNU Affero General Public License v3.0 5 votes vote down vote up
def _course_info_content(html_parsed):
    """
    Constructs the HTML for the course info update, not including the header.
    """
    if len(html_parsed) == 1:
        # could enforce that update[0].tag == 'h2'
        content = html_parsed[0].tail
    else:
        content = html_parsed[0].tail if html_parsed[0].tail is not None else ""
        content += "\n".join([html.tostring(ele) for ele in html_parsed[1:]])
    return content 
Example #18
Source File: html_to_telegraph.py    From html-telegraph-poster with MIT License 5 votes vote down vote up
def convert_html_to_telegraph_format(html_string, clean_html=True, output_format="json_string"):
    if clean_html:
        html_string = clean_article_html(html_string)

        body = preprocess_fragments(
            _fragments_from_string(html_string)
        )
        if body is not None:
            desc = [x for x in body.iterdescendants()]
            for tag in desc:
                preprocess_media_tags(tag)
            move_to_top(body)
            post_process(body)
    else:
        fragments = _fragments_from_string(html_string)
        body = fragments[0].getparent() if len(fragments) else None

    content = []
    if body is not None:
        content = [_recursive_convert(x) for x in body.iterchildren()]

    if output_format == 'json_string':
        return json.dumps(content, ensure_ascii=False)
    elif output_format == 'python_list':
        return content
    elif output_format == 'html_string':
        return html.tostring(body, encoding='unicode') 
Example #19
Source File: html_to_telegraph.py    From html-telegraph-poster with MIT License 5 votes vote down vote up
def convert_json_to_html(elements):
    content = html.fragment_fromstring('<div></div>')
    for element in elements:
        content.append(_recursive_convert_json(element))
    content.make_links_absolute(base_url=base_url)
    for x in content.xpath('.//span'):
        x.drop_tag()
    html_string = html.tostring(content, encoding='unicode')
    html_string = replace_line_breaks_except_pre(html_string, '<br/>')
    html_string = html_string[5:-6]
    return html_string 
Example #20
Source File: hearth.py    From hearthstats with GNU General Public License v2.0 5 votes vote down vote up
def get_deck_list(deckid):
    """
    For a given HearthPwn deck ID, return a list of Cards that belong to that
    deck.

    Parameters:

    - 'deckid' - a HearthPwn deck ID
    """
    # http://www.hearthpwn.com/decks/listing/ + deckid + /neutral or /class
    url = 'http://www.hearthpwn.com/decks/listing/'
    css = '#cards > tbody > tr > td.col-name'

    deck = []

    # Class Cards
    htmlelement = get_htmlelement_from_url(url + str(deckid) + '/class')
    cardelements = htmlelement.cssselect(css)
    # Neutral Cards
    htmlelement = get_htmlelement_from_url(url + str(deckid) + '/neutral')
    cardelements += htmlelement.cssselect(css)

    regex = re.compile('&#215;\s+(\d+)')
    for element in cardelements:
        # cssselect always returns an array, but in our case the result is
        # always just one element.
        cardname = element.cssselect('a')[0].text.strip()
        elementtext = html.tostring(element).decode('UTF-8')
        # There's probably a better way to get the amount, but we currently
        # look for the "x #" in the raw text of the element
        match = re.search(regex, elementtext)
        if match:
            amount = int(match.group(1))
        else:
            print('ERROR: Unable to get amount for card ' + cardname)
            # This shouldn't happen, but when it does, just continue on after
            # logging an error.
            amount = 0
        deck.append(Card(cardname, amount))

    return deck 
Example #21
Source File: html.py    From mailur with GNU General Public License v3.0 5 votes vote down vote up
def fix_privacy(htm, only_proxy=False):
    if not htm.strip():
        return htm

    use_proxy = conf['USE_PROXY']
    if only_proxy and not use_proxy:
        return htm

    htm = fromstring(htm)
    for img in htm.xpath('//img[@src]'):
        src = img.attrib['src']
        if re.match('^(https?://|//).*', src):
            if src.startswith('//'):
                src = 'https:' + src
            if use_proxy:
                src = '/proxy?url=' + src
            if only_proxy:
                img.attrib['src'] = src
            else:
                img.attrib['data-src'] = src
                del img.attrib['src']

    if not only_proxy:
        # style could contain "background-image", etc.
        for el in htm.xpath('//*[@style]'):
            el.attrib['data-style'] = el.attrib['style']
            del el.attrib['style']

    htm = tostring(htm, encoding='unicode').strip()
    htm = re.sub('(^<div>|</div>$)', '', htm)
    return htm 
Example #22
Source File: cleanhtml.py    From zing with GNU General Public License v3.0 5 votes vote down vote up
def url_trim(html):
    """Trims anchor texts that are longer than 70 chars."""
    fragment = fromstring(html)
    for el, attrib_, link_, pos_ in fragment.iterlinks():
        new_link_text = trim_url(el.text_content())
        el.text = new_link_text

    return mark_safe(tostring(fragment, encoding="unicode")) 
Example #23
Source File: __init__.py    From online-judge with GNU Affero General Public License v3.0 5 votes vote down vote up
def fragment_tree_to_str(tree):
    return html.tostring(tree, encoding='unicode')[len('<div>'):-len('</div>')] 
Example #24
Source File: lxml_tree.py    From online-judge with GNU Affero General Public License v3.0 5 votes vote down vote up
def __str__(self):
        return mark_safe(html.tostring(self._tree, encoding='unicode')) 
Example #25
Source File: test_items.py    From ant_nest with GNU Lesser General Public License v3.0 4 votes vote down vote up
def test_extract_item():
    with open("./tests/test.html", "rb") as f:
        response = httpx.Response(
            200, request=httpx.Request("Get", "https://test.com"), content=f.read()
        )

    class Item:
        pass

    # extract item with xpath and regex
    item_extractor = Extractor(Item)
    item_extractor.add_extractor(
        "paragraph",
        lambda x: html.fromstring(x.text).xpath("/html/body/div/p/text()")[0],
    )
    item_extractor.add_extractor(
        "title", lambda x: re.findall(r"<title>([A-Z a-z]+)</title>", x.text)[0]
    )
    item = item_extractor.extract(response)
    assert item.paragraph == "test"
    assert item.title == "Test html"
    # extract with jpath
    response = httpx.Response(
        200,
        request=httpx.Request("Get", "https://test.com"),
        content=b'{"a": {"b": {"c": 1}}, "d": null}',
    )
    item_extractor = Extractor(Item)
    item_extractor.add_extractor(
        "author", lambda x: jpath.get_all("a.b.c", x.json())[0]
    )
    item_extractor.add_extractor("freedom", lambda x: jpath.get_all("d", x.json())[0])
    item = item_extractor.extract(response)
    assert item.author == 1
    assert item.freedom is None
    # ItemNestExtractor tests
    with open("./tests/test.html", "rb") as f:
        response = httpx.Response(
            200, request=httpx.Request("Get", "https://test.com"), content=f.read()
        )
    item_nest_extractor = NestExtractor(
        Item, lambda x: html.fromstring(x.text).xpath('//div[@id="nest"]/div')
    )
    item_nest_extractor.add_extractor("xpath_key", lambda x: x.xpath("./p/text()")[0])
    item_nest_extractor.add_extractor(
        "regex_key",
        lambda x: re.findall(r"regex(\d+)</", html.tostring(x, encoding="unicode"))[0],
    )
    temp = 1
    for item in item_nest_extractor.extract_items(response):
        assert item.xpath_key == str(temp)
        assert item.regex_key == str(temp)
        temp += 1 
Example #26
Source File: core.py    From trafilatura with GNU General Public License v3.0 4 votes vote down vote up
def extract_content(tree, include_tables=False):
    '''Find the main content of a page using a set of XPath expressions,
       then extract relevant elements, strip them of unwanted subparts and
       convert them'''
    sure_thing = False
    result_body = etree.Element('body')
    # iterate
    for expr in BODY_XPATH:
        # select tree if the expression has been found
        subtree = tree.xpath(expr)
        if not subtree:
            continue
        subtree = subtree[0]
        # prune
        subtree = discard_unwanted(subtree)
        # remove elements by link density
        for elem in subtree.iter('list'):
            if link_density_test(elem) is True:
                elem.getparent().remove(elem)
                continue
            elem.attrib.clear()
            #for subelem in elem.iter('item'):
            #    subelem.attrib.clear()
        etree.strip_tags(subtree, 'a', 'link', 'span')
        # define iteration strategy
        potential_tags = set(TAG_CATALOG)  # + 'span'?
        if include_tables is True:
            potential_tags.add('table')
        # no paragraphs containing text
        if not subtree.xpath('//p//text()'):
            potential_tags.add('div')
        LOGGER.debug(sorted(potential_tags))
        # etree.strip_tags(subtree, 'lb') # BoingBoing-Bug
        # print(html.tostring(subtree, pretty_print=True, encoding='unicode'))
        # extract content
        processed_elems = [handle_textelem(e, potential_tags) for e in subtree.xpath('.//*')]
        # list(filter(None.__ne__, processed_elems))
        result_body.extend([e for e in processed_elems if e is not None])
        # exit the loop if the result has children
        if len(result_body) > 0:
            sure_thing = True
            LOGGER.debug(expr)
            break
    # try parsing wild <p> elements if nothing found or text too short
    temp_text = trim(' '.join(result_body.itertext()))
    len_text = len(temp_text)
    if len(result_body) == 0 or len_text < MIN_EXTRACTED_SIZE:
        result_body = recover_wild_paragraphs(tree, result_body)
        #search_tree = discard_unwanted(tree)
        #search_tree = prune_html(search_tree)
        #result_body, _, _ = baseline(search_tree)
        temp_text = trim(' '.join(result_body.itertext()))
        len_text = len(temp_text)
    # filter output
    etree.strip_elements(result_body, 'done')
    etree.strip_tags(result_body, 'div')
    # return
    return result_body, temp_text, len_text, sure_thing 
Example #27
Source File: html.py    From mailur with GNU General Public License v3.0 4 votes vote down vote up
def clean(htm, embeds=None):
    htm = re.sub(r'^\s*<\?xml.*?\?>', '', htm).strip()
    if not htm:
        return '', {}

    htm = htm.replace('\r\n', '\n')
    cleaner = Cleaner(
        links=False,
        style=True,
        inline_style=False,
        kill_tags=['head'],
        remove_tags=['html', 'base'],
        safe_attrs=list(set(Cleaner.safe_attrs) - {'class'}) + ['style'],
    )
    htm = fromstring(htm)
    htm = cleaner.clean_html(htm)

    ext_images = 0
    embeds = embeds or {}
    for img in htm.xpath('//img[@src]'):
        src = img.attrib.get('src')
        cid = re.match('^cid:(.*)', src)
        url = cid and embeds.get('<%s>' % cid.group(1))
        if url:
            img.attrib['src'] = url
        elif re.match('^data:image/.*', src):
            pass
        elif re.match('^(https?://|//).*', src):
            ext_images += 1
        else:
            del img.attrib['src']

    styles = False
    for el in htm.xpath('//*[@style]'):
        styles = True
        break

    fix_links(htm)

    richer = (('styles', styles), ('ext_images', ext_images))
    richer = {k: v for k, v in richer if v}

    htm = tostring(htm, encoding='unicode').strip()
    htm = re.sub('(^<div>|</div>$)', '', htm)
    return htm, richer 
Example #28
Source File: weibo.py    From news_spider with MIT License 4 votes vote down vote up
def parse_article_detail_js(self, response):
        """
        文章详情解析 js 版
        :param response:
        :return:
        """
        article_detail_body = response.body_as_unicode()
        article_detail_rule = r'<script>FM.view\({"ns":.*?"html":"(.*?)"}\)</script>'
        article_detail_re_parse = re.compile(article_detail_rule, re.S).findall(article_detail_body)
        if not article_detail_re_parse:
            return
        article_detail_html = ''.join(article_detail_re_parse)

        # 转义字符处理
        article_detail_html = article_detail_html.replace('\\r', '')
        article_detail_html = article_detail_html.replace('\\t', '')
        article_detail_html = article_detail_html.replace('\\n', '')
        article_detail_html = article_detail_html.replace('\\"', '"')
        article_detail_html = article_detail_html.replace('\\/', '/')

        article_detail_doc = fromstring(article_detail_html)

        article_title_parse = article_detail_doc.xpath('//h1[@class="title"]/text()')
        article_title = article_title_parse[0].strip() if article_title_parse else ''

        article_pub_time_parse = article_detail_doc.xpath('//span[@class="time"]/text()')
        article_pub_time = self.trans_time(article_pub_time_parse[0].strip()) if article_pub_time_parse else time.strftime('%Y-%m-%d %H:%M:%S')

        article_content_parse = article_detail_doc.xpath('//div[@class="WBA_content"]')
        article_content = tostring(article_content_parse[0], encoding='unicode').strip() if article_content_parse else ''

        fetch_result_item = FetchResultItem()
        fetch_result_item['task_id'] = response.meta['task_id']
        fetch_result_item['platform_id'] = response.meta['platform_id']
        fetch_result_item['platform_name'] = platform_name_map.get(response.meta['platform_id'], '')
        fetch_result_item['channel_id'] = response.meta['channel_id']
        fetch_result_item['channel_name'] = channel_name_map.get(response.meta['channel_id'], '')
        fetch_result_item['article_id'] = response.meta['article_id']
        fetch_result_item['article_title'] = article_title
        fetch_result_item['article_author_id'] = response.meta['follow_id']
        fetch_result_item['article_author_name'] = response.meta['follow_name']
        fetch_result_item['article_pub_time'] = time_local_to_utc(article_pub_time).strftime('%Y-%m-%d %H:%M:%S')
        fetch_result_item['article_url'] = response.url
        fetch_result_item['article_tags'] = ''
        fetch_result_item['article_abstract'] = response.meta['article_abstract']
        fetch_result_item['article_content'] = article_content
        yield fetch_result_item 
Example #29
Source File: utils.py    From scrape with MIT License 4 votes vote down vote up
def write_part_file(args, url, raw_html, html=None, part_num=None):
    """Write PART.html file(s) to disk, images in PART_files directory.

    Keyword arguments:
    args -- program arguments (dict)
    raw_html -- unparsed HTML file content (list)
    html -- parsed HTML file content (lxml.html.HtmlElement) (default: None)
    part_num -- PART(#).html file number (int) (default: None)
    """
    if part_num is None:
        part_num = get_num_part_files() + 1
    filename = "PART{0}.html".format(part_num)

    # Decode bytes to string in Python 3 versions
    if not PY2 and isinstance(raw_html, bytes):
        raw_html = raw_html.encode("ascii", "ignore")

    # Convert html to an lh.HtmlElement object for parsing/saving images
    if html is None:
        html = lh.fromstring(raw_html)

    # Parse HTML if XPath entered
    if args["xpath"]:
        raw_html = parse_html(html, args["xpath"])
        if isinstance(raw_html, list):
            if not isinstance(raw_html[0], lh.HtmlElement):
                raise ValueError("XPath should return an HtmlElement object.")
        else:
            if not isinstance(raw_html, lh.HtmlElement):
                raise ValueError("XPath should return an HtmlElement object.")

    # Write HTML and possibly images to disk
    if raw_html:
        if not args["no_images"] and (args["pdf"] or args["html"]):
            raw_html = write_part_images(url, raw_html, html, filename)
        with open(filename, "w") as part:
            if not isinstance(raw_html, list):
                raw_html = [raw_html]
                if isinstance(raw_html[0], lh.HtmlElement):
                    for elem in raw_html:
                        part.write(lh.tostring(elem))
                else:
                    for line in raw_html:
                        part.write(line) 
Example #30
Source File: test_parsers.py    From crestify with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def setUp(self):
        query_user = User.query.filter_by(email='instapaper@example.com').first()
        if query_user:
            query_bookmarks = Bookmark.query.filter_by(user=query_user.id)
            for bmark in query_bookmarks:
                db.session.delete(bmark)
            db.session.commit()
            db.session.delete(query_user)
            db.session.commit()
        create_user = User()
        create_user.first_name = 'Instapaper'
        create_user.last_name = 'Test'
        create_user.email = 'instapaper@example.com'
        create_user.password = 'instapaper_pass'
        create_user.active = True
        create_user.confirmed_at = datetime.datetime.utcnow()
        db.session.add(create_user)
        db.session.commit()
        self.user = create_user
        with open('Instapaper.html') as json_file:
            create_file = open(os.path.join(app.config['CRESTIFY_UPLOAD_DIRECTORY'], 'test_instapaper.html'), 'w+')
            self.data = html.document_fromstring(json_file.read())
            self.data = html.tostring(self.data)
            self.html_data = BeautifulSoup4(self.data)
            self.bookmarks = {}
            for tag in self.html_data.find_all('h1'):
                parent_elem = tag.find_next_sibling('ol')
                links = parent_elem.find_all('a')
                for link in links:
                    title = link.text
                    url = link['href']
                    tags = [tag.text]
                    tags.append('Imported')
                    #  Thanks Instapaper for not adding timestamps
                    self.bookmarks[url] = {
                        'href': url,
                        'title': title,
                        'tags': tags
                    }
            create_file.write(self.data)
            self.file_path = create_file.name
            create_file.close()
        init_parser = InstapaperParser(self.file_path, self.user.id)
        init_parser.process()
        init_parser.add_to_database()
        self.query = Bookmark.query.filter_by(user=self.user.id).all()
        self.html_parser = HTMLParser()