Python bs4.UnicodeDammit() Examples

The following are 18 code examples of bs4.UnicodeDammit(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module bs4 , or try the search function .
Example #1
Source File: LinkedinSpider.py    From openslack-crawler with Apache License 2.0 6 votes vote down vote up
def parse(self, response):
        """
        default parse method, rule is not useful now
        """
        # import pdb; pdb.set_trace()
        response = response.replace(url=HtmlParser.remove_url_parameter(response.url))
        hxs = HtmlXPathSelector(response)
        index_level = self.determine_level(response)
        log.msg("Parse: index level:" + str(index_level))
        if index_level in [1, 2, 3, 4]:
            self.save_to_file_system(index_level, response)
            relative_urls = self.get_follow_links(index_level, hxs)
            if relative_urls is not None:
                for url in relative_urls:
                    log.msg('yield process, url:' + url)
                    yield Request(url, callback=self.parse)
        elif index_level == 5:
            personProfile = HtmlParser.extract_person_profile(hxs)
            linkedin_id = self.get_linkedin_id(response.url)
            linkedin_id = UnicodeDammit(urllib.unquote_plus(linkedin_id)).markup
            if linkedin_id:
                personProfile['_id'] = linkedin_id
                personProfile['url'] = UnicodeDammit(response.url).markup
                yield personProfile 
Example #2
Source File: helper.py    From bazarr with GNU General Public License v3.0 6 votes vote down vote up
def force_unicode(s):
    """
    Ensure a string is unicode, not encoded; used for enforcing file paths to be unicode upon saving a subtitle,
    to prevent encoding issues when saving a subtitle to a non-ascii path.
    :param s: string
    :return: unicode string
    """
    if not isinstance(s, str):
        try:
            s = s.decode("utf-8")
        except UnicodeDecodeError:
            t = chardet.detect(s)
            try:
                s = s.decode(t["encoding"])
            except UnicodeDecodeError:
                s = UnicodeDammit(s).unicode_markup
    return s 
Example #3
Source File: scan.py    From Python-Automation-Cookbook with MIT License 6 votes vote down vote up
def search_txt(filename, word):
    '''
    Search the word in a text file
    '''
    # Detect the encoding
    with open(filename, 'rb') as file:
        content = file.read(1024)

    suggestion = UnicodeDammit(content)
    encoding = suggestion.original_encoding

    # Open and read
    with open(filename, encoding=encoding) as file:
        for line in file:
            if word in line.lower():
                return True

    return False 
Example #4
Source File: __init__.py    From ChemDataExtractor with MIT License 5 votes vote down vote up
def get_encoding(input_string, guesses=None, is_html=False):
    """Return the encoding of a byte string. Uses bs4 UnicodeDammit.

    :param string input_string: Encoded byte string.
    :param list[string] guesses: (Optional) List of encoding guesses to prioritize.
    :param bool is_html: Whether the input is HTML.
    """
    converted = UnicodeDammit(input_string, override_encodings=[guesses] if guesses else [], is_html=is_html)
    return converted.original_encoding 
Example #5
Source File: subtitriid.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def guess_encoding(self):
        # override default subtitle guess_encoding method to not include language-specific encodings guessing
        # chardet encoding detection seem to yield better results
        """Guess encoding using chardet.

        :return: the guessed encoding.
        :rtype: str

        """
        if self._guessed_encoding:
            return self._guessed_encoding

        logger.info('Guessing encoding for language %s', self.language)

        # guess/detect encoding using chardet
        encoding = chardet.detect(self.content)['encoding']
        logger.info('Chardet found encoding %s', encoding)

        if not encoding:
            # fallback on bs4
            logger.info('Falling back to bs4 detection')
            a = UnicodeDammit(self.content)

            logger.info("bs4 detected encoding: %s", a.original_encoding)

            if a.original_encoding:
                self._guessed_encoding = a.original_encoding
                return a.original_encoding
            raise ValueError(u"Couldn't guess the proper encoding for %s", self)

        self._guessed_encoding = encoding
        return encoding 
Example #6
Source File: nekur.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def guess_encoding(self):
        # override default subtitle guess_encoding method to not include language-specific encodings guessing
        # chardet encoding detection seem to yield better results
        """Guess encoding using chardet.

        :return: the guessed encoding.
        :rtype: str

        """
        if self._guessed_encoding:
            return self._guessed_encoding

        logger.info('Guessing encoding for language %s', self.language)

        # guess/detect encoding using chardet
        encoding = chardet.detect(self.content)['encoding']
        logger.info('Chardet found encoding %s', encoding)

        if not encoding:
            # fallback on bs4
            logger.info('Falling back to bs4 detection')
            a = UnicodeDammit(self.content)

            logger.info("bs4 detected encoding: %s", a.original_encoding)

            if a.original_encoding:
                self._guessed_encoding = a.original_encoding
                return a.original_encoding
            raise ValueError(u"Couldn't guess the proper encoding for %s", self)

        self._guessed_encoding = encoding
        return encoding 
Example #7
Source File: ubuntu.py    From apt-smart with MIT License 5 votes vote down vote up
def discover_mirror_selection():
    """Discover "geographically suitable" Ubuntu mirrors."""
    timer = Timer()
    logger.info("Identifying fast Ubuntu mirrors using %s ..", MIRROR_SELECTION_URL)
    data = fetch_url(MIRROR_SELECTION_URL, timeout=3, retry=True, max_attempts=5)
    # shorter timeout with more retries is good for unstable connections to MIRROR_SELECTION_URL
    dammit = UnicodeDammit(data)
    mirrors = set(
        CandidateMirror(mirror_url=mirror_url.strip())
        for mirror_url in dammit.unicode_markup.splitlines()
        if mirror_url and not mirror_url.isspace() and mirror_url.startswith(('http://', 'https://'))
    )
    logger.debug("Found %s in %s.", pluralize(len(mirrors), "fast Ubuntu mirror"), timer)
    return mirrors 
Example #8
Source File: tnef.py    From stoq-plugins-public with Apache License 2.0 5 votes vote down vote up
def scan(self, payload: Payload, request: Request) -> WorkerResponse:
        extracted: List[ExtractedPayload] = []
        tnef_results = TNEF(payload.content)
        if tnef_results.attachments:
            for tnef_attachment in tnef_results.attachments:
                try:
                    filename = UnicodeDammit(tnef_attachment.name).unicode_markup
                except:
                    filename = "None"
                tnef_meta = PayloadMeta(extra_data={'filename': filename})
                extracted.append(ExtractedPayload(tnef_attachment.data, tnef_meta))
        return WorkerResponse(extracted=extracted) 
Example #9
Source File: iocextract.py    From stoq-plugins-public with Apache License 2.0 5 votes vote down vote up
def scan(self, payload: Payload, request: Request) -> WorkerResponse:

        normalize: bool = True
        ioctype: str = 'all'
        results: Dict = {}

        if ioctype == 'all':
            for ioc in self.compiled_re:
                if self.compiled_re[ioc]:
                    matches = self.compiled_re[ioc].findall(UnicodeDammit(payload.content).unicode_markup)
                    if matches:
                        results[ioc] = list(set(matches))
        elif self.compiled_re[ioctype]:
            matches = self.compiled_re[ioctype].findall(UnicodeDammit(payload.content).unicode_markup)
            if matches:
                results[ioctype] = list(set(matches))

        if 'ipv6' in results:
            results['ipv6'] = [
                address for address in results['ipv6'] if self._validate_ipv6(address)
            ]
            if not results['ipv6']:
                results.pop('ipv6')

        if normalize:
            results = self._normalize(results)

        return WorkerResponse(results) 
Example #10
Source File: connector.py    From dataiku-contrib with Apache License 2.0 5 votes vote down vote up
def generate_rows(self, dataset_schema=None, dataset_partitioning=None,
                            partition_id=None, records_limit = -1):
        """
        The main reading method.
        """

        url_book = self.mirror
        lid = len(str(self.book_id))
        fullbid = str(self.book_id)
        rootbid = fullbid # sometimes the id to access a file has a variation, ex fullbid=14285-8 for the book 14285

        stopit = 0
        for i in range(lid-1):
            if (fullbid[i+1] != "-") and (stopit==0):
                url_book += '/'+fullbid[i]
            else:
                stopit=1
                rootbid=fullbid[0:i]
        url_book += '/'+ rootbid  + '/'+ fullbid + '.txt'

        response = urlopen(url_book)
        raw = response.read()   #.decode('utf8')
        converted = UnicodeDammit(raw)
        raw = converted.unicode_markup
        start_book = raw.find("START OF")
        end_book = raw.rfind('END OF')
        preamb = raw[:start_book]

        author = [ i.split(':')[1].strip() for i in preamb.split("\r\n\r\n") if i.find('Author') != -1][0]
        title = [ i.split(':')[1].strip() for i in preamb.split("\r\n\r\n") if i.find('Title') != -1][0]
        date = [ i.split(':')[1].strip() for i in preamb.split("\r\n\r\n") if i.find('Release Date') != -1][0]
        book_paraph =  raw[start_book:end_book].split("\r\n\r\n")

        logger.info("Book length %s" % len(raw))
        logger.info("N paragraphs:", len(book_paraph))

        for id_p, p in enumerate(book_paraph):
            yield {'id':id_p, 'author': author, 'title': title, 'text': p} 
Example #11
Source File: selector.py    From ChemDataExtractor with MIT License 5 votes vote down vote up
def _get_encoding(cls, input_string, encoding):
        converted = UnicodeDammit(input_string, [encoding] if encoding else [])
        # Not worth raising exception? lxml will raise if parse fails.
        # if not converted.unicode_markup:
        #     raise UnicodeDecodeError('Failed to detect encoding')
        return converted.original_encoding 
Example #12
Source File: one.py    From falsy with MIT License 5 votes vote down vote up
def pycurl_get_resp(data_buf, headers, payload, resp):
    charset = None
    if 'content-type' in headers:
        content_type = headers['content-type'].lower()
        match = re.search('charset=(\S+)', content_type)
        if match:
            charset = match.group(1)
            print('Decoding using %s' % charset)
    body = data_buf.getvalue()
    if len(body) == 0:
        data = ''
        charset = 'utf-8'
    else:
        if charset is None:
            dammit = UnicodeDammit(body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html")
            data = dammit.unicode_markup
            charset = dammit.original_encoding
        else:
            data = body.decode(charset, 'ignore')
    # headers.remove({})
    headers['content'] = [h for h in headers['content'] if len(h) > 0]
    soup_lxml = BeautifulSoup(data, 'lxml')
    soup_html = BeautifulSoup(data, 'html.parser')
    resp.update({
        'url': payload.get('url'),
        # 'soup': soup,
        'title': get_title(soup_lxml),
        'links': get_links(soup_lxml),
        'links2': get_links2(soup_lxml),
        'metas': get_metas(soup_lxml),
        'images': get_images(soup_lxml),
        'scripts': get_scripts(soup_lxml),
        'text': get_text(soup_html),
        'data': data,
        'headers': headers,
        'charset': charset,
        'spider': 'pycurl',
        'payload': payload,
    }) 
Example #13
Source File: chromeboy.py    From falsy with MIT License 5 votes vote down vote up
def beautify(self, data, charset):
        dammit = UnicodeDammit(data, [charset, "utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html")
        data = dammit.unicode_markup
        return data 
Example #14
Source File: gDocParse.py    From ReadableWebProxy with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def getItem(self, itemUrl, addlHeaders=None):

		content, handle = self.wg.getpage(itemUrl, returnMultiple=True, addlHeaders={'Referer': self.refererUrl})
		if not content or not handle:
			raise ValueError("Failed to retreive file from page '%s'!" % itemUrl)



		info = handle.info()
		if not 'Content-Disposition' in info:
			info['Content-Disposition'] = ''

		fileN = jsLiteralParse.parseContentDispositon(info['Content-Disposition'], itemUrl)
		fileN = bs4.UnicodeDammit(fileN).unicode_markup

		mType = handle.info()['Content-Type']

		# If there is an encoding in the content-type (or any other info), strip it out.
		# We don't care about the encoding, since WebRequest will already have handled that,
		# and returned a decoded unicode object.

		if mType and ";" in mType:
			mType = mType.split(";")[0].strip()


		self.log.info("Retreived file of type '%s', name of '%s' with a size of %0.3f K", mType, fileN, len(content)/1000.0)
		return content, fileN, mType 
Example #15
Source File: rsc.py    From ChemDataExtractor with MIT License 4 votes vote down vote up
def parse_rsc_html(htmlstring):
    """Messy RSC HTML needs this special parser to fix problems before creating selector."""
    converted = UnicodeDammit(htmlstring)
    if not converted.unicode_markup:
        raise UnicodeDecodeError('Failed to detect encoding, tried [%s]')
    root = fromstring(htmlstring, parser=HTMLParser(recover=True, encoding=converted.original_encoding))
    # Add p.otherpara tags around orphan text
    newp = None
    for child in root.get_element_by_id('wrapper'):
        if newp is not None:
            if child.tag in BLOCK_ELEMENTS or child.get('id', '').startswith('sect') or child.getnext() is None:
                child.addprevious(newp)
                newp = None
            else:
                newp.append(child)
        if newp is None and child.tag in BLOCK_ELEMENTS and child.tail and child.tail.strip():
            newp = Element('p', **{'class': 'otherpara'})
            newp.text = child.tail
            child.tail = ''
    return root 
Example #16
Source File: request.py    From falsy with MIT License 4 votes vote down vote up
def post_request(payload, share=None):
    c = pycurl.Curl()
    data_buf = BytesIO()
    # header_buf = BytesIO()
    headers = {'count': 0, 'content': [{}]}
    try:
        setup_curl_for_post(c, payload, data_buf, headers, share)  # header_buf)

        with aiohttp.Timeout(payload.get('aiohttp_timeout', 60)):
            resp = await CurlLoop.handler_ready(c)
            # encoding = None
            # if 'content-type' in headers:
            #     content_type = headers['content-type'].lower()
            #     match = re.search('charset=(\S+)', content_type)
            #     if match:
            #         encoding = match.group(1)
            #         print('Decoding using %s' % encoding)
            body = data_buf.getvalue()
            encoding = 'utf-8'
            data = body.decode(encoding, 'ignore') if len(body) > 0 else ''

            # if encoding is None:
            #     dammit = UnicodeDammit(body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html")
            #     data = dammit.unicode_markup
            #     encoding = dammit.original_encoding
            # else:
            #     data = body.decode(encoding, 'ignore')
            # headers.remove({})
            headers['content'] = [h for h in headers['content'] if len(h) > 0]

            resp.update({
                # 'url': payload.get('url'),
                'data': data,
                'headers': headers,
                'encoding': encoding,
            })
            post_func = payload.get('post_func')
            if type(post_func) == str:
                post_func = load(post_func)
            if post_func:
                resp = post_func(payload, resp)
            # post_func = payload.get('post_func')
            # if post_func:
            #     post_func = load(post_func)
            #     resp = post_func(payload, resp)
            return resp
    finally:
        c.close() 
Example #17
Source File: request.py    From falsy with MIT License 4 votes vote down vote up
def get_request(payload, share=None):
    c = pycurl.Curl()
    data_buf = BytesIO()
    # header_buf = BytesIO()
    headers = {'count': 0, 'content': [{}]}
    try:
        setup_curl_for_get(c, payload, data_buf, headers, share)  # header_buf)

        with aiohttp.Timeout(payload.get('aiohttp_timeout', 60)):
            resp = await CurlLoop.handler_ready(c)
            charset = None
            if 'content-type' in headers:
                content_type = headers['content-type'].lower()
                match = re.search('charset=(\S+)', content_type)
                if match:
                    charset = match.group(1)
                    print('Decoding using %s' % charset)
            body = data_buf.getvalue()
            if len(body) == 0:
                data = ''
                charset = 'utf-8'
            else:
                if charset is None:
                    dammit = UnicodeDammit(body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html")
                    data = dammit.unicode_markup
                    charset = dammit.original_encoding
                else:
                    data = body.decode(charset, 'ignore')
            # headers.remove({})
            headers['content'] = [h for h in headers['content'] if len(h) > 0]
            soup_lxml = BeautifulSoup(data, 'lxml')
            soup_html = BeautifulSoup(data, 'html.parser')
            resp.update({
                'url': payload.get('url'),
                # 'soup': soup,
                'title': get_title(soup_lxml),
                'links': get_links(soup_lxml),
                'links2': get_links2(soup_lxml),
                'metas': get_metas(soup_lxml),
                'images': get_images(soup_lxml),
                'scripts': get_scripts(soup_lxml),
                'text': get_text(soup_html),
                'data': data,
                'headers': headers,
                'charset': charset,
                'spider': 'pycurl',
                'payload': payload,
            })
            post_func = payload.get('post_func')
            if post_func:
                post_func = load(post_func)
                resp = post_func(payload, resp)
            return resp
    finally:
        c.close() 
Example #18
Source File: gDocParse.py    From ReadableWebProxy with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def extract(self):
		try:
			arch, fName = self.wg.getFileAndName(self.url, addlHeaders={'Referer': self.refererUrl})
		except IndexError:
			print("ERROR: Failure retrieving page!")
			return None, []

		baseName = fName.split(".")[0]

		if not isinstance(arch, bytes):
			if 'You need permission' in arch or 'Sign in to continue to Docs':
				self.log.critical("Retrieving zip archive failed?")
				self.log.critical("Retreived content type: '%s'", type(arch))
				raise TypeError("Cannot access document? Is it protected?")
			else:
				with open("tmp_page.html", "w") as fp:
					fp.write(arch)
				raise ValueError("Doc not valid?")

		zp = io.BytesIO(arch)
		zfp = zipfile.ZipFile(zp)

		resources = []
		baseFile = None

		for item in zfp.infolist():
			if not "/" in item.filename and not baseFile:
				contents = zfp.open(item).read()
				contents = bs4.UnicodeDammit(contents).unicode_markup

				baseFile = (item.filename, contents)

			elif baseName in item.filename and baseName:
				raise ValueError("Multiple base file items?")

			else:
				resources.append((item.filename, mimetypes.guess_type(item.filename)[0], zfp.open(item).read()))

		if not baseFile:
			raise ValueError("No base file found!")

		return baseFile, resources