Python Examples of bs4.UnicodeDammit

Source File: LinkedinSpider.py From openslack-crawler with Apache License 2.0

6 votes

def parse(self, response):
        """
        default parse method, rule is not useful now
        """
        # import pdb; pdb.set_trace()
        response = response.replace(url=HtmlParser.remove_url_parameter(response.url))
        hxs = HtmlXPathSelector(response)
        index_level = self.determine_level(response)
        log.msg("Parse: index level:" + str(index_level))
        if index_level in [1, 2, 3, 4]:
            self.save_to_file_system(index_level, response)
            relative_urls = self.get_follow_links(index_level, hxs)
            if relative_urls is not None:
                for url in relative_urls:
                    log.msg('yield process, url:' + url)
                    yield Request(url, callback=self.parse)
        elif index_level == 5:
            personProfile = HtmlParser.extract_person_profile(hxs)
            linkedin_id = self.get_linkedin_id(response.url)
            linkedin_id = UnicodeDammit(urllib.unquote_plus(linkedin_id)).markup
            if linkedin_id:
                personProfile['_id'] = linkedin_id
                personProfile['url'] = UnicodeDammit(response.url).markup
                yield personProfile

Source File: helper.py From bazarr with GNU General Public License v3.0

6 votes

def force_unicode(s):
    """
    Ensure a string is unicode, not encoded; used for enforcing file paths to be unicode upon saving a subtitle,
    to prevent encoding issues when saving a subtitle to a non-ascii path.
    :param s: string
    :return: unicode string
    """
    if not isinstance(s, str):
        try:
            s = s.decode("utf-8")
        except UnicodeDecodeError:
            t = chardet.detect(s)
            try:
                s = s.decode(t["encoding"])
            except UnicodeDecodeError:
                s = UnicodeDammit(s).unicode_markup
    return s

Source File: scan.py From Python-Automation-Cookbook with MIT License

6 votes

def search_txt(filename, word):
    '''
    Search the word in a text file
    '''
    # Detect the encoding
    with open(filename, 'rb') as file:
        content = file.read(1024)

    suggestion = UnicodeDammit(content)
    encoding = suggestion.original_encoding

    # Open and read
    with open(filename, encoding=encoding) as file:
        for line in file:
            if word in line.lower():
                return True

    return False

Source File: __init__.py From ChemDataExtractor with MIT License

5 votes

def get_encoding(input_string, guesses=None, is_html=False):
    """Return the encoding of a byte string. Uses bs4 UnicodeDammit.

    :param string input_string: Encoded byte string.
    :param list[string] guesses: (Optional) List of encoding guesses to prioritize.
    :param bool is_html: Whether the input is HTML.
    """
    converted = UnicodeDammit(input_string, override_encodings=[guesses] if guesses else [], is_html=is_html)
    return converted.original_encoding

Source File: subtitriid.py From bazarr with GNU General Public License v3.0

5 votes

def guess_encoding(self):
        # override default subtitle guess_encoding method to not include language-specific encodings guessing
        # chardet encoding detection seem to yield better results
        """Guess encoding using chardet.

        :return: the guessed encoding.
        :rtype: str

        """
        if self._guessed_encoding:
            return self._guessed_encoding

        logger.info('Guessing encoding for language %s', self.language)

        # guess/detect encoding using chardet
        encoding = chardet.detect(self.content)['encoding']
        logger.info('Chardet found encoding %s', encoding)

        if not encoding:
            # fallback on bs4
            logger.info('Falling back to bs4 detection')
            a = UnicodeDammit(self.content)

            logger.info("bs4 detected encoding: %s", a.original_encoding)

            if a.original_encoding:
                self._guessed_encoding = a.original_encoding
                return a.original_encoding
            raise ValueError(u"Couldn't guess the proper encoding for %s", self)

        self._guessed_encoding = encoding
        return encoding

Source File: nekur.py From bazarr with GNU General Public License v3.0

5 votes

def guess_encoding(self):
        # override default subtitle guess_encoding method to not include language-specific encodings guessing
        # chardet encoding detection seem to yield better results
        """Guess encoding using chardet.

        :return: the guessed encoding.
        :rtype: str

        """
        if self._guessed_encoding:
            return self._guessed_encoding

        logger.info('Guessing encoding for language %s', self.language)

        # guess/detect encoding using chardet
        encoding = chardet.detect(self.content)['encoding']
        logger.info('Chardet found encoding %s', encoding)

        if not encoding:
            # fallback on bs4
            logger.info('Falling back to bs4 detection')
            a = UnicodeDammit(self.content)

            logger.info("bs4 detected encoding: %s", a.original_encoding)

            if a.original_encoding:
                self._guessed_encoding = a.original_encoding
                return a.original_encoding
            raise ValueError(u"Couldn't guess the proper encoding for %s", self)

        self._guessed_encoding = encoding
        return encoding

Source File: ubuntu.py From apt-smart with MIT License

5 votes

def discover_mirror_selection():
    """Discover "geographically suitable" Ubuntu mirrors."""
    timer = Timer()
    logger.info("Identifying fast Ubuntu mirrors using %s ..", MIRROR_SELECTION_URL)
    data = fetch_url(MIRROR_SELECTION_URL, timeout=3, retry=True, max_attempts=5)
    # shorter timeout with more retries is good for unstable connections to MIRROR_SELECTION_URL
    dammit = UnicodeDammit(data)
    mirrors = set(
        CandidateMirror(mirror_url=mirror_url.strip())
        for mirror_url in dammit.unicode_markup.splitlines()
        if mirror_url and not mirror_url.isspace() and mirror_url.startswith(('http://', 'https://'))
    )
    logger.debug("Found %s in %s.", pluralize(len(mirrors), "fast Ubuntu mirror"), timer)
    return mirrors

Source File: tnef.py From stoq-plugins-public with Apache License 2.0

5 votes

def scan(self, payload: Payload, request: Request) -> WorkerResponse:
        extracted: List[ExtractedPayload] = []
        tnef_results = TNEF(payload.content)
        if tnef_results.attachments:
            for tnef_attachment in tnef_results.attachments:
                try:
                    filename = UnicodeDammit(tnef_attachment.name).unicode_markup
                except:
                    filename = "None"
                tnef_meta = PayloadMeta(extra_data={'filename': filename})
                extracted.append(ExtractedPayload(tnef_attachment.data, tnef_meta))
        return WorkerResponse(extracted=extracted)

Source File: iocextract.py From stoq-plugins-public with Apache License 2.0

5 votes

def scan(self, payload: Payload, request: Request) -> WorkerResponse:

        normalize: bool = True
        ioctype: str = 'all'
        results: Dict = {}

        if ioctype == 'all':
            for ioc in self.compiled_re:
                if self.compiled_re[ioc]:
                    matches = self.compiled_re[ioc].findall(UnicodeDammit(payload.content).unicode_markup)
                    if matches:
                        results[ioc] = list(set(matches))
        elif self.compiled_re[ioctype]:
            matches = self.compiled_re[ioctype].findall(UnicodeDammit(payload.content).unicode_markup)
            if matches:
                results[ioctype] = list(set(matches))

        if 'ipv6' in results:
            results['ipv6'] = [
                address for address in results['ipv6'] if self._validate_ipv6(address)
            ]
            if not results['ipv6']:
                results.pop('ipv6')

        if normalize:
            results = self._normalize(results)

        return WorkerResponse(results)

Source File: connector.py From dataiku-contrib with Apache License 2.0

5 votes

def generate_rows(self, dataset_schema=None, dataset_partitioning=None,
                            partition_id=None, records_limit = -1):
        """
        The main reading method.
        """

        url_book = self.mirror
        lid = len(str(self.book_id))
        fullbid = str(self.book_id)
        rootbid = fullbid # sometimes the id to access a file has a variation, ex fullbid=14285-8 for the book 14285

        stopit = 0
        for i in range(lid-1):
            if (fullbid[i+1] != "-") and (stopit==0):
                url_book += '/'+fullbid[i]
            else:
                stopit=1
                rootbid=fullbid[0:i]
        url_book += '/'+ rootbid  + '/'+ fullbid + '.txt'

        response = urlopen(url_book)
        raw = response.read()   #.decode('utf8')
        converted = UnicodeDammit(raw)
        raw = converted.unicode_markup
        start_book = raw.find("START OF")
        end_book = raw.rfind('END OF')
        preamb = raw[:start_book]

        author = [ i.split(':')[1].strip() for i in preamb.split("\r\n\r\n") if i.find('Author') != -1][0]
        title = [ i.split(':')[1].strip() for i in preamb.split("\r\n\r\n") if i.find('Title') != -1][0]
        date = [ i.split(':')[1].strip() for i in preamb.split("\r\n\r\n") if i.find('Release Date') != -1][0]
        book_paraph =  raw[start_book:end_book].split("\r\n\r\n")

        logger.info("Book length %s" % len(raw))
        logger.info("N paragraphs:", len(book_paraph))

        for id_p, p in enumerate(book_paraph):
            yield {'id':id_p, 'author': author, 'title': title, 'text': p}

Source File: selector.py From ChemDataExtractor with MIT License

5 votes

def _get_encoding(cls, input_string, encoding):
        converted = UnicodeDammit(input_string, [encoding] if encoding else [])
        # Not worth raising exception? lxml will raise if parse fails.
        # if not converted.unicode_markup:
        #     raise UnicodeDecodeError('Failed to detect encoding')
        return converted.original_encoding

Source File: one.py From falsy with MIT License

5 votes

def pycurl_get_resp(data_buf, headers, payload, resp):
    charset = None
    if 'content-type' in headers:
        content_type = headers['content-type'].lower()
        match = re.search('charset=(\S+)', content_type)
        if match:
            charset = match.group(1)
            print('Decoding using %s' % charset)
    body = data_buf.getvalue()
    if len(body) == 0:
        data = ''
        charset = 'utf-8'
    else:
        if charset is None:
            dammit = UnicodeDammit(body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html")
            data = dammit.unicode_markup
            charset = dammit.original_encoding
        else:
            data = body.decode(charset, 'ignore')
    # headers.remove({})
    headers['content'] = [h for h in headers['content'] if len(h) > 0]
    soup_lxml = BeautifulSoup(data, 'lxml')
    soup_html = BeautifulSoup(data, 'html.parser')
    resp.update({
        'url': payload.get('url'),
        # 'soup': soup,
        'title': get_title(soup_lxml),
        'links': get_links(soup_lxml),
        'links2': get_links2(soup_lxml),
        'metas': get_metas(soup_lxml),
        'images': get_images(soup_lxml),
        'scripts': get_scripts(soup_lxml),
        'text': get_text(soup_html),
        'data': data,
        'headers': headers,
        'charset': charset,
        'spider': 'pycurl',
        'payload': payload,
    })

Source File: chromeboy.py From falsy with MIT License

5 votes

def beautify(self, data, charset):
        dammit = UnicodeDammit(data, [charset, "utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html")
        data = dammit.unicode_markup
        return data

Source File: gDocParse.py From ReadableWebProxy with BSD 3-Clause "New" or "Revised" License

5 votes

def getItem(self, itemUrl, addlHeaders=None):

		content, handle = self.wg.getpage(itemUrl, returnMultiple=True, addlHeaders={'Referer': self.refererUrl})
		if not content or not handle:
			raise ValueError("Failed to retreive file from page '%s'!" % itemUrl)



		info = handle.info()
		if not 'Content-Disposition' in info:
			info['Content-Disposition'] = ''

		fileN = jsLiteralParse.parseContentDispositon(info['Content-Disposition'], itemUrl)
		fileN = bs4.UnicodeDammit(fileN).unicode_markup

		mType = handle.info()['Content-Type']

		# If there is an encoding in the content-type (or any other info), strip it out.
		# We don't care about the encoding, since WebRequest will already have handled that,
		# and returned a decoded unicode object.

		if mType and ";" in mType:
			mType = mType.split(";")[0].strip()


		self.log.info("Retreived file of type '%s', name of '%s' with a size of %0.3f K", mType, fileN, len(content)/1000.0)
		return content, fileN, mType

Source File: rsc.py From ChemDataExtractor with MIT License

4 votes

def parse_rsc_html(htmlstring):
    """Messy RSC HTML needs this special parser to fix problems before creating selector."""
    converted = UnicodeDammit(htmlstring)
    if not converted.unicode_markup:
        raise UnicodeDecodeError('Failed to detect encoding, tried [%s]')
    root = fromstring(htmlstring, parser=HTMLParser(recover=True, encoding=converted.original_encoding))
    # Add p.otherpara tags around orphan text
    newp = None
    for child in root.get_element_by_id('wrapper'):
        if newp is not None:
            if child.tag in BLOCK_ELEMENTS or child.get('id', '').startswith('sect') or child.getnext() is None:
                child.addprevious(newp)
                newp = None
            else:
                newp.append(child)
        if newp is None and child.tag in BLOCK_ELEMENTS and child.tail and child.tail.strip():
            newp = Element('p', **{'class': 'otherpara'})
            newp.text = child.tail
            child.tail = ''
    return root

Source File: request.py From falsy with MIT License

4 votes

def post_request(payload, share=None):
    c = pycurl.Curl()
    data_buf = BytesIO()
    # header_buf = BytesIO()
    headers = {'count': 0, 'content': [{}]}
    try:
        setup_curl_for_post(c, payload, data_buf, headers, share)  # header_buf)

        with aiohttp.Timeout(payload.get('aiohttp_timeout', 60)):
            resp = await CurlLoop.handler_ready(c)
            # encoding = None
            # if 'content-type' in headers:
            #     content_type = headers['content-type'].lower()
            #     match = re.search('charset=(\S+)', content_type)
            #     if match:
            #         encoding = match.group(1)
            #         print('Decoding using %s' % encoding)
            body = data_buf.getvalue()
            encoding = 'utf-8'
            data = body.decode(encoding, 'ignore') if len(body) > 0 else ''

            # if encoding is None:
            #     dammit = UnicodeDammit(body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html")
            #     data = dammit.unicode_markup
            #     encoding = dammit.original_encoding
            # else:
            #     data = body.decode(encoding, 'ignore')
            # headers.remove({})
            headers['content'] = [h for h in headers['content'] if len(h) > 0]

            resp.update({
                # 'url': payload.get('url'),
                'data': data,
                'headers': headers,
                'encoding': encoding,
            })
            post_func = payload.get('post_func')
            if type(post_func) == str:
                post_func = load(post_func)
            if post_func:
                resp = post_func(payload, resp)
            # post_func = payload.get('post_func')
            # if post_func:
            #     post_func = load(post_func)
            #     resp = post_func(payload, resp)
            return resp
    finally:
        c.close()

Source File: request.py From falsy with MIT License

4 votes

def get_request(payload, share=None):
    c = pycurl.Curl()
    data_buf = BytesIO()
    # header_buf = BytesIO()
    headers = {'count': 0, 'content': [{}]}
    try:
        setup_curl_for_get(c, payload, data_buf, headers, share)  # header_buf)

        with aiohttp.Timeout(payload.get('aiohttp_timeout', 60)):
            resp = await CurlLoop.handler_ready(c)
            charset = None
            if 'content-type' in headers:
                content_type = headers['content-type'].lower()
                match = re.search('charset=(\S+)', content_type)
                if match:
                    charset = match.group(1)
                    print('Decoding using %s' % charset)
            body = data_buf.getvalue()
            if len(body) == 0:
                data = ''
                charset = 'utf-8'
            else:
                if charset is None:
                    dammit = UnicodeDammit(body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html")
                    data = dammit.unicode_markup
                    charset = dammit.original_encoding
                else:
                    data = body.decode(charset, 'ignore')
            # headers.remove({})
            headers['content'] = [h for h in headers['content'] if len(h) > 0]
            soup_lxml = BeautifulSoup(data, 'lxml')
            soup_html = BeautifulSoup(data, 'html.parser')
            resp.update({
                'url': payload.get('url'),
                # 'soup': soup,
                'title': get_title(soup_lxml),
                'links': get_links(soup_lxml),
                'links2': get_links2(soup_lxml),
                'metas': get_metas(soup_lxml),
                'images': get_images(soup_lxml),
                'scripts': get_scripts(soup_lxml),
                'text': get_text(soup_html),
                'data': data,
                'headers': headers,
                'charset': charset,
                'spider': 'pycurl',
                'payload': payload,
            })
            post_func = payload.get('post_func')
            if post_func:
                post_func = load(post_func)
                resp = post_func(payload, resp)
            return resp
    finally:
        c.close()

Source File: gDocParse.py From ReadableWebProxy with BSD 3-Clause "New" or "Revised" License

4 votes

def extract(self):
		try:
			arch, fName = self.wg.getFileAndName(self.url, addlHeaders={'Referer': self.refererUrl})
		except IndexError:
			print("ERROR: Failure retrieving page!")
			return None, []

		baseName = fName.split(".")[0]

		if not isinstance(arch, bytes):
			if 'You need permission' in arch or 'Sign in to continue to Docs':
				self.log.critical("Retrieving zip archive failed?")
				self.log.critical("Retreived content type: '%s'", type(arch))
				raise TypeError("Cannot access document? Is it protected?")
			else:
				with open("tmp_page.html", "w") as fp:
					fp.write(arch)
				raise ValueError("Doc not valid?")

		zp = io.BytesIO(arch)
		zfp = zipfile.ZipFile(zp)

		resources = []
		baseFile = None

		for item in zfp.infolist():
			if not "/" in item.filename and not baseFile:
				contents = zfp.open(item).read()
				contents = bs4.UnicodeDammit(contents).unicode_markup

				baseFile = (item.filename, contents)

			elif baseName in item.filename and baseName:
				raise ValueError("Multiple base file items?")

			else:
				resources.append((item.filename, mimetypes.guess_type(item.filename)[0], zfp.open(item).read()))

		if not baseFile:
			raise ValueError("No base file found!")

		return baseFile, resources

Python bs4.UnicodeDammit() Examples