Python bs4.UnicodeDammit() Examples
The following are 18
code examples of bs4.UnicodeDammit().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
bs4
, or try the search function
.
Example #1
Source File: LinkedinSpider.py From openslack-crawler with Apache License 2.0 | 6 votes |
def parse(self, response): """ default parse method, rule is not useful now """ # import pdb; pdb.set_trace() response = response.replace(url=HtmlParser.remove_url_parameter(response.url)) hxs = HtmlXPathSelector(response) index_level = self.determine_level(response) log.msg("Parse: index level:" + str(index_level)) if index_level in [1, 2, 3, 4]: self.save_to_file_system(index_level, response) relative_urls = self.get_follow_links(index_level, hxs) if relative_urls is not None: for url in relative_urls: log.msg('yield process, url:' + url) yield Request(url, callback=self.parse) elif index_level == 5: personProfile = HtmlParser.extract_person_profile(hxs) linkedin_id = self.get_linkedin_id(response.url) linkedin_id = UnicodeDammit(urllib.unquote_plus(linkedin_id)).markup if linkedin_id: personProfile['_id'] = linkedin_id personProfile['url'] = UnicodeDammit(response.url).markup yield personProfile
Example #2
Source File: helper.py From bazarr with GNU General Public License v3.0 | 6 votes |
def force_unicode(s): """ Ensure a string is unicode, not encoded; used for enforcing file paths to be unicode upon saving a subtitle, to prevent encoding issues when saving a subtitle to a non-ascii path. :param s: string :return: unicode string """ if not isinstance(s, str): try: s = s.decode("utf-8") except UnicodeDecodeError: t = chardet.detect(s) try: s = s.decode(t["encoding"]) except UnicodeDecodeError: s = UnicodeDammit(s).unicode_markup return s
Example #3
Source File: scan.py From Python-Automation-Cookbook with MIT License | 6 votes |
def search_txt(filename, word): ''' Search the word in a text file ''' # Detect the encoding with open(filename, 'rb') as file: content = file.read(1024) suggestion = UnicodeDammit(content) encoding = suggestion.original_encoding # Open and read with open(filename, encoding=encoding) as file: for line in file: if word in line.lower(): return True return False
Example #4
Source File: __init__.py From ChemDataExtractor with MIT License | 5 votes |
def get_encoding(input_string, guesses=None, is_html=False): """Return the encoding of a byte string. Uses bs4 UnicodeDammit. :param string input_string: Encoded byte string. :param list[string] guesses: (Optional) List of encoding guesses to prioritize. :param bool is_html: Whether the input is HTML. """ converted = UnicodeDammit(input_string, override_encodings=[guesses] if guesses else [], is_html=is_html) return converted.original_encoding
Example #5
Source File: subtitriid.py From bazarr with GNU General Public License v3.0 | 5 votes |
def guess_encoding(self): # override default subtitle guess_encoding method to not include language-specific encodings guessing # chardet encoding detection seem to yield better results """Guess encoding using chardet. :return: the guessed encoding. :rtype: str """ if self._guessed_encoding: return self._guessed_encoding logger.info('Guessing encoding for language %s', self.language) # guess/detect encoding using chardet encoding = chardet.detect(self.content)['encoding'] logger.info('Chardet found encoding %s', encoding) if not encoding: # fallback on bs4 logger.info('Falling back to bs4 detection') a = UnicodeDammit(self.content) logger.info("bs4 detected encoding: %s", a.original_encoding) if a.original_encoding: self._guessed_encoding = a.original_encoding return a.original_encoding raise ValueError(u"Couldn't guess the proper encoding for %s", self) self._guessed_encoding = encoding return encoding
Example #6
Source File: nekur.py From bazarr with GNU General Public License v3.0 | 5 votes |
def guess_encoding(self): # override default subtitle guess_encoding method to not include language-specific encodings guessing # chardet encoding detection seem to yield better results """Guess encoding using chardet. :return: the guessed encoding. :rtype: str """ if self._guessed_encoding: return self._guessed_encoding logger.info('Guessing encoding for language %s', self.language) # guess/detect encoding using chardet encoding = chardet.detect(self.content)['encoding'] logger.info('Chardet found encoding %s', encoding) if not encoding: # fallback on bs4 logger.info('Falling back to bs4 detection') a = UnicodeDammit(self.content) logger.info("bs4 detected encoding: %s", a.original_encoding) if a.original_encoding: self._guessed_encoding = a.original_encoding return a.original_encoding raise ValueError(u"Couldn't guess the proper encoding for %s", self) self._guessed_encoding = encoding return encoding
Example #7
Source File: ubuntu.py From apt-smart with MIT License | 5 votes |
def discover_mirror_selection(): """Discover "geographically suitable" Ubuntu mirrors.""" timer = Timer() logger.info("Identifying fast Ubuntu mirrors using %s ..", MIRROR_SELECTION_URL) data = fetch_url(MIRROR_SELECTION_URL, timeout=3, retry=True, max_attempts=5) # shorter timeout with more retries is good for unstable connections to MIRROR_SELECTION_URL dammit = UnicodeDammit(data) mirrors = set( CandidateMirror(mirror_url=mirror_url.strip()) for mirror_url in dammit.unicode_markup.splitlines() if mirror_url and not mirror_url.isspace() and mirror_url.startswith(('http://', 'https://')) ) logger.debug("Found %s in %s.", pluralize(len(mirrors), "fast Ubuntu mirror"), timer) return mirrors
Example #8
Source File: tnef.py From stoq-plugins-public with Apache License 2.0 | 5 votes |
def scan(self, payload: Payload, request: Request) -> WorkerResponse: extracted: List[ExtractedPayload] = [] tnef_results = TNEF(payload.content) if tnef_results.attachments: for tnef_attachment in tnef_results.attachments: try: filename = UnicodeDammit(tnef_attachment.name).unicode_markup except: filename = "None" tnef_meta = PayloadMeta(extra_data={'filename': filename}) extracted.append(ExtractedPayload(tnef_attachment.data, tnef_meta)) return WorkerResponse(extracted=extracted)
Example #9
Source File: iocextract.py From stoq-plugins-public with Apache License 2.0 | 5 votes |
def scan(self, payload: Payload, request: Request) -> WorkerResponse: normalize: bool = True ioctype: str = 'all' results: Dict = {} if ioctype == 'all': for ioc in self.compiled_re: if self.compiled_re[ioc]: matches = self.compiled_re[ioc].findall(UnicodeDammit(payload.content).unicode_markup) if matches: results[ioc] = list(set(matches)) elif self.compiled_re[ioctype]: matches = self.compiled_re[ioctype].findall(UnicodeDammit(payload.content).unicode_markup) if matches: results[ioctype] = list(set(matches)) if 'ipv6' in results: results['ipv6'] = [ address for address in results['ipv6'] if self._validate_ipv6(address) ] if not results['ipv6']: results.pop('ipv6') if normalize: results = self._normalize(results) return WorkerResponse(results)
Example #10
Source File: connector.py From dataiku-contrib with Apache License 2.0 | 5 votes |
def generate_rows(self, dataset_schema=None, dataset_partitioning=None, partition_id=None, records_limit = -1): """ The main reading method. """ url_book = self.mirror lid = len(str(self.book_id)) fullbid = str(self.book_id) rootbid = fullbid # sometimes the id to access a file has a variation, ex fullbid=14285-8 for the book 14285 stopit = 0 for i in range(lid-1): if (fullbid[i+1] != "-") and (stopit==0): url_book += '/'+fullbid[i] else: stopit=1 rootbid=fullbid[0:i] url_book += '/'+ rootbid + '/'+ fullbid + '.txt' response = urlopen(url_book) raw = response.read() #.decode('utf8') converted = UnicodeDammit(raw) raw = converted.unicode_markup start_book = raw.find("START OF") end_book = raw.rfind('END OF') preamb = raw[:start_book] author = [ i.split(':')[1].strip() for i in preamb.split("\r\n\r\n") if i.find('Author') != -1][0] title = [ i.split(':')[1].strip() for i in preamb.split("\r\n\r\n") if i.find('Title') != -1][0] date = [ i.split(':')[1].strip() for i in preamb.split("\r\n\r\n") if i.find('Release Date') != -1][0] book_paraph = raw[start_book:end_book].split("\r\n\r\n") logger.info("Book length %s" % len(raw)) logger.info("N paragraphs:", len(book_paraph)) for id_p, p in enumerate(book_paraph): yield {'id':id_p, 'author': author, 'title': title, 'text': p}
Example #11
Source File: selector.py From ChemDataExtractor with MIT License | 5 votes |
def _get_encoding(cls, input_string, encoding): converted = UnicodeDammit(input_string, [encoding] if encoding else []) # Not worth raising exception? lxml will raise if parse fails. # if not converted.unicode_markup: # raise UnicodeDecodeError('Failed to detect encoding') return converted.original_encoding
Example #12
Source File: one.py From falsy with MIT License | 5 votes |
def pycurl_get_resp(data_buf, headers, payload, resp): charset = None if 'content-type' in headers: content_type = headers['content-type'].lower() match = re.search('charset=(\S+)', content_type) if match: charset = match.group(1) print('Decoding using %s' % charset) body = data_buf.getvalue() if len(body) == 0: data = '' charset = 'utf-8' else: if charset is None: dammit = UnicodeDammit(body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html") data = dammit.unicode_markup charset = dammit.original_encoding else: data = body.decode(charset, 'ignore') # headers.remove({}) headers['content'] = [h for h in headers['content'] if len(h) > 0] soup_lxml = BeautifulSoup(data, 'lxml') soup_html = BeautifulSoup(data, 'html.parser') resp.update({ 'url': payload.get('url'), # 'soup': soup, 'title': get_title(soup_lxml), 'links': get_links(soup_lxml), 'links2': get_links2(soup_lxml), 'metas': get_metas(soup_lxml), 'images': get_images(soup_lxml), 'scripts': get_scripts(soup_lxml), 'text': get_text(soup_html), 'data': data, 'headers': headers, 'charset': charset, 'spider': 'pycurl', 'payload': payload, })
Example #13
Source File: chromeboy.py From falsy with MIT License | 5 votes |
def beautify(self, data, charset): dammit = UnicodeDammit(data, [charset, "utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html") data = dammit.unicode_markup return data
Example #14
Source File: gDocParse.py From ReadableWebProxy with BSD 3-Clause "New" or "Revised" License | 5 votes |
def getItem(self, itemUrl, addlHeaders=None): content, handle = self.wg.getpage(itemUrl, returnMultiple=True, addlHeaders={'Referer': self.refererUrl}) if not content or not handle: raise ValueError("Failed to retreive file from page '%s'!" % itemUrl) info = handle.info() if not 'Content-Disposition' in info: info['Content-Disposition'] = '' fileN = jsLiteralParse.parseContentDispositon(info['Content-Disposition'], itemUrl) fileN = bs4.UnicodeDammit(fileN).unicode_markup mType = handle.info()['Content-Type'] # If there is an encoding in the content-type (or any other info), strip it out. # We don't care about the encoding, since WebRequest will already have handled that, # and returned a decoded unicode object. if mType and ";" in mType: mType = mType.split(";")[0].strip() self.log.info("Retreived file of type '%s', name of '%s' with a size of %0.3f K", mType, fileN, len(content)/1000.0) return content, fileN, mType
Example #15
Source File: rsc.py From ChemDataExtractor with MIT License | 4 votes |
def parse_rsc_html(htmlstring): """Messy RSC HTML needs this special parser to fix problems before creating selector.""" converted = UnicodeDammit(htmlstring) if not converted.unicode_markup: raise UnicodeDecodeError('Failed to detect encoding, tried [%s]') root = fromstring(htmlstring, parser=HTMLParser(recover=True, encoding=converted.original_encoding)) # Add p.otherpara tags around orphan text newp = None for child in root.get_element_by_id('wrapper'): if newp is not None: if child.tag in BLOCK_ELEMENTS or child.get('id', '').startswith('sect') or child.getnext() is None: child.addprevious(newp) newp = None else: newp.append(child) if newp is None and child.tag in BLOCK_ELEMENTS and child.tail and child.tail.strip(): newp = Element('p', **{'class': 'otherpara'}) newp.text = child.tail child.tail = '' return root
Example #16
Source File: request.py From falsy with MIT License | 4 votes |
def post_request(payload, share=None): c = pycurl.Curl() data_buf = BytesIO() # header_buf = BytesIO() headers = {'count': 0, 'content': [{}]} try: setup_curl_for_post(c, payload, data_buf, headers, share) # header_buf) with aiohttp.Timeout(payload.get('aiohttp_timeout', 60)): resp = await CurlLoop.handler_ready(c) # encoding = None # if 'content-type' in headers: # content_type = headers['content-type'].lower() # match = re.search('charset=(\S+)', content_type) # if match: # encoding = match.group(1) # print('Decoding using %s' % encoding) body = data_buf.getvalue() encoding = 'utf-8' data = body.decode(encoding, 'ignore') if len(body) > 0 else '' # if encoding is None: # dammit = UnicodeDammit(body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html") # data = dammit.unicode_markup # encoding = dammit.original_encoding # else: # data = body.decode(encoding, 'ignore') # headers.remove({}) headers['content'] = [h for h in headers['content'] if len(h) > 0] resp.update({ # 'url': payload.get('url'), 'data': data, 'headers': headers, 'encoding': encoding, }) post_func = payload.get('post_func') if type(post_func) == str: post_func = load(post_func) if post_func: resp = post_func(payload, resp) # post_func = payload.get('post_func') # if post_func: # post_func = load(post_func) # resp = post_func(payload, resp) return resp finally: c.close()
Example #17
Source File: request.py From falsy with MIT License | 4 votes |
def get_request(payload, share=None): c = pycurl.Curl() data_buf = BytesIO() # header_buf = BytesIO() headers = {'count': 0, 'content': [{}]} try: setup_curl_for_get(c, payload, data_buf, headers, share) # header_buf) with aiohttp.Timeout(payload.get('aiohttp_timeout', 60)): resp = await CurlLoop.handler_ready(c) charset = None if 'content-type' in headers: content_type = headers['content-type'].lower() match = re.search('charset=(\S+)', content_type) if match: charset = match.group(1) print('Decoding using %s' % charset) body = data_buf.getvalue() if len(body) == 0: data = '' charset = 'utf-8' else: if charset is None: dammit = UnicodeDammit(body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html") data = dammit.unicode_markup charset = dammit.original_encoding else: data = body.decode(charset, 'ignore') # headers.remove({}) headers['content'] = [h for h in headers['content'] if len(h) > 0] soup_lxml = BeautifulSoup(data, 'lxml') soup_html = BeautifulSoup(data, 'html.parser') resp.update({ 'url': payload.get('url'), # 'soup': soup, 'title': get_title(soup_lxml), 'links': get_links(soup_lxml), 'links2': get_links2(soup_lxml), 'metas': get_metas(soup_lxml), 'images': get_images(soup_lxml), 'scripts': get_scripts(soup_lxml), 'text': get_text(soup_html), 'data': data, 'headers': headers, 'charset': charset, 'spider': 'pycurl', 'payload': payload, }) post_func = payload.get('post_func') if post_func: post_func = load(post_func) resp = post_func(payload, resp) return resp finally: c.close()
Example #18
Source File: gDocParse.py From ReadableWebProxy with BSD 3-Clause "New" or "Revised" License | 4 votes |
def extract(self): try: arch, fName = self.wg.getFileAndName(self.url, addlHeaders={'Referer': self.refererUrl}) except IndexError: print("ERROR: Failure retrieving page!") return None, [] baseName = fName.split(".")[0] if not isinstance(arch, bytes): if 'You need permission' in arch or 'Sign in to continue to Docs': self.log.critical("Retrieving zip archive failed?") self.log.critical("Retreived content type: '%s'", type(arch)) raise TypeError("Cannot access document? Is it protected?") else: with open("tmp_page.html", "w") as fp: fp.write(arch) raise ValueError("Doc not valid?") zp = io.BytesIO(arch) zfp = zipfile.ZipFile(zp) resources = [] baseFile = None for item in zfp.infolist(): if not "/" in item.filename and not baseFile: contents = zfp.open(item).read() contents = bs4.UnicodeDammit(contents).unicode_markup baseFile = (item.filename, contents) elif baseName in item.filename and baseName: raise ValueError("Multiple base file items?") else: resources.append((item.filename, mimetypes.guess_type(item.filename)[0], zfp.open(item).read())) if not baseFile: raise ValueError("No base file found!") return baseFile, resources