Python bs4.NavigableString() Examples
The following are 30
code examples of bs4.NavigableString().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
bs4
, or try the search function
.
Example #1
Source File: Answer.py From zhihu-terminal with MIT License | 7 votes |
def print_content(contents): for content in contents: name = content.name #if not isinstance(content, Tag): if isinstance(content, NavigableString): s = str(content) s = s.replace("\n","") print s.strip() else: if name == "img": ''' img = content.find("img") if img: print img.get("src") ''' print "[图片]" elif name == "br": print "" elif name == "noscript": continue elif name == "li": print "•", print_content(content.contents)
Example #2
Source File: html_clear.py From ns4_chatbot with Apache License 2.0 | 6 votes |
def __clear(parent_node,config): # return bs.prettify() content = "" # print parent_node if isinstance(parent_node, NavigableString): return parent_node.string if parent_node.name in line_elements: content += "\n" children = parent_node.contents for child in children: if child.name == "table": content += parse_table(child,config) else: content += __clear(child,config) return content
Example #3
Source File: file_docx.py From docassemble with MIT License | 6 votes |
def get_children(descendants, parsed): subelement = False descendants_buff = deque() if descendants is None: return descendants_buff if (isinstance(descendants, NavigableString)): parsed.append(descendants) else: for child in descendants.children: if (child.name == None): if (subelement == False): parsed.append(child) else: descendants_buff.append(child) else: if (subelement == False): subelement = True descendants_buff.append(child) else: descendants_buff.append(child) descendants_buff.reverse() return descendants_buff
Example #4
Source File: __init__.py From uoft-scrapers with MIT License | 6 votes |
def normalize_text_sections(div): paragraph = '' for content in div.contents: text = '' if type(content) == NavigableString: text = content elif type(content) == Comment: pass elif content.name == 'li': text = content.text else: text = content.text text = text.strip() paragraph += text.strip() + ' ' paragraph = paragraph.strip() paragraph = paragraph.replace('\r', '') paragraph = paragraph.replace('\n', ', ') paragraph = paragraph.strip() return paragraph
Example #5
Source File: __init__.py From uoft-scrapers with MIT License | 6 votes |
def normalize_text_sections(div): paragraph = '' for content in div.contents: text = '' if type(content) == NavigableString: text = content elif type(content) == Comment: pass elif content.name == 'li': text = content.text else: text = content.text text = text.strip() paragraph += text.strip() + ' ' paragraph = paragraph.strip() paragraph = paragraph.replace('\r', '') paragraph = paragraph.replace('\n', ', ') paragraph = paragraph.replace(' ', ' ') paragraph = paragraph.strip() return paragraph
Example #6
Source File: process_tag.py From DashTable with MIT License | 6 votes |
def process_tag(node): """ Recursively go through a tag's children, converting them, then convert the tag itself. """ text = '' exceptions = ['table'] for element in node.children: if isinstance(element, NavigableString): text += element elif not node.name in exceptions: text += process_tag(element) try: convert_fn = globals()["convert_%s" % node.name.lower()] text = convert_fn(node, text) except KeyError: pass return text
Example #7
Source File: fetcher.py From Greynir with GNU General Public License v3.0 | 5 votes |
def extract_text(soup, result): """ Append the human-readable text found in an HTML soup to the result TextList """ if soup is None: return for t in soup.children: if type(t) == NavigableString: # Text content node result.append(t) elif isinstance(t, NavigableString): # Comment, CDATA or other text data: ignore pass elif t.name in Fetcher._BREAK_TAGS: result.insert_break() # html.parser (erroneously) nests content inside # <br> and <hr> tags Fetcher.extract_text(t, result) elif t.name in Fetcher._WHITESPACE_TAGS: # Tags that we interpret as whitespace, such as <img> result.append_whitespace() # html.parser nests content inside <img> tags if # they are not explicitly closed Fetcher.extract_text(t, result) elif t.name in Fetcher._BLOCK_TAGS: # Nested block tag result.begin() # Begin block Fetcher.extract_text(t, result) result.end() # End block elif t.name in Fetcher._INLINE_BLOCK_TAGS: # Put whitespace around the inline block # so that words don't run together result.append_whitespace() Fetcher.extract_text(t, result) result.append_whitespace() elif t.name not in Fetcher._EXCLUDE_TAGS: # Non-block tag Fetcher.extract_text(t, result)
Example #8
Source File: doc_utils.py From axcell with Apache License 2.0 | 5 votes |
def _insert_anchor(el, anchor_id, prefix="xxanchor"): el.insert(0, NavigableString(f' {prefix}-{anchor_id} '))
Example #9
Source File: css_match.py From plugin.git.browser with GNU General Public License v3.0 | 5 votes |
def is_navigable_string(obj): """Is navigable string.""" import bs4 return isinstance(obj, bs4.NavigableString)
Example #10
Source File: trac_export.py From allura with Apache License 2.0 | 5 votes |
def parse_ticket_attachments(self, id): SIZE_PATTERN = r'(\d+) bytes' TIMESTAMP_PATTERN = r'(.+) in Timeline' # Scrape HTML to get ticket attachments url = self.full_url(self.ATTACHMENT_LIST_URL % id) self.log_url(url) f = urlopen(url) soup = BeautifulSoup(f) attach = soup.find('div', id='attachments') list = [] while attach: attach = attach.findNext('dt') if not attach: break d = {} d['filename'] = attach.a['href'].rsplit('/', 1)[1] d['url'] = self.full_url(self.ATTACHMENT_URL % (id, d['filename'])) size_s = attach.span['title'] d['size'] = int(self.match_pattern(SIZE_PATTERN, size_s)) timestamp_s = attach.find('a', {'class': 'timeline'})['title'] d['date'] = self.trac2z_date( self.match_pattern(TIMESTAMP_PATTERN, timestamp_s)) d['by'] = attach.find( text=re.compile('added by')).nextSibling.renderContents() d['description'] = '' # Skip whitespace while attach.nextSibling and isinstance(attach.nextSibling, NavigableString): attach = attach.nextSibling # if there's a description, there will be a <dd> element, other # immediately next <dt> if attach.nextSibling and attach.nextSibling.name == 'dd': desc_el = attach.nextSibling if desc_el: # TODO: Convert to Allura link syntax as needed d['description'] = ''.join( desc_el.findAll(text=True)).strip() list.append(d) return list
Example #11
Source File: subscene.py From bazarr with GNU General Public License v3.0 | 5 votes |
def get_first_film(soup, section, year=None, session=None): tag_part = SectionsParts[section] tag = None headers = soup.find("div", "search-result").find_all("h2") for header in headers: if tag_part in header.text: tag = header break if not tag: return url = None if not year: url = SITE_DOMAIN + tag.findNext("ul").find("li").div.a.get("href") else: for t in tag.findNext("ul").findAll("li"): if isinstance(t, NavigableString) or not t.div: continue if str(year) in t.div.a.string: url = SITE_DOMAIN + t.div.a.get("href") break if not url: # fallback to non-year results logger.info("Falling back to non-year results as year wasn't found (%s)", year) url = SITE_DOMAIN + tag.findNext("ul").find("li").div.a.get("href") return Film.from_url(url, session=session)
Example #12
Source File: css_match.py From bazarr with GNU General Public License v3.0 | 5 votes |
def is_navigable_string(obj): """Is navigable string.""" import bs4 return isinstance(obj, bs4.NavigableString)
Example #13
Source File: soupy.py From soupy with MIT License | 5 votes |
def __new__(cls, value, *args, **kwargs): if isinstance(value, NavigableString): return object.__new__(NavigableStringNode) return object.__new__(cls)
Example #14
Source File: html.py From quay with Apache License 2.0 | 5 votes |
def _html2text(elem): for child in elem.children: if isinstance(child, Tag): _html2text(child) elif isinstance(child, NavigableString): # No changes necessary continue if elem.parent: if elem.name in _ELEMENT_REPLACER: _ELEMENT_REPLACER[elem.name](elem)
Example #15
Source File: default.py From Greynir with GNU General Public License v3.0 | 5 votes |
def _get_content(self, soup_body): """ Find the article content (main text) in the soup """ content = ScrapeHelper.div_class(soup_body, "article-body") # Some sports event pages don't have an article__body if not content: return BeautifulSoup("", _HTML_PARSER) # Return empty soup. # Get rid of stuff we don't want ScrapeHelper.del_tag(content, "h3") ScrapeHelper.del_tag(content, "figure") ScrapeHelper.del_div_class(content, "embed") # First char in first paragraph is wrapped in its own span tag # for styling purposes, which separates it from the rest of the word. # We extract the character and insert it into the first p tag firstchar = "" span = content.find("span", {"class": "article-dropcap"}) if span: firstchar = span.get_text() span.decompose() for div in content.find_all("div", {"class": "read-more-block"}): div.decompose() for div in content.find_all("div", {"class": "sja-einnig"}): div.decompose() for div in content.find_all("div", {"class": "img-block"}): div.decompose() # Insert it in the first paragraph ptag = content.find("p") if ptag and firstchar: ptag.insert(0, NavigableString(firstchar)) return content
Example #16
Source File: css_match.py From Tautulli with GNU General Public License v3.0 | 5 votes |
def is_navigable_string(obj): """Is navigable string.""" import bs4 return isinstance(obj, bs4.NavigableString)
Example #17
Source File: css_match.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def is_navigable_string(obj): """Is navigable string.""" import bs4 return isinstance(obj, bs4.NavigableString)
Example #18
Source File: extract_tables.py From axcell with Apache License 2.0 | 5 votes |
def wrap_elem_content(elem, begin, end): elem.insert(0, NavigableString(begin)) elem.append(NavigableString(end))
Example #19
Source File: get_references_web_single_group.py From fine-lm with MIT License | 5 votes |
def soup_strings(soup): paragraph_tags = set(["caption", "details", "h1", "h2", "h3", "h4", "h5", "h6", "li", "p", "td", "div", "span"]) skip_children = None for descendant in soup.descendants: # If we've treated a tag as a contiguous paragraph, don't re-emit the # children (see below). if skip_children is not None: try: in_skip = descendant in skip_children except RecursionError: # Possible for this check to hit a nasty infinite recursion because of # BeautifulSoup __eq__ checks. in_skip = True if in_skip: continue else: skip_children = None # Treat some tags as contigous paragraphs, regardless of other tags nested # inside (like <a> or <b>). if isinstance(descendant, bs4.Tag): if descendant.name in paragraph_tags: if descendant.find_all(paragraph_tags): # If there are nested paragraph tags, don't treat it as a single # contiguous tag. continue skip_children = list(descendant.descendants) text = " ".join(descendant.get_text(" ", strip=True).split()) if text: yield text continue if (isinstance(descendant, bs4.Comment) or not isinstance(descendant, bs4.NavigableString)): continue text = " ".join(descendant.strip().split()) if text: yield text
Example #20
Source File: css_match.py From soupsieve with MIT License | 5 votes |
def is_navigable_string(obj): """Is navigable string.""" return isinstance(obj, bs4.NavigableString)
Example #21
Source File: make_catena_input.py From NeuralDater with Apache License 2.0 | 5 votes |
def make_catena_input(src, dest): text = open(src).read() soup = BeautifulSoup(text, 'xml') soup.find('DCT').insert_after(soup.new_tag('TITLE')) soup.find('DCT').append(soup.new_tag('TIMEX3', functionInDocument="CREATION_TIME", temporalFunction="false", tid="t0", type="DATE", value="")) for e in soup.find_all('event'): new_e = soup.new_tag('EVENT', **e.attrs) new_e.insert(0, NavigableString(e.get_text())) e.replaceWith(new_e) [s.extract() for s in soup('TLINK')] with open(args.dest + src.split('/')[-1] + '.tml', 'w') as f: f.write(str(soup))
Example #22
Source File: whitelist.py From wagtail with BSD 3-Clause "New" or "Revised" License | 5 votes |
def clean_node(self, doc, node): """Clean a BeautifulSoup document in-place""" if isinstance(node, NavigableString): self.clean_string_node(doc, node) elif isinstance(node, Tag): self.clean_tag_node(doc, node) # This branch is here in case node is a BeautifulSoup object that does # not inherit from NavigableString or Tag. I can't find any examples # of such a thing at the moment, so this branch is untested. else: # pragma: no cover self.clean_unknown_node(doc, node)
Example #23
Source File: amazon_invoice.py From beancount-import with GNU General Public License v2.0 | 5 votes |
def get_text_lines(parent_node): text_lines = [''] for node in parent_node.children: if isinstance(node, bs4.NavigableString): text_lines[-1] += str(node) elif node.name == 'br': text_lines.append('') else: text_lines[-1] += node.text return text_lines
Example #24
Source File: FontRemapProcessors.py From ReadableWebProxy with BSD 3-Clause "New" or "Revised" License | 5 votes |
def apply_correction_map(soup, tag, cor_map): for item in list(tag.descendants): if isinstance(item, bs4.NavigableString): origstr = str(item) itemstr = origstr for fontset in cor_map: for badc, goodc in fontset.items(): if badc in itemstr: itemstr = itemstr.replace(badc, goodc) if origstr != itemstr: news = soup.new_string(itemstr) item.replace_with(news)
Example #25
Source File: fetchhaodoo.py From nlputils with MIT License | 4 votes |
def process_audio(self, abookid, soup): table = soup.find('table', border='0', cellspacing='0', cellpadding='0', width="530") h4 = table.h4 font = h4.find('font', color="CC0000") a = h4.a match = re_audiobooktitle.match(h4.get_text().strip()) if font: author = font.string.strip() else: author = match.group(1) if a: title = a.string.strip().strip('《》【】') bookid = parse_page(a['href']) else: title = match.group(2) bookid = None bookrecorder = match.group(3) contenttable = h4.parent.table audiofiles = [] filename = chapter = cnum = cauthor = recorder = add_date = None for td in contenttable.find_all('td'): if td.audio: filename = td.audio.source['src'] audiofiles.append((filename, abookid, cnum, chapter, cauthor, recorder, add_date)) filename = chapter = cnum = cauthor = recorder = add_date = None else: a = td.a if a: cnum = int(parse_page(a['href']).split(':')[-1]) chapter = a.string.strip() match = re_recorder.match(str(a.next_sibling).strip()) if match: recorder = match.group(1) aleftstr = a.previous_sibling if isinstance(aleftstr, bs4.NavigableString): cauthor = str(aleftstr).strip().rstrip('《【') else: cnum = chapter = recorder = None match = re_audiobookchapter.match(td.get_text().strip()) if not match: match = re_audiobookchapter2.match(td.get_text().strip()) if match: # check real no author if cauthor is None: cauthor = match.group(1) cauthor = cauthor or None chapter = chapter or match.group(2) recorder = recorder or match.group(3) add_date = date_fmt(match.group(4)) return bookid, title, author, bookrecorder, audiofiles
Example #26
Source File: nekur.py From bazarr with GNU General Public License v3.0 | 4 votes |
def query(self, title): subtitles = [] data = { 'ajax': '1', 'sSearch': title, } r = self.session.post(self.search_url, data=data, timeout=10) r.raise_for_status() if not r.content: logger.debug('No data returned from provider') return [] soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']) # loop over subtitle cells rows = soup.select('tbody > tr') for row in rows: # title title_anchor_el = row.select_one('.title > a') title_inner_text = [element for element in title_anchor_el if isinstance(element, NavigableString)] title = title_inner_text[0].strip() # year year = row.select_one('.year').text.strip('()') # download link href = title_anchor_el.get('href') download_link = self.server_url + href # imdb id imdb_td = row.select_one('td:nth-of-type(4)') imdb_link = imdb_td.select_one('a').get('href') imdb_id = imdb_link.split('/')[-2] # fps fps = row.select_one('.fps').text.strip() # additional notes notes = row.select_one('.notes').text.strip() # page link = download link (there is no seperate subtitle page link) page_link = download_link # create/add the subitle subtitle = self.subtitle_class(Language.fromalpha2('lv'), page_link, download_link, title, year, imdb_id, fps, notes) logger.debug('nekur: Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles
Example #27
Source File: parse_html_deps.py From Jandroid with BSD 3-Clause "New" or "Revised" License | 4 votes |
def GenerateHTML(self, controller, minify=False, prettify=False): soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup)) # Remove declaration. for x in soup.contents: if isinstance(x, bs4.Doctype): x.extract() # Remove declaration. for x in soup.contents: if isinstance(x, bs4.Declaration): x.extract() # Remove all imports. imports = soup.findAll('link', rel='import') for imp in imports: imp.extract() # Remove all script links. scripts_external = soup.findAll('script', src=True) for script in scripts_external: script.extract() # Remove all in-line scripts. scripts_external = soup.findAll('script', src=None) for script in scripts_external: script.extract() # Process all in-line styles. inline_styles = soup.findAll('style') for style in inline_styles: html = controller.GetHTMLForInlineStylesheet(unicode(style.string)) if html: ns = soup.new_tag('style') ns.append(bs4.NavigableString(html)) style.replaceWith(ns) else: style.extract() # Rewrite all external stylesheet hrefs or remove, as needed. stylesheet_links = soup.findAll('link', rel='stylesheet') for stylesheet_link in stylesheet_links: html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href']) if html: tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style') assert len(tmp) == 1 stylesheet_link.replaceWith(tmp[0]) else: stylesheet_link.extract() # Remove comments if minifying. if minify: comments = soup.findAll( text=lambda text: isinstance(text, bs4.Comment)) for comment in comments: comment.extract() if prettify: return soup.prettify('utf-8').strip() # We are done. return unicode(soup).strip()
Example #28
Source File: parse_html_deps.py From Jandroid with BSD 3-Clause "New" or "Revised" License | 4 votes |
def GenerateHTML(self, controller, minify=False, prettify=False): soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup)) # Remove declaration. for x in soup.contents: if isinstance(x, bs4.Doctype): x.extract() # Remove declaration. for x in soup.contents: if isinstance(x, bs4.Declaration): x.extract() # Remove all imports. imports = soup.findAll('link', rel='import') for imp in imports: imp.extract() # Remove all script links. scripts_external = soup.findAll('script', src=True) for script in scripts_external: script.extract() # Remove all in-line scripts. scripts_external = soup.findAll('script', src=None) for script in scripts_external: script.extract() # Process all in-line styles. inline_styles = soup.findAll('style') for style in inline_styles: html = controller.GetHTMLForInlineStylesheet(unicode(style.string)) if html: ns = soup.new_tag('style') ns.append(bs4.NavigableString(html)) style.replaceWith(ns) else: style.extract() # Rewrite all external stylesheet hrefs or remove, as needed. stylesheet_links = soup.findAll('link', rel='stylesheet') for stylesheet_link in stylesheet_links: html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href']) if html: tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style') assert len(tmp) == 1 stylesheet_link.replaceWith(tmp[0]) else: stylesheet_link.extract() # Remove comments if minifying. if minify: comments = soup.findAll( text=lambda text: isinstance(text, bs4.Comment)) for comment in comments: comment.extract() if prettify: return soup.prettify('utf-8').strip() # We are done. return unicode(soup).strip()
Example #29
Source File: parse_html_deps.py From Jandroid with BSD 3-Clause "New" or "Revised" License | 4 votes |
def GenerateHTML(self, controller, minify=False, prettify=False): soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup)) # Remove declaration. for x in soup.contents: if isinstance(x, bs4.Doctype): x.extract() # Remove declaration. for x in soup.contents: if isinstance(x, bs4.Declaration): x.extract() # Remove all imports. imports = soup.findAll('link', rel='import') for imp in imports: imp.extract() # Remove all script links. scripts_external = soup.findAll('script', src=True) for script in scripts_external: script.extract() # Remove all in-line scripts. scripts_external = soup.findAll('script', src=None) for script in scripts_external: script.extract() # Process all in-line styles. inline_styles = soup.findAll('style') for style in inline_styles: html = controller.GetHTMLForInlineStylesheet(unicode(style.string)) if html: ns = soup.new_tag('style') ns.append(bs4.NavigableString(html)) style.replaceWith(ns) else: style.extract() # Rewrite all external stylesheet hrefs or remove, as needed. stylesheet_links = soup.findAll('link', rel='stylesheet') for stylesheet_link in stylesheet_links: html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href']) if html: tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style') assert len(tmp) == 1 stylesheet_link.replaceWith(tmp[0]) else: stylesheet_link.extract() # Remove comments if minifying. if minify: comments = soup.findAll( text=lambda text: isinstance(text, bs4.Comment)) for comment in comments: comment.extract() if prettify: return soup.prettify('utf-8').strip() # We are done. return unicode(soup).strip()
Example #30
Source File: html.py From training_results_v0.5 with Apache License 2.0 | 4 votes |
def _soup_strings(soup): """Return text strings in soup.""" paragraph_tags = set([ "caption", "details", "h1", "h2", "h3", "h4", "h5", "h6", "li", "p", "td", "div", "span" ]) skip_children = None for descendant in soup.descendants: # If we've treated a tag as a contiguous paragraph, don't re-emit the # children (see below). if skip_children is not None: try: in_skip = descendant in skip_children # pylint: disable=unsupported-membership-test except RecursionError: # pylint: disable=undefined-variable # Possible for this check to hit a nasty infinite recursion because of # BeautifulSoup __eq__ checks. in_skip = True if in_skip: continue else: skip_children = None # Treat some tags as contiguous paragraphs, regardless of other tags nested # inside (like <a> or <b>). if isinstance(descendant, bs4.Tag): if descendant.name in paragraph_tags: if descendant.find_all(paragraph_tags): # If there are nested paragraph tags, don't treat it as a single # contiguous tag. continue skip_children = list(descendant.descendants) text = " ".join(descendant.get_text(" ", strip=True).split()) if text: yield text continue if (isinstance(descendant, bs4.Comment) or not isinstance(descendant, bs4.NavigableString)): continue text = " ".join(descendant.strip().split()) if text: yield text