Python bs4.NavigableString() Examples

The following are 30 code examples of bs4.NavigableString(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module bs4 , or try the search function .
Example #1
Source File: Answer.py    From zhihu-terminal with MIT License 7 votes vote down vote up
def print_content(contents):
    for content in contents:
        name = content.name
        #if not isinstance(content, Tag):
        if isinstance(content, NavigableString):
            s = str(content)
            s = s.replace("\n","")
            print s.strip()
        else:
            if name == "img":
                '''
                img = content.find("img")
                if img:
                    print img.get("src")
                '''
                print "[图片]"
            elif name == "br":
                print ""
            elif name == "noscript":
                continue
            elif name == "li":
                print "•",
            print_content(content.contents) 
Example #2
Source File: html_clear.py    From ns4_chatbot with Apache License 2.0 6 votes vote down vote up
def __clear(parent_node,config):
	# return bs.prettify()
	content = ""
	# print parent_node
	if isinstance(parent_node, NavigableString):
		return parent_node.string

	if parent_node.name in line_elements:
		content += "\n"

	children = parent_node.contents

	for child in children:
		if child.name == "table":
			content += parse_table(child,config)
		else:
			content += __clear(child,config)

	return content 
Example #3
Source File: file_docx.py    From docassemble with MIT License 6 votes vote down vote up
def get_children(descendants, parsed):
    subelement = False
    descendants_buff = deque()
    if descendants is None:
        return descendants_buff
    if (isinstance(descendants, NavigableString)):
        parsed.append(descendants)
    else:
        for child in descendants.children:
            if (child.name == None):
                if (subelement == False):
                    parsed.append(child)
                else:
                    descendants_buff.append(child)
            else:
                if (subelement == False):
                    subelement = True
                    descendants_buff.append(child)
                else:
                    descendants_buff.append(child)
    descendants_buff.reverse()
    return descendants_buff 
Example #4
Source File: __init__.py    From uoft-scrapers with MIT License 6 votes vote down vote up
def normalize_text_sections(div):
        paragraph = ''
        for content in div.contents:
            text = ''
            if type(content) == NavigableString:
                text = content
            elif type(content) == Comment:
                pass
            elif content.name == 'li':
                text = content.text
            else:
                text = content.text
            text = text.strip()
            paragraph += text.strip() + ' '
        paragraph = paragraph.strip()
        paragraph = paragraph.replace('\r', '')
        paragraph = paragraph.replace('\n', ', ')
        paragraph = paragraph.strip()
        return paragraph 
Example #5
Source File: __init__.py    From uoft-scrapers with MIT License 6 votes vote down vote up
def normalize_text_sections(div):
        paragraph = ''
        for content in div.contents:
            text = ''
            if type(content) == NavigableString:
                text = content
            elif type(content) == Comment:
                pass
            elif content.name == 'li':
                text = content.text
            else:
                text = content.text
            text = text.strip()
            paragraph += text.strip() + ' '
        paragraph = paragraph.strip()
        paragraph = paragraph.replace('\r', '')
        paragraph = paragraph.replace('\n', ', ')
        paragraph = paragraph.replace('  ', ' ')
        paragraph = paragraph.strip()
        return paragraph 
Example #6
Source File: process_tag.py    From DashTable with MIT License 6 votes vote down vote up
def process_tag(node):
    """
    Recursively go through a tag's children, converting them, then
    convert the tag itself.

    """
    text = ''

    exceptions = ['table']

    for element in node.children:
        if isinstance(element, NavigableString):
            text += element
        elif not node.name in exceptions:
            text += process_tag(element)

    try:
        convert_fn = globals()["convert_%s" % node.name.lower()]
        text = convert_fn(node, text)

    except KeyError:
        pass

    return text 
Example #7
Source File: fetcher.py    From Greynir with GNU General Public License v3.0 5 votes vote down vote up
def extract_text(soup, result):
        """ Append the human-readable text found in an HTML soup
            to the result TextList """
        if soup is None:
            return
        for t in soup.children:
            if type(t) == NavigableString:
                # Text content node
                result.append(t)
            elif isinstance(t, NavigableString):
                # Comment, CDATA or other text data: ignore
                pass
            elif t.name in Fetcher._BREAK_TAGS:
                result.insert_break()
                # html.parser (erroneously) nests content inside
                # <br> and <hr> tags
                Fetcher.extract_text(t, result)
            elif t.name in Fetcher._WHITESPACE_TAGS:
                # Tags that we interpret as whitespace, such as <img>
                result.append_whitespace()
                # html.parser nests content inside <img> tags if
                # they are not explicitly closed
                Fetcher.extract_text(t, result)
            elif t.name in Fetcher._BLOCK_TAGS:
                # Nested block tag
                result.begin()  # Begin block
                Fetcher.extract_text(t, result)
                result.end()  # End block
            elif t.name in Fetcher._INLINE_BLOCK_TAGS:
                # Put whitespace around the inline block
                # so that words don't run together
                result.append_whitespace()
                Fetcher.extract_text(t, result)
                result.append_whitespace()
            elif t.name not in Fetcher._EXCLUDE_TAGS:
                # Non-block tag
                Fetcher.extract_text(t, result) 
Example #8
Source File: doc_utils.py    From axcell with Apache License 2.0 5 votes vote down vote up
def _insert_anchor(el, anchor_id, prefix="xxanchor"):
    el.insert(0, NavigableString(f' {prefix}-{anchor_id} ')) 
Example #9
Source File: css_match.py    From plugin.git.browser with GNU General Public License v3.0 5 votes vote down vote up
def is_navigable_string(obj):
        """Is navigable string."""

        import bs4
        return isinstance(obj, bs4.NavigableString) 
Example #10
Source File: trac_export.py    From allura with Apache License 2.0 5 votes vote down vote up
def parse_ticket_attachments(self, id):
        SIZE_PATTERN = r'(\d+) bytes'
        TIMESTAMP_PATTERN = r'(.+) in Timeline'
        # Scrape HTML to get ticket attachments
        url = self.full_url(self.ATTACHMENT_LIST_URL % id)
        self.log_url(url)
        f = urlopen(url)
        soup = BeautifulSoup(f)
        attach = soup.find('div', id='attachments')
        list = []
        while attach:
            attach = attach.findNext('dt')
            if not attach:
                break
            d = {}
            d['filename'] = attach.a['href'].rsplit('/', 1)[1]
            d['url'] = self.full_url(self.ATTACHMENT_URL % (id, d['filename']))
            size_s = attach.span['title']
            d['size'] = int(self.match_pattern(SIZE_PATTERN, size_s))
            timestamp_s = attach.find('a', {'class': 'timeline'})['title']
            d['date'] = self.trac2z_date(
                self.match_pattern(TIMESTAMP_PATTERN, timestamp_s))
            d['by'] = attach.find(
                text=re.compile('added by')).nextSibling.renderContents()
            d['description'] = ''
            # Skip whitespace
            while attach.nextSibling and isinstance(attach.nextSibling, NavigableString):
                attach = attach.nextSibling
            # if there's a description, there will be a <dd> element, other
            # immediately next <dt>
            if attach.nextSibling and attach.nextSibling.name == 'dd':
                desc_el = attach.nextSibling
                if desc_el:
                    # TODO: Convert to Allura link syntax as needed
                    d['description'] = ''.join(
                        desc_el.findAll(text=True)).strip()
            list.append(d)
        return list 
Example #11
Source File: subscene.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def get_first_film(soup, section, year=None, session=None):
    tag_part = SectionsParts[section]
    tag = None

    headers = soup.find("div", "search-result").find_all("h2")
    for header in headers:
        if tag_part in header.text:
            tag = header
            break

    if not tag:
        return

    url = None

    if not year:
        url = SITE_DOMAIN + tag.findNext("ul").find("li").div.a.get("href")
    else:
        for t in tag.findNext("ul").findAll("li"):
            if isinstance(t, NavigableString) or not t.div:
                continue

            if str(year) in t.div.a.string:
                url = SITE_DOMAIN + t.div.a.get("href")
                break
        if not url:
            # fallback to non-year results
            logger.info("Falling back to non-year results as year wasn't found (%s)", year)
            url = SITE_DOMAIN + tag.findNext("ul").find("li").div.a.get("href")

    return Film.from_url(url, session=session) 
Example #12
Source File: css_match.py    From bazarr with GNU General Public License v3.0 5 votes vote down vote up
def is_navigable_string(obj):
        """Is navigable string."""

        import bs4
        return isinstance(obj, bs4.NavigableString) 
Example #13
Source File: soupy.py    From soupy with MIT License 5 votes vote down vote up
def __new__(cls, value, *args, **kwargs):
        if isinstance(value, NavigableString):
            return object.__new__(NavigableStringNode)

        return object.__new__(cls) 
Example #14
Source File: html.py    From quay with Apache License 2.0 5 votes vote down vote up
def _html2text(elem):
    for child in elem.children:
        if isinstance(child, Tag):
            _html2text(child)
        elif isinstance(child, NavigableString):
            # No changes necessary
            continue

    if elem.parent:
        if elem.name in _ELEMENT_REPLACER:
            _ELEMENT_REPLACER[elem.name](elem) 
Example #15
Source File: default.py    From Greynir with GNU General Public License v3.0 5 votes vote down vote up
def _get_content(self, soup_body):
        """ Find the article content (main text) in the soup """
        content = ScrapeHelper.div_class(soup_body, "article-body")
        # Some sports event pages don't have an article__body
        if not content:
            return BeautifulSoup("", _HTML_PARSER)  # Return empty soup.

        # Get rid of stuff we don't want
        ScrapeHelper.del_tag(content, "h3")
        ScrapeHelper.del_tag(content, "figure")
        ScrapeHelper.del_div_class(content, "embed")

        # First char in first paragraph is wrapped in its own span tag
        # for styling purposes, which separates it from the rest of the word.
        # We extract the character and insert it into the first p tag
        firstchar = ""
        span = content.find("span", {"class": "article-dropcap"})
        if span:
            firstchar = span.get_text()
            span.decompose()

        for div in content.find_all("div", {"class": "read-more-block"}):
            div.decompose()

        for div in content.find_all("div", {"class": "sja-einnig"}):
            div.decompose()

        for div in content.find_all("div", {"class": "img-block"}):
            div.decompose()

        # Insert it in the first paragraph
        ptag = content.find("p")
        if ptag and firstchar:
            ptag.insert(0, NavigableString(firstchar))

        return content 
Example #16
Source File: css_match.py    From Tautulli with GNU General Public License v3.0 5 votes vote down vote up
def is_navigable_string(obj):
        """Is navigable string."""

        import bs4
        return isinstance(obj, bs4.NavigableString) 
Example #17
Source File: css_match.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def is_navigable_string(obj):
        """Is navigable string."""

        import bs4
        return isinstance(obj, bs4.NavigableString) 
Example #18
Source File: extract_tables.py    From axcell with Apache License 2.0 5 votes vote down vote up
def wrap_elem_content(elem, begin, end):
    elem.insert(0, NavigableString(begin))
    elem.append(NavigableString(end)) 
Example #19
Source File: get_references_web_single_group.py    From fine-lm with MIT License 5 votes vote down vote up
def soup_strings(soup):
  paragraph_tags = set(["caption", "details", "h1", "h2", "h3", "h4", "h5",
                        "h6", "li", "p", "td", "div", "span"])

  skip_children = None
  for descendant in soup.descendants:
    # If we've treated a tag as a contiguous paragraph, don't re-emit the
    # children (see below).
    if skip_children is not None:
      try:
        in_skip = descendant in skip_children
      except RecursionError:
        # Possible for this check to hit a nasty infinite recursion because of
        # BeautifulSoup __eq__ checks.
        in_skip = True
      if in_skip:
        continue
      else:
        skip_children = None

    # Treat some tags as contigous paragraphs, regardless of other tags nested
    # inside (like <a> or <b>).
    if isinstance(descendant, bs4.Tag):
      if descendant.name in paragraph_tags:
        if descendant.find_all(paragraph_tags):
          # If there are nested paragraph tags, don't treat it as a single
          # contiguous tag.
          continue
        skip_children = list(descendant.descendants)
        text = " ".join(descendant.get_text(" ", strip=True).split())
        if text:
          yield text
        continue

    if (isinstance(descendant, bs4.Comment) or
        not isinstance(descendant, bs4.NavigableString)):
      continue

    text = " ".join(descendant.strip().split())
    if text:
      yield text 
Example #20
Source File: css_match.py    From soupsieve with MIT License 5 votes vote down vote up
def is_navigable_string(obj):
        """Is navigable string."""
        return isinstance(obj, bs4.NavigableString) 
Example #21
Source File: make_catena_input.py    From NeuralDater with Apache License 2.0 5 votes vote down vote up
def make_catena_input(src, dest):
	text = open(src).read()
	soup = BeautifulSoup(text, 'xml')
	soup.find('DCT').insert_after(soup.new_tag('TITLE'))
	soup.find('DCT').append(soup.new_tag('TIMEX3', functionInDocument="CREATION_TIME", temporalFunction="false", tid="t0", type="DATE", value=""))

	for e in soup.find_all('event'):
		new_e = soup.new_tag('EVENT', **e.attrs)
		new_e.insert(0, NavigableString(e.get_text()))
		e.replaceWith(new_e)

	[s.extract() for s in soup('TLINK')]

	with open(args.dest + src.split('/')[-1] + '.tml', 'w') as f:
		f.write(str(soup)) 
Example #22
Source File: whitelist.py    From wagtail with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def clean_node(self, doc, node):
        """Clean a BeautifulSoup document in-place"""
        if isinstance(node, NavigableString):
            self.clean_string_node(doc, node)
        elif isinstance(node, Tag):
            self.clean_tag_node(doc, node)
        # This branch is here in case node is a BeautifulSoup object that does
        # not inherit from NavigableString or Tag. I can't find any examples
        # of such a thing at the moment, so this branch is untested.
        else:  # pragma: no cover
            self.clean_unknown_node(doc, node) 
Example #23
Source File: amazon_invoice.py    From beancount-import with GNU General Public License v2.0 5 votes vote down vote up
def get_text_lines(parent_node):
    text_lines = ['']
    for node in parent_node.children:
        if isinstance(node, bs4.NavigableString):
            text_lines[-1] += str(node)
        elif node.name == 'br':
            text_lines.append('')
        else:
            text_lines[-1] += node.text
    return text_lines 
Example #24
Source File: FontRemapProcessors.py    From ReadableWebProxy with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def apply_correction_map(soup, tag, cor_map):
	for item in list(tag.descendants):
		if isinstance(item, bs4.NavigableString):
			origstr = str(item)
			itemstr = origstr
			for fontset in cor_map:
				for badc, goodc in fontset.items():
					if badc in itemstr:
						itemstr = itemstr.replace(badc, goodc)
			if origstr != itemstr:
				news = soup.new_string(itemstr)
				item.replace_with(news) 
Example #25
Source File: fetchhaodoo.py    From nlputils with MIT License 4 votes vote down vote up
def process_audio(self, abookid, soup):
        table = soup.find('table', border='0', cellspacing='0', cellpadding='0', width="530")
        h4 = table.h4
        font = h4.find('font', color="CC0000")
        a = h4.a
        match = re_audiobooktitle.match(h4.get_text().strip())
        if font:
            author = font.string.strip()
        else:
            author = match.group(1)
        if a:
            title = a.string.strip().strip('《》【】')
            bookid = parse_page(a['href'])
        else:
            title = match.group(2)
            bookid = None
        bookrecorder = match.group(3)
        contenttable = h4.parent.table
        audiofiles = []
        filename = chapter = cnum = cauthor = recorder = add_date = None
        for td in contenttable.find_all('td'):
            if td.audio:
                filename = td.audio.source['src']
                audiofiles.append((filename, abookid, cnum, chapter, cauthor, recorder, add_date))
                filename = chapter = cnum = cauthor = recorder = add_date = None
            else:
                a = td.a
                if a:
                    cnum = int(parse_page(a['href']).split(':')[-1])
                    chapter = a.string.strip()
                    match = re_recorder.match(str(a.next_sibling).strip())
                    if match:
                        recorder = match.group(1)
                    aleftstr = a.previous_sibling
                    if isinstance(aleftstr, bs4.NavigableString):
                        cauthor = str(aleftstr).strip().rstrip('《【')
                else:
                    cnum = chapter = recorder = None
                match = re_audiobookchapter.match(td.get_text().strip())
                if not match:
                    match = re_audiobookchapter2.match(td.get_text().strip())
                if match:
                    # check real no author
                    if cauthor is None:
                        cauthor = match.group(1)
                    cauthor = cauthor or None
                    chapter = chapter or match.group(2)
                    recorder = recorder or match.group(3)
                    add_date = date_fmt(match.group(4))
        return bookid, title, author, bookrecorder, audiofiles 
Example #26
Source File: nekur.py    From bazarr with GNU General Public License v3.0 4 votes vote down vote up
def query(self, title):
        subtitles = []

        data = {
            'ajax': '1',
            'sSearch': title,
        }

        r = self.session.post(self.search_url, data=data, timeout=10)
        r.raise_for_status()

        if not r.content:
            logger.debug('No data returned from provider')
            return []

        soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser'])

        # loop over subtitle cells
        rows = soup.select('tbody > tr')
        for row in rows:
            # title
            title_anchor_el = row.select_one('.title > a')
            title_inner_text = [element for element in title_anchor_el if isinstance(element, NavigableString)]
            title = title_inner_text[0].strip()
            
            # year
            year = row.select_one('.year').text.strip('()')
            
            # download link
            href = title_anchor_el.get('href')
            download_link = self.server_url + href

            # imdb id
            imdb_td = row.select_one('td:nth-of-type(4)')
            imdb_link = imdb_td.select_one('a').get('href')
            imdb_id = imdb_link.split('/')[-2]

            # fps
            fps = row.select_one('.fps').text.strip()

            # additional notes
            notes = row.select_one('.notes').text.strip()

            # page link = download link (there is no seperate subtitle page link)
            page_link = download_link
            
            # create/add the subitle
            subtitle = self.subtitle_class(Language.fromalpha2('lv'), page_link, download_link, title, year, imdb_id, fps, notes)
            logger.debug('nekur: Found subtitle %r', subtitle)
            subtitles.append(subtitle)

        return subtitles 
Example #27
Source File: parse_html_deps.py    From Jandroid with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def GenerateHTML(self, controller, minify=False, prettify=False):
    soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup))

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Doctype):
        x.extract()

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Declaration):
        x.extract()

    # Remove all imports.
    imports = soup.findAll('link', rel='import')
    for imp in imports:
      imp.extract()

    # Remove all script links.
    scripts_external = soup.findAll('script', src=True)
    for script in scripts_external:
      script.extract()

    # Remove all in-line scripts.
    scripts_external = soup.findAll('script', src=None)
    for script in scripts_external:
      script.extract()

    # Process all in-line styles.
    inline_styles = soup.findAll('style')
    for style in inline_styles:
      html = controller.GetHTMLForInlineStylesheet(unicode(style.string))
      if html:
        ns = soup.new_tag('style')
        ns.append(bs4.NavigableString(html))
        style.replaceWith(ns)
      else:
        style.extract()

    # Rewrite all external stylesheet hrefs or remove, as needed.
    stylesheet_links = soup.findAll('link', rel='stylesheet')
    for stylesheet_link in stylesheet_links:
      html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
      if html:
        tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style')
        assert len(tmp) == 1
        stylesheet_link.replaceWith(tmp[0])
      else:
        stylesheet_link.extract()

    # Remove comments if minifying.
    if minify:
      comments = soup.findAll(
          text=lambda text: isinstance(text, bs4.Comment))
      for comment in comments:
        comment.extract()
    if prettify:
      return soup.prettify('utf-8').strip()

    # We are done.
    return unicode(soup).strip() 
Example #28
Source File: parse_html_deps.py    From Jandroid with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def GenerateHTML(self, controller, minify=False, prettify=False):
    soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup))

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Doctype):
        x.extract()

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Declaration):
        x.extract()

    # Remove all imports.
    imports = soup.findAll('link', rel='import')
    for imp in imports:
      imp.extract()

    # Remove all script links.
    scripts_external = soup.findAll('script', src=True)
    for script in scripts_external:
      script.extract()

    # Remove all in-line scripts.
    scripts_external = soup.findAll('script', src=None)
    for script in scripts_external:
      script.extract()

    # Process all in-line styles.
    inline_styles = soup.findAll('style')
    for style in inline_styles:
      html = controller.GetHTMLForInlineStylesheet(unicode(style.string))
      if html:
        ns = soup.new_tag('style')
        ns.append(bs4.NavigableString(html))
        style.replaceWith(ns)
      else:
        style.extract()

    # Rewrite all external stylesheet hrefs or remove, as needed.
    stylesheet_links = soup.findAll('link', rel='stylesheet')
    for stylesheet_link in stylesheet_links:
      html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
      if html:
        tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style')
        assert len(tmp) == 1
        stylesheet_link.replaceWith(tmp[0])
      else:
        stylesheet_link.extract()

    # Remove comments if minifying.
    if minify:
      comments = soup.findAll(
          text=lambda text: isinstance(text, bs4.Comment))
      for comment in comments:
        comment.extract()
    if prettify:
      return soup.prettify('utf-8').strip()

    # We are done.
    return unicode(soup).strip() 
Example #29
Source File: parse_html_deps.py    From Jandroid with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def GenerateHTML(self, controller, minify=False, prettify=False):
    soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup))

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Doctype):
        x.extract()

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Declaration):
        x.extract()

    # Remove all imports.
    imports = soup.findAll('link', rel='import')
    for imp in imports:
      imp.extract()

    # Remove all script links.
    scripts_external = soup.findAll('script', src=True)
    for script in scripts_external:
      script.extract()

    # Remove all in-line scripts.
    scripts_external = soup.findAll('script', src=None)
    for script in scripts_external:
      script.extract()

    # Process all in-line styles.
    inline_styles = soup.findAll('style')
    for style in inline_styles:
      html = controller.GetHTMLForInlineStylesheet(unicode(style.string))
      if html:
        ns = soup.new_tag('style')
        ns.append(bs4.NavigableString(html))
        style.replaceWith(ns)
      else:
        style.extract()

    # Rewrite all external stylesheet hrefs or remove, as needed.
    stylesheet_links = soup.findAll('link', rel='stylesheet')
    for stylesheet_link in stylesheet_links:
      html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
      if html:
        tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style')
        assert len(tmp) == 1
        stylesheet_link.replaceWith(tmp[0])
      else:
        stylesheet_link.extract()

    # Remove comments if minifying.
    if minify:
      comments = soup.findAll(
          text=lambda text: isinstance(text, bs4.Comment))
      for comment in comments:
        comment.extract()
    if prettify:
      return soup.prettify('utf-8').strip()

    # We are done.
    return unicode(soup).strip() 
Example #30
Source File: html.py    From training_results_v0.5 with Apache License 2.0 4 votes vote down vote up
def _soup_strings(soup):
  """Return text strings in soup."""
  paragraph_tags = set([
      "caption", "details", "h1", "h2", "h3", "h4", "h5", "h6", "li", "p", "td",
      "div", "span"
  ])

  skip_children = None
  for descendant in soup.descendants:
    # If we've treated a tag as a contiguous paragraph, don't re-emit the
    # children (see below).
    if skip_children is not None:
      try:
        in_skip = descendant in skip_children  # pylint: disable=unsupported-membership-test
      except RecursionError:  # pylint: disable=undefined-variable
        # Possible for this check to hit a nasty infinite recursion because of
        # BeautifulSoup __eq__ checks.
        in_skip = True
      if in_skip:
        continue
      else:
        skip_children = None

    # Treat some tags as contiguous paragraphs, regardless of other tags nested
    # inside (like <a> or <b>).
    if isinstance(descendant, bs4.Tag):
      if descendant.name in paragraph_tags:
        if descendant.find_all(paragraph_tags):
          # If there are nested paragraph tags, don't treat it as a single
          # contiguous tag.
          continue
        skip_children = list(descendant.descendants)
        text = " ".join(descendant.get_text(" ", strip=True).split())
        if text:
          yield text
        continue

    if (isinstance(descendant, bs4.Comment) or
        not isinstance(descendant, bs4.NavigableString)):
      continue

    text = " ".join(descendant.strip().split())
    if text:
      yield text