Python Examples of bs4.NavigableString

Source File: Answer.py From zhihu-terminal with MIT License

7 votes

def print_content(contents):
    for content in contents:
        name = content.name
        #if not isinstance(content, Tag):
        if isinstance(content, NavigableString):
            s = str(content)
            s = s.replace("\n","")
            print s.strip()
        else:
            if name == "img":
                '''
                img = content.find("img")
                if img:
                    print img.get("src")
                '''
                print "[图片]"
            elif name == "br":
                print ""
            elif name == "noscript":
                continue
            elif name == "li":
                print "•",
            print_content(content.contents)

Source File: html_clear.py From ns4_chatbot with Apache License 2.0

6 votes

def __clear(parent_node,config):
	# return bs.prettify()
	content = ""
	# print parent_node
	if isinstance(parent_node, NavigableString):
		return parent_node.string

	if parent_node.name in line_elements:
		content += "\n"

	children = parent_node.contents

	for child in children:
		if child.name == "table":
			content += parse_table(child,config)
		else:
			content += __clear(child,config)

	return content

Source File: file_docx.py From docassemble with MIT License

6 votes

def get_children(descendants, parsed):
    subelement = False
    descendants_buff = deque()
    if descendants is None:
        return descendants_buff
    if (isinstance(descendants, NavigableString)):
        parsed.append(descendants)
    else:
        for child in descendants.children:
            if (child.name == None):
                if (subelement == False):
                    parsed.append(child)
                else:
                    descendants_buff.append(child)
            else:
                if (subelement == False):
                    subelement = True
                    descendants_buff.append(child)
                else:
                    descendants_buff.append(child)
    descendants_buff.reverse()
    return descendants_buff

Source File: __init__.py From uoft-scrapers with MIT License

6 votes

def normalize_text_sections(div):
        paragraph = ''
        for content in div.contents:
            text = ''
            if type(content) == NavigableString:
                text = content
            elif type(content) == Comment:
                pass
            elif content.name == 'li':
                text = content.text
            else:
                text = content.text
            text = text.strip()
            paragraph += text.strip() + ' '
        paragraph = paragraph.strip()
        paragraph = paragraph.replace('\r', '')
        paragraph = paragraph.replace('\n', ', ')
        paragraph = paragraph.strip()
        return paragraph

Source File: __init__.py From uoft-scrapers with MIT License

6 votes

def normalize_text_sections(div):
        paragraph = ''
        for content in div.contents:
            text = ''
            if type(content) == NavigableString:
                text = content
            elif type(content) == Comment:
                pass
            elif content.name == 'li':
                text = content.text
            else:
                text = content.text
            text = text.strip()
            paragraph += text.strip() + ' '
        paragraph = paragraph.strip()
        paragraph = paragraph.replace('\r', '')
        paragraph = paragraph.replace('\n', ', ')
        paragraph = paragraph.replace('  ', ' ')
        paragraph = paragraph.strip()
        return paragraph

Source File: process_tag.py From DashTable with MIT License

6 votes

def process_tag(node):
    """
    Recursively go through a tag's children, converting them, then
    convert the tag itself.

    """
    text = ''

    exceptions = ['table']

    for element in node.children:
        if isinstance(element, NavigableString):
            text += element
        elif not node.name in exceptions:
            text += process_tag(element)

    try:
        convert_fn = globals()["convert_%s" % node.name.lower()]
        text = convert_fn(node, text)

    except KeyError:
        pass

    return text

Source File: fetcher.py From Greynir with GNU General Public License v3.0

5 votes

def extract_text(soup, result):
        """ Append the human-readable text found in an HTML soup
            to the result TextList """
        if soup is None:
            return
        for t in soup.children:
            if type(t) == NavigableString:
                # Text content node
                result.append(t)
            elif isinstance(t, NavigableString):
                # Comment, CDATA or other text data: ignore
                pass
            elif t.name in Fetcher._BREAK_TAGS:
                result.insert_break()
                # html.parser (erroneously) nests content inside
                # <br> and <hr> tags
                Fetcher.extract_text(t, result)
            elif t.name in Fetcher._WHITESPACE_TAGS:
                # Tags that we interpret as whitespace, such as <img>
                result.append_whitespace()
                # html.parser nests content inside <img> tags if
                # they are not explicitly closed
                Fetcher.extract_text(t, result)
            elif t.name in Fetcher._BLOCK_TAGS:
                # Nested block tag
                result.begin()  # Begin block
                Fetcher.extract_text(t, result)
                result.end()  # End block
            elif t.name in Fetcher._INLINE_BLOCK_TAGS:
                # Put whitespace around the inline block
                # so that words don't run together
                result.append_whitespace()
                Fetcher.extract_text(t, result)
                result.append_whitespace()
            elif t.name not in Fetcher._EXCLUDE_TAGS:
                # Non-block tag
                Fetcher.extract_text(t, result)

Source File: doc_utils.py From axcell with Apache License 2.0

5 votes

def _insert_anchor(el, anchor_id, prefix="xxanchor"):
    el.insert(0, NavigableString(f' {prefix}-{anchor_id} '))

Source File: css_match.py From plugin.git.browser with GNU General Public License v3.0

5 votes

def is_navigable_string(obj):
        """Is navigable string."""

        import bs4
        return isinstance(obj, bs4.NavigableString)

Source File: trac_export.py From allura with Apache License 2.0

5 votes

def parse_ticket_attachments(self, id):
        SIZE_PATTERN = r'(\d+) bytes'
        TIMESTAMP_PATTERN = r'(.+) in Timeline'
        # Scrape HTML to get ticket attachments
        url = self.full_url(self.ATTACHMENT_LIST_URL % id)
        self.log_url(url)
        f = urlopen(url)
        soup = BeautifulSoup(f)
        attach = soup.find('div', id='attachments')
        list = []
        while attach:
            attach = attach.findNext('dt')
            if not attach:
                break
            d = {}
            d['filename'] = attach.a['href'].rsplit('/', 1)[1]
            d['url'] = self.full_url(self.ATTACHMENT_URL % (id, d['filename']))
            size_s = attach.span['title']
            d['size'] = int(self.match_pattern(SIZE_PATTERN, size_s))
            timestamp_s = attach.find('a', {'class': 'timeline'})['title']
            d['date'] = self.trac2z_date(
                self.match_pattern(TIMESTAMP_PATTERN, timestamp_s))
            d['by'] = attach.find(
                text=re.compile('added by')).nextSibling.renderContents()
            d['description'] = ''
            # Skip whitespace
            while attach.nextSibling and isinstance(attach.nextSibling, NavigableString):
                attach = attach.nextSibling
            # if there's a description, there will be a <dd> element, other
            # immediately next <dt>
            if attach.nextSibling and attach.nextSibling.name == 'dd':
                desc_el = attach.nextSibling
                if desc_el:
                    # TODO: Convert to Allura link syntax as needed
                    d['description'] = ''.join(
                        desc_el.findAll(text=True)).strip()
            list.append(d)
        return list

Source File: subscene.py From bazarr with GNU General Public License v3.0

5 votes

def get_first_film(soup, section, year=None, session=None):
    tag_part = SectionsParts[section]
    tag = None

    headers = soup.find("div", "search-result").find_all("h2")
    for header in headers:
        if tag_part in header.text:
            tag = header
            break

    if not tag:
        return

    url = None

    if not year:
        url = SITE_DOMAIN + tag.findNext("ul").find("li").div.a.get("href")
    else:
        for t in tag.findNext("ul").findAll("li"):
            if isinstance(t, NavigableString) or not t.div:
                continue

            if str(year) in t.div.a.string:
                url = SITE_DOMAIN + t.div.a.get("href")
                break
        if not url:
            # fallback to non-year results
            logger.info("Falling back to non-year results as year wasn't found (%s)", year)
            url = SITE_DOMAIN + tag.findNext("ul").find("li").div.a.get("href")

    return Film.from_url(url, session=session)

Source File: css_match.py From bazarr with GNU General Public License v3.0

5 votes

def is_navigable_string(obj):
        """Is navigable string."""

        import bs4
        return isinstance(obj, bs4.NavigableString)

Source File: soupy.py From soupy with MIT License

5 votes

def __new__(cls, value, *args, **kwargs):
        if isinstance(value, NavigableString):
            return object.__new__(NavigableStringNode)

        return object.__new__(cls)

Source File: html.py From quay with Apache License 2.0

5 votes

def _html2text(elem):
    for child in elem.children:
        if isinstance(child, Tag):
            _html2text(child)
        elif isinstance(child, NavigableString):
            # No changes necessary
            continue

    if elem.parent:
        if elem.name in _ELEMENT_REPLACER:
            _ELEMENT_REPLACER[elem.name](elem)

Source File: default.py From Greynir with GNU General Public License v3.0

5 votes

def _get_content(self, soup_body):
        """ Find the article content (main text) in the soup """
        content = ScrapeHelper.div_class(soup_body, "article-body")
        # Some sports event pages don't have an article__body
        if not content:
            return BeautifulSoup("", _HTML_PARSER)  # Return empty soup.

        # Get rid of stuff we don't want
        ScrapeHelper.del_tag(content, "h3")
        ScrapeHelper.del_tag(content, "figure")
        ScrapeHelper.del_div_class(content, "embed")

        # First char in first paragraph is wrapped in its own span tag
        # for styling purposes, which separates it from the rest of the word.
        # We extract the character and insert it into the first p tag
        firstchar = ""
        span = content.find("span", {"class": "article-dropcap"})
        if span:
            firstchar = span.get_text()
            span.decompose()

        for div in content.find_all("div", {"class": "read-more-block"}):
            div.decompose()

        for div in content.find_all("div", {"class": "sja-einnig"}):
            div.decompose()

        for div in content.find_all("div", {"class": "img-block"}):
            div.decompose()

        # Insert it in the first paragraph
        ptag = content.find("p")
        if ptag and firstchar:
            ptag.insert(0, NavigableString(firstchar))

        return content

Source File: css_match.py From Tautulli with GNU General Public License v3.0

5 votes

def is_navigable_string(obj):
        """Is navigable string."""

        import bs4
        return isinstance(obj, bs4.NavigableString)

Source File: css_match.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

5 votes

def is_navigable_string(obj):
        """Is navigable string."""

        import bs4
        return isinstance(obj, bs4.NavigableString)

Source File: extract_tables.py From axcell with Apache License 2.0

5 votes

def wrap_elem_content(elem, begin, end):
    elem.insert(0, NavigableString(begin))
    elem.append(NavigableString(end))

Source File: get_references_web_single_group.py From fine-lm with MIT License

5 votes

def soup_strings(soup):
  paragraph_tags = set(["caption", "details", "h1", "h2", "h3", "h4", "h5",
                        "h6", "li", "p", "td", "div", "span"])

  skip_children = None
  for descendant in soup.descendants:
    # If we've treated a tag as a contiguous paragraph, don't re-emit the
    # children (see below).
    if skip_children is not None:
      try:
        in_skip = descendant in skip_children
      except RecursionError:
        # Possible for this check to hit a nasty infinite recursion because of
        # BeautifulSoup __eq__ checks.
        in_skip = True
      if in_skip:
        continue
      else:
        skip_children = None

    # Treat some tags as contigous paragraphs, regardless of other tags nested
    # inside (like <a> or <b>).
    if isinstance(descendant, bs4.Tag):
      if descendant.name in paragraph_tags:
        if descendant.find_all(paragraph_tags):
          # If there are nested paragraph tags, don't treat it as a single
          # contiguous tag.
          continue
        skip_children = list(descendant.descendants)
        text = " ".join(descendant.get_text(" ", strip=True).split())
        if text:
          yield text
        continue

    if (isinstance(descendant, bs4.Comment) or
        not isinstance(descendant, bs4.NavigableString)):
      continue

    text = " ".join(descendant.strip().split())
    if text:
      yield text

Source File: css_match.py From soupsieve with MIT License

5 votes

def is_navigable_string(obj):
        """Is navigable string."""
        return isinstance(obj, bs4.NavigableString)

Source File: make_catena_input.py From NeuralDater with Apache License 2.0

5 votes

def make_catena_input(src, dest):
	text = open(src).read()
	soup = BeautifulSoup(text, 'xml')
	soup.find('DCT').insert_after(soup.new_tag('TITLE'))
	soup.find('DCT').append(soup.new_tag('TIMEX3', functionInDocument="CREATION_TIME", temporalFunction="false", tid="t0", type="DATE", value=""))

	for e in soup.find_all('event'):
		new_e = soup.new_tag('EVENT', **e.attrs)
		new_e.insert(0, NavigableString(e.get_text()))
		e.replaceWith(new_e)

	[s.extract() for s in soup('TLINK')]

	with open(args.dest + src.split('/')[-1] + '.tml', 'w') as f:
		f.write(str(soup))

Source File: whitelist.py From wagtail with BSD 3-Clause "New" or "Revised" License

5 votes

def clean_node(self, doc, node):
        """Clean a BeautifulSoup document in-place"""
        if isinstance(node, NavigableString):
            self.clean_string_node(doc, node)
        elif isinstance(node, Tag):
            self.clean_tag_node(doc, node)
        # This branch is here in case node is a BeautifulSoup object that does
        # not inherit from NavigableString or Tag. I can't find any examples
        # of such a thing at the moment, so this branch is untested.
        else:  # pragma: no cover
            self.clean_unknown_node(doc, node)

Source File: amazon_invoice.py From beancount-import with GNU General Public License v2.0

5 votes

def get_text_lines(parent_node):
    text_lines = ['']
    for node in parent_node.children:
        if isinstance(node, bs4.NavigableString):
            text_lines[-1] += str(node)
        elif node.name == 'br':
            text_lines.append('')
        else:
            text_lines[-1] += node.text
    return text_lines

Source File: FontRemapProcessors.py From ReadableWebProxy with BSD 3-Clause "New" or "Revised" License

5 votes

def apply_correction_map(soup, tag, cor_map):
	for item in list(tag.descendants):
		if isinstance(item, bs4.NavigableString):
			origstr = str(item)
			itemstr = origstr
			for fontset in cor_map:
				for badc, goodc in fontset.items():
					if badc in itemstr:
						itemstr = itemstr.replace(badc, goodc)
			if origstr != itemstr:
				news = soup.new_string(itemstr)
				item.replace_with(news)

Source File: fetchhaodoo.py From nlputils with MIT License

4 votes

def process_audio(self, abookid, soup):
        table = soup.find('table', border='0', cellspacing='0', cellpadding='0', width="530")
        h4 = table.h4
        font = h4.find('font', color="CC0000")
        a = h4.a
        match = re_audiobooktitle.match(h4.get_text().strip())
        if font:
            author = font.string.strip()
        else:
            author = match.group(1)
        if a:
            title = a.string.strip().strip('《》【】')
            bookid = parse_page(a['href'])
        else:
            title = match.group(2)
            bookid = None
        bookrecorder = match.group(3)
        contenttable = h4.parent.table
        audiofiles = []
        filename = chapter = cnum = cauthor = recorder = add_date = None
        for td in contenttable.find_all('td'):
            if td.audio:
                filename = td.audio.source['src']
                audiofiles.append((filename, abookid, cnum, chapter, cauthor, recorder, add_date))
                filename = chapter = cnum = cauthor = recorder = add_date = None
            else:
                a = td.a
                if a:
                    cnum = int(parse_page(a['href']).split(':')[-1])
                    chapter = a.string.strip()
                    match = re_recorder.match(str(a.next_sibling).strip())
                    if match:
                        recorder = match.group(1)
                    aleftstr = a.previous_sibling
                    if isinstance(aleftstr, bs4.NavigableString):
                        cauthor = str(aleftstr).strip().rstrip('《【')
                else:
                    cnum = chapter = recorder = None
                match = re_audiobookchapter.match(td.get_text().strip())
                if not match:
                    match = re_audiobookchapter2.match(td.get_text().strip())
                if match:
                    # check real no author
                    if cauthor is None:
                        cauthor = match.group(1)
                    cauthor = cauthor or None
                    chapter = chapter or match.group(2)
                    recorder = recorder or match.group(3)
                    add_date = date_fmt(match.group(4))
        return bookid, title, author, bookrecorder, audiofiles

Source File: nekur.py From bazarr with GNU General Public License v3.0

4 votes

def query(self, title):
        subtitles = []

        data = {
            'ajax': '1',
            'sSearch': title,
        }

        r = self.session.post(self.search_url, data=data, timeout=10)
        r.raise_for_status()

        if not r.content:
            logger.debug('No data returned from provider')
            return []

        soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser'])

        # loop over subtitle cells
        rows = soup.select('tbody > tr')
        for row in rows:
            # title
            title_anchor_el = row.select_one('.title > a')
            title_inner_text = [element for element in title_anchor_el if isinstance(element, NavigableString)]
            title = title_inner_text[0].strip()
            
            # year
            year = row.select_one('.year').text.strip('()')
            
            # download link
            href = title_anchor_el.get('href')
            download_link = self.server_url + href

            # imdb id
            imdb_td = row.select_one('td:nth-of-type(4)')
            imdb_link = imdb_td.select_one('a').get('href')
            imdb_id = imdb_link.split('/')[-2]

            # fps
            fps = row.select_one('.fps').text.strip()

            # additional notes
            notes = row.select_one('.notes').text.strip()

            # page link = download link (there is no seperate subtitle page link)
            page_link = download_link
            
            # create/add the subitle
            subtitle = self.subtitle_class(Language.fromalpha2('lv'), page_link, download_link, title, year, imdb_id, fps, notes)
            logger.debug('nekur: Found subtitle %r', subtitle)
            subtitles.append(subtitle)

        return subtitles

Source File: parse_html_deps.py From Jandroid with BSD 3-Clause "New" or "Revised" License

4 votes

def GenerateHTML(self, controller, minify=False, prettify=False):
    soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup))

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Doctype):
        x.extract()

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Declaration):
        x.extract()

    # Remove all imports.
    imports = soup.findAll('link', rel='import')
    for imp in imports:
      imp.extract()

    # Remove all script links.
    scripts_external = soup.findAll('script', src=True)
    for script in scripts_external:
      script.extract()

    # Remove all in-line scripts.
    scripts_external = soup.findAll('script', src=None)
    for script in scripts_external:
      script.extract()

    # Process all in-line styles.
    inline_styles = soup.findAll('style')
    for style in inline_styles:
      html = controller.GetHTMLForInlineStylesheet(unicode(style.string))
      if html:
        ns = soup.new_tag('style')
        ns.append(bs4.NavigableString(html))
        style.replaceWith(ns)
      else:
        style.extract()

    # Rewrite all external stylesheet hrefs or remove, as needed.
    stylesheet_links = soup.findAll('link', rel='stylesheet')
    for stylesheet_link in stylesheet_links:
      html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
      if html:
        tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style')
        assert len(tmp) == 1
        stylesheet_link.replaceWith(tmp[0])
      else:
        stylesheet_link.extract()

    # Remove comments if minifying.
    if minify:
      comments = soup.findAll(
          text=lambda text: isinstance(text, bs4.Comment))
      for comment in comments:
        comment.extract()
    if prettify:
      return soup.prettify('utf-8').strip()

    # We are done.
    return unicode(soup).strip()

Source File: parse_html_deps.py From Jandroid with BSD 3-Clause "New" or "Revised" License

4 votes

def GenerateHTML(self, controller, minify=False, prettify=False):
    soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup))

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Doctype):
        x.extract()

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Declaration):
        x.extract()

    # Remove all imports.
    imports = soup.findAll('link', rel='import')
    for imp in imports:
      imp.extract()

    # Remove all script links.
    scripts_external = soup.findAll('script', src=True)
    for script in scripts_external:
      script.extract()

    # Remove all in-line scripts.
    scripts_external = soup.findAll('script', src=None)
    for script in scripts_external:
      script.extract()

    # Process all in-line styles.
    inline_styles = soup.findAll('style')
    for style in inline_styles:
      html = controller.GetHTMLForInlineStylesheet(unicode(style.string))
      if html:
        ns = soup.new_tag('style')
        ns.append(bs4.NavigableString(html))
        style.replaceWith(ns)
      else:
        style.extract()

    # Rewrite all external stylesheet hrefs or remove, as needed.
    stylesheet_links = soup.findAll('link', rel='stylesheet')
    for stylesheet_link in stylesheet_links:
      html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
      if html:
        tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style')
        assert len(tmp) == 1
        stylesheet_link.replaceWith(tmp[0])
      else:
        stylesheet_link.extract()

    # Remove comments if minifying.
    if minify:
      comments = soup.findAll(
          text=lambda text: isinstance(text, bs4.Comment))
      for comment in comments:
        comment.extract()
    if prettify:
      return soup.prettify('utf-8').strip()

    # We are done.
    return unicode(soup).strip()

Source File: parse_html_deps.py From Jandroid with BSD 3-Clause "New" or "Revised" License

4 votes

def GenerateHTML(self, controller, minify=False, prettify=False):
    soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup))

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Doctype):
        x.extract()

    # Remove declaration.
    for x in soup.contents:
      if isinstance(x, bs4.Declaration):
        x.extract()

    # Remove all imports.
    imports = soup.findAll('link', rel='import')
    for imp in imports:
      imp.extract()

    # Remove all script links.
    scripts_external = soup.findAll('script', src=True)
    for script in scripts_external:
      script.extract()

    # Remove all in-line scripts.
    scripts_external = soup.findAll('script', src=None)
    for script in scripts_external:
      script.extract()

    # Process all in-line styles.
    inline_styles = soup.findAll('style')
    for style in inline_styles:
      html = controller.GetHTMLForInlineStylesheet(unicode(style.string))
      if html:
        ns = soup.new_tag('style')
        ns.append(bs4.NavigableString(html))
        style.replaceWith(ns)
      else:
        style.extract()

    # Rewrite all external stylesheet hrefs or remove, as needed.
    stylesheet_links = soup.findAll('link', rel='stylesheet')
    for stylesheet_link in stylesheet_links:
      html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href'])
      if html:
        tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style')
        assert len(tmp) == 1
        stylesheet_link.replaceWith(tmp[0])
      else:
        stylesheet_link.extract()

    # Remove comments if minifying.
    if minify:
      comments = soup.findAll(
          text=lambda text: isinstance(text, bs4.Comment))
      for comment in comments:
        comment.extract()
    if prettify:
      return soup.prettify('utf-8').strip()

    # We are done.
    return unicode(soup).strip()

Source File: html.py From training_results_v0.5 with Apache License 2.0

4 votes

def _soup_strings(soup):
  """Return text strings in soup."""
  paragraph_tags = set([
      "caption", "details", "h1", "h2", "h3", "h4", "h5", "h6", "li", "p", "td",
      "div", "span"
  ])

  skip_children = None
  for descendant in soup.descendants:
    # If we've treated a tag as a contiguous paragraph, don't re-emit the
    # children (see below).
    if skip_children is not None:
      try:
        in_skip = descendant in skip_children  # pylint: disable=unsupported-membership-test
      except RecursionError:  # pylint: disable=undefined-variable
        # Possible for this check to hit a nasty infinite recursion because of
        # BeautifulSoup __eq__ checks.
        in_skip = True
      if in_skip:
        continue
      else:
        skip_children = None

    # Treat some tags as contiguous paragraphs, regardless of other tags nested
    # inside (like <a> or <b>).
    if isinstance(descendant, bs4.Tag):
      if descendant.name in paragraph_tags:
        if descendant.find_all(paragraph_tags):
          # If there are nested paragraph tags, don't treat it as a single
          # contiguous tag.
          continue
        skip_children = list(descendant.descendants)
        text = " ".join(descendant.get_text(" ", strip=True).split())
        if text:
          yield text
        continue

    if (isinstance(descendant, bs4.Comment) or
        not isinstance(descendant, bs4.NavigableString)):
      continue

    text = " ".join(descendant.strip().split())
    if text:
      yield text

Python bs4.NavigableString() Examples