Python Examples of bs4.element.Tag

Source File: scrape_announcements.py From Penny-Dreadful-Tools with GNU General Public License v3.0

6 votes

def parse_build_notes(h: Tag) -> None:
    entries = []
    for n in h.next_elements:
        if isinstance(n, Tag) and n.name == 'p':
            if 'posted-in' in n.attrs.get('class', []):
                break
            if n.text:
                entries.append(n.text)

    embed = {
        'title': 'MTGO Build Notes',
        'type': 'rich',
        'description': '\n'.join(entries),
        'url': fetcher.find_announcements()[0],
    }
    if configuration.get_optional_str('bugs_webhook_id') is not None:
        fetch_tools.post_discord_webhook(
            configuration.get_str('bugs_webhook_id'),
            configuration.get_str('bugs_webhook_token'),
            embeds=[embed],
            username='Magic Online Announcements',
            avatar_url='https://magic.wizards.com/sites/mtg/files/styles/auth_small/public/images/person/wizards_authorpic_larger.jpg'
            )

Source File: scrape_bugblog.py From Penny-Dreadful-Tools with GNU General Public License v3.0

6 votes

def parse_changelog(collapsible_block: Tag) -> None:
    # They never show Fixed bugs in the Bug Blog anymore.  Fixed bugs are now listed on the Build Notes section of MTGO weekly announcements.
    # This is frustrating.
    for added in collapsible_block.find_all('ul'):
        for item in added.find_all('li'):
            print(item)
            bbt = strings.remove_smartquotes(item.get_text())

            issue = find_issue_by_code(bbt)
            if issue is not None:
                if not repo.is_issue_from_bug_blog(issue):
                    print('Adding Bug Blog to labels')
                    issue.add_to_labels('From Bug Blog')
            elif find_issue_by_name(bbt):
                print('Already exists.')
            else:
                print('Creating new issue')
                text = 'From Bug Blog.\nBug Blog Text: {0}'.format(bbt)
                repo.get_repo().create_issue(bbt, body=strings.remove_smartquotes(text), labels=['From Bug Blog'])

Source File: decrypt.py From Anti-Spider with MIT License

6 votes

def decrypt_woff_tag(tag,TTGlyphs,d_list):
    contents = tag.contents
    _ = []
    while contents:
        i = contents.pop(0)
        if isinstance(i, Tag):
            if i.name in decrypt_tags:
                text = dec(i.text)
                for index,name in enumerate(TTGlyphs):
                    if text in name:
                        i = d_list[index]
            else:
                continue
        if not isinstance(i, str):
            continue
        _.append(i)
    return ''.join(_)

Source File: scrape_bugblog.py From Penny-Dreadful-Tools with GNU General Public License v3.0

6 votes

def check_if_removed_from_bugblog(bbt: Match, b: Tag, issue: Issue) -> None:
    if bbt is not None:
        text = strings.remove_smartquotes(bbt.group(1).strip())
        for row in b.find_all('tr'):
            data = row.find_all('td')
            rowtext = strings.remove_smartquotes(data[1].text.strip())
            if rowtext == text:
                break
            if strip_squarebrackets(rowtext) == strip_squarebrackets(text):
                # Fix this
                print("Issue #{id}'s bug blog text has differing autocard notation.".format(id=issue.number))
                old_bbt = strings.get_body_field(issue.body, 'Bug Blog Text')
                body = re.sub(BBT_REGEX, 'Bug Blog Text: {0}'.format(rowtext), issue.body, flags=re.MULTILINE)
                new_bbt = strings.get_body_field(body, 'Bug Blog Text')
                issue.edit(body=body)
                print('Updated to `{0}`'.format(rowtext))
                issue.create_comment(f'Changed bug blog text from `{old_bbt}` to `{new_bbt}`')
                break
        else:
            print('{id} is fixed!'.format(id=issue.number))
            repo.create_comment(issue, 'This bug has been removed from the bug blog!')
            issue.edit(state='closed')

Source File: deeru_html.py From DeerU with GNU General Public License v3.0

6 votes

def get_tag_from_bs(cls, soup):
        from bs4 import BeautifulSoup as bs
        from bs4.element import Tag as bs_tag
        father = None
        if isinstance(soup, bs):
            father = soup.find()
        elif isinstance(soup, bs_tag):
            father = soup
        if not father or not father.name:
            return None

        tag = cls(father.name, father.text, father.attrs)

        for c in father.children:
            c_tag = cls.get_tag_from_bs(c)
            tag.append(c_tag)
        return tag

Source File: helpers.py From robobrowser with BSD 3-Clause "New" or "Revised" License

6 votes

def ensure_soup(value, parser=None):
    """Coerce a value (or list of values) to Tag (or list of Tag).

    :param value: String, BeautifulSoup, Tag, or list of the above
    :param str parser: Parser to use; defaults to BeautifulSoup default
    :return: Tag or list of Tags

    """
    if isinstance(value, BeautifulSoup):
        return value.find()
    if isinstance(value, Tag):
        return value
    if isinstance(value, list):
        return [
            ensure_soup(item, parser=parser)
            for item in value
        ]
    parsed = BeautifulSoup(value, features=parser)
    return parsed.find()

Source File: fetcher.py From Penny-Dreadful-Tools with GNU General Public License v3.0

5 votes

def parse_article_item_extended(a: Tag) -> Tuple[Tag, str]:
    title = a.find_all('h3')[0]
    link = 'http://magic.wizards.com' + a.find_all('a')[0]['href']
    return (title, link)

Source File: Junos.py From assimilator with MIT License

5 votes

def get(self):
		if not self.dev.connected:
			logger.error("{0}: Firewall timed out or incorrect device credentials.".format(self.firewall_config['name']))
			return {'error' : 'Could not connect to device.'}, 504
		else:
			logger.info("{0}: Connected successfully.".format(self.firewall_config['name']))
		try:
			rpc = etree.tostring(str(jns.rpc.get_security_policies_hit_count()), encoding='unicode')
		except Exception as e:
			logger.error("Error parsing rpc: {0}".format(str(e)))
			return {'error' : 'Error parsing soup.'}, 500
		finally:
			self.dev.close()
		soup = BS(rpc,'xml')
		entries = list()
		for hitcount in soup.find('policy-hit-count').children:
			if type(hitcount) != Tag or hitcount.name != 'policy-hit-count-entry':
				continue
			aux = {
			'count' : int(hitcount.find('policy-hit-count-count').text),
			'from' : hitcount.find('policy-hit-count-from-zone').text,
			'to' : hitcount.find('policy-hit-count-to-zone').text,
			'policy' : hitcount.find('policy-hit-count-policy-name').text
			}
			entries.append(aux)
		return {'len' : len(entries), 'hitcount' : entries}

Source File: Junos.py From assimilator with MIT License

5 votes

def get(self):
		if not self.dev.connected:
			logger.error("{0}: Firewall timed out or incorrect device credentials.".format(self.firewall_config['name']))
			return {'error' : 'Could not connect to device.'}, 504
		else:
			logger.info("{0}: Connected successfully.".format(self.firewall_config['name']))
		rpc = etree.tostring(self.dev.rpc.get_commit_information(), encoding='unicode')
		soup = BS(rpc,'xml')
		entries = list()
		logger.debug("soup: {0}".format(str(soup)))
		for entry in soup.find('commit-information').children:
			if type(entry) != Tag:
				continue
			entries.append({'user' : entry.user.text, 'sequence' : entry.find('sequence-number').text, 'date' : entry.find('date-time').text, 'comment' : entry.log.text if entry.log else None})
		return {'len' : len(entries), 'commit' : entries}

Source File: helpers.py From NBAsh with MIT License

5 votes

def GetTextOfItem(item, default_value=''):
    if item is not None and isinstance(item, element.Tag):
        return item.get_text()
    else:
        return default_value

Source File: filter_callouts.py From dactyl with MIT License

5 votes

def filter_soup(soup, currentpage={}, config={}, **kwargs):
    """
    Find patterns that look like callouts, for example **Note:**, and add
    callout classes to their parent elements (usually <p>)
    """
    # callout classes are defined by page>target>config>default
    callout_classes = currentpage.get(CALLOUT_TYPES_FIELD,
                        config.get(CALLOUT_TYPES_FIELD,
                        DEFAULT_CALLOUT_TYPES))
    callout_intro = re.compile(r"("+"|".join(callout_classes)+"):?$", re.I)
    callout_base_class = currentpage.get(CALLOUT_CLASS_FIELD,
                        config.get(CALLOUT_CLASS_FIELD,
                        DEFAULT_CALLOUT_CLASS))

    callouts = soup.find_all(name=["strong","em"], string=callout_intro)
    for c in callouts:
        if not c.previous_sibling: #This callout starts a block
            callout_type = c.string.replace(":","").lower()
            if callout_type in callout_classes:
                if (c.parent.parent.name == "blockquote" and Tag not in
                    [type(u) for u in c.parent.previous_siblings]):
                    # Special case for blockquotes, to allow multiline callouts.
                    # First element of BQ must start with a callout keyword
                    callout_el = c.parent.parent
                else:
                    callout_el = c.parent
                callout_el["class"] = [callout_base_class, callout_type]

Source File: test_feedback.py From notifications-admin with MIT License

5 votes

def test_email_address_required_for_problems_and_questions(
    client_request,
    mocker,
    data,
    ticket_type,
):
    mocker.patch('app.main.views.feedback.zendesk_client')
    client_request.logout()
    page = client_request.post(
        'main.feedback',
        ticket_type=ticket_type,
        _data=data,
        _expected_status=200
    )
    assert isinstance(page.find('span', {'class': 'error-message'}), element.Tag)

Source File: tbentries.py From open-context-py with GNU General Public License v3.0

5 votes

def check_fix_photobox(self, node):
        """ gets rid of empty tags """
        if isinstance(node, Tag):
            if node.get('class') is not None:
                node_clases = node.get('class')
                if node_clases[0] == self.pc_photobox_class:
                    # print('Check: '+ node_clases[0])
                    make_span = False
                    show_img = False
                    img_nodes = node.find_all(['img'])
                    if len(img_nodes) < 1:
                        make_span = True
                    else:
                        for img in img_nodes:
                            # print('img style: ' + str(img.get('style')))
                            if img.get('style') is None:
                                show_img = True
                            else:
                                img_styles = img.get('style')
                                if 'display:none;' in img_styles:
                                    # print('ok img..')
                                    pass
                                else:
                                    # print('crap!')
                                    show_img = True
                        if show_img is False:
                            # print('img ok to span transform')
                            make_span = True
                    if make_span:
                        # print('Span transform')
                        node.name = 'span'

Source File: babelnovel.py From lightnovel-crawler with Apache License 2.0

5 votes

def download_chapter_body(self, chapter):
        logger.info('Visiting %s', chapter['json_url'])
        data = self.get_json(chapter['json_url'])

        soup = BeautifulSoup(data['data']['content'], 'lxml')
        if self.bad_selectors:
            for tag in soup.select(self.bad_selectors):
                tag.extract()
            # end for
        # end if

        body = soup.find('body')
        self.clean_contents(body)

        for tag in body.contents:
            if not str(tag).strip():
                tag.extract()
            elif isinstance(tag, Tag):
                tag.name = 'p'
            # end if
        # end for

        # body = data['data']['content']
        result = str(body)
        result = re.sub(r'\n\n', '<br><br>', result)
        return result
    # end def
# end class

Source File: helpers.py From robobrowser with BSD 3-Clause "New" or "Revised" License

5 votes

def lowercase_attr_names(tag):
    """Lower-case all attribute names of the provided BeautifulSoup tag.
    Note: this mutates the tag's attribute names and does not return a new
    tag.

    :param Tag: BeautifulSoup tag

    """
    # Use list comprehension instead of dict comprehension for 2.6 support
    tag.attrs = dict([
        (key.lower(), value)
        for key, value in iteritems(tag.attrs)
    ])

Source File: visual_linker.py From fonduer with MIT License

5 votes

def _coordinates_from_HTML(
        self, page: Tag, page_num: int
    ) -> Tuple[
        List[PdfWord], Dict[PdfWordId, Bbox],
    ]:
        pdf_word_list: List[PdfWord] = []
        coordinate_map: Dict[PdfWordId, Bbox] = {}
        block_coordinates: Dict[PdfWordId, Tuple[int, int]] = {}
        blocks = page.find_all("block")
        i = 0  # counter for word_id in page_num
        for block in blocks:
            x_min_block = int(float(block.get("xmin")))
            y_min_block = int(float(block.get("ymin")))
            lines = block.find_all("line")
            for line in lines:
                y_min_line = int(float(line.get("ymin")))
                y_max_line = int(float(line.get("ymax")))
                words = line.find_all("word")
                for word in words:
                    xmin = int(float(word.get("xmin")))
                    xmax = int(float(word.get("xmax")))
                    for content in self.separators.split(word.getText()):
                        if len(content) > 0:  # Ignore empty characters
                            word_id: PdfWordId = (page_num, i)
                            pdf_word_list.append((word_id, content))
                            coordinate_map[word_id] = Bbox(
                                page_num, y_min_line, y_max_line, xmin, xmax,
                            )
                            block_coordinates[word_id] = (y_min_block, x_min_block)
                            i += 1
        # sort pdf_word_list by page, block top then block left, top, then left
        pdf_word_list = sorted(
            pdf_word_list,
            key=lambda word_id__: block_coordinates[word_id__[0]]
            + (coordinate_map[word_id__[0]].top, coordinate_map[word_id__[0]].left),
        )
        return pdf_word_list, coordinate_map

Source File: scrape_bugblog.py From Penny-Dreadful-Tools with GNU General Public License v3.0

5 votes

def find_bbt_in_issue_title(issue: Issue, known_issues: Tag) -> None:
    title = strip_squarebrackets(issue.title).replace(' ', '')
    for row in known_issues.find_all('tr'):
        data = row.find_all('td')
        row_text = strip_squarebrackets(data[1].text.strip()).replace(' ', '')
        if row_text == title:
            body = issue.body
            body += '\nBug Blog Text: {0}'.format(data[1].text.strip())
            if body != issue.body:
                issue.edit(body=body)
            return

Source File: scrape_bugblog.py From Penny-Dreadful-Tools with GNU General Public License v3.0

5 votes

def check_for_missing_bugs(b: Tag) -> None:
    for row in b.find_all('tr'):
        data = row.find_all('td')
        row_text = data[1].text.strip()
        if row_text == 'Description':
            # BS4 is bad.
            continue
        issue = find_issue_by_code(row_text)
        if issue:
            labels = [c.name for c in issue.labels]
            categories = [c for c in labels if c in strings.METACATS]
            if categories:
                continue
            bbcat = re.match(strings.REGEX_BBCAT, data[2].text.strip())
            if bbcat is None:
                continue
            g1 = bbcat.group(1).strip()
            if g1 in strings.METACATS:
                issue.add_to_labels(g1)
                continue
            if bbcat.group(2) is not None:
                g2 = bbcat.group(2).strip()
                if g2 in strings.METACATS:
                    issue.add_to_labels(g2)
                    continue
            print(f'Unknown BBCat: {bbcat.group(0)}')
            continue
        print('Could not find issue for `{row}`'.format(row=row_text))
        text = 'From Bug Blog.\nBug Blog Text: {0}'.format(row_text)
        repo.get_repo().create_issue(strings.remove_smartquotes(row_text), body=strings.remove_smartquotes(text), labels=['From Bug Blog'])

Source File: scrape_announcements.py From Penny-Dreadful-Tools with GNU General Public License v3.0

5 votes

def parse_downtimes(h: Tag) -> None:
    for n in h.next_elements:
        if isinstance(n, Tag) and n.text:
            with open('downtimes.txt', 'w', encoding='utf-8') as f:
                txt = n.text.strip()
                txt = txt.replace("Please note that there are no more 'extended' or 'normal' downtimes; in the new world with fewer downtimes, they're all the same length of time.", '')
                print(txt)
                f.write(txt)
            break

Source File: scrape_announcements.py From Penny-Dreadful-Tools with GNU General Public License v3.0

5 votes

def parse_header(h: Tag) -> None:
    txt = h.text
    if txt.startswith('Downtime'):
        parse_downtimes(h)
    elif txt.startswith('Build Notes'):
        parse_build_notes(h)

Source File: fetcher.py From Penny-Dreadful-Tools with GNU General Public License v3.0

5 votes

def get_article_archive() -> List[Tuple[Tag, str]]:
    try:
        html = fetch_tools.fetch('http://magic.wizards.com/en/articles/archive/184956')
    except fetch_tools.FetchException:
        html = fetch_tools.fetch('http://magic.wizards.com/en/articles/archive/')
    soup = BeautifulSoup(html, 'html.parser')
    return [parse_article_item_extended(a) for a in soup.find_all('div', class_='article-item-extended')]

Source File: doc.py From bot with MIT License

5 votes

def _match_end_tag(tag: Tag) -> bool:
        """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table."""
        for attr in SEARCH_END_TAG_ATTRS:
            if attr in tag.get("class", ()):
                return True

        return tag.name == "table"

Source File: xss_utils.py From ITWSV with MIT License

4 votes

def study(bs_node, parent=None, keyword=""):
    entries = []

    # if parent is None:
    #  print("Keyword is: {0}".format(keyword))
    if keyword in str(bs_node).lower():
        if isinstance(bs_node, element.Tag):
            if keyword in str(bs_node.attrs):

                for k, v in bs_node.attrs.items():
                    if keyword in v:
                        # print("Found in attribute value {0} of tag {1}".format(k, bs_node.name))
                        noscript = close_noscript(bs_node)
                        d = {"type": "attrval", "name": k, "tag": bs_node.name, "noscript": noscript}
                        if d not in entries:
                            entries.append(d)

                    if keyword in k:
                        # print("Found in attribute name {0} of tag {1}".format(k, bs_node.name))
                        noscript = close_noscript(bs_node)
                        d = {"type": "attrname", "name": k, "tag": bs_node.name, "noscript": noscript}
                        if d not in entries:
                            entries.append(d)

            elif keyword in bs_node.name:
                # print("Found in tag name")
                noscript = close_noscript(bs_node)
                d = {"type": "tag", "value": bs_node.name, "noscript": noscript}
                if d not in entries:
                    entries.append(d)

            # recursively search injection points for the same variable
            for x in bs_node.contents:
                for entry in study(x, parent=bs_node, keyword=keyword):
                    if entry not in entries:
                        entries.append(entry)

        elif isinstance(bs_node, element.Comment):
            # print("Found in comment, tag {0}".format(parent.name))
            noscript = close_noscript(bs_node)
            d = {"type": "comment", "parent": parent.name, "noscript": noscript}
            if d not in entries:
                entries.append(d)

        elif isinstance(bs_node, element.NavigableString):
            # print("Found in text, tag {0}".format(parent.name))
            noscript = close_noscript(bs_node)
            d = {"type": "text", "parent": parent.name, "noscript": noscript}
            if d not in entries:
                entries.append(d)

    return entries


# generate a list of payloads based on where in the webpage the js-code will be injected

Source File: tbentries.py From open-context-py with GNU General Public License v3.0

4 votes

def remove_empty_node(self, node):
        """ gets rid of empty tags """
        if isinstance(node, Tag):
            keep_nodes = [
                'img',
                'td',
                'tr',
                'th',
            ]
            if node.name.lower() not in keep_nodes:
                remove_node = False
                no_child_remove_tags = [
                    'a',
                    'img',
                    'th',
                    'tr',
                    'td',
                    'strong',
                    'ul',
                    'ol',
                    'li',
                    'em',
                    'i',
                    'u',
                    'b',
                    'sup',
                    'sub',
                    'mark',
                    'q',
                    'samp',
                    'small'
                ]
                ok_child_nodes = node.find_all(no_child_remove_tags)
                if len(ok_child_nodes) < 1:
                    # ok no images check for text
                    all_string = ''
                    for act_string in node.stripped_strings:
                        all_string += str(act_string)
                    for d_child in node.descendants:
                        if isinstance(d_child, Tag):
                            for act_string in d_child.stripped_strings:
                                all_string += str(act_string)
                    all_string = all_string.strip()
                    # print('Check on: <' + node.name.lower() + '> with: ' + str(all_string))
                    if len(all_string) < 1:
                        remove_node = True
                    if isinstance(node.string, str):
                        n_string = node.string
                        n_string = n_string.strip()
                        if len(n_string) < 1:
                            remove_node = True
                        else:
                            remove_node = False
                if remove_node:
                    # print('Removing a: <' + node.name.lower() + '>')
                    node.extract()

Source File: scrape_bugblog.py From Penny-Dreadful-Tools with GNU General Public License v3.0

4 votes

def parse_knownbugs(b: Tag) -> None:
    # attempt to find all the fixed bugs
    all_codes = b.find_all(string=lambda text: isinstance(text, Comment))
    all_codes = [str(code).replace('\t', ' ') for code in all_codes]
    for issue in repo.get_repo().get_issues():
        # code = re.search(CODE_REGEX, issue.body, re.MULTILINE)
        bbt = re.search(BBT_REGEX, issue.body, re.MULTILINE)
        if bbt is None:
            cards = strings.get_cards_from_string(issue.title)
            if repo.is_issue_from_bug_blog(issue):
                find_bbt_in_body_or_comments(issue)
                find_bbt_in_issue_title(issue, b)
                bbt = re.search(BBT_REGEX, issue.body, re.MULTILINE)
                if bbt is None:
                    print('Issue #{id} {cards} has no Bug Blog text!'.format(id=issue.number, cards=cards))
                    issue.add_to_labels('Invalid Bug Blog')
                continue

            if not cards:
                continue
            lines = b.find_all(string=re.compile(r'\[' + cards[0] + r'\]'))
            if not lines:
                continue
            for line in lines:
                parent = line.parent
                bb_text = parent.get_text().strip()
                if find_issue_by_code(bb_text) is not None:
                    print('Already assigned.')
                    continue
                text = ''.join(parent.strings)
                print(text)
                repo.create_comment(issue, 'Found in bug blog.\nBug Blog Text: {0}'.format(text))
                if not repo.is_issue_from_bug_blog(issue):
                    issue.add_to_labels('From Bug Blog')
            continue
        if 'Invalid Bug Blog' in [i.name for i in issue.labels]:
            issue.remove_from_labels('Invalid Bug Blog')

        if repo.is_issue_from_bug_blog(issue):
            # Don't check for Bug Blog Text if it's not marked as a BB issue (Maybe because it was reopened)
            check_if_removed_from_bugblog(bbt, b, issue)

    check_for_missing_bugs(b)

Source File: extract.py From dart-fss with MIT License

4 votes

def seek_table(tables: List, includes: Pattern,
               excludes: Union[Pattern, None] = None) -> Tuple[Union[str, None], Union[str, None], Union[str, None]]:
    """ Table 검색 """
    regex = re.compile(r'\d{4}(.*?)\d{2}(.*?)\d{2}')
    for table in tables:
        for tag in table.previous_siblings:
            if tag in tables:
                break
            if isinstance(tag, Tag):
                children = tag.findChildren(text=includes)
                for child in children:
                    title = child
                    if title:
                        title = re.sub(r'\s+', '', title)
                        if excludes and excludes.search(title):
                            continue
                        if len(title) > 12:
                            continue
                        header = table.find_previous('table', class_='nb')
                        if header is None:
                            continue
                        tr_list = header.find_all('tr')
                        if len(tr_list) < 2:
                            continue

                        tr_cnt = 0
                        for tr in tr_list:
                            if regex.search(tr.text):
                                tr_cnt += 1

                        if tr_cnt == 0:
                            found = table.find_previous(text=re.compile(r'\d{4}(.*?)\d{2}(.*?)\d{2}'))
                            if found is None:
                                continue
                            header = found.parent
                            extract_text = re.sub('<.*?>', '\n', str(header))
                            extract_text = extract_text.split('\n')
                            html = '<table class="nb"><tbody>'

                            error = False
                            for t in extract_text:
                                if t.strip() == '':
                                    pass
                                else:
                                    if len(t) > 100:
                                        error = True
                                        break
                                    html += '<tr><td>' + t + '</td></tr>'
                            if error:
                                continue
                            html += '</tbody></table>'
                            header = BeautifulSoup(html, 'html.parser')
                        return title, header, table
    return None, None, None

Source File: Junos.py From assimilator with MIT License

4 votes

def get(self,args):
		logger.debug("class rules(JUNOS).get({0})".format(str(args)))
		if not self.dev.connected:
			logger.error("{0}: Firewall timed out or incorrect device credentials.".format(self.firewall_config['name']))
			return {'error' : 'Could not connect to device.'}, 504
		else:
			logger.info("{0}: Connected successfully.".format(self.firewall_config['name']))
		try:
			soup = BS(str(etree.tostring(self.dev.rpc.get_firewall_policies(), encoding='unicode')),'xml')
			logger.debug("soup: " + str(soup))
		except Exception as e:
			logger.error("Error parsing soup: {0}".format(str(e)))
			return {'error' : 'Error parsing soup.'}, 500
		finally:
			logger.debug("Closing device...")
			self.dev.close()
		entries = list()
		for context in soup.find("security-policies").children:			
			if type(context) != Tag:
				continue
			elif context.name == "default-policy":
				continue
			else:
				logger.debug("context: {0}".format(str(context)))
			src_zone = context.find("context-information").find("source-zone-name").text
			dst_zone = context.find("context-information").find("destination-zone-name").text
			logger.debug("src_zone: {0}\ndst_zone: {1}\n".format(src_zone,dst_zone))
			for rule in context.children:
				logger.debug("Rule: {0}".format(str(rule)))
				if rule.name == "context-information" or type(rule) != Tag:
					continue
				aux = {
					"enabled" : True if rule.find('policy-state').text == 'enabled' else False,
					"id" : int(rule.find('policy-identifier').text),
				      "action": rule.find('policy-information').find('policy-action').find('action-type').text,
				      "destination": list(),
				      "from": src_zone,
				      "logging": False if rule.find('policy-information').find('policy-action').find('log') else rule.find('policy-information').find('policy-action').find('log'),
				      "name": rule.find('policy-information').find('policy-name').text,
				      "application": list(),
				      "source": list(),
					"to": dst_zone
		   		 	}
				for addr in rule.find('source-addresses').children:
					if type(addr) != Tag:
						continue
					aux['source'].append(addr.find('address-name').text)
				for addr in rule.find('destination-addresses').children:
					if type(addr) != Tag:
						continue
					aux['destination'].append(addr.find('address-name').text)
				for addr in rule.find('applications').children:
					if type(addr) != Tag:
						continue
					aux['application'].append(addr.find('application-name').text)
				entries.append(aux)
		#entries = self.filter(args,entries)
		return {'len' : len(entries), 'rules' : entries}

Source File: mod16.py From RHEAS with MIT License

4 votes

def download(dbname, dts, bbox):
    """Downloads the MODIS evapotranspiration data product MOD16 for
    a set of dates *dt* and imports them into the PostGIS database *dbname*."""
    log = logging.getLogger(__name__)
    res = 0.01
    urlbase = "http://files.ntsg.umt.edu"
    tiles = modis.findTiles(bbox)
    if tiles is not None:
        for dt in [dts[0] + timedelta(dti) for dti in range((dts[-1] - dts[0]).days + 1)]:
            url = "{0}/data/NTSG_Products/MOD16/MOD16A2.105_MERRAGMAO/Y{1}".format(urlbase, dt.year)
            resp_year = requests.get(url)
            try:
                assert resp_year.status_code == 200
                days = [link for link in BeautifulSoup(resp_year.text, parse_only=SoupStrainer('a')) if isinstance(link, Tag) and link.text.find(dt.strftime("%j")) >= 0]
                assert len(days) > 0
                resp_day = requests.get("{0}{1}".format(urlbase, days[0].get('href')))
                assert resp_day.status_code == 200
                files = [link.get('href') for link in BeautifulSoup(resp_day.text, parse_only=SoupStrainer('a')) if isinstance(link, Tag) and link.text.find("hdf") > 0]
                files = [f for f in files if any(f.find("h{0:02d}v{1:02d}".format(t[1], t[0])) > 0 for t in tiles)]
                outpath = tempfile.mkdtemp()
                for fname in files:
                    resp_file = requests.get("{0}{1}".format(urlbase, fname)) 
                    filename = fname.split("/")[-1]
                    with open("{0}/{1}".format(outpath, filename), 'wb') as fout:
                        for chunk in resp_file:
                            fout.write(chunk)
                    proc = subprocess.Popen(["gdal_translate", "HDF4_EOS:EOS_GRID:{0}/{1}:MOD_Grid_MOD16A2:ET_1km".format(
                        outpath, filename), "{0}/{1}".format(outpath, filename).replace("hdf", "tif")], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
                    out, err = proc.communicate()
                    log.debug(out)
                tifs = glob.glob("{0}/*.tif".format(outpath))
                proc = subprocess.Popen(
                    ["gdal_merge.py", "-o", "{0}/et.tif".format(outpath)] + tifs, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
                out, err = proc.communicate()
                log.debug(out)
                proc = subprocess.Popen(["gdal_calc.py", "-A", "{0}/et.tif".format(outpath), "--outfile={0}/et1.tif".format(
                    outpath), "--NoDataValue=-9999", "--calc=(A<32701)*(0.1*A+9999)-9999"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
                out, err = proc.communicate()
                log.debug(out)
                proc = subprocess.Popen(["gdalwarp", "-t_srs", "+proj=latlong +ellps=sphere", "-tr", str(
                    res), str(-res), "{0}/et1.tif".format(outpath), "{0}/et2.tif".format(outpath)], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
                out, err = proc.communicate()
                log.debug(out)
                if bbox is None:
                    pstr = []
                else:
                    pstr = ["-projwin", str(bbox[0]), str(bbox[3]), str(bbox[2]), str(bbox[1])]
                proc = subprocess.Popen(["gdal_translate"] + pstr + ["-a_srs", "epsg:4326", "{0}/et2.tif".format(outpath), "{0}/et3.tif".format(outpath)], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
                out, err = proc.communicate()
                log.debug(out)
                dbio.ingest(
                    dbname, "{0}/et3.tif".format(outpath), dt, table, False)
                shutil.rmtree(outpath)
            except:
                log.warning("MOD16 data not available for {0}. Skipping download!".format(
                    dt.strftime("%Y-%m-%d")))

Python bs4.element.Tag() Examples