Python bs4.element.Tag() Examples
The following are 28
code examples of bs4.element.Tag().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
bs4.element
, or try the search function
.
Example #1
Source File: scrape_announcements.py From Penny-Dreadful-Tools with GNU General Public License v3.0 | 6 votes |
def parse_build_notes(h: Tag) -> None: entries = [] for n in h.next_elements: if isinstance(n, Tag) and n.name == 'p': if 'posted-in' in n.attrs.get('class', []): break if n.text: entries.append(n.text) embed = { 'title': 'MTGO Build Notes', 'type': 'rich', 'description': '\n'.join(entries), 'url': fetcher.find_announcements()[0], } if configuration.get_optional_str('bugs_webhook_id') is not None: fetch_tools.post_discord_webhook( configuration.get_str('bugs_webhook_id'), configuration.get_str('bugs_webhook_token'), embeds=[embed], username='Magic Online Announcements', avatar_url='https://magic.wizards.com/sites/mtg/files/styles/auth_small/public/images/person/wizards_authorpic_larger.jpg' )
Example #2
Source File: scrape_bugblog.py From Penny-Dreadful-Tools with GNU General Public License v3.0 | 6 votes |
def parse_changelog(collapsible_block: Tag) -> None: # They never show Fixed bugs in the Bug Blog anymore. Fixed bugs are now listed on the Build Notes section of MTGO weekly announcements. # This is frustrating. for added in collapsible_block.find_all('ul'): for item in added.find_all('li'): print(item) bbt = strings.remove_smartquotes(item.get_text()) issue = find_issue_by_code(bbt) if issue is not None: if not repo.is_issue_from_bug_blog(issue): print('Adding Bug Blog to labels') issue.add_to_labels('From Bug Blog') elif find_issue_by_name(bbt): print('Already exists.') else: print('Creating new issue') text = 'From Bug Blog.\nBug Blog Text: {0}'.format(bbt) repo.get_repo().create_issue(bbt, body=strings.remove_smartquotes(text), labels=['From Bug Blog'])
Example #3
Source File: decrypt.py From Anti-Spider with MIT License | 6 votes |
def decrypt_woff_tag(tag,TTGlyphs,d_list): contents = tag.contents _ = [] while contents: i = contents.pop(0) if isinstance(i, Tag): if i.name in decrypt_tags: text = dec(i.text) for index,name in enumerate(TTGlyphs): if text in name: i = d_list[index] else: continue if not isinstance(i, str): continue _.append(i) return ''.join(_)
Example #4
Source File: scrape_bugblog.py From Penny-Dreadful-Tools with GNU General Public License v3.0 | 6 votes |
def check_if_removed_from_bugblog(bbt: Match, b: Tag, issue: Issue) -> None: if bbt is not None: text = strings.remove_smartquotes(bbt.group(1).strip()) for row in b.find_all('tr'): data = row.find_all('td') rowtext = strings.remove_smartquotes(data[1].text.strip()) if rowtext == text: break if strip_squarebrackets(rowtext) == strip_squarebrackets(text): # Fix this print("Issue #{id}'s bug blog text has differing autocard notation.".format(id=issue.number)) old_bbt = strings.get_body_field(issue.body, 'Bug Blog Text') body = re.sub(BBT_REGEX, 'Bug Blog Text: {0}'.format(rowtext), issue.body, flags=re.MULTILINE) new_bbt = strings.get_body_field(body, 'Bug Blog Text') issue.edit(body=body) print('Updated to `{0}`'.format(rowtext)) issue.create_comment(f'Changed bug blog text from `{old_bbt}` to `{new_bbt}`') break else: print('{id} is fixed!'.format(id=issue.number)) repo.create_comment(issue, 'This bug has been removed from the bug blog!') issue.edit(state='closed')
Example #5
Source File: deeru_html.py From DeerU with GNU General Public License v3.0 | 6 votes |
def get_tag_from_bs(cls, soup): from bs4 import BeautifulSoup as bs from bs4.element import Tag as bs_tag father = None if isinstance(soup, bs): father = soup.find() elif isinstance(soup, bs_tag): father = soup if not father or not father.name: return None tag = cls(father.name, father.text, father.attrs) for c in father.children: c_tag = cls.get_tag_from_bs(c) tag.append(c_tag) return tag
Example #6
Source File: helpers.py From robobrowser with BSD 3-Clause "New" or "Revised" License | 6 votes |
def ensure_soup(value, parser=None): """Coerce a value (or list of values) to Tag (or list of Tag). :param value: String, BeautifulSoup, Tag, or list of the above :param str parser: Parser to use; defaults to BeautifulSoup default :return: Tag or list of Tags """ if isinstance(value, BeautifulSoup): return value.find() if isinstance(value, Tag): return value if isinstance(value, list): return [ ensure_soup(item, parser=parser) for item in value ] parsed = BeautifulSoup(value, features=parser) return parsed.find()
Example #7
Source File: fetcher.py From Penny-Dreadful-Tools with GNU General Public License v3.0 | 5 votes |
def parse_article_item_extended(a: Tag) -> Tuple[Tag, str]: title = a.find_all('h3')[0] link = 'http://magic.wizards.com' + a.find_all('a')[0]['href'] return (title, link)
Example #8
Source File: Junos.py From assimilator with MIT License | 5 votes |
def get(self): if not self.dev.connected: logger.error("{0}: Firewall timed out or incorrect device credentials.".format(self.firewall_config['name'])) return {'error' : 'Could not connect to device.'}, 504 else: logger.info("{0}: Connected successfully.".format(self.firewall_config['name'])) try: rpc = etree.tostring(str(jns.rpc.get_security_policies_hit_count()), encoding='unicode') except Exception as e: logger.error("Error parsing rpc: {0}".format(str(e))) return {'error' : 'Error parsing soup.'}, 500 finally: self.dev.close() soup = BS(rpc,'xml') entries = list() for hitcount in soup.find('policy-hit-count').children: if type(hitcount) != Tag or hitcount.name != 'policy-hit-count-entry': continue aux = { 'count' : int(hitcount.find('policy-hit-count-count').text), 'from' : hitcount.find('policy-hit-count-from-zone').text, 'to' : hitcount.find('policy-hit-count-to-zone').text, 'policy' : hitcount.find('policy-hit-count-policy-name').text } entries.append(aux) return {'len' : len(entries), 'hitcount' : entries}
Example #9
Source File: Junos.py From assimilator with MIT License | 5 votes |
def get(self): if not self.dev.connected: logger.error("{0}: Firewall timed out or incorrect device credentials.".format(self.firewall_config['name'])) return {'error' : 'Could not connect to device.'}, 504 else: logger.info("{0}: Connected successfully.".format(self.firewall_config['name'])) rpc = etree.tostring(self.dev.rpc.get_commit_information(), encoding='unicode') soup = BS(rpc,'xml') entries = list() logger.debug("soup: {0}".format(str(soup))) for entry in soup.find('commit-information').children: if type(entry) != Tag: continue entries.append({'user' : entry.user.text, 'sequence' : entry.find('sequence-number').text, 'date' : entry.find('date-time').text, 'comment' : entry.log.text if entry.log else None}) return {'len' : len(entries), 'commit' : entries}
Example #10
Source File: helpers.py From NBAsh with MIT License | 5 votes |
def GetTextOfItem(item, default_value=''): if item is not None and isinstance(item, element.Tag): return item.get_text() else: return default_value
Example #11
Source File: filter_callouts.py From dactyl with MIT License | 5 votes |
def filter_soup(soup, currentpage={}, config={}, **kwargs): """ Find patterns that look like callouts, for example **Note:**, and add callout classes to their parent elements (usually <p>) """ # callout classes are defined by page>target>config>default callout_classes = currentpage.get(CALLOUT_TYPES_FIELD, config.get(CALLOUT_TYPES_FIELD, DEFAULT_CALLOUT_TYPES)) callout_intro = re.compile(r"("+"|".join(callout_classes)+"):?$", re.I) callout_base_class = currentpage.get(CALLOUT_CLASS_FIELD, config.get(CALLOUT_CLASS_FIELD, DEFAULT_CALLOUT_CLASS)) callouts = soup.find_all(name=["strong","em"], string=callout_intro) for c in callouts: if not c.previous_sibling: #This callout starts a block callout_type = c.string.replace(":","").lower() if callout_type in callout_classes: if (c.parent.parent.name == "blockquote" and Tag not in [type(u) for u in c.parent.previous_siblings]): # Special case for blockquotes, to allow multiline callouts. # First element of BQ must start with a callout keyword callout_el = c.parent.parent else: callout_el = c.parent callout_el["class"] = [callout_base_class, callout_type]
Example #12
Source File: test_feedback.py From notifications-admin with MIT License | 5 votes |
def test_email_address_required_for_problems_and_questions( client_request, mocker, data, ticket_type, ): mocker.patch('app.main.views.feedback.zendesk_client') client_request.logout() page = client_request.post( 'main.feedback', ticket_type=ticket_type, _data=data, _expected_status=200 ) assert isinstance(page.find('span', {'class': 'error-message'}), element.Tag)
Example #13
Source File: tbentries.py From open-context-py with GNU General Public License v3.0 | 5 votes |
def check_fix_photobox(self, node): """ gets rid of empty tags """ if isinstance(node, Tag): if node.get('class') is not None: node_clases = node.get('class') if node_clases[0] == self.pc_photobox_class: # print('Check: '+ node_clases[0]) make_span = False show_img = False img_nodes = node.find_all(['img']) if len(img_nodes) < 1: make_span = True else: for img in img_nodes: # print('img style: ' + str(img.get('style'))) if img.get('style') is None: show_img = True else: img_styles = img.get('style') if 'display:none;' in img_styles: # print('ok img..') pass else: # print('crap!') show_img = True if show_img is False: # print('img ok to span transform') make_span = True if make_span: # print('Span transform') node.name = 'span'
Example #14
Source File: babelnovel.py From lightnovel-crawler with Apache License 2.0 | 5 votes |
def download_chapter_body(self, chapter): logger.info('Visiting %s', chapter['json_url']) data = self.get_json(chapter['json_url']) soup = BeautifulSoup(data['data']['content'], 'lxml') if self.bad_selectors: for tag in soup.select(self.bad_selectors): tag.extract() # end for # end if body = soup.find('body') self.clean_contents(body) for tag in body.contents: if not str(tag).strip(): tag.extract() elif isinstance(tag, Tag): tag.name = 'p' # end if # end for # body = data['data']['content'] result = str(body) result = re.sub(r'\n\n', '<br><br>', result) return result # end def # end class
Example #15
Source File: helpers.py From robobrowser with BSD 3-Clause "New" or "Revised" License | 5 votes |
def lowercase_attr_names(tag): """Lower-case all attribute names of the provided BeautifulSoup tag. Note: this mutates the tag's attribute names and does not return a new tag. :param Tag: BeautifulSoup tag """ # Use list comprehension instead of dict comprehension for 2.6 support tag.attrs = dict([ (key.lower(), value) for key, value in iteritems(tag.attrs) ])
Example #16
Source File: visual_linker.py From fonduer with MIT License | 5 votes |
def _coordinates_from_HTML( self, page: Tag, page_num: int ) -> Tuple[ List[PdfWord], Dict[PdfWordId, Bbox], ]: pdf_word_list: List[PdfWord] = [] coordinate_map: Dict[PdfWordId, Bbox] = {} block_coordinates: Dict[PdfWordId, Tuple[int, int]] = {} blocks = page.find_all("block") i = 0 # counter for word_id in page_num for block in blocks: x_min_block = int(float(block.get("xmin"))) y_min_block = int(float(block.get("ymin"))) lines = block.find_all("line") for line in lines: y_min_line = int(float(line.get("ymin"))) y_max_line = int(float(line.get("ymax"))) words = line.find_all("word") for word in words: xmin = int(float(word.get("xmin"))) xmax = int(float(word.get("xmax"))) for content in self.separators.split(word.getText()): if len(content) > 0: # Ignore empty characters word_id: PdfWordId = (page_num, i) pdf_word_list.append((word_id, content)) coordinate_map[word_id] = Bbox( page_num, y_min_line, y_max_line, xmin, xmax, ) block_coordinates[word_id] = (y_min_block, x_min_block) i += 1 # sort pdf_word_list by page, block top then block left, top, then left pdf_word_list = sorted( pdf_word_list, key=lambda word_id__: block_coordinates[word_id__[0]] + (coordinate_map[word_id__[0]].top, coordinate_map[word_id__[0]].left), ) return pdf_word_list, coordinate_map
Example #17
Source File: scrape_bugblog.py From Penny-Dreadful-Tools with GNU General Public License v3.0 | 5 votes |
def find_bbt_in_issue_title(issue: Issue, known_issues: Tag) -> None: title = strip_squarebrackets(issue.title).replace(' ', '') for row in known_issues.find_all('tr'): data = row.find_all('td') row_text = strip_squarebrackets(data[1].text.strip()).replace(' ', '') if row_text == title: body = issue.body body += '\nBug Blog Text: {0}'.format(data[1].text.strip()) if body != issue.body: issue.edit(body=body) return
Example #18
Source File: scrape_bugblog.py From Penny-Dreadful-Tools with GNU General Public License v3.0 | 5 votes |
def check_for_missing_bugs(b: Tag) -> None: for row in b.find_all('tr'): data = row.find_all('td') row_text = data[1].text.strip() if row_text == 'Description': # BS4 is bad. continue issue = find_issue_by_code(row_text) if issue: labels = [c.name for c in issue.labels] categories = [c for c in labels if c in strings.METACATS] if categories: continue bbcat = re.match(strings.REGEX_BBCAT, data[2].text.strip()) if bbcat is None: continue g1 = bbcat.group(1).strip() if g1 in strings.METACATS: issue.add_to_labels(g1) continue if bbcat.group(2) is not None: g2 = bbcat.group(2).strip() if g2 in strings.METACATS: issue.add_to_labels(g2) continue print(f'Unknown BBCat: {bbcat.group(0)}') continue print('Could not find issue for `{row}`'.format(row=row_text)) text = 'From Bug Blog.\nBug Blog Text: {0}'.format(row_text) repo.get_repo().create_issue(strings.remove_smartquotes(row_text), body=strings.remove_smartquotes(text), labels=['From Bug Blog'])
Example #19
Source File: scrape_announcements.py From Penny-Dreadful-Tools with GNU General Public License v3.0 | 5 votes |
def parse_downtimes(h: Tag) -> None: for n in h.next_elements: if isinstance(n, Tag) and n.text: with open('downtimes.txt', 'w', encoding='utf-8') as f: txt = n.text.strip() txt = txt.replace("Please note that there are no more 'extended' or 'normal' downtimes; in the new world with fewer downtimes, they're all the same length of time.", '') print(txt) f.write(txt) break
Example #20
Source File: scrape_announcements.py From Penny-Dreadful-Tools with GNU General Public License v3.0 | 5 votes |
def parse_header(h: Tag) -> None: txt = h.text if txt.startswith('Downtime'): parse_downtimes(h) elif txt.startswith('Build Notes'): parse_build_notes(h)
Example #21
Source File: fetcher.py From Penny-Dreadful-Tools with GNU General Public License v3.0 | 5 votes |
def get_article_archive() -> List[Tuple[Tag, str]]: try: html = fetch_tools.fetch('http://magic.wizards.com/en/articles/archive/184956') except fetch_tools.FetchException: html = fetch_tools.fetch('http://magic.wizards.com/en/articles/archive/') soup = BeautifulSoup(html, 'html.parser') return [parse_article_item_extended(a) for a in soup.find_all('div', class_='article-item-extended')]
Example #22
Source File: doc.py From bot with MIT License | 5 votes |
def _match_end_tag(tag: Tag) -> bool: """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table.""" for attr in SEARCH_END_TAG_ATTRS: if attr in tag.get("class", ()): return True return tag.name == "table"
Example #23
Source File: xss_utils.py From ITWSV with MIT License | 4 votes |
def study(bs_node, parent=None, keyword=""): entries = [] # if parent is None: # print("Keyword is: {0}".format(keyword)) if keyword in str(bs_node).lower(): if isinstance(bs_node, element.Tag): if keyword in str(bs_node.attrs): for k, v in bs_node.attrs.items(): if keyword in v: # print("Found in attribute value {0} of tag {1}".format(k, bs_node.name)) noscript = close_noscript(bs_node) d = {"type": "attrval", "name": k, "tag": bs_node.name, "noscript": noscript} if d not in entries: entries.append(d) if keyword in k: # print("Found in attribute name {0} of tag {1}".format(k, bs_node.name)) noscript = close_noscript(bs_node) d = {"type": "attrname", "name": k, "tag": bs_node.name, "noscript": noscript} if d not in entries: entries.append(d) elif keyword in bs_node.name: # print("Found in tag name") noscript = close_noscript(bs_node) d = {"type": "tag", "value": bs_node.name, "noscript": noscript} if d not in entries: entries.append(d) # recursively search injection points for the same variable for x in bs_node.contents: for entry in study(x, parent=bs_node, keyword=keyword): if entry not in entries: entries.append(entry) elif isinstance(bs_node, element.Comment): # print("Found in comment, tag {0}".format(parent.name)) noscript = close_noscript(bs_node) d = {"type": "comment", "parent": parent.name, "noscript": noscript} if d not in entries: entries.append(d) elif isinstance(bs_node, element.NavigableString): # print("Found in text, tag {0}".format(parent.name)) noscript = close_noscript(bs_node) d = {"type": "text", "parent": parent.name, "noscript": noscript} if d not in entries: entries.append(d) return entries # generate a list of payloads based on where in the webpage the js-code will be injected
Example #24
Source File: tbentries.py From open-context-py with GNU General Public License v3.0 | 4 votes |
def remove_empty_node(self, node): """ gets rid of empty tags """ if isinstance(node, Tag): keep_nodes = [ 'img', 'td', 'tr', 'th', ] if node.name.lower() not in keep_nodes: remove_node = False no_child_remove_tags = [ 'a', 'img', 'th', 'tr', 'td', 'strong', 'ul', 'ol', 'li', 'em', 'i', 'u', 'b', 'sup', 'sub', 'mark', 'q', 'samp', 'small' ] ok_child_nodes = node.find_all(no_child_remove_tags) if len(ok_child_nodes) < 1: # ok no images check for text all_string = '' for act_string in node.stripped_strings: all_string += str(act_string) for d_child in node.descendants: if isinstance(d_child, Tag): for act_string in d_child.stripped_strings: all_string += str(act_string) all_string = all_string.strip() # print('Check on: <' + node.name.lower() + '> with: ' + str(all_string)) if len(all_string) < 1: remove_node = True if isinstance(node.string, str): n_string = node.string n_string = n_string.strip() if len(n_string) < 1: remove_node = True else: remove_node = False if remove_node: # print('Removing a: <' + node.name.lower() + '>') node.extract()
Example #25
Source File: scrape_bugblog.py From Penny-Dreadful-Tools with GNU General Public License v3.0 | 4 votes |
def parse_knownbugs(b: Tag) -> None: # attempt to find all the fixed bugs all_codes = b.find_all(string=lambda text: isinstance(text, Comment)) all_codes = [str(code).replace('\t', ' ') for code in all_codes] for issue in repo.get_repo().get_issues(): # code = re.search(CODE_REGEX, issue.body, re.MULTILINE) bbt = re.search(BBT_REGEX, issue.body, re.MULTILINE) if bbt is None: cards = strings.get_cards_from_string(issue.title) if repo.is_issue_from_bug_blog(issue): find_bbt_in_body_or_comments(issue) find_bbt_in_issue_title(issue, b) bbt = re.search(BBT_REGEX, issue.body, re.MULTILINE) if bbt is None: print('Issue #{id} {cards} has no Bug Blog text!'.format(id=issue.number, cards=cards)) issue.add_to_labels('Invalid Bug Blog') continue if not cards: continue lines = b.find_all(string=re.compile(r'\[' + cards[0] + r'\]')) if not lines: continue for line in lines: parent = line.parent bb_text = parent.get_text().strip() if find_issue_by_code(bb_text) is not None: print('Already assigned.') continue text = ''.join(parent.strings) print(text) repo.create_comment(issue, 'Found in bug blog.\nBug Blog Text: {0}'.format(text)) if not repo.is_issue_from_bug_blog(issue): issue.add_to_labels('From Bug Blog') continue if 'Invalid Bug Blog' in [i.name for i in issue.labels]: issue.remove_from_labels('Invalid Bug Blog') if repo.is_issue_from_bug_blog(issue): # Don't check for Bug Blog Text if it's not marked as a BB issue (Maybe because it was reopened) check_if_removed_from_bugblog(bbt, b, issue) check_for_missing_bugs(b)
Example #26
Source File: extract.py From dart-fss with MIT License | 4 votes |
def seek_table(tables: List, includes: Pattern, excludes: Union[Pattern, None] = None) -> Tuple[Union[str, None], Union[str, None], Union[str, None]]: """ Table 검색 """ regex = re.compile(r'\d{4}(.*?)\d{2}(.*?)\d{2}') for table in tables: for tag in table.previous_siblings: if tag in tables: break if isinstance(tag, Tag): children = tag.findChildren(text=includes) for child in children: title = child if title: title = re.sub(r'\s+', '', title) if excludes and excludes.search(title): continue if len(title) > 12: continue header = table.find_previous('table', class_='nb') if header is None: continue tr_list = header.find_all('tr') if len(tr_list) < 2: continue tr_cnt = 0 for tr in tr_list: if regex.search(tr.text): tr_cnt += 1 if tr_cnt == 0: found = table.find_previous(text=re.compile(r'\d{4}(.*?)\d{2}(.*?)\d{2}')) if found is None: continue header = found.parent extract_text = re.sub('<.*?>', '\n', str(header)) extract_text = extract_text.split('\n') html = '<table class="nb"><tbody>' error = False for t in extract_text: if t.strip() == '': pass else: if len(t) > 100: error = True break html += '<tr><td>' + t + '</td></tr>' if error: continue html += '</tbody></table>' header = BeautifulSoup(html, 'html.parser') return title, header, table return None, None, None
Example #27
Source File: Junos.py From assimilator with MIT License | 4 votes |
def get(self,args): logger.debug("class rules(JUNOS).get({0})".format(str(args))) if not self.dev.connected: logger.error("{0}: Firewall timed out or incorrect device credentials.".format(self.firewall_config['name'])) return {'error' : 'Could not connect to device.'}, 504 else: logger.info("{0}: Connected successfully.".format(self.firewall_config['name'])) try: soup = BS(str(etree.tostring(self.dev.rpc.get_firewall_policies(), encoding='unicode')),'xml') logger.debug("soup: " + str(soup)) except Exception as e: logger.error("Error parsing soup: {0}".format(str(e))) return {'error' : 'Error parsing soup.'}, 500 finally: logger.debug("Closing device...") self.dev.close() entries = list() for context in soup.find("security-policies").children: if type(context) != Tag: continue elif context.name == "default-policy": continue else: logger.debug("context: {0}".format(str(context))) src_zone = context.find("context-information").find("source-zone-name").text dst_zone = context.find("context-information").find("destination-zone-name").text logger.debug("src_zone: {0}\ndst_zone: {1}\n".format(src_zone,dst_zone)) for rule in context.children: logger.debug("Rule: {0}".format(str(rule))) if rule.name == "context-information" or type(rule) != Tag: continue aux = { "enabled" : True if rule.find('policy-state').text == 'enabled' else False, "id" : int(rule.find('policy-identifier').text), "action": rule.find('policy-information').find('policy-action').find('action-type').text, "destination": list(), "from": src_zone, "logging": False if rule.find('policy-information').find('policy-action').find('log') else rule.find('policy-information').find('policy-action').find('log'), "name": rule.find('policy-information').find('policy-name').text, "application": list(), "source": list(), "to": dst_zone } for addr in rule.find('source-addresses').children: if type(addr) != Tag: continue aux['source'].append(addr.find('address-name').text) for addr in rule.find('destination-addresses').children: if type(addr) != Tag: continue aux['destination'].append(addr.find('address-name').text) for addr in rule.find('applications').children: if type(addr) != Tag: continue aux['application'].append(addr.find('application-name').text) entries.append(aux) #entries = self.filter(args,entries) return {'len' : len(entries), 'rules' : entries}
Example #28
Source File: mod16.py From RHEAS with MIT License | 4 votes |
def download(dbname, dts, bbox): """Downloads the MODIS evapotranspiration data product MOD16 for a set of dates *dt* and imports them into the PostGIS database *dbname*.""" log = logging.getLogger(__name__) res = 0.01 urlbase = "http://files.ntsg.umt.edu" tiles = modis.findTiles(bbox) if tiles is not None: for dt in [dts[0] + timedelta(dti) for dti in range((dts[-1] - dts[0]).days + 1)]: url = "{0}/data/NTSG_Products/MOD16/MOD16A2.105_MERRAGMAO/Y{1}".format(urlbase, dt.year) resp_year = requests.get(url) try: assert resp_year.status_code == 200 days = [link for link in BeautifulSoup(resp_year.text, parse_only=SoupStrainer('a')) if isinstance(link, Tag) and link.text.find(dt.strftime("%j")) >= 0] assert len(days) > 0 resp_day = requests.get("{0}{1}".format(urlbase, days[0].get('href'))) assert resp_day.status_code == 200 files = [link.get('href') for link in BeautifulSoup(resp_day.text, parse_only=SoupStrainer('a')) if isinstance(link, Tag) and link.text.find("hdf") > 0] files = [f for f in files if any(f.find("h{0:02d}v{1:02d}".format(t[1], t[0])) > 0 for t in tiles)] outpath = tempfile.mkdtemp() for fname in files: resp_file = requests.get("{0}{1}".format(urlbase, fname)) filename = fname.split("/")[-1] with open("{0}/{1}".format(outpath, filename), 'wb') as fout: for chunk in resp_file: fout.write(chunk) proc = subprocess.Popen(["gdal_translate", "HDF4_EOS:EOS_GRID:{0}/{1}:MOD_Grid_MOD16A2:ET_1km".format( outpath, filename), "{0}/{1}".format(outpath, filename).replace("hdf", "tif")], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) out, err = proc.communicate() log.debug(out) tifs = glob.glob("{0}/*.tif".format(outpath)) proc = subprocess.Popen( ["gdal_merge.py", "-o", "{0}/et.tif".format(outpath)] + tifs, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) out, err = proc.communicate() log.debug(out) proc = subprocess.Popen(["gdal_calc.py", "-A", "{0}/et.tif".format(outpath), "--outfile={0}/et1.tif".format( outpath), "--NoDataValue=-9999", "--calc=(A<32701)*(0.1*A+9999)-9999"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) out, err = proc.communicate() log.debug(out) proc = subprocess.Popen(["gdalwarp", "-t_srs", "+proj=latlong +ellps=sphere", "-tr", str( res), str(-res), "{0}/et1.tif".format(outpath), "{0}/et2.tif".format(outpath)], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) out, err = proc.communicate() log.debug(out) if bbox is None: pstr = [] else: pstr = ["-projwin", str(bbox[0]), str(bbox[3]), str(bbox[2]), str(bbox[1])] proc = subprocess.Popen(["gdal_translate"] + pstr + ["-a_srs", "epsg:4326", "{0}/et2.tif".format(outpath), "{0}/et3.tif".format(outpath)], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) out, err = proc.communicate() log.debug(out) dbio.ingest( dbname, "{0}/et3.tif".format(outpath), dt, table, False) shutil.rmtree(outpath) except: log.warning("MOD16 data not available for {0}. Skipping download!".format( dt.strftime("%Y-%m-%d")))