Python bs4.SoupStrainer() Examples
The following are 30
code examples of bs4.SoupStrainer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
bs4
, or try the search function
.
Example #1
Source File: _session.py From fbchat with BSD 3-Clause "New" or "Revised" License | 6 votes |
def find_form_request(html: str): soup = bs4.BeautifulSoup(html, "html.parser", parse_only=bs4.SoupStrainer("form")) form = soup.form if not form: raise _exception.ParseError("Could not find form to submit", data=html) url = form.get("action") if not url: raise _exception.ParseError("Could not find url to submit to", data=form) # From what I've seen, it'll always do this! if url.startswith("/"): url = "https://www.facebook.com" + url # It's okay to set missing values to something crap, the values are localized, and # hence are not available in the raw HTML data = { x["name"]: x.get("value", "[missing]") for x in form.find_all(["input", "button"]) } return url, data
Example #2
Source File: interface.py From hfut with MIT License | 6 votes |
def parse(response): page = response.text ss = SoupStrainer('table', width='650') bs = GlobalFeaturedSoup(page, parse_only=ss) trs = bs.find_all('tr') keys = tuple(trs[1].stripped_strings) if len(keys) != 6: log_result_not_found(response) return [] value_list = parse_tr_strs(trs[2:]) teaching_plan = [] for values in value_list: code = values[1].upper() if teaching_plan and teaching_plan[-1]['课程代码'] == code: # 宣城校区查询公选课会有大量的重复 continue plan = dict(safe_zip(keys, values)) plan.pop('序号') plan['课程代码'] = code plan['学时'] = int(plan['学时']) plan['学分'] = float(plan['学分']) teaching_plan.append(plan) return teaching_plan
Example #3
Source File: interface.py From hfut with MIT License | 6 votes |
def parse(response): page = response.text ss = SoupStrainer('table', width='650') bs = GlobalFeaturedSoup(page, parse_only=ss) title = bs.find('tr', bgcolor='#FB9E04') trs = bs.find_all('tr', bgcolor=re.compile(r'#D6D3CE|#B4B9B9')) if title: courses = [] keys = tuple(title.stripped_strings) value_list = parse_tr_strs(trs) for values in value_list: course = dict(safe_zip(keys, values)) course.pop('序号') course['课程代码'] = course['课程代码'].upper() course['班级容量'] = int(course['班级容量']) courses.append(course) return courses else: log_result_not_found(response) return []
Example #4
Source File: voc_fetcher1.0.py From VOC with GNU General Public License v3.0 | 6 votes |
def __pre_process(self, page): # As BeautifulSoup will cause memory I/O error when the page is too large if page.find('<dl')>0: data = page.split('<dl') tag_dd = SoupStrainer('dd') for idx in xrange(1, len(data)): count = data[idx].count('<dd') if count > 5: parts = data[idx].split('</dl>') dds = parts[0].split('</dd>') data[idx] = ''.join([dds[0], '</dd> <dx>%d</dx>'%idx, dds[-1], '</dl>', ''.join(parts[1:])]) self.__dd_ext[str(idx)] =[] for item in dds[1:-1]: dd = BeautifulSoup(item, parse_only=tag_dd).dd assert dd self.__dd_ext[str(idx)].append(dd) return '<dl'.join(data) else: return page
Example #5
Source File: interface.py From hfut with MIT License | 6 votes |
def parse(response): page = response.text ss = SoupStrainer('table') bs = GlobalFeaturedSoup(page, parse_only=ss) if not bs.text: log_result_not_found(response) return {} value_list = parse_tr_strs(bs.find_all('tr')) # 第一行最后有个照片项 teacher_info = {'照片': value_list[0].pop()} # 第五行最后有两个空白 value_list[4] = value_list[4][:2] # 第六行有两个 联系电话 键 phone = value_list[5] teacher_info['联系电话'] = phone[1::2] value_list.remove(phone) # 解析其他项 for v in value_list: for i in range(0, len(v), 2): teacher_info[v[i]] = v[i + 1] return teacher_info
Example #6
Source File: jenkins.py From hacker-scripts with MIT License | 6 votes |
def __get_version(self): ''' get jenkins version :return: ''' try: html = urllib2.urlopen(self.url + '/login?from=%2F').read() links = SoupStrainer('a' ,href = re.compile(VERSION_TAG)) version_text = BeautifulSoup(html, "html.parser", parse_only= links) if version_text.text != "": color_output("[+]....jenkins version is %s" % version_text.text) version_re = re.findall(u"ver.\s(.*)" ,version_text.text) if len(version_re) != 0: if version_re[0][0:4] >= self.check_version: self.user_link = ASYNCH_PEOPEL_PERFIX else: self.user_link = PEOPLE_PERFIX else: color_output("[-]....can't get jenkins version!") sys.exit() except urllib2.URLError,e: color_output("[-]....can't get jenkins version!") sys.exit()
Example #7
Source File: voc_fetcher1.0.py From VOC with GNU General Public License v3.0 | 6 votes |
def __initdef(self, word, data): data = self.__pre_process(data) wpg = SoupStrainer('div', class_=re.compile('[^<>]*?wordPage[^<>]*?')) soup = BeautifulSoup(data, parse_only=wpg) div = soup.find('div', class_=re.compile('[^<>]*?wordPage[^<>]*?')) assert div self.__getwordtitle(div.h1) if word != self.__title: self.__title = None return False div = soup.find('div', {'class': 'section blurb'}) if div: self.__hasblurb = True self.__getblurb(div) tags = soup.find_all(re.compile(r'div|h2'), class_='sectionHeader') tag = find_fulldefine(tags, re.compile(r'DEFINITIONS OF', re.I)) if tag: self.__getfulldef(tag.parent) else: print("WARNING: %s HAS NO FULLDEFINITION" % self.__title) assert tag # to raise error and break div = soup.find('div', {'class': 'section family'}) if div: self.__getwordfamily(div) return True
Example #8
Source File: get_witnesses_for_bill.py From influence-texas with GNU General Public License v2.0 | 6 votes |
def get_witnesses_for_bill(bill_id, session): # ex: "HB 864" -> "https://capitol.texas.gov/tlodocs/86R/witlistbill/html/HB00864H.htm" parsed_bill_id = re.search(r"(\w+)\s+(\d+)", bill_id) bill_type = parsed_bill_id.group(1) bill_number = parsed_bill_id.group(2).zfill(5) url_prefix = f"https://capitol.texas.gov/tlodocs/{session}R/witlistbill/html/{bill_type}{bill_number}" house_url = f"{url_prefix}H.htm" senate_url = f"{url_prefix}S.htm" res = requests.get(house_url) # ##### Basic Test # # parsing all <p/> blocks up front may not be efficient # filter = SoupStrainer('p') # only <p/> tags contain text that we care about # text_blocks = BeautifulSoup(res.content, "html.parser", parse_only=filter) # selecting = None; # for block in text_blocks: # text = block.get_text(strip=True) # print(f"[{text}]") return parse_witness_list_html(res.content)
Example #9
Source File: fox.py From plugin.video.ustvvod with GNU General Public License v2.0 | 6 votes |
def convert_subtitles(closedcaption): str_output = '' last_start_time = '' subtitle_data = connection.getURL(closedcaption, connectiontype = 0) subtitle_data = BeautifulSoup(subtitle_data, 'html.parser', parse_only = SoupStrainer('div')) lines = subtitle_data.find_all('p') for i, line in enumerate(lines): if line is not None: sub = clean_subs(common.smart_utf8(line)) start_time = common.smart_utf8(line['begin'].replace('.', ',')) try: end_time = common.smart_utf8(line['end'].replace('.', ',')) except: continue if last_start_time != start_time: if i != 0: str_output += '\n\n' str_output += str(i + 1) + '\n' + start_time + ' --> ' + end_time + '\n' + sub else: str_output += '\n' + sub last_start_time = start_time file = open(ustvpaths.SUBTITLE, 'w') file.write(str_output) file.close()
Example #10
Source File: html_pbp.py From Hockey-Scraper with GNU General Public License v3.0 | 6 votes |
def get_contents(game_html): """ Uses Beautiful soup to parses the html document. Some parsers work for some pages but don't work for others....I'm not sure why so I just try them all here in order :param game_html: html doc :return: "soupified" html """ parsers = ["lxml", "html.parser", "html5lib"] strainer = SoupStrainer('td', attrs={'class': re.compile(r'bborder')}) for parser in parsers: # parse_only only works with lxml for some reason if parser == "lxml": soup = BeautifulSoup(game_html, parser, parse_only=strainer) else: soup = BeautifulSoup(game_html, parser) tds = soup.find_all("td", {"class": re.compile('.*bborder.*')}) if len(tds) > 0: break return tds
Example #11
Source File: pbs.py From plugin.video.ustvvod with GNU General Public License v2.0 | 6 votes |
def convert_subtitles(closedcaption): str_output = '' subtitle_data = connection.getURL(closedcaption, connectiontype = 0) subtitle_data = BeautifulSoup(subtitle_data, 'html.parser', parse_only = SoupStrainer('div')) lines = subtitle_data.find_all('p') for i, line in enumerate(lines): if line is not None: sub = clean_subs(common.smart_utf8(line)) start_time = common.smart_utf8(line['begin'].replace('.', ',')) if ',' not in start_time: start_time = start_time + ',00' end_time = common.smart_utf8(line['end'].replace('.', ',')) if ',' not in end_time: end_time = end_time + ',00' str_output += str(i + 1) + '\n' + start_time[:11] + ' --> ' + end_time[:11] + '\n' + sub + '\n\n' file = open(ustvpaths.SUBTITLE, 'w') file.write(str_output) file.close()
Example #12
Source File: oxygen.py From plugin.video.ustvvod with GNU General Public License v2.0 | 6 votes |
def convert_subtitles(closedcaption): str_output = '' subtitle_data = connection.getURL(closedcaption, connectiontype = 0) subtitle_data = BeautifulSoup(subtitle_data, 'html.parser', parse_only = SoupStrainer('div')) lines = subtitle_data.find_all('p') for i, line in enumerate(lines): if line is not None: sub = clean_subs(common.smart_utf8(line)) start_time_rest, start_time_msec = line['begin'].rsplit(':',1) start_time = common.smart_utf8(start_time_rest + ',' + start_time_msec) try: end_time_rest, end_time_msec = line['end'].rsplit(':',1) end_time = common.smart_utf8(end_time_rest + ',' + end_time_msec) except: continue str_output += str(i + 1) + '\n' + start_time + ' --> ' + end_time + '\n' + sub + '\n\n' file = open(ustvpaths.SUBTITLE, 'w') file.write(str_output) file.close()
Example #13
Source File: webscraper_beautifulsoup.py From Python4Pentesters with MIT License | 6 votes |
def getLinks(text, url=""): if url and url[-1] == "/": url = url[:-1] links = [] if text: for link in BeautifulSoup(text, "html.parser", parse_only=SoupStrainer("a", href=True)): if link.has_attr('href'): if (link['href']): href = link['href'].strip() if not href.startswith("http://") and not href.startswith("https://") and not href.startswith("mailto:") and not href.startswith("tel:"): if not href.startswith('/'): href = "/" + href href = url + href links.append(href) return links #----------------------------------------------------------------------------- # MAIN #-----------------------------------------------------------------------------
Example #14
Source File: collect.py From web_page_classification with MIT License | 6 votes |
def get_child_urls(main_page, max_child=20): """retrieve urls from giving html page. args: main_page(str): html file. max_child(int): max number of return urls. return: list of url string. """ from bs4 import BeautifulSoup, SoupStrainer children = [] for link in BeautifulSoup(main_page, "html.parser", parse_only=SoupStrainer('a')): if link.has_attr('href') and link['href'].startswith("http"): children.append(link['href']) if len(children) > max_child: children = children[:max_child] return children
Example #15
Source File: main_nbcu.py From plugin.video.ustvvod with GNU General Public License v2.0 | 6 votes |
def convert_subtitles(closedcaption): str_output = '' subtitle_data = connection.getURL(closedcaption, connectiontype = 0) subtitle_data = BeautifulSoup(subtitle_data, 'html.parser', parse_only = SoupStrainer('div')) lines = subtitle_data.find_all('p') for i, line in enumerate(lines): if line is not None: sub = clean_subs(common.smart_utf8(line)) start_time_rest, start_time_msec = line['begin'].rsplit(':',1) start_time = common.smart_utf8(start_time_rest + ',' + start_time_msec) try: end_time_rest, end_time_msec = line['end'].rsplit(':',1) end_time = common.smart_utf8(end_time_rest + ',' + end_time_msec) except: continue str_output += str(i + 1) + '\n' + start_time + ' --> ' + end_time + '\n' + sub + '\n\n' file = open(ustvpaths.SUBTITLE, 'w') file.write(str_output) file.close()
Example #16
Source File: main_abcdisney.py From plugin.video.ustvvod with GNU General Public License v2.0 | 6 votes |
def convert_subtitles(closedcaption): str_output = '' subtitle_data = connection.getURL(closedcaption, connectiontype = 0) subtitle_data = BeautifulSoup(subtitle_data, 'html.parser', parse_only = SoupStrainer('div')) lines = subtitle_data.find_all('p') for i, line in enumerate(lines): if line is not None: sub = clean_subs(common.smart_utf8(line)) start_time_hours, start_time_rest = line['begin'].split(':', 1) start_time_hours = '%02d' % (int(start_time_hours) - 1) start_time = common.smart_utf8(start_time_hours + ':' + start_time_rest.replace('.', ',')) end_time_hours, end_time_rest = line['end'].split(':', 1) end_time_hours = '%02d' % (int(end_time_hours) - 1) end_time = common.smart_utf8(end_time_hours + ':' + end_time_rest.replace('.', ',')) str_output += str(i + 1) + '\n' + start_time + ' --> ' + end_time + '\n' + sub + '\n\n' file = open(ustvpaths.SUBTITLE, 'w') file.write(str_output) file.close() return True
Example #17
Source File: interface.py From hfut with MIT License | 6 votes |
def parse(response): page = response.text ss = SoupStrainer('table', id='TableXKJG') bs = GlobalFeaturedSoup(page, parse_only=ss) courses = [] keys = tuple(bs.find('tr', bgcolor='#296DBD').stripped_strings) # value_list = [tr.stripped_strings for tr in bs.find_all('tr', bgcolor='#D6D3CE')] value_list = parse_tr_strs(bs.find_all('tr', bgcolor='#D6D3CE')) for values in value_list: course = dict(safe_zip(keys, values)) course['课程代码'] = course['课程代码'].upper() course['学分'] = float(course['学分']) course['费用'] = float(course['费用']) courses.append(course) return courses
Example #18
Source File: main_natgeo.py From plugin.video.ustvvod with GNU General Public License v2.0 | 6 votes |
def episodes(SITE, episode_url = common.args.url): episodes = [] if '#' in episode_url: episode_url = episode_url.split('#')[1] episode_data = connection.getURL(episode_url) episode_tree = BeautifulSoup(episode_data, 'html.parser', parse_only = SoupStrainer('div', class_ = 'show')) try: episodes = add_videos(episode_tree, SITE) except: print "Can't add video" more = episode_tree.find('a', class_ = 'load-more') if more: episode_data = connection.getURL(BASE + more['href']) episode_tree = BeautifulSoup(episode_data, 'html.parser') episodes = add_videos(episode_tree, SITE) return episodes
Example #19
Source File: main_natgeo.py From plugin.video.ustvvod with GNU General Public License v2.0 | 6 votes |
def masterlist(SITE, SHOWS, SPECIALS = None): master_db = [] root_dict = {} root_url = SHOWS root_data = connection.getURL(root_url) root_tree = BeautifulSoup(root_data, 'html.parser', parse_only = SoupStrainer('div', id = 'grid-frame')) root_menu = root_tree.find_all('div', class_ = 'media-module') for root_item in root_menu: root_name = root_item.find('div', class_ = 'title').text season_url = BASE + root_item.a['href'] if '-1' not in season_url: tvdb_name = common.get_show_data(root_name, SITE, 'seasons')[-1] root_name = root_name + '#' + season_url if tvdb_name not in root_dict.keys(): root_dict[tvdb_name] = root_name else: root_dict[tvdb_name] = root_dict[tvdb_name] + '|' + root_name for root_name in root_dict: season_url = root_dict[root_name] master_db.append((root_name, SITE, 'seasons', season_url)) more = root_tree.find('a', class_ = 'load-more') if more: master_db.extend(masterlist(SITE, BASE + more['href'])) return master_db
Example #20
Source File: interface.py From hfut with MIT License | 5 votes |
def parse(response): page = response.text ss = SoupStrainer('table', id='KCTable') bs = GlobalFeaturedSoup(page, parse_only=ss) courses = [] trs = bs.find_all('tr') value_list = [tuple(tr.stripped_strings) for tr in trs] for values in value_list: course = {'课程代码': values[0].upper(), '课程名称': values[1], '课程类型': values[2], '开课院系': values[3], '学分': float(values[4])} courses.append(course) return courses
Example #21
Source File: teams.py From sports.py with MIT License | 5 votes |
def _get_team_links(base_url, table_id): links = SoupStrainer('table', {'id': table_id}) return BeautifulSoup(requests.get(base_url).content, 'html.parser', parse_only=links)
Example #22
Source File: doc_dict_gen.py From zenpy with GNU General Public License v3.0 | 5 votes |
def parse_link(link): namespace = link.split('/')[-2] print("Parsing {} link: {}".format(namespace, link)) response = requests.get(base_url + link) table_attr = SoupStrainer("table") soup = BeautifulSoup(response.content, 'lxml', parse_only=table_attr) table = soup.find('table') if not table: return {} rows = table.findAll('tr') header = [data.text.lower() for data in rows[0].findAll('th')] object_name = os.path.basename(os.path.normpath(link)) object_info = defaultdict(dict) object_info[object_name] = defaultdict(dict) for row in rows[1:]: columns = [data.text for data in row.findAll('td')] row_data = dict(zip(header, columns)) name = row_data.pop('name', None) if name: object_info[object_name][name].update(row_data) print("Parsing Completed for: " + link) return namespace, object_info
Example #23
Source File: html.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def __init__(self, *args, **kwargs): super(_BeautifulSoupHtml5LibFrameParser, self).__init__(*args, **kwargs) from bs4 import SoupStrainer self._strainer = SoupStrainer('table')
Example #24
Source File: score.py From gxgk-wechat-server with MIT License | 5 votes |
def score_page(studentid, url, session, proxy): """在校成绩页面""" url = url + studentid # 先获取查询成绩需要的 VIEWSTATE if not proxy: pre_score = session.get(url, allow_redirects=False, timeout=5) else: pre_score = session.get(url, allow_redirects=False, timeout=5, proxies=app.config['SCHOOL_LAN_PROXIES']) pre_score_soup = BeautifulSoup( pre_score.text, "html.parser", parse_only=SoupStrainer("input")) score_view_state = pre_score_soup.find( attrs={"name": "__VIEWSTATE"})['value'] # 查询成绩 payload = { '__VIEWSTATE': score_view_state, 'Button2': u'在校学习成绩查询', 'ddlXN': '', 'ddlXQ': '' } if not proxy: score_res = session.post(url, data=payload, allow_redirects=False, timeout=5) else: score_res = session.post(url, data=payload, allow_redirects=False, proxies=app.config['SCHOOL_LAN_PROXIES'], timeout=5) return score_res
Example #25
Source File: voc_fetcher1.0.py From VOC with GNU General Public License v3.0 | 5 votes |
def __getmore(self, link): page = getpage(link) article = SoupStrainer('div', class_='articlebody') soup = BeautifulSoup(page, parse_only=article) div = soup.find('div', {'class': 'articlebody'}) assert div self.__transfchswdBd(div, link)
Example #26
Source File: html.py From Splunking-Crime with GNU Affero General Public License v3.0 | 5 votes |
def __init__(self, *args, **kwargs): super(_BeautifulSoupHtml5LibFrameParser, self).__init__(*args, **kwargs) from bs4 import SoupStrainer self._strainer = SoupStrainer('table')
Example #27
Source File: html.py From elasticintel with GNU General Public License v3.0 | 5 votes |
def __init__(self, *args, **kwargs): super(_BeautifulSoupHtml5LibFrameParser, self).__init__(*args, **kwargs) from bs4 import SoupStrainer self._strainer = SoupStrainer('table')
Example #28
Source File: tipue_search.py From ford with GNU General Public License v3.0 | 5 votes |
def __init__(self, output_path, project_url): self.output_path = output_path self.siteurl = project_url self.json_nodes = [] self.only_text = SoupStrainer('div', id="text") self.only_title = SoupStrainer('title')
Example #29
Source File: _session.py From fbchat with BSD 3-Clause "New" or "Revised" License | 5 votes |
def get_error_data(html: str) -> Optional[str]: """Get error message from a request.""" soup = bs4.BeautifulSoup( html, "html.parser", parse_only=bs4.SoupStrainer("form", id="login_form") ) # Attempt to extract and format the error string return " ".join(list(soup.stripped_strings)[1:3]) or None
Example #30
Source File: webscraper.py From plugin.video.vrt.nu with GNU General Public License v3.0 | 5 votes |
def get_categories(): """Return a list of categories by scraping the VRT NU website""" cache_file = 'categories.json' categories = [] # Try the cache if it is fresh categories = get_cache(cache_file, ttl=7 * 24 * 60 * 60) # Try to scrape from the web if not valid_categories(categories): from bs4 import BeautifulSoup, SoupStrainer log(2, 'URL get: https://www.vrt.be/vrtnu/categorieen/') response = urlopen('https://www.vrt.be/vrtnu/categorieen/') tiles = SoupStrainer('nui-list--content') soup = BeautifulSoup(response.read(), 'html.parser', parse_only=tiles) categories = [] for tile in soup.find_all('nui-tile'): categories.append(dict( id=tile.get('href').split('/')[-2], thumbnail=get_category_thumbnail(tile), name=get_category_title(tile), )) if categories: from json import dumps update_cache('categories.json', dumps(categories)) return categories