Python bs4.SoupStrainer() Examples

The following are 30 code examples of bs4.SoupStrainer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module bs4 , or try the search function .
Example #1
Source File: _session.py    From fbchat with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def find_form_request(html: str):
    soup = bs4.BeautifulSoup(html, "html.parser", parse_only=bs4.SoupStrainer("form"))

    form = soup.form
    if not form:
        raise _exception.ParseError("Could not find form to submit", data=html)

    url = form.get("action")
    if not url:
        raise _exception.ParseError("Could not find url to submit to", data=form)

    # From what I've seen, it'll always do this!
    if url.startswith("/"):
        url = "https://www.facebook.com" + url

    # It's okay to set missing values to something crap, the values are localized, and
    # hence are not available in the raw HTML
    data = {
        x["name"]: x.get("value", "[missing]")
        for x in form.find_all(["input", "button"])
    }
    return url, data 
Example #2
Source File: interface.py    From hfut with MIT License 6 votes vote down vote up
def parse(response):
        page = response.text
        ss = SoupStrainer('table', width='650')
        bs = GlobalFeaturedSoup(page, parse_only=ss)
        trs = bs.find_all('tr')
        keys = tuple(trs[1].stripped_strings)
        if len(keys) != 6:
            log_result_not_found(response)
            return []

        value_list = parse_tr_strs(trs[2:])
        teaching_plan = []
        for values in value_list:
            code = values[1].upper()
            if teaching_plan and teaching_plan[-1]['课程代码'] == code:
                # 宣城校区查询公选课会有大量的重复
                continue
            plan = dict(safe_zip(keys, values))
            plan.pop('序号')
            plan['课程代码'] = code
            plan['学时'] = int(plan['学时'])
            plan['学分'] = float(plan['学分'])
            teaching_plan.append(plan)
        return teaching_plan 
Example #3
Source File: interface.py    From hfut with MIT License 6 votes vote down vote up
def parse(response):
        page = response.text
        ss = SoupStrainer('table', width='650')
        bs = GlobalFeaturedSoup(page, parse_only=ss)
        title = bs.find('tr', bgcolor='#FB9E04')
        trs = bs.find_all('tr', bgcolor=re.compile(r'#D6D3CE|#B4B9B9'))
        if title:
            courses = []
            keys = tuple(title.stripped_strings)
            value_list = parse_tr_strs(trs)
            for values in value_list:
                course = dict(safe_zip(keys, values))
                course.pop('序号')
                course['课程代码'] = course['课程代码'].upper()
                course['班级容量'] = int(course['班级容量'])
                courses.append(course)
            return courses
        else:
            log_result_not_found(response)
            return [] 
Example #4
Source File: voc_fetcher1.0.py    From VOC with GNU General Public License v3.0 6 votes vote down vote up
def __pre_process(self, page):
    # As BeautifulSoup will cause memory I/O error when the page is too large
        if page.find('<dl')>0:
            data = page.split('<dl')
            tag_dd = SoupStrainer('dd')
            for idx in xrange(1, len(data)):
                count = data[idx].count('<dd')
                if count > 5:
                    parts = data[idx].split('</dl>')
                    dds = parts[0].split('</dd>')
                    data[idx] = ''.join([dds[0], '</dd> <dx>%d</dx>'%idx,
                        dds[-1], '</dl>', ''.join(parts[1:])])
                    self.__dd_ext[str(idx)] =[]
                    for item in dds[1:-1]:
                        dd = BeautifulSoup(item, parse_only=tag_dd).dd
                        assert dd
                        self.__dd_ext[str(idx)].append(dd)
            return '<dl'.join(data)
        else:
            return page 
Example #5
Source File: interface.py    From hfut with MIT License 6 votes vote down vote up
def parse(response):
        page = response.text
        ss = SoupStrainer('table')
        bs = GlobalFeaturedSoup(page, parse_only=ss)
        if not bs.text:
            log_result_not_found(response)
            return {}
        value_list = parse_tr_strs(bs.find_all('tr'))
        # 第一行最后有个照片项
        teacher_info = {'照片': value_list[0].pop()}
        # 第五行最后有两个空白
        value_list[4] = value_list[4][:2]
        # 第六行有两个 联系电话 键
        phone = value_list[5]
        teacher_info['联系电话'] = phone[1::2]
        value_list.remove(phone)
        # 解析其他项
        for v in value_list:
            for i in range(0, len(v), 2):
                teacher_info[v[i]] = v[i + 1]
        return teacher_info 
Example #6
Source File: jenkins.py    From hacker-scripts with MIT License 6 votes vote down vote up
def __get_version(self):
        '''
        get jenkins version
        :return:
        '''
        try:
            html = urllib2.urlopen(self.url + '/login?from=%2F').read()
            links = SoupStrainer('a' ,href = re.compile(VERSION_TAG))
            version_text = BeautifulSoup(html, "html.parser", parse_only= links)
            if version_text.text != "":
                color_output("[+]....jenkins version is %s" % version_text.text)
                version_re = re.findall(u"ver.\s(.*)" ,version_text.text)
                if len(version_re) != 0:
                    if version_re[0][0:4] >= self.check_version:
                        self.user_link = ASYNCH_PEOPEL_PERFIX
                    else:
                        self.user_link = PEOPLE_PERFIX
            else:
                color_output("[-]....can't get jenkins version!")
                sys.exit()
        except urllib2.URLError,e:
            color_output("[-]....can't get jenkins version!")
            sys.exit() 
Example #7
Source File: voc_fetcher1.0.py    From VOC with GNU General Public License v3.0 6 votes vote down vote up
def __initdef(self, word, data):
        data = self.__pre_process(data)
        wpg = SoupStrainer('div', class_=re.compile('[^<>]*?wordPage[^<>]*?'))
        soup = BeautifulSoup(data, parse_only=wpg)
        div = soup.find('div', class_=re.compile('[^<>]*?wordPage[^<>]*?'))
        assert div
        self.__getwordtitle(div.h1)
        if word != self.__title:
            self.__title = None
            return False
        div = soup.find('div', {'class': 'section blurb'})
        if div:
            self.__hasblurb = True
            self.__getblurb(div)
        tags = soup.find_all(re.compile(r'div|h2'), class_='sectionHeader')
        tag = find_fulldefine(tags, re.compile(r'DEFINITIONS OF', re.I))
        if tag:
            self.__getfulldef(tag.parent)
        else:
            print("WARNING: %s HAS NO FULLDEFINITION" % self.__title)
            assert tag # to raise error and break
        div = soup.find('div', {'class': 'section family'})
        if div:
            self.__getwordfamily(div)
        return True 
Example #8
Source File: get_witnesses_for_bill.py    From influence-texas with GNU General Public License v2.0 6 votes vote down vote up
def get_witnesses_for_bill(bill_id, session):
    # ex: "HB 864" -> "https://capitol.texas.gov/tlodocs/86R/witlistbill/html/HB00864H.htm"
    parsed_bill_id = re.search(r"(\w+)\s+(\d+)", bill_id)
    bill_type = parsed_bill_id.group(1)
    bill_number = parsed_bill_id.group(2).zfill(5)
    url_prefix = f"https://capitol.texas.gov/tlodocs/{session}R/witlistbill/html/{bill_type}{bill_number}"
    house_url = f"{url_prefix}H.htm"
    senate_url = f"{url_prefix}S.htm"

    res = requests.get(house_url)

    # ##### Basic Test
    # # parsing all <p/> blocks up front may not be efficient
    # filter = SoupStrainer('p') # only <p/> tags contain text that we care about
    # text_blocks = BeautifulSoup(res.content, "html.parser", parse_only=filter)
    # selecting = None;
    # for block in text_blocks:
    #     text = block.get_text(strip=True)
    #     print(f"[{text}]")

    return parse_witness_list_html(res.content) 
Example #9
Source File: fox.py    From plugin.video.ustvvod with GNU General Public License v2.0 6 votes vote down vote up
def convert_subtitles(closedcaption):
	str_output = ''
	last_start_time = ''
	subtitle_data = connection.getURL(closedcaption, connectiontype = 0)
	subtitle_data = BeautifulSoup(subtitle_data, 'html.parser', parse_only = SoupStrainer('div'))
	lines = subtitle_data.find_all('p')
	for i, line in enumerate(lines):
		if line is not None:
			sub = clean_subs(common.smart_utf8(line))
			start_time = common.smart_utf8(line['begin'].replace('.', ','))
			try:
				end_time = common.smart_utf8(line['end'].replace('.', ','))
			except:
				continue
			if last_start_time != start_time:
				if i != 0:
					str_output += '\n\n'
				str_output += str(i + 1) + '\n' + start_time + ' --> ' + end_time + '\n' + sub
			else:
				str_output += '\n' + sub 
			last_start_time = start_time
	file = open(ustvpaths.SUBTITLE, 'w')
	file.write(str_output)
	file.close() 
Example #10
Source File: html_pbp.py    From Hockey-Scraper with GNU General Public License v3.0 6 votes vote down vote up
def get_contents(game_html):
    """
    Uses Beautiful soup to parses the html document.
    Some parsers work for some pages but don't work for others....I'm not sure why so I just try them all here in order
    
    :param game_html: html doc
    
    :return: "soupified" html 
    """
    parsers = ["lxml", "html.parser", "html5lib"]
    strainer = SoupStrainer('td', attrs={'class': re.compile(r'bborder')})

    for parser in parsers:
        # parse_only only works with lxml for some reason
        if parser == "lxml":
            soup = BeautifulSoup(game_html, parser, parse_only=strainer)
        else:
            soup = BeautifulSoup(game_html, parser)

        tds = soup.find_all("td", {"class": re.compile('.*bborder.*')})

        if len(tds) > 0:
            break

    return tds 
Example #11
Source File: pbs.py    From plugin.video.ustvvod with GNU General Public License v2.0 6 votes vote down vote up
def convert_subtitles(closedcaption):
	str_output = ''
	subtitle_data = connection.getURL(closedcaption, connectiontype = 0)
	subtitle_data = BeautifulSoup(subtitle_data, 'html.parser', parse_only = SoupStrainer('div'))
	lines = subtitle_data.find_all('p')
	for i, line in enumerate(lines):
		if line is not None:
			sub = clean_subs(common.smart_utf8(line))
			start_time = common.smart_utf8(line['begin'].replace('.', ','))
			if ',' not in start_time:
				start_time = start_time + ',00'
			end_time = common.smart_utf8(line['end'].replace('.', ','))
			if ',' not in end_time:
				end_time = end_time + ',00'
			str_output += str(i + 1) + '\n' + start_time[:11] + ' --> ' + end_time[:11] + '\n' + sub + '\n\n'
	file = open(ustvpaths.SUBTITLE, 'w')
	file.write(str_output)
	file.close() 
Example #12
Source File: oxygen.py    From plugin.video.ustvvod with GNU General Public License v2.0 6 votes vote down vote up
def convert_subtitles(closedcaption):
	str_output = ''
	subtitle_data = connection.getURL(closedcaption, connectiontype = 0)
	subtitle_data = BeautifulSoup(subtitle_data, 'html.parser', parse_only = SoupStrainer('div'))
	lines = subtitle_data.find_all('p')
	for i, line in enumerate(lines):
		if line is not None:
			sub = clean_subs(common.smart_utf8(line))
			start_time_rest, start_time_msec = line['begin'].rsplit(':',1)
			start_time = common.smart_utf8(start_time_rest + ',' + start_time_msec)
			try:
				end_time_rest, end_time_msec = line['end'].rsplit(':',1)
				end_time = common.smart_utf8(end_time_rest + ',' + end_time_msec)
			except:
				continue
			str_output += str(i + 1) + '\n' + start_time + ' --> ' + end_time + '\n' + sub + '\n\n'
	file = open(ustvpaths.SUBTITLE, 'w')
	file.write(str_output)
	file.close() 
Example #13
Source File: webscraper_beautifulsoup.py    From Python4Pentesters with MIT License 6 votes vote down vote up
def getLinks(text, url=""):
    if url and url[-1] == "/":
        url = url[:-1]

    links = []
    if text:
        for link in BeautifulSoup(text, "html.parser", parse_only=SoupStrainer("a", href=True)):
            if link.has_attr('href'):
                if (link['href']):
                    href = link['href'].strip()
                    if not href.startswith("http://") and not href.startswith("https://") and not href.startswith("mailto:") and not href.startswith("tel:"):                                                     
                        if not href.startswith('/'):
                            href = "/" + href
                        href = url + href
                    links.append(href)
    return links

#-----------------------------------------------------------------------------
# MAIN
#----------------------------------------------------------------------------- 
Example #14
Source File: collect.py    From web_page_classification with MIT License 6 votes vote down vote up
def get_child_urls(main_page, max_child=20):
    """retrieve urls from giving html page.
    args:
        main_page(str): html file.
        max_child(int): max number of return urls.
    return:
        list of url string.
    """
    from bs4 import BeautifulSoup, SoupStrainer
    children = []
    for link in BeautifulSoup(main_page,
                              "html.parser",
                              parse_only=SoupStrainer('a')):
        if link.has_attr('href') and link['href'].startswith("http"):
            children.append(link['href'])
    if len(children) > max_child:
        children = children[:max_child]
    return children 
Example #15
Source File: main_nbcu.py    From plugin.video.ustvvod with GNU General Public License v2.0 6 votes vote down vote up
def convert_subtitles(closedcaption):
	str_output = ''
	subtitle_data = connection.getURL(closedcaption, connectiontype = 0)
	subtitle_data = BeautifulSoup(subtitle_data, 'html.parser', parse_only = SoupStrainer('div'))
	lines = subtitle_data.find_all('p')
	for i, line in enumerate(lines):
		if line is not None:
			sub = clean_subs(common.smart_utf8(line))
			start_time_rest, start_time_msec = line['begin'].rsplit(':',1)
			start_time = common.smart_utf8(start_time_rest + ',' + start_time_msec)
			try:
				end_time_rest, end_time_msec = line['end'].rsplit(':',1)
				end_time = common.smart_utf8(end_time_rest + ',' + end_time_msec)
			except:
				continue
			str_output += str(i + 1) + '\n' + start_time + ' --> ' + end_time + '\n' + sub + '\n\n'
	file = open(ustvpaths.SUBTITLE, 'w')
	file.write(str_output)
	file.close() 
Example #16
Source File: main_abcdisney.py    From plugin.video.ustvvod with GNU General Public License v2.0 6 votes vote down vote up
def convert_subtitles(closedcaption):
	str_output = ''
	subtitle_data = connection.getURL(closedcaption, connectiontype = 0)
	subtitle_data = BeautifulSoup(subtitle_data, 'html.parser', parse_only = SoupStrainer('div'))
	lines = subtitle_data.find_all('p')
	for i, line in enumerate(lines):
		if line is not None:
			sub = clean_subs(common.smart_utf8(line))
			start_time_hours, start_time_rest = line['begin'].split(':', 1)
			start_time_hours = '%02d' % (int(start_time_hours) - 1)
			start_time = common.smart_utf8(start_time_hours + ':' + start_time_rest.replace('.', ','))
			end_time_hours, end_time_rest = line['end'].split(':', 1)
			end_time_hours = '%02d' % (int(end_time_hours) - 1)
			end_time = common.smart_utf8(end_time_hours + ':' + end_time_rest.replace('.', ','))
			str_output += str(i + 1) + '\n' + start_time + ' --> ' + end_time + '\n' + sub + '\n\n'
	file = open(ustvpaths.SUBTITLE, 'w')
	file.write(str_output)
	file.close()
	return True 
Example #17
Source File: interface.py    From hfut with MIT License 6 votes vote down vote up
def parse(response):
        page = response.text
        ss = SoupStrainer('table', id='TableXKJG')
        bs = GlobalFeaturedSoup(page, parse_only=ss)

        courses = []
        keys = tuple(bs.find('tr', bgcolor='#296DBD').stripped_strings)
        # value_list = [tr.stripped_strings for tr in bs.find_all('tr', bgcolor='#D6D3CE')]
        value_list = parse_tr_strs(bs.find_all('tr', bgcolor='#D6D3CE'))
        for values in value_list:
            course = dict(safe_zip(keys, values))
            course['课程代码'] = course['课程代码'].upper()
            course['学分'] = float(course['学分'])
            course['费用'] = float(course['费用'])
            courses.append(course)
        return courses 
Example #18
Source File: main_natgeo.py    From plugin.video.ustvvod with GNU General Public License v2.0 6 votes vote down vote up
def episodes(SITE, episode_url = common.args.url):
	episodes = []
	if '#' in episode_url:
		episode_url = episode_url.split('#')[1]
	episode_data = connection.getURL(episode_url)
	episode_tree = BeautifulSoup(episode_data, 'html.parser', parse_only = SoupStrainer('div', class_ = 'show'))
	try:
		episodes = add_videos(episode_tree, SITE)
	except:
		print "Can't add video"
	more = episode_tree.find('a', class_ = 'load-more')
	if more:
		episode_data = connection.getURL(BASE + more['href'])
		episode_tree = BeautifulSoup(episode_data, 'html.parser')
		episodes = add_videos(episode_tree, SITE)
	return episodes 
Example #19
Source File: main_natgeo.py    From plugin.video.ustvvod with GNU General Public License v2.0 6 votes vote down vote up
def masterlist(SITE, SHOWS, SPECIALS = None):
	master_db = []
	root_dict = {}
	root_url = SHOWS
	root_data = connection.getURL(root_url)
	root_tree = BeautifulSoup(root_data, 'html.parser', parse_only = SoupStrainer('div', id = 'grid-frame'))
	root_menu = root_tree.find_all('div', class_ = 'media-module')
	for root_item in root_menu:
		root_name = root_item.find('div', class_ = 'title').text
		season_url = BASE + root_item.a['href']
		if '-1' not in season_url:
			tvdb_name = common.get_show_data(root_name, SITE, 'seasons')[-1]
			root_name = root_name + '#' + season_url 
			if tvdb_name not in root_dict.keys():
				root_dict[tvdb_name] = root_name
			else:
				root_dict[tvdb_name] = root_dict[tvdb_name] + '|' + root_name
	for root_name in root_dict:
		season_url = root_dict[root_name]
		master_db.append((root_name, SITE, 'seasons', season_url))
	more = root_tree.find('a', class_ = 'load-more')
	if more:
		master_db.extend(masterlist(SITE, BASE + more['href']))
	return master_db 
Example #20
Source File: interface.py    From hfut with MIT License 5 votes vote down vote up
def parse(response):
        page = response.text
        ss = SoupStrainer('table', id='KCTable')
        bs = GlobalFeaturedSoup(page, parse_only=ss)
        courses = []
        trs = bs.find_all('tr')
        value_list = [tuple(tr.stripped_strings) for tr in trs]
        for values in value_list:
            course = {'课程代码': values[0].upper(),
                      '课程名称': values[1],
                      '课程类型': values[2],
                      '开课院系': values[3],
                      '学分': float(values[4])}
            courses.append(course)
        return courses 
Example #21
Source File: teams.py    From sports.py with MIT License 5 votes vote down vote up
def _get_team_links(base_url, table_id):
    links = SoupStrainer('table', {'id': table_id})
    return BeautifulSoup(requests.get(base_url).content, 'html.parser', parse_only=links) 
Example #22
Source File: doc_dict_gen.py    From zenpy with GNU General Public License v3.0 5 votes vote down vote up
def parse_link(link):
    namespace = link.split('/')[-2]

    print("Parsing {} link: {}".format(namespace, link))
    response = requests.get(base_url + link)

    table_attr = SoupStrainer("table")
    soup = BeautifulSoup(response.content, 'lxml', parse_only=table_attr)

    table = soup.find('table')
    if not table:
        return {}

    rows = table.findAll('tr')
    header = [data.text.lower() for data in rows[0].findAll('th')]

    object_name = os.path.basename(os.path.normpath(link))
    object_info = defaultdict(dict)
    object_info[object_name] = defaultdict(dict)
    for row in rows[1:]:
        columns = [data.text for data in row.findAll('td')]
        row_data = dict(zip(header, columns))
        name = row_data.pop('name', None)
        if name:
            object_info[object_name][name].update(row_data)

    print("Parsing Completed for: " + link)
    return namespace, object_info 
Example #23
Source File: html.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def __init__(self, *args, **kwargs):
        super(_BeautifulSoupHtml5LibFrameParser, self).__init__(*args,
                                                                **kwargs)
        from bs4 import SoupStrainer
        self._strainer = SoupStrainer('table') 
Example #24
Source File: score.py    From gxgk-wechat-server with MIT License 5 votes vote down vote up
def score_page(studentid, url, session, proxy):
    """在校成绩页面"""
    url = url + studentid
    # 先获取查询成绩需要的 VIEWSTATE
    if not proxy:
        pre_score = session.get(url, allow_redirects=False, timeout=5)
    else:
        pre_score = session.get(url, allow_redirects=False, timeout=5,
                                proxies=app.config['SCHOOL_LAN_PROXIES'])
    pre_score_soup = BeautifulSoup(
        pre_score.text, "html.parser", parse_only=SoupStrainer("input"))
    score_view_state = pre_score_soup.find(
        attrs={"name": "__VIEWSTATE"})['value']
    # 查询成绩
    payload = {
        '__VIEWSTATE': score_view_state,
        'Button2': u'在校学习成绩查询',
        'ddlXN': '',
        'ddlXQ': ''
    }
    if not proxy:
        score_res = session.post(url, data=payload, allow_redirects=False,
                                 timeout=5)
    else:
        score_res = session.post(url, data=payload, allow_redirects=False,
                                 proxies=app.config['SCHOOL_LAN_PROXIES'],
                                 timeout=5)
    return score_res 
Example #25
Source File: voc_fetcher1.0.py    From VOC with GNU General Public License v3.0 5 votes vote down vote up
def __getmore(self, link):
        page = getpage(link)
        article = SoupStrainer('div', class_='articlebody')
        soup = BeautifulSoup(page, parse_only=article)
        div = soup.find('div', {'class': 'articlebody'})
        assert div
        self.__transfchswdBd(div, link) 
Example #26
Source File: html.py    From Splunking-Crime with GNU Affero General Public License v3.0 5 votes vote down vote up
def __init__(self, *args, **kwargs):
        super(_BeautifulSoupHtml5LibFrameParser, self).__init__(*args,
                                                                **kwargs)
        from bs4 import SoupStrainer
        self._strainer = SoupStrainer('table') 
Example #27
Source File: html.py    From elasticintel with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, *args, **kwargs):
        super(_BeautifulSoupHtml5LibFrameParser, self).__init__(*args,
                                                                **kwargs)
        from bs4 import SoupStrainer
        self._strainer = SoupStrainer('table') 
Example #28
Source File: tipue_search.py    From ford with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, output_path, project_url):

        self.output_path = output_path
        self.siteurl = project_url
        self.json_nodes = []
        self.only_text = SoupStrainer('div', id="text")
        self.only_title = SoupStrainer('title') 
Example #29
Source File: _session.py    From fbchat with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def get_error_data(html: str) -> Optional[str]:
    """Get error message from a request."""
    soup = bs4.BeautifulSoup(
        html, "html.parser", parse_only=bs4.SoupStrainer("form", id="login_form")
    )
    # Attempt to extract and format the error string
    return " ".join(list(soup.stripped_strings)[1:3]) or None 
Example #30
Source File: webscraper.py    From plugin.video.vrt.nu with GNU General Public License v3.0 5 votes vote down vote up
def get_categories():
    """Return a list of categories by scraping the VRT NU website"""

    cache_file = 'categories.json'
    categories = []

    # Try the cache if it is fresh
    categories = get_cache(cache_file, ttl=7 * 24 * 60 * 60)

    # Try to scrape from the web
    if not valid_categories(categories):
        from bs4 import BeautifulSoup, SoupStrainer
        log(2, 'URL get: https://www.vrt.be/vrtnu/categorieen/')
        response = urlopen('https://www.vrt.be/vrtnu/categorieen/')
        tiles = SoupStrainer('nui-list--content')
        soup = BeautifulSoup(response.read(), 'html.parser', parse_only=tiles)

        categories = []
        for tile in soup.find_all('nui-tile'):
            categories.append(dict(
                id=tile.get('href').split('/')[-2],
                thumbnail=get_category_thumbnail(tile),
                name=get_category_title(tile),
            ))
        if categories:
            from json import dumps
            update_cache('categories.json', dumps(categories))

    return categories