Python Examples of bs4.SoupStrainer

Source File: _session.py From fbchat with BSD 3-Clause "New" or "Revised" License

6 votes

def find_form_request(html: str):
    soup = bs4.BeautifulSoup(html, "html.parser", parse_only=bs4.SoupStrainer("form"))

    form = soup.form
    if not form:
        raise _exception.ParseError("Could not find form to submit", data=html)

    url = form.get("action")
    if not url:
        raise _exception.ParseError("Could not find url to submit to", data=form)

    # From what I've seen, it'll always do this!
    if url.startswith("/"):
        url = "https://www.facebook.com" + url

    # It's okay to set missing values to something crap, the values are localized, and
    # hence are not available in the raw HTML
    data = {
        x["name"]: x.get("value", "[missing]")
        for x in form.find_all(["input", "button"])
    }
    return url, data

Source File: interface.py From hfut with MIT License

6 votes

def parse(response):
        page = response.text
        ss = SoupStrainer('table', width='650')
        bs = GlobalFeaturedSoup(page, parse_only=ss)
        trs = bs.find_all('tr')
        keys = tuple(trs[1].stripped_strings)
        if len(keys) != 6:
            log_result_not_found(response)
            return []

        value_list = parse_tr_strs(trs[2:])
        teaching_plan = []
        for values in value_list:
            code = values[1].upper()
            if teaching_plan and teaching_plan[-1]['课程代码'] == code:
                # 宣城校区查询公选课会有大量的重复
                continue
            plan = dict(safe_zip(keys, values))
            plan.pop('序号')
            plan['课程代码'] = code
            plan['学时'] = int(plan['学时'])
            plan['学分'] = float(plan['学分'])
            teaching_plan.append(plan)
        return teaching_plan

Source File: interface.py From hfut with MIT License

6 votes

def parse(response):
        page = response.text
        ss = SoupStrainer('table', width='650')
        bs = GlobalFeaturedSoup(page, parse_only=ss)
        title = bs.find('tr', bgcolor='#FB9E04')
        trs = bs.find_all('tr', bgcolor=re.compile(r'#D6D3CE|#B4B9B9'))
        if title:
            courses = []
            keys = tuple(title.stripped_strings)
            value_list = parse_tr_strs(trs)
            for values in value_list:
                course = dict(safe_zip(keys, values))
                course.pop('序号')
                course['课程代码'] = course['课程代码'].upper()
                course['班级容量'] = int(course['班级容量'])
                courses.append(course)
            return courses
        else:
            log_result_not_found(response)
            return []

Source File: voc_fetcher1.0.py From VOC with GNU General Public License v3.0

6 votes

def __pre_process(self, page):
    # As BeautifulSoup will cause memory I/O error when the page is too large
        if page.find('<dl')>0:
            data = page.split('<dl')
            tag_dd = SoupStrainer('dd')
            for idx in xrange(1, len(data)):
                count = data[idx].count('<dd')
                if count > 5:
                    parts = data[idx].split('</dl>')
                    dds = parts[0].split('</dd>')
                    data[idx] = ''.join([dds[0], '</dd> <dx>%d</dx>'%idx,
                        dds[-1], '</dl>', ''.join(parts[1:])])
                    self.__dd_ext[str(idx)] =[]
                    for item in dds[1:-1]:
                        dd = BeautifulSoup(item, parse_only=tag_dd).dd
                        assert dd
                        self.__dd_ext[str(idx)].append(dd)
            return '<dl'.join(data)
        else:
            return page

Source File: interface.py From hfut with MIT License

6 votes

def parse(response):
        page = response.text
        ss = SoupStrainer('table')
        bs = GlobalFeaturedSoup(page, parse_only=ss)
        if not bs.text:
            log_result_not_found(response)
            return {}
        value_list = parse_tr_strs(bs.find_all('tr'))
        # 第一行最后有个照片项
        teacher_info = {'照片': value_list[0].pop()}
        # 第五行最后有两个空白
        value_list[4] = value_list[4][:2]
        # 第六行有两个 联系电话 键
        phone = value_list[5]
        teacher_info['联系电话'] = phone[1::2]
        value_list.remove(phone)
        # 解析其他项
        for v in value_list:
            for i in range(0, len(v), 2):
                teacher_info[v[i]] = v[i + 1]
        return teacher_info

Source File: jenkins.py From hacker-scripts with MIT License

6 votes

def __get_version(self):
        '''
        get jenkins version
        :return:
        '''
        try:
            html = urllib2.urlopen(self.url + '/login?from=%2F').read()
            links = SoupStrainer('a' ,href = re.compile(VERSION_TAG))
            version_text = BeautifulSoup(html, "html.parser", parse_only= links)
            if version_text.text != "":
                color_output("[+]....jenkins version is %s" % version_text.text)
                version_re = re.findall(u"ver.\s(.*)" ,version_text.text)
                if len(version_re) != 0:
                    if version_re[0][0:4] >= self.check_version:
                        self.user_link = ASYNCH_PEOPEL_PERFIX
                    else:
                        self.user_link = PEOPLE_PERFIX
            else:
                color_output("[-]....can't get jenkins version!")
                sys.exit()
        except urllib2.URLError,e:
            color_output("[-]....can't get jenkins version!")
            sys.exit()

Source File: voc_fetcher1.0.py From VOC with GNU General Public License v3.0

6 votes

def __initdef(self, word, data):
        data = self.__pre_process(data)
        wpg = SoupStrainer('div', class_=re.compile('[^<>]*?wordPage[^<>]*?'))
        soup = BeautifulSoup(data, parse_only=wpg)
        div = soup.find('div', class_=re.compile('[^<>]*?wordPage[^<>]*?'))
        assert div
        self.__getwordtitle(div.h1)
        if word != self.__title:
            self.__title = None
            return False
        div = soup.find('div', {'class': 'section blurb'})
        if div:
            self.__hasblurb = True
            self.__getblurb(div)
        tags = soup.find_all(re.compile(r'div|h2'), class_='sectionHeader')
        tag = find_fulldefine(tags, re.compile(r'DEFINITIONS OF', re.I))
        if tag:
            self.__getfulldef(tag.parent)
        else:
            print("WARNING: %s HAS NO FULLDEFINITION" % self.__title)
            assert tag # to raise error and break
        div = soup.find('div', {'class': 'section family'})
        if div:
            self.__getwordfamily(div)
        return True

Source File: get_witnesses_for_bill.py From influence-texas with GNU General Public License v2.0

6 votes

def get_witnesses_for_bill(bill_id, session):
    # ex: "HB 864" -> "https://capitol.texas.gov/tlodocs/86R/witlistbill/html/HB00864H.htm"
    parsed_bill_id = re.search(r"(\w+)\s+(\d+)", bill_id)
    bill_type = parsed_bill_id.group(1)
    bill_number = parsed_bill_id.group(2).zfill(5)
    url_prefix = f"https://capitol.texas.gov/tlodocs/{session}R/witlistbill/html/{bill_type}{bill_number}"
    house_url = f"{url_prefix}H.htm"
    senate_url = f"{url_prefix}S.htm"

    res = requests.get(house_url)

    # ##### Basic Test
    # # parsing all <p/> blocks up front may not be efficient
    # filter = SoupStrainer('p') # only <p/> tags contain text that we care about
    # text_blocks = BeautifulSoup(res.content, "html.parser", parse_only=filter)
    # selecting = None;
    # for block in text_blocks:
    #     text = block.get_text(strip=True)
    #     print(f"[{text}]")

    return parse_witness_list_html(res.content)

Source File: fox.py From plugin.video.ustvvod with GNU General Public License v2.0

6 votes

def convert_subtitles(closedcaption):
	str_output = ''
	last_start_time = ''
	subtitle_data = connection.getURL(closedcaption, connectiontype = 0)
	subtitle_data = BeautifulSoup(subtitle_data, 'html.parser', parse_only = SoupStrainer('div'))
	lines = subtitle_data.find_all('p')
	for i, line in enumerate(lines):
		if line is not None:
			sub = clean_subs(common.smart_utf8(line))
			start_time = common.smart_utf8(line['begin'].replace('.', ','))
			try:
				end_time = common.smart_utf8(line['end'].replace('.', ','))
			except:
				continue
			if last_start_time != start_time:
				if i != 0:
					str_output += '\n\n'
				str_output += str(i + 1) + '\n' + start_time + ' --> ' + end_time + '\n' + sub
			else:
				str_output += '\n' + sub 
			last_start_time = start_time
	file = open(ustvpaths.SUBTITLE, 'w')
	file.write(str_output)
	file.close()

Source File: html_pbp.py From Hockey-Scraper with GNU General Public License v3.0

6 votes

def get_contents(game_html):
    """
    Uses Beautiful soup to parses the html document.
    Some parsers work for some pages but don't work for others....I'm not sure why so I just try them all here in order
    
    :param game_html: html doc
    
    :return: "soupified" html 
    """
    parsers = ["lxml", "html.parser", "html5lib"]
    strainer = SoupStrainer('td', attrs={'class': re.compile(r'bborder')})

    for parser in parsers:
        # parse_only only works with lxml for some reason
        if parser == "lxml":
            soup = BeautifulSoup(game_html, parser, parse_only=strainer)
        else:
            soup = BeautifulSoup(game_html, parser)

        tds = soup.find_all("td", {"class": re.compile('.*bborder.*')})

        if len(tds) > 0:
            break

    return tds

Source File: pbs.py From plugin.video.ustvvod with GNU General Public License v2.0

6 votes

def convert_subtitles(closedcaption):
	str_output = ''
	subtitle_data = connection.getURL(closedcaption, connectiontype = 0)
	subtitle_data = BeautifulSoup(subtitle_data, 'html.parser', parse_only = SoupStrainer('div'))
	lines = subtitle_data.find_all('p')
	for i, line in enumerate(lines):
		if line is not None:
			sub = clean_subs(common.smart_utf8(line))
			start_time = common.smart_utf8(line['begin'].replace('.', ','))
			if ',' not in start_time:
				start_time = start_time + ',00'
			end_time = common.smart_utf8(line['end'].replace('.', ','))
			if ',' not in end_time:
				end_time = end_time + ',00'
			str_output += str(i + 1) + '\n' + start_time[:11] + ' --> ' + end_time[:11] + '\n' + sub + '\n\n'
	file = open(ustvpaths.SUBTITLE, 'w')
	file.write(str_output)
	file.close()

Source File: oxygen.py From plugin.video.ustvvod with GNU General Public License v2.0

6 votes

def convert_subtitles(closedcaption):
	str_output = ''
	subtitle_data = connection.getURL(closedcaption, connectiontype = 0)
	subtitle_data = BeautifulSoup(subtitle_data, 'html.parser', parse_only = SoupStrainer('div'))
	lines = subtitle_data.find_all('p')
	for i, line in enumerate(lines):
		if line is not None:
			sub = clean_subs(common.smart_utf8(line))
			start_time_rest, start_time_msec = line['begin'].rsplit(':',1)
			start_time = common.smart_utf8(start_time_rest + ',' + start_time_msec)
			try:
				end_time_rest, end_time_msec = line['end'].rsplit(':',1)
				end_time = common.smart_utf8(end_time_rest + ',' + end_time_msec)
			except:
				continue
			str_output += str(i + 1) + '\n' + start_time + ' --> ' + end_time + '\n' + sub + '\n\n'
	file = open(ustvpaths.SUBTITLE, 'w')
	file.write(str_output)
	file.close()

Source File: webscraper_beautifulsoup.py From Python4Pentesters with MIT License

6 votes

def getLinks(text, url=""):
    if url and url[-1] == "/":
        url = url[:-1]

    links = []
    if text:
        for link in BeautifulSoup(text, "html.parser", parse_only=SoupStrainer("a", href=True)):
            if link.has_attr('href'):
                if (link['href']):
                    href = link['href'].strip()
                    if not href.startswith("http://") and not href.startswith("https://") and not href.startswith("mailto:") and not href.startswith("tel:"):                                                     
                        if not href.startswith('/'):
                            href = "/" + href
                        href = url + href
                    links.append(href)
    return links

#-----------------------------------------------------------------------------
# MAIN
#-----------------------------------------------------------------------------

Source File: collect.py From web_page_classification with MIT License

6 votes

def get_child_urls(main_page, max_child=20):
    """retrieve urls from giving html page.
    args:
        main_page(str): html file.
        max_child(int): max number of return urls.
    return:
        list of url string.
    """
    from bs4 import BeautifulSoup, SoupStrainer
    children = []
    for link in BeautifulSoup(main_page,
                              "html.parser",
                              parse_only=SoupStrainer('a')):
        if link.has_attr('href') and link['href'].startswith("http"):
            children.append(link['href'])
    if len(children) > max_child:
        children = children[:max_child]
    return children

Source File: main_nbcu.py From plugin.video.ustvvod with GNU General Public License v2.0

6 votes

def convert_subtitles(closedcaption):
	str_output = ''
	subtitle_data = connection.getURL(closedcaption, connectiontype = 0)
	subtitle_data = BeautifulSoup(subtitle_data, 'html.parser', parse_only = SoupStrainer('div'))
	lines = subtitle_data.find_all('p')
	for i, line in enumerate(lines):
		if line is not None:
			sub = clean_subs(common.smart_utf8(line))
			start_time_rest, start_time_msec = line['begin'].rsplit(':',1)
			start_time = common.smart_utf8(start_time_rest + ',' + start_time_msec)
			try:
				end_time_rest, end_time_msec = line['end'].rsplit(':',1)
				end_time = common.smart_utf8(end_time_rest + ',' + end_time_msec)
			except:
				continue
			str_output += str(i + 1) + '\n' + start_time + ' --> ' + end_time + '\n' + sub + '\n\n'
	file = open(ustvpaths.SUBTITLE, 'w')
	file.write(str_output)
	file.close()

Source File: main_abcdisney.py From plugin.video.ustvvod with GNU General Public License v2.0

6 votes

def convert_subtitles(closedcaption):
	str_output = ''
	subtitle_data = connection.getURL(closedcaption, connectiontype = 0)
	subtitle_data = BeautifulSoup(subtitle_data, 'html.parser', parse_only = SoupStrainer('div'))
	lines = subtitle_data.find_all('p')
	for i, line in enumerate(lines):
		if line is not None:
			sub = clean_subs(common.smart_utf8(line))
			start_time_hours, start_time_rest = line['begin'].split(':', 1)
			start_time_hours = '%02d' % (int(start_time_hours) - 1)
			start_time = common.smart_utf8(start_time_hours + ':' + start_time_rest.replace('.', ','))
			end_time_hours, end_time_rest = line['end'].split(':', 1)
			end_time_hours = '%02d' % (int(end_time_hours) - 1)
			end_time = common.smart_utf8(end_time_hours + ':' + end_time_rest.replace('.', ','))
			str_output += str(i + 1) + '\n' + start_time + ' --> ' + end_time + '\n' + sub + '\n\n'
	file = open(ustvpaths.SUBTITLE, 'w')
	file.write(str_output)
	file.close()
	return True

Source File: interface.py From hfut with MIT License

6 votes

def parse(response):
        page = response.text
        ss = SoupStrainer('table', id='TableXKJG')
        bs = GlobalFeaturedSoup(page, parse_only=ss)

        courses = []
        keys = tuple(bs.find('tr', bgcolor='#296DBD').stripped_strings)
        # value_list = [tr.stripped_strings for tr in bs.find_all('tr', bgcolor='#D6D3CE')]
        value_list = parse_tr_strs(bs.find_all('tr', bgcolor='#D6D3CE'))
        for values in value_list:
            course = dict(safe_zip(keys, values))
            course['课程代码'] = course['课程代码'].upper()
            course['学分'] = float(course['学分'])
            course['费用'] = float(course['费用'])
            courses.append(course)
        return courses

Source File: main_natgeo.py From plugin.video.ustvvod with GNU General Public License v2.0

6 votes

def episodes(SITE, episode_url = common.args.url):
	episodes = []
	if '#' in episode_url:
		episode_url = episode_url.split('#')[1]
	episode_data = connection.getURL(episode_url)
	episode_tree = BeautifulSoup(episode_data, 'html.parser', parse_only = SoupStrainer('div', class_ = 'show'))
	try:
		episodes = add_videos(episode_tree, SITE)
	except:
		print "Can't add video"
	more = episode_tree.find('a', class_ = 'load-more')
	if more:
		episode_data = connection.getURL(BASE + more['href'])
		episode_tree = BeautifulSoup(episode_data, 'html.parser')
		episodes = add_videos(episode_tree, SITE)
	return episodes

Source File: main_natgeo.py From plugin.video.ustvvod with GNU General Public License v2.0

6 votes

def masterlist(SITE, SHOWS, SPECIALS = None):
	master_db = []
	root_dict = {}
	root_url = SHOWS
	root_data = connection.getURL(root_url)
	root_tree = BeautifulSoup(root_data, 'html.parser', parse_only = SoupStrainer('div', id = 'grid-frame'))
	root_menu = root_tree.find_all('div', class_ = 'media-module')
	for root_item in root_menu:
		root_name = root_item.find('div', class_ = 'title').text
		season_url = BASE + root_item.a['href']
		if '-1' not in season_url:
			tvdb_name = common.get_show_data(root_name, SITE, 'seasons')[-1]
			root_name = root_name + '#' + season_url 
			if tvdb_name not in root_dict.keys():
				root_dict[tvdb_name] = root_name
			else:
				root_dict[tvdb_name] = root_dict[tvdb_name] + '|' + root_name
	for root_name in root_dict:
		season_url = root_dict[root_name]
		master_db.append((root_name, SITE, 'seasons', season_url))
	more = root_tree.find('a', class_ = 'load-more')
	if more:
		master_db.extend(masterlist(SITE, BASE + more['href']))
	return master_db

Source File: interface.py From hfut with MIT License

5 votes

def parse(response):
        page = response.text
        ss = SoupStrainer('table', id='KCTable')
        bs = GlobalFeaturedSoup(page, parse_only=ss)
        courses = []
        trs = bs.find_all('tr')
        value_list = [tuple(tr.stripped_strings) for tr in trs]
        for values in value_list:
            course = {'课程代码': values[0].upper(),
                      '课程名称': values[1],
                      '课程类型': values[2],
                      '开课院系': values[3],
                      '学分': float(values[4])}
            courses.append(course)
        return courses

Source File: teams.py From sports.py with MIT License

5 votes

def _get_team_links(base_url, table_id):
    links = SoupStrainer('table', {'id': table_id})
    return BeautifulSoup(requests.get(base_url).content, 'html.parser', parse_only=links)

Source File: doc_dict_gen.py From zenpy with GNU General Public License v3.0

5 votes

def parse_link(link):
    namespace = link.split('/')[-2]

    print("Parsing {} link: {}".format(namespace, link))
    response = requests.get(base_url + link)

    table_attr = SoupStrainer("table")
    soup = BeautifulSoup(response.content, 'lxml', parse_only=table_attr)

    table = soup.find('table')
    if not table:
        return {}

    rows = table.findAll('tr')
    header = [data.text.lower() for data in rows[0].findAll('th')]

    object_name = os.path.basename(os.path.normpath(link))
    object_info = defaultdict(dict)
    object_info[object_name] = defaultdict(dict)
    for row in rows[1:]:
        columns = [data.text for data in row.findAll('td')]
        row_data = dict(zip(header, columns))
        name = row_data.pop('name', None)
        if name:
            object_info[object_name][name].update(row_data)

    print("Parsing Completed for: " + link)
    return namespace, object_info

Source File: html.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def __init__(self, *args, **kwargs):
        super(_BeautifulSoupHtml5LibFrameParser, self).__init__(*args,
                                                                **kwargs)
        from bs4 import SoupStrainer
        self._strainer = SoupStrainer('table')

Source File: score.py From gxgk-wechat-server with MIT License

5 votes

def score_page(studentid, url, session, proxy):
    """在校成绩页面"""
    url = url + studentid
    # 先获取查询成绩需要的 VIEWSTATE
    if not proxy:
        pre_score = session.get(url, allow_redirects=False, timeout=5)
    else:
        pre_score = session.get(url, allow_redirects=False, timeout=5,
                                proxies=app.config['SCHOOL_LAN_PROXIES'])
    pre_score_soup = BeautifulSoup(
        pre_score.text, "html.parser", parse_only=SoupStrainer("input"))
    score_view_state = pre_score_soup.find(
        attrs={"name": "__VIEWSTATE"})['value']
    # 查询成绩
    payload = {
        '__VIEWSTATE': score_view_state,
        'Button2': u'在校学习成绩查询',
        'ddlXN': '',
        'ddlXQ': ''
    }
    if not proxy:
        score_res = session.post(url, data=payload, allow_redirects=False,
                                 timeout=5)
    else:
        score_res = session.post(url, data=payload, allow_redirects=False,
                                 proxies=app.config['SCHOOL_LAN_PROXIES'],
                                 timeout=5)
    return score_res

Source File: voc_fetcher1.0.py From VOC with GNU General Public License v3.0

5 votes

def __getmore(self, link):
        page = getpage(link)
        article = SoupStrainer('div', class_='articlebody')
        soup = BeautifulSoup(page, parse_only=article)
        div = soup.find('div', {'class': 'articlebody'})
        assert div
        self.__transfchswdBd(div, link)

Source File: html.py From Splunking-Crime with GNU Affero General Public License v3.0

5 votes

def __init__(self, *args, **kwargs):
        super(_BeautifulSoupHtml5LibFrameParser, self).__init__(*args,
                                                                **kwargs)
        from bs4 import SoupStrainer
        self._strainer = SoupStrainer('table')

Source File: html.py From elasticintel with GNU General Public License v3.0

5 votes

def __init__(self, *args, **kwargs):
        super(_BeautifulSoupHtml5LibFrameParser, self).__init__(*args,
                                                                **kwargs)
        from bs4 import SoupStrainer
        self._strainer = SoupStrainer('table')

Source File: tipue_search.py From ford with GNU General Public License v3.0

5 votes

def __init__(self, output_path, project_url):

        self.output_path = output_path
        self.siteurl = project_url
        self.json_nodes = []
        self.only_text = SoupStrainer('div', id="text")
        self.only_title = SoupStrainer('title')

Source File: _session.py From fbchat with BSD 3-Clause "New" or "Revised" License

5 votes

def get_error_data(html: str) -> Optional[str]:
    """Get error message from a request."""
    soup = bs4.BeautifulSoup(
        html, "html.parser", parse_only=bs4.SoupStrainer("form", id="login_form")
    )
    # Attempt to extract and format the error string
    return " ".join(list(soup.stripped_strings)[1:3]) or None

Source File: webscraper.py From plugin.video.vrt.nu with GNU General Public License v3.0

5 votes

def get_categories():
    """Return a list of categories by scraping the VRT NU website"""

    cache_file = 'categories.json'
    categories = []

    # Try the cache if it is fresh
    categories = get_cache(cache_file, ttl=7 * 24 * 60 * 60)

    # Try to scrape from the web
    if not valid_categories(categories):
        from bs4 import BeautifulSoup, SoupStrainer
        log(2, 'URL get: https://www.vrt.be/vrtnu/categorieen/')
        response = urlopen('https://www.vrt.be/vrtnu/categorieen/')
        tiles = SoupStrainer('nui-list--content')
        soup = BeautifulSoup(response.read(), 'html.parser', parse_only=tiles)

        categories = []
        for tile in soup.find_all('nui-tile'):
            categories.append(dict(
                id=tile.get('href').split('/')[-2],
                thumbnail=get_category_thumbnail(tile),
                name=get_category_title(tile),
            ))
        if categories:
            from json import dumps
            update_cache('categories.json', dumps(categories))

    return categories

Python bs4.SoupStrainer() Examples