Python pyquery.PyQuery() Examples
The following are 30
code examples of pyquery.PyQuery().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyquery
, or try the search function
.
Example #1
Source File: csdn.py From AILearners with Apache License 2.0 | 6 votes |
def getOriginalArticalNums(self,proxies): main_response = requests.get(self.blogurl,proxies=proxies) # 判断是否成功获取 (根据状态码来判断) if main_response.status_code == 200: print('获取成功') self.main_html = main_response.text main_doc = pq(self.main_html) mainpage_str = main_doc.text() #页面信息去除标签信息 origin_position = mainpage_str.index('原创') #找到原创的位置 end_position = mainpage_str.index('原创',origin_position+1) #最终的位置,即原创底下是数字多少篇博文 self.blog_nums = '' # 获取写的博客数目 for num in range(3,10): #判断为空格 则跳出循环 if mainpage_str[end_position + num].isspace() == True: break self.blog_nums += mainpage_str[end_position + num] print(type(str(self.blog_nums))) cur_blog_nums = (int((self.blog_nums))) #获得当前博客文章数量 return cur_blog_nums #返回博文数量 else: print('爬取失败') return 0 #返回0 说明博文数为0或者爬取失败
Example #2
Source File: Proxies.py From Proxies with MIT License | 6 votes |
def fetch_urls(self, queue, quantity): while not queue.empty(): url = queue.get() html = self.s.get(url, headers=self.headers).text pq = PyQuery(html) size = pq.find('tbody tr').size() for index in range(size): item = pq.find('tbody tr').eq(index) ip = item.find('td').eq(0).text() port = item.find('td').eq(1).text() _type = item.find('td').eq(3).text() self.result_arr.append({ str(_type).lower(): '{0}://{1}:{2}'.format(str(_type).lower(), ip, port) }) if len(self.result_arr) >= quantity: break
Example #3
Source File: getchu_get_urls.py From create-girls-moe-pytorch with MIT License | 6 votes |
def get_url_and_date(I: str, O: str, id_data_output_path: str) -> None: ''' Get image url and date. Saved in the resource directory with names of `O` and `id_data_output_path`. :param I: :param O: :param id_data_output_path: :return: None ''' with open(I, encoding='utf-8') as fin: doc = pyquery.PyQuery(fin.read()) table = doc.attr('id', 'query_result_main')('tbody') id_data = [] with open(O, 'w', encoding='utf-8') as fout: for line in table.items(): for tr in line('tr').items(): lst = re.findall(ID_PATTERN, tr.text()) data = re.findall(DATA_PATTERN, tr.text()) if len(lst) == 0: continue fout.write('http://www.getchu.com/soft.phtml?id={}&gc=gc\n'.format(lst[-1])) id_data.append([lst[-1], data[-1]]) with open(id_data_output_path, 'w', encoding='utf-8') as fout: for each in id_data: fout.write('{} {}\n'.format(each[0], each[1]))
Example #4
Source File: handlers.py From doufen with MIT License | 6 votes |
def get(self, douban_id): try: subject = db.Note.get(db.Note.douban_id == douban_id) history = db.NoteHistorical.select().where(db.NoteHistorical.id == subject.id) except db.Note.DoesNotExist: raise tornado.web.HTTPError(404) comments = db.Comment.select().join(db.User).where( db.Comment.target_type == 'note', db.Comment.target_douban_id == subject.douban_id ) dom = PyQuery(subject.content) dom_iframe = dom('iframe') dom_iframe.before('<p class="title"><a href="{0}" class="external-link">站外视频</a></p>'.format(dom_iframe.attr('src'))) dom_iframe.remove() dom('a').add_class('external-link') self.render('note.html', note=subject, comments=comments, content=dom)
Example #5
Source File: tasks.py From doufen with MIT License | 6 votes |
def fetch_note_list(self): url = self.account.user.alt + 'notes' notes = [] while True: response = self.fetch_url_content(url) if not response: break dom = PyQuery(response.text) note_items = dom('#content .article>.note-container') for note_item in note_items: notes.append(PyQuery(note_item).attr('data-url')) next_page = dom('#content .article>.paginator>.next>a') if next_page: url = next_page.attr('href') else: break return notes
Example #6
Source File: py_mstr.py From py-mstr with MIT License | 6 votes |
def get_attribute(self, attribute_id): """ Returns the attribute object for the given attribute id. Args: attribute_id (str): the attribute guid Returns: Attribute: Attribute object for this guid Raises: MstrClientException: if no attribute id is supplied """ if not attribute_id: raise MstrClientException("You must provide an attribute id") arguments = {'taskId': 'getAttributeForms', 'attributeID': attribute_id, 'sessionState': self._session} response = self._request(arguments) d = pq(response) return Attribute(d('dssid')[0].text, d('n')[0].text)
Example #7
Source File: tabelog.py From tabebot with MIT License | 6 votes |
def parse_reviews_and_users(self, response): if not self.is_tabelog(response): return Request(url=response.url, dont_filter=True) dom = PyQuery(response.body) review_nodes = dom('div.rvw-item') business_id = int(re.findall(r'[a-z]+/A\d{4}/A\d{6}/(\d+)/dtlrvwlst/', response.url)[0]) reviews_and_users = [] for review_node in review_nodes: user_id = self._extract_user_id(review_node) review = self._generate_review(review_node, business_id, user_id) if review: reviews_and_users.append(review) user = self._generate_user(review_node, user_id) if user: reviews_and_users.append(user) return reviews_and_users
Example #8
Source File: response.py From pledgeservice with Apache License 2.0 | 6 votes |
def pyquery(self): """ Returns the response as a `PyQuery <http://pyquery.org/>`_ object. Only works with HTML and XML responses; other content-types raise AttributeError. """ if 'html' not in self.content_type and 'xml' not in self.content_type: raise AttributeError( "Not an HTML or XML response body (content-type: %s)" % self.content_type) try: from pyquery import PyQuery except ImportError: # pragma: no cover raise ImportError( "You must have PyQuery installed to use response.pyquery") d = PyQuery(self.testbody) return d
Example #9
Source File: cover_query.py From QMusic with GNU Lesser General Public License v2.1 | 6 votes |
def query_album_cover_from_xiami(artist_name, album_name): if not is_network_connected(): return None if not artist_name and not album_name: return False xiami_album_search_url = 'http://www.xiami.com/search/album?key=' + artist_name + '+' + album_name html = public_curl.get(xiami_album_search_url) try: search_result_object = PyQuery(html) album_info_element = search_result_object( 'div.albumBlock_list div.album_item100_block p.cover a.CDcover100 img') info_href_attr = album_info_element.attr('src') if not info_href_attr: return False return info_href_attr.replace("_1", "_2") except: return False
Example #10
Source File: utils.py From cryptoCMD with BSD 3-Clause "New" or "Revised" License | 6 votes |
def get_coin_id(coin_code): """ This method fetches the name(id) of currency from the given code :param coin_code: coin code of a cryptocurrency e.g. btc :return: coin-id for the a cryptocurrency on the coinmarketcap.com """ try: url = "https://coinmarketcap.com/all/views/all/" html = get_url_data(url).text raw_data = pq(html) coin_code = coin_code.upper() for _row in raw_data("tr")[1:]: symbol = _row.cssselect("td.text-left.col-symbol")[0].text_content() coin_id = _row.values()[0][3:] if symbol == coin_code: return coin_id raise InvalidCoinCode("'{}' coin code is unavailable on coinmarketcap.com".format(coin_code)) except Exception as e: raise e
Example #11
Source File: crawler.py From odds-portal-scraper with The Unlicense | 6 votes |
def get_seasons_for_league(self, main_league_results_url): """ Params: (str) main_league_results_url e.g. https://www.oddsportal.com/hockey/usa/nhl/results/ Returns: (list) urls to each season for given league """ seasons = [] logger.info('Getting all seasons for league via %s', main_league_results_url) if not self.go_to_link(main_league_results_url): logger.error('League results URL loaded unsuccessfully %s', main_league_results_url) # Going to send back empty list so this is not processed further return seasons html_source = self.get_html_source() html_querying = pyquery(html_source) season_links = html_querying.find('div.main-menu2.main-menu-gray > ul.main-filter > li > span > strong > a') logger.info('Extracted links to %d seasons', len(season_links)) for season_link in season_links: this_season = Season(season_link.text) # Start the Season's list of URLs with just the root one this_season_url = self.base_url + season_link.attrib['href'] this_season.urls.append(this_season_url) seasons.append(this_season) return seasons
Example #12
Source File: tests.py From django-ra-erp with GNU Affero General Public License v3.0 | 6 votes |
def test_report_movement_redirect(self): """ When showing a report, if it contains transactions the slug of the transaction is transformed into an <a> elem, here we test that the <a redirect to an actual change form :return: """ self.client.login(username='super', password='secret') response = self.client.get(reverse('ra_admin:report', args=('client', 'clientdetailedstatement')), data={'client_id': self.client1.pk}, HTTP_X_REQUESTED_WITH='XMLHttpRequest') data = response.json() a_elem = pq(data['data'][0]['slug']) doc_type = data['data'][0]['doc_type'] url = a_elem.attr('href') response = self.client.get(url, follow=True) self.assertEqual(response.status_code, 200) instance = response.context['original'] self.assertEqual(instance.slug, a_elem.text()) self.assertEqual(instance.doc_type, doc_type)
Example #13
Source File: pdb.py From bioservices with GNU General Public License v3.0 | 6 votes |
def get_ligands(self, identifier): """List the ligands that can be found in a PDB entry :param identifier: a valid PDB identifier (e.g., 4HHB) :return: xml document >>> from bioservices import PDB >>> s = PDB() >>> s.get_ligands("4HHB") Then, :: x = s.get_ligands("4HHB") from pyquery import PyQuery as pq d = pq(x) """ res = self.services.http_get("rest/ligandInfo", frmt='xml', params={'structureId': identifier}) return res
Example #14
Source File: FakeUA.py From FakeUA with MIT License | 6 votes |
def getTypesL2(target, types, href): """ 取得二级分类 """ loger.info(colored(f'fetching {href}', 'yellow')) resp = await spiderSession.get(href) async with trio.open_nursery() as nursery: for item in jq(resp.text)("body > div.content-base > section > div > table > tbody > tr").items(): name = item( 'td:nth-child(1)>a').text().strip().replace(' ', '_').lower() target[name] = {} url = urljoin(href, item('td:nth-child(1)>a').attr('href')) nums = int(item('td:nth-child(2)').text().strip()) target[name]['url'] = url target[name]['nums'] = nums target[name]['UA_list'] = [] for page in range(1, math.ceil(nums/PERPAGE)+1): TASKS.add('__'.join([ types, name, f"{url}{page}" ]))
Example #15
Source File: pastebin_crawler.py From pastebin-monitor with GNU General Public License v2.0 | 6 votes |
def check_paste ( self, paste_id ): paste_url = self.PASTEBIN_URL + paste_id try: paste_txt = PyQuery ( url = paste_url )('#paste_code').text() for regex,file,directory in self.regexes: if re.match ( regex, paste_txt, re.IGNORECASE ): Logger ().log ( 'Found a matching paste: ' + paste_url + ' (' + file + ')', True, 'CYAN' ) self.save_result ( paste_url,paste_id,file,directory ) return True Logger ().log ( 'Not matching paste: ' + paste_url ) except KeyboardInterrupt: raise except: Logger ().log ( 'Error reading paste (probably a 404 or encoding issue).', True, 'YELLOW') return False
Example #16
Source File: coursera_offline.py From coursera-offline with GNU General Public License v3.0 | 5 votes |
def get_vid_sub_links(anchor_elems): vid_link = None sub_link = None other_links = [] for anchor_elem in anchor_elems: temp = pq(anchor_elem) href = temp.attr('href'); if 'subtitles' in href and 'format=srt' in href: sub_link = href elif 'download.mp4' in href: vid_link = href elif any([ext in href for ext in SUPPORTED_OTHER_FILE_EXTENSIONS]): other_links.append(href) return vid_link, sub_link, other_links
Example #17
Source File: example2_quotes_authors.py From Hands-On-Web-Scraping-with-Python with MIT License | 5 votes |
def read_url(url): """Read given Url , Returns pyquery object for page content""" pageSource = pq(url) return pq(pageSource)
Example #18
Source File: coursera_offline.py From coursera-offline with GNU General Public License v3.0 | 5 votes |
def parse_week_info(i, e, sl, j): de = pq(e) wt = de('h3').text() parsed_json = {'title': wt, 'links': []} for li in sl('li'): _li = pq(li) _as = _li('a') vl, sl, ol = get_vid_sub_links(_as) parsed_json['links'].append({ 'title': pq(_as[0]).text(), 'link': vl, 'sub_link': sl, 'other_links': ol }) j['data'].append(parsed_json)
Example #19
Source File: utils.py From cryptoCMD with BSD 3-Clause "New" or "Revised" License | 5 votes |
def extract_data(html): """ Extract the price history from the HTML. :param html: html having historical price data :return: end_date, start_date, headers(column name of data), rows(price data) """ raw_data = pq(html) headers = [col.text_content().strip("*") for col in raw_data("table:first>thead>tr>th")] rows = [] for _row in raw_data("table tbody>tr"): row = [ _native_type(_replace(col.text_content().strip(), ",-*?")) for col in _row.findall("td") ] # change format of date ('Aug 24 2017' to '24-08-2017') row[0] = datetime.datetime.strptime(row[0], "%b %d %Y").strftime("%d-%m-%Y") rows.append(row) end_date, start_date = rows[0][0], rows[-1][0] return end_date, start_date, headers, rows
Example #20
Source File: example3_AHL.py From Hands-On-Web-Scraping-with-Python with MIT License | 5 votes |
def read_url(url): """Read given Url , Returns pyquery object for page content""" pageSource = pq(url) return pq(pageSource)
Example #21
Source File: tm_gensim.py From MoAL with Apache License 2.0 | 5 votes |
def get_random_sites(total): return [Pq(get_random_site()).html().split('\n') for _ in range(total)]
Example #22
Source File: py_mstr.py From py-mstr with MIT License | 5 votes |
def _parse_elements(self, response): d = pq(response) result = [] for attr in d('block'): if attr.find('n').text: result.append(attr.find('n').text) return result
Example #23
Source File: py_mstr.py From py-mstr with MIT License | 5 votes |
def _parse_folder_contents(self, response): d = pq(response) result = [] for folder in d('folders').find('obj'): result.append({ 'name': folder.find('n').text, 'description': folder.find('d').text, 'id': folder.find('id').text, 'type': folder.find('t').text }) return result
Example #24
Source File: py_mstr.py From py-mstr with MIT License | 5 votes |
def _login(self, project_source, project_name, username, password): arguments = { 'taskId': 'login', 'server': project_source, 'project': project_name, 'userid': username, 'password': password } logger.info("logging in.") response = self._request(arguments) d = pq(response) return d[0][0].find('sessionState').text
Example #25
Source File: 0008.py From My-Solutions-For-Show-Me-the-Code with Mozilla Public License 2.0 | 5 votes |
def get_result(content): content = content.decode('utf-8') jq = pq(content) l = jq('p') result = [] for string in l: result.append(pq(string).text()) return result
Example #26
Source File: msg.py From wechat-dump with GNU General Public License v3.0 | 5 votes |
def get_emoji_product_id(self): assert self.type == TYPE_EMOJI, "Wrong call to get_emoji_product_id()!" pq = PyQuery(self.content_xml_ready, parser='xml') emoji = pq('emoji') if not emoji: return None return emoji.attrs['productid']
Example #27
Source File: libchathelper.py From wechat-dump with GNU General Public License v3.0 | 5 votes |
def _get_image(self, msg): """ get image content and type from a message""" if msg.type == TYPE_IMG: # imgPath was original THUMBNAIL_DIRPATH://th_xxxxxxxxx imgpath = msg.imgPath.split('_')[-1] if not imgpath: logger.warn( 'No imgpath in an image message. Perhaps a bug in wechat: {}'.format(msg)) return '', '' bigimgpath = self.parser.imginfo.get(msg.msgSvrId) img = self.res.get_img([imgpath, bigimgpath]) if not img: logger.warn("No image found for {}".format(imgpath)) return img, 'jpeg' elif msg.type == TYPE_EMOJI: md5 = msg.imgPath if md5: emoji_img, format = self.res.get_emoji_by_md5(md5) return emoji_img, format else: return '', '' elif msg.type == TYPE_CUSTOM_EMOJI: pq = PyQuery(msg.content) md5 = pq('emoticonmd5').text() if md5: img, format = self.res.get_emoji(md5, None) return img, format else: return '', '' else: return '', ''
Example #28
Source File: test_city.py From tor_access with MIT License | 5 votes |
def get_citys(self, fid=0): url = self._url.format(fid) doc = pyq(url=url) text = doc.text()[21:-1] try: return json.loads(text) except: print text return []
Example #29
Source File: tests.py From django-ra-erp with GNU Affero General Public License v3.0 | 5 votes |
def test_reset_password_link(self): self.client.login(username='super', password='secret') url = reverse('ra_admin:auth_user_change', args=(self.user.pk,)) response = self.client.get(url) doc = pq(response.content) reset_password_url = doc('a.reset-password').attr('href') abs_url = urljoin(url, reset_password_url) response = self.client.get(abs_url) self.assertEqual(response.status_code, 200, "%s %s" % (response.status_code, abs_url))
Example #30
Source File: wordcloud.py From flask_jsondash with MIT License | 5 votes |
def url2wordcloud(url, requests_kwargs={}, exclude_punct=True, normalized=True, limit=None, size=1, min_len=None): """Convert the text content of a urls' html to a wordcloud config. Args: url (str): The url to load. requests_kwargs (dict, optional): The kwargs to pass to the requests library. (e.g. auth, headers, mimetypes) exclude_punc (bool, optional): exclude punctuation min_length (int, optional): the minimum required length, if any limit (int, optional): the number of items to limit (by most common), if any normalized (bool, optional): normalize data by lowercasing and strippping whitespace Returns: same value as :func:`~format_4_wordcloud` """ resp = requests.get(url, **requests_kwargs) if not resp.status_code == 200: return [] resp = Pq(resp.content).find('body').text().split(' ') if exclude_punct: resp = [ re.sub(r'[^a-zA-Z0-9]+', '', w) for w in resp if w not in punctuation ] if min_len is not None: resp = [w for w in resp if len(w) >= min_len] if normalized: resp = [w.lower() for w in resp] words = get_word_freq_distribution(resp) if limit is not None: words = words.most_common(limit) else: words = [(k, v) for k, v in words.items()] return format_4_wordcloud(words, size_multiplier=size)