Python pyquery.PyQuery() Examples

The following are 30 code examples of pyquery.PyQuery(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyquery , or try the search function .
Example #1
Source File: csdn.py    From AILearners with Apache License 2.0 6 votes vote down vote up
def getOriginalArticalNums(self,proxies):
        main_response = requests.get(self.blogurl,proxies=proxies)
        # 判断是否成功获取 (根据状态码来判断)
        if main_response.status_code == 200:
            print('获取成功')
            self.main_html = main_response.text
            main_doc = pq(self.main_html)
            mainpage_str = main_doc.text() #页面信息去除标签信息
            origin_position = mainpage_str.index('原创') #找到原创的位置
            end_position = mainpage_str.index('原创',origin_position+1) #最终的位置,即原创底下是数字多少篇博文
            self.blog_nums = ''
            # 获取写的博客数目
            for num in range(3,10):
                #判断为空格 则跳出循环
                if mainpage_str[end_position + num].isspace() == True:
                    break
                self.blog_nums += mainpage_str[end_position + num]
            print(type(str(self.blog_nums)))
            cur_blog_nums = (int((self.blog_nums))) #获得当前博客文章数量
            return cur_blog_nums #返回博文数量
        else:
            print('爬取失败')
            return 0 #返回0 说明博文数为0或者爬取失败 
Example #2
Source File: Proxies.py    From Proxies with MIT License 6 votes vote down vote up
def fetch_urls(self, queue, quantity):
        while not queue.empty():
            url = queue.get()
            html = self.s.get(url, headers=self.headers).text
            pq = PyQuery(html)
            size = pq.find('tbody tr').size()
            for index in range(size):
                item = pq.find('tbody tr').eq(index)
                ip = item.find('td').eq(0).text()
                port = item.find('td').eq(1).text()
                _type = item.find('td').eq(3).text()
                self.result_arr.append({
                    str(_type).lower(): '{0}://{1}:{2}'.format(str(_type).lower(), ip, port)
                })
                if len(self.result_arr) >= quantity:
                    break 
Example #3
Source File: getchu_get_urls.py    From create-girls-moe-pytorch with MIT License 6 votes vote down vote up
def get_url_and_date(I: str, O: str, id_data_output_path: str) -> None:
  '''
  Get image url and date.
  Saved in the resource directory with names of `O` and `id_data_output_path`.
  :param I:
  :param O:
  :param id_data_output_path:
  :return: None
  '''
  with open(I, encoding='utf-8') as fin:
    doc = pyquery.PyQuery(fin.read())
  table = doc.attr('id', 'query_result_main')('tbody')
  id_data = []
  with open(O, 'w', encoding='utf-8') as fout:
    for line in table.items():
      for tr in line('tr').items():
        lst = re.findall(ID_PATTERN, tr.text())
        data = re.findall(DATA_PATTERN, tr.text())
        if len(lst) == 0:
          continue
        fout.write('http://www.getchu.com/soft.phtml?id={}&gc=gc\n'.format(lst[-1]))
        id_data.append([lst[-1], data[-1]])
  with open(id_data_output_path, 'w', encoding='utf-8') as fout:
    for each in id_data:
      fout.write('{} {}\n'.format(each[0], each[1])) 
Example #4
Source File: handlers.py    From doufen with MIT License 6 votes vote down vote up
def get(self, douban_id):
        try:
            subject = db.Note.get(db.Note.douban_id == douban_id)
            history = db.NoteHistorical.select().where(db.NoteHistorical.id == subject.id)
        except db.Note.DoesNotExist:
            raise tornado.web.HTTPError(404)

        comments = db.Comment.select().join(db.User).where(
            db.Comment.target_type == 'note',
            db.Comment.target_douban_id == subject.douban_id
        )

        dom = PyQuery(subject.content)
        dom_iframe = dom('iframe')
        dom_iframe.before('<p class="title"><a href="{0}" class="external-link">站外视频</a></p>'.format(dom_iframe.attr('src')))
        dom_iframe.remove()
        dom('a').add_class('external-link')

        self.render('note.html', note=subject, comments=comments, content=dom) 
Example #5
Source File: tasks.py    From doufen with MIT License 6 votes vote down vote up
def fetch_note_list(self):
        url = self.account.user.alt + 'notes'
        notes = []
        while True:
            response = self.fetch_url_content(url)
            if not response:
                break
            dom = PyQuery(response.text)
            note_items = dom('#content .article>.note-container')
            for note_item in note_items:
                notes.append(PyQuery(note_item).attr('data-url'))
            next_page = dom('#content .article>.paginator>.next>a')
            if next_page:
                url = next_page.attr('href')
            else:
                break
        return notes 
Example #6
Source File: py_mstr.py    From py-mstr with MIT License 6 votes vote down vote up
def get_attribute(self, attribute_id):
        """ Returns the attribute object for the given attribute id.

        Args:
            attribute_id (str): the attribute guid

        Returns:
            Attribute: Attribute object for this guid

        Raises:
            MstrClientException: if no attribute id is supplied
        """

        if not attribute_id:
            raise MstrClientException("You must provide an attribute id")
        arguments = {'taskId': 'getAttributeForms', 'attributeID': attribute_id, 'sessionState': self._session}
        response = self._request(arguments)
        d = pq(response)
        return Attribute(d('dssid')[0].text, d('n')[0].text) 
Example #7
Source File: tabelog.py    From tabebot with MIT License 6 votes vote down vote up
def parse_reviews_and_users(self, response):
        if not self.is_tabelog(response):
            return Request(url=response.url, dont_filter=True)

        dom = PyQuery(response.body)
        review_nodes = dom('div.rvw-item')
        business_id = int(re.findall(r'[a-z]+/A\d{4}/A\d{6}/(\d+)/dtlrvwlst/', response.url)[0])

        reviews_and_users = []
        for review_node in review_nodes:
            user_id = self._extract_user_id(review_node)
            review = self._generate_review(review_node, business_id, user_id)
            if review:
                reviews_and_users.append(review)
            user = self._generate_user(review_node, user_id)
            if user:
                reviews_and_users.append(user)
        return reviews_and_users 
Example #8
Source File: response.py    From pledgeservice with Apache License 2.0 6 votes vote down vote up
def pyquery(self):
        """
        Returns the response as a `PyQuery <http://pyquery.org/>`_ object.

        Only works with HTML and XML responses; other content-types raise
        AttributeError.
        """
        if 'html' not in self.content_type and 'xml' not in self.content_type:
            raise AttributeError(
                "Not an HTML or XML response body (content-type: %s)"
                % self.content_type)
        try:
            from pyquery import PyQuery
        except ImportError:  # pragma: no cover
            raise ImportError(
                "You must have PyQuery installed to use response.pyquery")
        d = PyQuery(self.testbody)
        return d 
Example #9
Source File: cover_query.py    From QMusic with GNU Lesser General Public License v2.1 6 votes vote down vote up
def query_album_cover_from_xiami(artist_name, album_name):
    if not is_network_connected():
        return None

    if not artist_name and not album_name:
        return False
    xiami_album_search_url = 'http://www.xiami.com/search/album?key=' + artist_name + '+' + album_name
    html = public_curl.get(xiami_album_search_url)
    try:
        search_result_object = PyQuery(html)
        album_info_element = search_result_object(
            'div.albumBlock_list div.album_item100_block p.cover a.CDcover100 img')
        info_href_attr = album_info_element.attr('src')
        if not info_href_attr: return False
        return info_href_attr.replace("_1", "_2")
    except:
        return False 
Example #10
Source File: utils.py    From cryptoCMD with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def get_coin_id(coin_code):
    """
    This method fetches the name(id) of currency from the given code
    :param coin_code: coin code of a cryptocurrency e.g. btc
    :return: coin-id for the a cryptocurrency on the coinmarketcap.com
    """

    try:
        url = "https://coinmarketcap.com/all/views/all/"

        html = get_url_data(url).text
        raw_data = pq(html)

        coin_code = coin_code.upper()

        for _row in raw_data("tr")[1:]:
            symbol = _row.cssselect("td.text-left.col-symbol")[0].text_content()
            coin_id = _row.values()[0][3:]
            if symbol == coin_code:
                return coin_id
        raise InvalidCoinCode("'{}' coin code is unavailable on coinmarketcap.com".format(coin_code))
    except Exception as e:
        raise e 
Example #11
Source File: crawler.py    From odds-portal-scraper with The Unlicense 6 votes vote down vote up
def get_seasons_for_league(self, main_league_results_url):
        """
        Params:
            (str) main_league_results_url e.g. https://www.oddsportal.com/hockey/usa/nhl/results/

        Returns:
            (list) urls to each season for given league
        """
        seasons = []
        logger.info('Getting all seasons for league via %s', main_league_results_url)
        if not self.go_to_link(main_league_results_url):
            logger.error('League results URL loaded unsuccessfully %s', main_league_results_url)
            # Going to send back empty list so this is not processed further
            return seasons
        html_source = self.get_html_source()
        html_querying = pyquery(html_source)
        season_links = html_querying.find('div.main-menu2.main-menu-gray > ul.main-filter > li > span > strong > a')
        logger.info('Extracted links to %d seasons', len(season_links))
        for season_link in season_links:
            this_season = Season(season_link.text)
            # Start the Season's list of URLs with just the root one
            this_season_url = self.base_url + season_link.attrib['href']
            this_season.urls.append(this_season_url)
            seasons.append(this_season)
        return seasons 
Example #12
Source File: tests.py    From django-ra-erp with GNU Affero General Public License v3.0 6 votes vote down vote up
def test_report_movement_redirect(self):
        """
        When showing a report, if it contains transactions the slug of the transaction is transformed into an
        <a> elem, here we test that the <a redirect to an actual change form
        :return:
        """
        self.client.login(username='super', password='secret')
        response = self.client.get(reverse('ra_admin:report', args=('client', 'clientdetailedstatement')),
                                   data={'client_id': self.client1.pk},
                                   HTTP_X_REQUESTED_WITH='XMLHttpRequest')
        data = response.json()
        a_elem = pq(data['data'][0]['slug'])
        doc_type = data['data'][0]['doc_type']
        url = a_elem.attr('href')
        response = self.client.get(url, follow=True)
        self.assertEqual(response.status_code, 200)
        instance = response.context['original']
        self.assertEqual(instance.slug, a_elem.text())
        self.assertEqual(instance.doc_type, doc_type) 
Example #13
Source File: pdb.py    From bioservices with GNU General Public License v3.0 6 votes vote down vote up
def get_ligands(self, identifier):
        """List the ligands that can be found in a PDB entry

        :param identifier: a valid PDB identifier (e.g., 4HHB)
        :return: xml document


            >>> from bioservices import PDB
            >>> s = PDB()
            >>> s.get_ligands("4HHB")

        Then, ::

            x = s.get_ligands("4HHB")
            from pyquery import PyQuery as pq
            d = pq(x)


        """

        res = self.services.http_get("rest/ligandInfo", frmt='xml',
                params={'structureId': identifier})
        return res 
Example #14
Source File: FakeUA.py    From FakeUA with MIT License 6 votes vote down vote up
def getTypesL2(target, types, href):
    """
        取得二级分类
    """
    loger.info(colored(f'fetching {href}', 'yellow'))
    resp = await spiderSession.get(href)
    async with trio.open_nursery() as nursery:
        for item in jq(resp.text)("body > div.content-base > section > div > table > tbody > tr").items():
            name = item(
                'td:nth-child(1)>a').text().strip().replace(' ', '_').lower()
            target[name] = {}
            url = urljoin(href, item('td:nth-child(1)>a').attr('href'))
            nums = int(item('td:nth-child(2)').text().strip())
            target[name]['url'] = url
            target[name]['nums'] = nums
            target[name]['UA_list'] = []
            for page in range(1, math.ceil(nums/PERPAGE)+1):
                TASKS.add('__'.join([
                    types,
                    name,
                    f"{url}{page}"
                ])) 
Example #15
Source File: pastebin_crawler.py    From pastebin-monitor with GNU General Public License v2.0 6 votes vote down vote up
def check_paste ( self, paste_id ):
        paste_url = self.PASTEBIN_URL + paste_id
        try:
            paste_txt = PyQuery ( url = paste_url )('#paste_code').text()

            for regex,file,directory in self.regexes:
                if re.match ( regex, paste_txt, re.IGNORECASE ):
                    Logger ().log ( 'Found a matching paste: ' + paste_url + ' (' + file + ')', True, 'CYAN' )
                    self.save_result ( paste_url,paste_id,file,directory )
                    return True
            Logger ().log ( 'Not matching paste: ' + paste_url )
        except KeyboardInterrupt:
            raise
        except:
            Logger ().log ( 'Error reading paste (probably a 404 or encoding issue).', True, 'YELLOW')
        return False 
Example #16
Source File: coursera_offline.py    From coursera-offline with GNU General Public License v3.0 5 votes vote down vote up
def get_vid_sub_links(anchor_elems):
    vid_link = None
    sub_link = None
    other_links = []
    for anchor_elem in anchor_elems:
        temp = pq(anchor_elem)
        href = temp.attr('href');
        if 'subtitles' in href and 'format=srt' in href:
            sub_link = href
        elif 'download.mp4' in href:
            vid_link = href
        elif any([ext in href for ext in SUPPORTED_OTHER_FILE_EXTENSIONS]):
            other_links.append(href)
    return vid_link, sub_link, other_links 
Example #17
Source File: example2_quotes_authors.py    From Hands-On-Web-Scraping-with-Python with MIT License 5 votes vote down vote up
def read_url(url):
    """Read given Url , Returns pyquery object for page content"""
    pageSource = pq(url)
    return pq(pageSource) 
Example #18
Source File: coursera_offline.py    From coursera-offline with GNU General Public License v3.0 5 votes vote down vote up
def parse_week_info(i, e, sl, j):
    de = pq(e)
    wt = de('h3').text()
    parsed_json = {'title': wt, 'links': []}
    for li in sl('li'):
        _li = pq(li)
        _as = _li('a')
        vl, sl, ol = get_vid_sub_links(_as)
        parsed_json['links'].append({
            'title': pq(_as[0]).text(),
            'link': vl,
            'sub_link': sl,
            'other_links': ol
        })
    j['data'].append(parsed_json) 
Example #19
Source File: utils.py    From cryptoCMD with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def extract_data(html):
    """
    Extract the price history from the HTML.

    :param html: html having historical price data
    :return: end_date, start_date, headers(column name of data), rows(price data)
    """

    raw_data = pq(html)

    headers = [col.text_content().strip("*") for col in raw_data("table:first>thead>tr>th")]

    rows = []

    for _row in raw_data("table tbody>tr"):
        row = [
            _native_type(_replace(col.text_content().strip(), ",-*?"))
            for col in _row.findall("td")
        ]

        # change format of date ('Aug 24 2017' to '24-08-2017')
        row[0] = datetime.datetime.strptime(row[0], "%b %d %Y").strftime("%d-%m-%Y")

        rows.append(row)

    end_date, start_date = rows[0][0], rows[-1][0]

    return end_date, start_date, headers, rows 
Example #20
Source File: example3_AHL.py    From Hands-On-Web-Scraping-with-Python with MIT License 5 votes vote down vote up
def read_url(url):
    """Read given Url , Returns pyquery object for page content"""
    pageSource = pq(url)
    return pq(pageSource) 
Example #21
Source File: tm_gensim.py    From MoAL with Apache License 2.0 5 votes vote down vote up
def get_random_sites(total):
    return [Pq(get_random_site()).html().split('\n') for _ in range(total)] 
Example #22
Source File: py_mstr.py    From py-mstr with MIT License 5 votes vote down vote up
def _parse_elements(self, response):
        d = pq(response)
        result = []
        for attr in d('block'):
            if attr.find('n').text:
                result.append(attr.find('n').text)
        return result 
Example #23
Source File: py_mstr.py    From py-mstr with MIT License 5 votes vote down vote up
def _parse_folder_contents(self, response):
        d = pq(response)
        result = []
        for folder in d('folders').find('obj'):
            result.append({
                'name': folder.find('n').text,
                'description': folder.find('d').text,
                'id': folder.find('id').text,
                'type': folder.find('t').text
            })
        return result 
Example #24
Source File: py_mstr.py    From py-mstr with MIT License 5 votes vote down vote up
def _login(self, project_source, project_name, username, password):
        arguments = {
            'taskId': 'login',
            'server': project_source,
            'project': project_name,
            'userid': username,
            'password': password
        }
        logger.info("logging in.")
        response = self._request(arguments)
        d = pq(response)
        return d[0][0].find('sessionState').text 
Example #25
Source File: 0008.py    From My-Solutions-For-Show-Me-the-Code with Mozilla Public License 2.0 5 votes vote down vote up
def get_result(content):
    content = content.decode('utf-8')
    jq = pq(content)
    l = jq('p')
    result = []
    for string in l:
        result.append(pq(string).text()) 
    return result 
Example #26
Source File: msg.py    From wechat-dump with GNU General Public License v3.0 5 votes vote down vote up
def get_emoji_product_id(self):
        assert self.type == TYPE_EMOJI, "Wrong call to get_emoji_product_id()!"
        pq = PyQuery(self.content_xml_ready, parser='xml')
        emoji = pq('emoji')
        if not emoji:
            return None
        return emoji.attrs['productid'] 
Example #27
Source File: libchathelper.py    From wechat-dump with GNU General Public License v3.0 5 votes vote down vote up
def _get_image(self, msg):
        """ get image content and type from a message"""
        if msg.type == TYPE_IMG:
            # imgPath was original THUMBNAIL_DIRPATH://th_xxxxxxxxx
            imgpath = msg.imgPath.split('_')[-1]
            if not imgpath:
                logger.warn(
                    'No imgpath in an image message. Perhaps a bug in wechat: {}'.format(msg))
                return '', ''
            bigimgpath = self.parser.imginfo.get(msg.msgSvrId)
            img = self.res.get_img([imgpath, bigimgpath])
            if not img:
                logger.warn("No image found for {}".format(imgpath))
            return img, 'jpeg'
        elif msg.type == TYPE_EMOJI:
            md5 = msg.imgPath
            if md5:
                emoji_img, format = self.res.get_emoji_by_md5(md5)
                return emoji_img, format
            else:
                return '', ''
        elif msg.type == TYPE_CUSTOM_EMOJI:
            pq = PyQuery(msg.content)
            md5 = pq('emoticonmd5').text()
            if md5:
                img, format = self.res.get_emoji(md5, None)
                return img, format
            else:
                return '', ''
        else:
            return '', '' 
Example #28
Source File: test_city.py    From tor_access with MIT License 5 votes vote down vote up
def get_citys(self, fid=0):
        url = self._url.format(fid)
        doc = pyq(url=url)
        text = doc.text()[21:-1]
        try:
            return json.loads(text)
        except:
            print text
            return [] 
Example #29
Source File: tests.py    From django-ra-erp with GNU Affero General Public License v3.0 5 votes vote down vote up
def test_reset_password_link(self):
        self.client.login(username='super', password='secret')
        url = reverse('ra_admin:auth_user_change', args=(self.user.pk,))
        response = self.client.get(url)
        doc = pq(response.content)
        reset_password_url = doc('a.reset-password').attr('href')
        abs_url = urljoin(url, reset_password_url)
        response = self.client.get(abs_url)
        self.assertEqual(response.status_code, 200, "%s %s" % (response.status_code, abs_url)) 
Example #30
Source File: wordcloud.py    From flask_jsondash with MIT License 5 votes vote down vote up
def url2wordcloud(url, requests_kwargs={},
                  exclude_punct=True,
                  normalized=True,
                  limit=None,
                  size=1,
                  min_len=None):
    """Convert the text content of a urls' html to a wordcloud config.

    Args:
        url (str): The url to load.
        requests_kwargs (dict, optional): The kwargs to pass to the
            requests library. (e.g. auth, headers, mimetypes)
        exclude_punc (bool, optional): exclude punctuation
        min_length (int, optional): the minimum required length, if any
        limit (int, optional): the number of items to limit
            (by most common), if any
        normalized (bool, optional): normalize data by
            lowercasing and strippping whitespace

    Returns:
        same value as :func:`~format_4_wordcloud`
    """
    resp = requests.get(url, **requests_kwargs)
    if not resp.status_code == 200:
        return []
    resp = Pq(resp.content).find('body').text().split(' ')
    if exclude_punct:
        resp = [
            re.sub(r'[^a-zA-Z0-9]+', '', w) for w
            in resp if w not in punctuation
        ]
    if min_len is not None:
        resp = [w for w in resp if len(w) >= min_len]
    if normalized:
        resp = [w.lower() for w in resp]
    words = get_word_freq_distribution(resp)
    if limit is not None:
        words = words.most_common(limit)
    else:
        words = [(k, v) for k, v in words.items()]
    return format_4_wordcloud(words, size_multiplier=size)