Python wikipedia.search() Examples

The following are 19 code examples of wikipedia.search(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module wikipedia , or try the search function .
Example #1
Source File: fetch_wiki.py    From adam_qas with GNU General Public License v3.0 8 votes vote down vote up
def search_wiki(keywords, number_of_search, wiki_pages):
    suggestion = False

    for word in range(0, len(keywords) - 1):
        # print(keywords[word], ">>")
        result_set = wikipedia.search(keywords[word], number_of_search, suggestion)
        for term in result_set:

            try:
                page = wikipedia.page(term, preload=False)
                page_title = page.title
                # page_summary = page.summary
                page_content = page.content
                wiki_pages[page_title] = page_content

            except wikipedia.exceptions.DisambiguationError as error:
                pass
            except wikipedia.exceptions.PageError as error:
                pass
                # print(error.options)

            # print(page_title, len(page_content), type(page_content))

    return wiki_pages 
Example #2
Source File: gen_corpus.py    From Living-Audio-Dataset with Apache License 2.0 6 votes vote down vote up
def get_articles(language, no_words, max_no_articles, search, **kwargs):
    """ Retrieve articles from Wikipedia """
    wikipedia.set_rate_limiting(True) # be polite
    wikipedia.set_lang(language)

    if search is not None:
        titles = wikipedia.search(search, results = max_no_articles)
    else:
        titles = wikipedia.random(pages = max_no_articles)

    articles = []
    current_no_words = 0
    for title in titles:
        print("INFO: loading {}".format(title))
        page = wikipedia.page(title=title)
        content = page.content
        article_no_words = len(content.split())
        current_no_words += article_no_words
        print("INFO: article contains {} words".format(article_no_words))
        articles.append((title, content))
        if current_no_words >= no_words:
            break

    return articles 
Example #3
Source File: fetch_tax_info.py    From idseq-dag with MIT License 6 votes vote down vote up
def fetch_wiki_content(num_threads, taxid2wikidict, taxid2wikicontent, id2namedict):
        ''' Fetch wikipedia content based on taxid2wikidict '''
        threads = []
        semaphore = threading.Semaphore(num_threads)
        mutex = TraceLock("fetch_wiki_content", threading.RLock())
        for taxid, url in taxid2wikidict.items():
            m = re.search(r"curid=(\d+)", url)
            pageid = None
            if m:
                pageid = m[1]
            name = id2namedict.get(taxid)
            if pageid or name:
                semaphore.acquire()
                t = threading.Thread(
                    target=PipelineStepFetchTaxInfo.
                    get_wiki_content_for_page,
                    args=[taxid, pageid, name, taxid2wikicontent, mutex, semaphore]
                )
                t.start()
                threads.append(t)
        for t in threads:
            t.join() 
Example #4
Source File: BuscadorPersonas.py    From osint-suite-tools with GNU General Public License v3.0 6 votes vote down vote up
def search_google_(target):
    engine = Google()
    results = engine.search("'" + target + "'")
    for r in results:
        print ("|--[INFO][GOOGLE][RESULTS][>] " + r["title"] + " | " + r["text"] + " | " + r["link"])
        
        try:
            tsd, td, tsu = extract(r["link"])
            domain = td + '.' + tsu

            web = requests.get(r["link"], timeout=3)
            print ("|----[INFO][WEB][HTTP CODE][>] " + str(web.status_code) + "\n")

            if web.status_code >= 200 or web.status_code < 300:

                if not domain in config.BL_parserPhone:
                    TEXT = er.remove_tags(str(web.text))
                    parser.parserMAIN(TEXT)

        except Exception as e:
            print ("|----[ERROR][HTTP CONNECTION][>] " + str(e)) 
Example #5
Source File: search.py    From W.I.L.L with MIT License 6 votes vote down vote up
def main(data):
    '''Start the search'''
    response = {"text": None, "data":{}, "type": "success"}
    query = data["command"]
    log.info("In main search function with query {0}".format(query))
    db = data["db"]
    answer = False
    wolfram_key = tools.load_key("wolfram", db)
    wolfram_response = search_wolfram(query, wolfram_key)
    # If it found an answer answer will be set to that, if not it'll still be false
    answer = wolfram_response
    if answer:
        response["text"] = answer
    else:
        response["text"]=search_google(query)
    return response 
Example #6
Source File: search.py    From W.I.L.L with MIT License 6 votes vote down vote up
def is_search(event):
    '''Determine whether it's a search command'''
    command = event["command"]
    if "search" in event["verbs"]:
        return True
    question_words = [
        "what",
        "when",
        "why",
        "how",
        "who",
        "are",
        "is"
    ]
    first_word = command.split(" ")[0].lower()
    log.debug("First word in command is {0}".format(first_word))
    if first_word in question_words:
        return True
    return False 
Example #7
Source File: wiki.py    From Jarvis with MIT License 6 votes vote down vote up
def __call__(self, jarvis, s):
        k = s.split(' ', 1)
        if len(k) == 1:
            jarvis.say(
                "Do you mean:\n1. wiki search <subject>\n2. wiki summary <subject>\n3. wiki content <subject>")
        else:
            data = None
            if k[0] == "search":
                data = self.search(" ".join(k[1:]))
            elif k[0] == "summary":
                data = self.summary(" ".join(k[1:]))
            elif k[0] == "content":
                data = self.content(" ".join(k[1:]))
            else:
                jarvis.say("I don't know what you mean")
                return

            if isinstance(data, list):
                print("\nDid you mean one of these pages?\n")
                for d in range(len(data)):
                    print(str(d + 1) + ": " + data[d])
            else:
                print("\n" + data) 
Example #8
Source File: gen_corpus.py    From Living-Audio-Dataset with Apache License 2.0 6 votes vote down vote up
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-n", "--max-no-articles", type = int, default=10,
                        help = "maximum number of articles to download")
    parser.add_argument("-w", "--no-words", type = int, default=1000000,
                        help = "target number of words")

    parser.add_argument("-s", "--search",
                        help = "if specified will use this search term")

    parser.add_argument("language",
                        help = "2 letter language code")

    parser.add_argument("output", type = argparse.FileType('w'),
                        help = "output file")

    args = parser.parse_args()
    articles = get_articles(**vars(args))
    corpusxml = articles2xml(articles)

    xmlstr = lxml.etree.tostring(corpusxml,
                                 pretty_print=True,
                                 xml_declaration=True,
                                 encoding='utf-8')
    args.output.write(xmlstr.decode('utf-8')) 
Example #9
Source File: temporal_lobe.py    From rpi_ai with MIT License 5 votes vote down vote up
def playMusic(query):
	
	# get YouTube list
	pattern = re.compile('([^\s\w]|_)+')
	b_string = re.sub(pattern, '', query)
	phrase=b_string
	pattern = re.compile("\\b(some|play)\\W", re.I)
	query = [pattern.sub("", phrase)] 
	# get YouTube list
	query = query[0]
	print query
	url = "https://www.googleapis.com/youtube/v3/search?part=snippet&key="+keyring.get_password('google','api_secret')+"&q="+urllib.quote_plus(query)+"&type=video"
	response = urllib2.urlopen(url)
	jsonResp = response.read()
	decoded = json.loads(jsonResp)
	#os.system('echo \''+url+'\' > url.txt') #for debugging
	url = 'http://youtube.com/watch?v=' + decoded['items'][0]['id']['videoId']
	theSongName = decoded['items'][0]['snippet']['title']
	pattern = re.compile("([^a-zA-Z\d\s:,.']|_)+")
	theSongName = re.sub(pattern, '', theSongName)
	#for x in range(1,len(decoded['items'])):
	#url = url + ' ' + 'http://youtube.com/watch?v=' + decoded['items'][x]['id']['videoId']
	permission = audio_cortex.getUserPermission("Do you want to hear " + theSongName)
	if permission:
		vlc = 'cvlc --no-video --volume 270 -A alsa,none --alsa-audio-device hw:1' + ' ' + url + ' --play-and-exit &'
		print url
		os.system(vlc)
		print "started music.."
		return "Sure I'll play " + theSongName
	else:
		return "Okay, I will play nothing."

# Look up declarative knowledge with Wolfram 
Example #10
Source File: temporal_lobe.py    From rpi_ai with MIT License 5 votes vote down vote up
def wikipediaLookUp(a_string,num_sentences):
	print a_string
	pattern = re.compile('([^\s\w]|_)+')
	b_string = re.sub(pattern, '', a_string)
	phrase=b_string
	print phrase
	pattern = re.compile("\\b(lot|lots|a|an|who|can|you|what|is|info|somethings|whats|have|i|something|to|know|like|Id|information|about|tell|me)\\W", re.I)
	phrase_noise_removed = [pattern.sub("", phrase)] 
	print phrase_noise_removed[0]
	a = wikipedia.search(phrase_noise_removed[0])
	print a[0]
	the_summary = (wikipedia.summary(a[0], sentences=num_sentences))
	print the_summary
	return the_summary 
Example #11
Source File: wiki.py    From Jarvis with MIT License 5 votes vote down vote up
def search(self, query, count=10, suggestion=False):
        """Do a Wikipedia search for a query, returns a list of 10 related items."""
        items = wikipedia.search(query, count, suggestion)
        if isinstance(items, list) and items:
            return items
        return "No articles with that name, try another item." 
Example #12
Source File: BuscadorPersonas.py    From osint-suite-tools with GNU General Public License v3.0 5 votes vote down vote up
def searchWikipedia(target):

    try:
        wikipedia.set_lang("es")
        d0 = wikipedia.search(target)

        if d0:
            print()
            print("|----[INFO][WIKIPEDIA][>] ")
            print("     |----[INFO][SEARCH][>] ")
            print("     - Resultados encontrados: ")
            for r in d0:
                print("     - " + r)
        else:
            print("|----[INFO][WIKIPEDIA][>] No aparecen resultados en WIKIPEDIA.")

    except:
        print("[!][WARNING][WIKIPEDIA][>] Error en la API...")

    try:
        d1 = wikipedia.page(target)

        linksWIKI = d1.links
        urlWIKI = d1.url

        if d1:
            print("     |----[INFO][TAGS][>] ")
            for l in linksWIKI:
                print("     - " + l)
            print("|----[FUENTES][WIKIPEDIA][>] ")
            print("     - " + urlWIKI)
            config.wikipediaData_list.append(urlWIKI)
        else:
            print("|----[INFO][WIKIPEDIA][>] No aparecen resultados en WIKIPEDIA.")
    
    except:
        print("[!][WARNING][WIKIPEDIA][>] Error en la API o no aparecen resultados...")

#Funciones para buscar en Youtube 
Example #13
Source File: wikipedia.py    From pyconjpbot with MIT License 5 votes vote down vote up
def wikipedia_page(message, option, query):
    """
    Wikipediaで検索した結果を返す
    """
    if query == 'help':
        return

    # set language
    lang = 'ja'
    if option:
        _, lang = option.split('-')
    wikipedia.set_lang(lang)

    try:
        # search with query
        results = wikipedia.search(query)
    except:
        botsend(message, '指定された言語 `{}` は存在しません'.format(lang))
        return

    # get first result
    if results:
        page = wikipedia.page(results[0])

        attachments = [{
            'fallback': 'Wikipedia: {}'.format(page.title),
            'pretext': 'Wikipedia: <{}|{}>'.format(page.url, page.title),
            'text': page.summary,
        }]
        botwebapi(message, attachments)
    else:
        botsend(message, '`{}` に該当するページはありません'.format(query)) 
Example #14
Source File: BuscadorPersonas.py    From osint-suite-tools with GNU General Public License v3.0 5 votes vote down vote up
def search_dogpile_(target):
    engine = Dogpile()
    results = engine.search("'" + target + "'")
    for r in results:
        print ("|--[INFO][DOGPILE][RESULTS][>] " + r["title"] + " | " + r["text"] + " | " + r["link"])
        
        try:
            web = requests.get(r["link"], timeout=3)
            print ("|----[INFO][WEB][HTTP CODE][>] " + str(web.status_code) + "\n")
            if web.status_code >= 200 or web.status_code < 300:
                TEXT = er.remove_tags(str(web.text))
                parser.parserMAIN(TEXT)

        except Exception as e:
            print ("|----[ERROR][HTTP CONNECTION][>] " + str(e)) 
Example #15
Source File: views.py    From Microsoft-chatbot with MIT License 5 votes vote down vote up
def who_is(query, session_id="general"):
    try:
        return wikipedia.summary(query)
    except requests.exceptions.SSLError:
        return "Sorry I could not search online due to SSL error"
    except:
        pass
    for new_query in wikipedia.search(query):
        try:
            return wikipedia.summary(new_query)
        except:
            pass
    return "Sorry I could not find any data related to '%s'" % query 
Example #16
Source File: wiki_search.py    From NeuralTripleTranslation with Apache License 2.0 5 votes vote down vote up
def get(query):
    return find_candidates(find_json(query))

# with open("testoutput.txt", "w") as text_file:

# for i in get('video-assisted thoracoscopic'):
#     try:
#             # print get('video-assisted thoracoscopic')
#             # print wikipedia.page(i)
#             # allwiki.append(wikipedia.page(i).content)
#         text_file.write('%s\n\n' % wikipedia.page(i).content.encode('utf8'))
#         # print type(wikipedia.page(i).content.encode('utf8'))
#         # print wikipedia.page(i).content
#     except:
#         print "Unexpected error:", sys.exc_info()[0]


# print get('video-assisted thoracoscopic')

# for keys in world_dict.keys():
#     for term in world_dict.get(keys, keys):
#         if wikipedia.search(term):
#             term = wikipedia.search(term)[0]
#             wikipage = wikipedia.page(term)
#             content = wikipage.content
#             allwiki.append(content)
#
# print (wikipedia.page("Georgia_(country)").content)
#
# print wikipedia.search('fraction of inspired o2')[0]


# text_file.close() 
Example #17
Source File: fetch_tax_info.py    From idseq-dag with MIT License 5 votes vote down vote up
def get_wiki_content_for_page(taxid, pageid, taxname, taxid2wikicontent, mutex, semaphore, max_attempt=3):
        ''' Fetch wiki content for pageid '''
        for attempt in range(max_attempt):
            try:
                page = None
                if pageid:
                    log.write(f"fetching wiki {pageid} for {taxid}")
                    page = wikipedia.page(pageid=pageid)
                elif taxname:
                    search_results = wikipedia.search(taxname)
                    if len(search_results) > 0:
                        wikiname = str(search_results[0])
                        if taxname.lower() == wikiname.lower():
                            page = wikipedia.page(wikiname)
                    if not page:
                        # query the page directly
                        try:
                            page = wikipedia.page(taxname.replace(" ", "_"))
                        except:
                            page = None

                if page:
                    output = {
                        "pageid": page.pageid,
                        "description": page.content[:1000],
                        "title": page.title,
                        "summary": page.summary
                    }
                    with mutex:
                        taxid2wikicontent[taxid] = output
                break
            except:
                log.write(f"having trouble fetching {taxid} wiki {pageid} attempt {attempt}")
        semaphore.release() 
Example #18
Source File: fetch_tax_info.py    From idseq-dag with MIT License 5 votes vote down vote up
def get_taxid_mapping_for_batch(taxids, taxid2wikidict, mutex, semaphore, max_attempt=3):
        ''' Get wiki mapping for a list of taxids '''
        taxid_str = ",".join(taxids)
        log.write(f"fetching batch {taxid_str}")
        for attempt in range(max_attempt):
            try:
                handle = Entrez.elink(dbfrom="taxonomy", id=taxid_str, cmd="llinks")
                record = Entrez.read(handle)
                handle.close()

                parsed = {}
                results = record[0]['IdUrlList']['IdUrlSet']
                for result in results:
                    taxid = result['Id']
                    wikiurl = ""
                    for link in result['ObjUrl']:
                        url = str(link['Url'])
                        if re.search('wikipedia.org', url):
                            wikiurl = url
                            break
                    parsed[taxid] = wikiurl
                break
            except:
                log.write(f"failed batch attempt {attempt}")
                time.sleep(5)
        semaphore.release()
        with mutex:
            taxid2wikidict.update(parsed) 
Example #19
Source File: search.py    From W.I.L.L with MIT License 4 votes vote down vote up
def search_google(query):
    '''Search google and determine if wikipedia is in it'''
    search_object = google.search(query)
    #Determine if a wikipedia url is in the first 5 searches
    urls = []
    for i in range(0, 4):
        url = search_object.__next__()
        urls.append(url)
        if "wikipedia.org/wiki" in url:
            wikipedia_search = wikipedia.search(query)[0]
            url = wikipedia.page(wikipedia_search).url
            response = wikipedia.summary(wikipedia_search) + " ({0})".format(url)
            return response
    #If there were no wikipedia pages
    first_url = urls[0]
    try:
        article = Article(first_url)
        article.download()
        article.parse()
        article.nlp()
        article_summary = article.summary
        article_title = article.title
        return "{0}\n{1} - ({2})".format(
            article_summary, article_title, first_url
        )

    except Exception as article_exception:
        try:
            log.debug("Got error {0}, {1} while using newspaper, switching to bs4".format(
            article_exception.message,article_exception.args
            ))
            html = requests.get(first_url).text
            #Parse the html using bs4
            soup = BeautifulSoup(html, "html.parser")
            [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
            text = soup.getText()
         # break into lines and remove leading and trailing space on each
            lines = (line.strip() for line in text.splitlines())
            # break multi-headlines into a line each
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            # drop blank lines
            soup_text = '\n'.join(chunk for chunk in chunks if " " in chunk)
            response = format(soup_text) + " ({0})".format(first_url)
            return response
        except Exception as search_exception:
            log.info("Error {0},{1} occurred while searching query {2}".format(
                search_exception.message, search_exception.args, query
            ))
            return "Error encountered on query {0}".format(query)