Python fuzzywuzzy.process.extract() Examples
The following are 25
code examples of fuzzywuzzy.process.extract().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
fuzzywuzzy.process
, or try the search function
.
Example #1
Source File: qa_to_oie.py From supervised-oie with MIT License | 6 votes |
def fuzzy_match_word(word, words, limit): """ Fuzzy find the indexes of word in words, returns a list of indexes which match the best return from fuzzy. limit controls the number of choices to allow. """ # Try finding exact matches exact_matches = set([i for (i, w) in enumerate(words) if w == word]) if exact_matches: logging.debug("Found exact match for {}".format(word)) # Else, return fuzzy matching logging.debug("No exact match for: {}".format(word)) # Allow some variance which extractOne misses # For example: "Armstrong World Industries Inc" in "Armstrong World Industries Inc. agreed in principle to sell its carpet operations to Shaw Industries Inc ." best_matches = [w for (w, s) in process.extract(word, words, processor = semi_process, limit = limit) if (s > 70)] logging.debug("Best matches = {}".format(best_matches)) return list(exact_matches.union([i for (i, w) in enumerate(words) if w in best_matches])) # Flatten a list of lists
Example #2
Source File: main.py From squeeze-alexa with GNU General Public License v3.0 | 6 votes |
def _genres_from_slots(self, slots: Iterable[str], genres: Iterable[str]): def genres_from(g): if not g: return set() res = process.extract(g, genres)[:MAX_GUESSES_PER_SLOT] print_d("Raw genre results: {data}", data=res) for g, c in res: # Exact(ish) matches shouldn't allow other genres if c > MinConfidences.SINGLE_GENRE: return {g} return {g for g, c in res if g and int(c) >= MinConfidences.MULTI_GENRE} # Grr where's my foldl results = set() for slot in slots: results |= genres_from(slot) return results
Example #3
Source File: chatbot_fuzzy.py From nlp_xiaojiang with MIT License | 6 votes |
def fuzzy_fuzzywuzzy(fuzz, user_input, collection): '''编辑距离,速度比较慢,比起匹配方法,能够处理字符不一样的问题''' collection_new = [] len_user_input = len(user_input) for coll in collection: # 获取包含一个字符的,如果不包含,就返回错误 for i in range(len_user_input): if user_input[i] in coll: collection_new.append(coll) if not collection_new: return None collection_new = list(set(collection_new)) same_char_list = [] for collection_new_one in collection_new: # 获取相同字符串多的问题 count_same_char_one = count_same_char(user_input, collection_new_one) same_char_list.append((collection_new_one, count_same_char_one)) same_char_list.sort(key=lambda x: x[1], reverse=True) if len(same_char_list) >= 500: same_char_list = same_char_list[0: 500] result = process.extract(user_input, same_char_list, scorer=fuzz.token_set_ratio, limit=20) return result
Example #4
Source File: internal.py From cheat.sh with MIT License | 6 votes |
def _get_page(self, topic, request_options=None): topics_list = self.get_topics_list() if topic.startswith(':'): topics_list = [x for x in topics_list if x.startswith(':')] else: topics_list = [x for x in topics_list if not x.startswith(':')] if _USING_FUZZYWUZZY: possible_topics = process.extract(topic, topics_list, scorer=fuzz.ratio)[:3] else: possible_topics = process.extract(topic, topics_list, limit=3, scorer=fuzz.ratio) possible_topics_text = "\n".join([(" * %s %s" % x) for x in possible_topics]) return """ Unknown topic. Do you mean one of these topics maybe? %s """ % possible_topics_text
Example #5
Source File: tag_ner.py From TaskBot with GNU General Public License v3.0 | 5 votes |
def extract(self, context): entities = process.extract(context["query"], self.keywords) print(entities) entities = filter(lambda x: x[1] >= self.threshold, entities) entities = sorted(entities, key=lambda x: x[1] + len(x[0])/10, reverse=True) entities = list(map(lambda x: Tag(TAGMAP[x[0]]), entities)) if len(entities) == 0: return None return entities[0]
Example #6
Source File: smart_bubble.py From Persimmon with MIT License | 5 votes |
def search(self, string: str): if string: results = process.extract(string, self.cache, limit=len(self.cache)) self.rv.data = [{'cls_name': block[0], 'cls_': block[2], 'bub': self, 'backdrop': self.backdrop, 'pin': self.pin, 'block_pos': self.pos} for block in results if block[1] > 50] else: self.rv.data = [{'cls_name': name, 'cls_': class_, 'bub': self, 'backdrop': self.backdrop, 'pin': self.pin, 'block_pos': self.pos} for class_, name in self.cache.items()]
Example #7
Source File: fuzzy_search.py From VideoHub with MIT License | 5 votes |
def fuzzy(search_key, videos, video_titles): """ - Returns a list of closest matching video IDs. """ best_matches = process.extract(search_key, video_titles, limit=10) best_match_titles = [] for match in best_matches: best_match_titles.append(match[0]) best_match_IDs = [] for title in best_match_titles: for ID in videos: if title == videos[ID]: best_match_IDs.append(ID) return best_match_IDs
Example #8
Source File: gnome-pass-search-provider.py From gnome-pass-search-provider with GNU General Public License v3.0 | 5 votes |
def get_result_set(self, terms): if terms[0] == "otp": field = terms[0] elif terms[0].startswith(":"): field = terms[0][1:] terms = terms[1:] else: field = None name = "".join(terms) password_list = [] for root, dirs, files in walk(self.password_store): dir_path = root[len(self.password_store) + 1 :] if dir_path.startswith("."): continue for filename in files: if filename[-4:] != ".gpg": continue path = path_join(dir_path, filename)[:-4] password_list.append(path) results = [ e[0] for e in process.extract( name, password_list, limit=5, scorer=fuzz.partial_ratio ) ] if field == "otp": results = [f"otp {r}" for r in results] elif field is not None: results = [f":{field} {r}" for r in results] return results
Example #9
Source File: provider.py From feeluown-core with MIT License | 5 votes |
def search(self, keyword, **kwargs): limit = kwargs.get('limit', 10) repr_song_map = dict() for song in self.songs: key = song.title + ' ' + song.artists_name + str(song.identifier) repr_song_map[key] = song choices = repr_song_map.keys() result = process.extract(keyword, choices, limit=limit) result_songs = [] for each, score in result: # if score > 80, keyword is almost included in song key if score > 80: result_songs.append(repr_song_map[each]) return LSearchModel(q=keyword, songs=result_songs)
Example #10
Source File: bot.py From app_rasa_chat_bot with MIT License | 5 votes |
def fuzzy_match_ents(ents, choices, limit=2, thresh=80): fuzz_matches_out = [] for ent in ents: top_matches = process.extract( ent, set(choices), limit=limit, scorer=fuzz.partial_ratio) for match, score in top_matches: if score >= thresh: fuzz_matches_out.append(match) return fuzz_matches_out
Example #11
Source File: ida_fuzzy.py From IDAFuzzy with MIT License | 5 votes |
def OnFormChange(self, fid): if fid == -1: # initialize pass elif fid == -2: # terminate pass elif fid == self.cEChooser.id: self.selected_id = self.GetControlValue(self.cEChooser)[0] elif fid == self.iStr1.id: self.s = self.GetControlValue(self.iStr1) self.EChooser.items = [] if self.s == '': self.RefreshField(self.cEChooser) return 1 self.fst.stop() self.fst.quit() # if you type speedy, FuzzySearch which executed before is not finished here. self.fst.terminate_event.set() self.fst.wait() #self.fst.terminate() # but last time's FuzzySearch is meaningless, so terminate this. <- little dangerous? #stop and quit take time.(and maybe non-blocking) #So if you type speedy, some start() call will be ignored. #re-create thread solve this. self.fst = FuzzySearchThread() self.fst.refresh_list.connect(self.refresh_list) self.fst.finished.connect(self.finished) self.fst.setup(self.s) self.fst.start() # extracts = process.extract(s, names, limit=10) # f.iStr1.value won't change until Form.Execute() returns. else: pass return 1
Example #12
Source File: ida_fuzzy.py From IDAFuzzy with MIT License | 5 votes |
def run(self): f = functools.partial(hooked_scorer, terminate_event=self.terminate_event) try: res = process.extract(self.s, names, limit=LISTLEN, scorer=f) # f.iStr1.value won't change until Form.Execute() returns. extracts = [] for i in res: extracts.append(i[0]) for i in range(10-len(res)): extracts.append("") self.refresh_list.emit(*extracts) # call main Thread's UI function. except TerminateException: pass self.stop() self.finished.emit() # --------------------------------------------------------------------------
Example #13
Source File: utilities.py From estimagic with BSD 3-Clause "New" or "Revised" License | 5 votes |
def propose_algorithms(requested_algo, algos, number=3): """Propose a a number of algorithms based on similarity to the requested algorithm. Args: requested_algo (str): From the user requested algorithm. algos (dict(str, list(str))): Dictionary where keys are the package and values are lists of algorithms. number (int) : Number of proposals. Returns: proposals (list(str)): List of proposed algorithms. Example: >>> algos = {"scipy": ["L-BFGS-B", "TNC"], "nlopt": ["lbfgsb"]} >>> propose_algorithms("scipy_L-BFGS-B", algos, number=1) ['scipy_L-BFGS-B'] >>> propose_algorithms("L-BFGS-B", algos, number=2) ['scipy_L-BFGS-B', 'nlopt_lbfgsb'] """ possibilities = [ "_".join([origin, algo_name]) for origin in algos for algo_name in algos[origin] ] proposals_w_probs = fw_process.extract(requested_algo, possibilities, limit=number) proposals = [proposal[0] for proposal in proposals_w_probs] return proposals
Example #14
Source File: bolt.py From Bolt with GNU General Public License v3.0 | 5 votes |
def fuzzy(tokens): averages = [] for token in tokens: sameTokenRemoved = False result = process.extract(token, tokens, scorer=fuzz.partial_ratio) scores = [] for each in result: score = each[1] if score == 100 and not sameTokenRemoved: sameTokenRemoved = True continue scores.append(score) average = statistics.mean(scores) averages.append(average) return statistics.mean(averages)
Example #15
Source File: pass-filter.py From alfred-pass with GNU General Public License v3.0 | 5 votes |
def search_passwords_fuzzy(query): ''' Search passwords using the Fuzzy search method using fuzzywuzzy''' passwords = list_passwords() return [entry[0] for entry in process.extract(query, passwords)]
Example #16
Source File: ADUserCreds.py From armory with GNU General Public License v3.0 | 5 votes |
def search_term(self, txt, pw_count): pws = pw_count.keys() if type(txt) == str: txt = [txt] total_matches = 0 for t in txt: matches = [r[0] for r in process.extract(t, pws, limit=None) if r[1] > 75] total_matches += sum([pw_count[p]["count"] for p in matches]) return total_matches
Example #17
Source File: pyinrail.py From pyinrail with MIT License | 5 votes |
def search_train(self, query): """ search train by name or number """ return [x[0] for x in process.extract(query, self.trains.values())]
Example #18
Source File: pyinrail.py From pyinrail with MIT License | 5 votes |
def search_station(self, query): """ search station by name or code """ return [x[0] for x in process.extract(query, self.stations)]
Example #19
Source File: util.py From rules-bot with GNU Affero General Public License v3.0 | 5 votes |
def search(self, query): def processor(x): if isinstance(x, Issue): x = x.title return x.strip().lower() # We don't care about the score, so return first element # This must not happen while updating the self.issues dict so acquire the lock with self.issues_lock: return [result[0] for result in process.extract(query, self.issues, scorer=fuzz.partial_ratio, processor=processor, limit=5)]
Example #20
Source File: twistmoe.py From anime-downloader with The Unlicense | 5 votes |
def search(self, query): headers = { 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.46 Safari/537.36', 'x-access-token': '1rj2vRtegS8Y60B3w3qNZm5T2Q0TN2NR' } # soup = helpers.soupify(helpers.get('https://twist.moe/', allow_redirects=True, headers=headers)) req = helpers.get('https://twist.moe/api/anime', headers=headers) if 'being redirected' in req.text: logger.debug('Tring to extract cookie') cookie = get_cookie(req) logger.debug('Got cookie: ' + cookie) headers['cookie'] = cookie # XXX: Can't use helpers.get here becuse that one is cached. Investigate req = helpers.get('https://twist.moe/api/anime', headers=headers) all_anime = req.json() animes = [] for anime in all_anime: animes.append(SearchResult( title=anime['title'], url='https://twist.moe/a/' + anime['slug']['slug'] + '/', )) animes = [ani[0] for ani in process.extract(query, animes)] return animes
Example #21
Source File: tag_ner.py From TaskBot with GNU General Public License v3.0 | 5 votes |
def transform(self, context): return self.extract(context)
Example #22
Source File: searcher.py From todxpy with GNU General Public License v2.0 | 5 votes |
def find_index_tag(tag, tlist): """ Returns a list with first element as tag and rest indexes of todos with that tag """ index_list = [] similar_tags = [] for i, todo in enumerate(tlist): similar_tags = process.extract(tag, todo.tags) if len(similar_tags) > 0: if len(index_list) == 0: index_list.append(similar_tags[0][0]) if similar_tags[0][1] > 70: index_list.append(i) return index_list
Example #23
Source File: sudoku_guessing.py From songoku with MIT License | 4 votes |
def solve_approximate(self, approximate=False): 'If it finds a sudoku similar to one it has already done, uses its solution' string = self.as_string() if string in self.already_solved.keys(): return self.already_solved[string], self.already_solved_numbers[string] else: # We save the attempts that we already did but were unsuccesful if string in self.already_solved_false: solved = False else: solved = sudoku_solving.solve(string) # If the sudoku is unsolvable but very similar to one we already did # we assume it's the same one but we couldn't quite catch some numbers # Approximate is percent-based, 90 = 90% if solved is False: # Saves this sudoku as false so we don't have to try to solve it every frame self.already_solved_false.append(string) if self.already_solved.keys(): guesses = process.extract(string, self.already_solved.keys()) if guesses: # Prioritizes length, then similarity to the guess if approximate is False: best = max(guesses, key=lambda x: (x[1], len(self.already_solved_numbers[x[0]])))[0] return self.already_solved[best], self.already_solved_numbers[best] else: sorty = sorted(guesses, key=lambda x: (len(self.already_solved_numbers[x[0]]), x[1]), reverse=True) for item in sorty: if item[1] > approximate: # Sort them by length and then get the one with biggest length that has addecuate ratio? return self.already_solved[item[0]], self.already_solved_numbers[item[0]] else: best = max(guesses, key=lambda x: (x[1], len(self.already_solved_numbers[x[0]])))[0] return self.already_solved[best], self.already_solved_numbers[best] # Only saves correct solutions if solved is not False: # also save the numbers that already exist in the array # (so we don't write over them if we can't see them) self.already_solved_numbers[string] = self.get_existing_numbers() self.already_solved[string] = solved return solved, self.already_solved_numbers[string] return False, False
Example #24
Source File: chatbot_fuzzy.py From nlp_xiaojiang with MIT License | 4 votes |
def fuzzy_fuzzywuzzy_list(fuzz, user_input, qa_list, collection, topn=50): '''编辑距离,速度比较慢,比起匹配方法,能够处理字符不一样的问题''' start_time = time.time() # user_input_set = set([user_input_one for user_input_one in user_input]) user_input_set = [user_input_one for user_input_one in user_input] same_char_list = [] max_data = 0 max_data_list = [] count_collection_new_one = 0 for collection_new_one in collection: # 获取相同字符串多的问题 count_same_char_one = len([x for x in user_input_set if x in collection_new_one]) if count_same_char_one > 0: same_char_list.append((count_collection_new_one, count_same_char_one)) if count_same_char_one > max_data: max_data_list.append(count_same_char_one) max_data = count_same_char_one count_collection_new_one += 1 end_time1 = time.time() list_max_count = [] len_max_data_list = len(max_data_list) for x in range(len_max_data_list): # 获取前20排名 for k,l in same_char_list: if l == max_data_list[len_max_data_list -1 - x]: list_max_count.append(qa_list[k]) #问答重这里取出来 if len(list_max_count) >= 5000: list_max_count = list_max_count[0:5000] break end_time2 = time.time() # end_time1: 0.34090662002563477 # end_time2: 0.4080846309661865 # end_time1: 0.06417036056518555 # end_time2: 0.08422374725341797 # same_char_list.sort(key=lambda x: x[1], reverse=True) # if len(same_char_list) >= 20: # same_char_list = same_char_list[0: 20] result = process.extract(user_input, list_max_count, scorer=fuzz.token_set_ratio, limit=topn) end_time3 = time.time() # print('end_time1: ' + str(end_time1 - start_time)) # print('end_time2: ' + str(end_time2 - start_time)) # print('end_time3: ' + str(end_time3 - start_time)) return result # [fuzz.WRatio, fuzz.QRatio, # fuzz.token_set_ratio, fuzz.token_sort_ratio, # fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio, # fuzz.UWRatio, fuzz.UQRatio]
Example #25
Source File: functions.py From avrae with GNU General Public License v3.0 | 4 votes |
def search(list_to_search: list, value, key, cutoff=5, return_key=False, strict=False): """Fuzzy searches a list for an object result can be either an object or list of objects :param list_to_search: The list to search. :param value: The value to search for. :param key: A function defining what to search for. :param cutoff: The scorer cutoff value for fuzzy searching. :param return_key: Whether to return the key of the object that matched or the object itself. :param strict: If True, will only search for exact matches. :returns: A two-tuple (result, strict)""" # there is nothing to search if len(list_to_search) == 0: return [], False # full match, return result exact_matches = [a for a in list_to_search if value.lower() == key(a).lower()] if not (exact_matches or strict): partial_matches = [a for a in list_to_search if value.lower() in key(a).lower()] if len(partial_matches) > 1 or not partial_matches: names = [key(d).lower() for d in list_to_search] fuzzy_map = {key(d).lower(): d for d in list_to_search} fuzzy_results = [r for r in process.extract(value.lower(), names, scorer=fuzz.ratio) if r[1] >= cutoff] fuzzy_sum = sum(r[1] for r in fuzzy_results) fuzzy_matches_and_confidences = [(fuzzy_map[r[0]], r[1] / fuzzy_sum) for r in fuzzy_results] # display the results in order of confidence weighted_results = [] weighted_results.extend((match, confidence) for match, confidence in fuzzy_matches_and_confidences) weighted_results.extend((match, len(value) / len(key(match))) for match in partial_matches) sorted_weighted = sorted(weighted_results, key=lambda e: e[1], reverse=True) # build results list, unique results = [] for r in sorted_weighted: if r[0] not in results: results.append(r[0]) else: results = partial_matches else: results = exact_matches if len(results) > 1: if return_key: return [key(r) for r in results], False else: return results, False elif not results: return [], False else: if return_key: return key(results[0]), True else: return results[0], True