Python Examples of fuzzywuzzy.process.extract

Source File: qa_to_oie.py From supervised-oie with MIT License

6 votes

def fuzzy_match_word(word, words, limit):
    """
    Fuzzy find the indexes of word in words, returns a list of indexes which match the
    best return from fuzzy.
    limit controls the number of choices to allow.
    """
    # Try finding exact matches
    exact_matches = set([i for (i, w) in enumerate(words) if w == word])
    if exact_matches:
        logging.debug("Found exact match for {}".format(word))

    # Else, return fuzzy matching
    logging.debug("No exact match for: {}".format(word))
    # Allow some variance which extractOne misses
    # For example: "Armstrong World Industries Inc" in "Armstrong World Industries Inc. agreed in principle to sell its carpet operations to Shaw Industries Inc ."
    best_matches  = [w for (w, s) in process.extract(word, words, processor = semi_process, limit = limit) if (s > 70)]
    logging.debug("Best matches = {}".format(best_matches))
    return list(exact_matches.union([i for (i, w) in enumerate(words) if w in best_matches]))


# Flatten a list of lists

Source File: main.py From squeeze-alexa with GNU General Public License v3.0

6 votes

def _genres_from_slots(self, slots: Iterable[str], genres: Iterable[str]):
        def genres_from(g):
            if not g:
                return set()
            res = process.extract(g, genres)[:MAX_GUESSES_PER_SLOT]
            print_d("Raw genre results: {data}", data=res)
            for g, c in res:
                # Exact(ish) matches shouldn't allow other genres
                if c > MinConfidences.SINGLE_GENRE:
                    return {g}
            return {g for g, c in res
                    if g and int(c) >= MinConfidences.MULTI_GENRE}

        # Grr where's my foldl
        results = set()
        for slot in slots:
            results |= genres_from(slot)
        return results

Source File: chatbot_fuzzy.py From nlp_xiaojiang with MIT License

6 votes

def fuzzy_fuzzywuzzy(fuzz, user_input, collection):
    '''编辑距离，速度比较慢，比起匹配方法，能够处理字符不一样的问题'''
    collection_new = []
    len_user_input = len(user_input)
    for coll in collection:  # 获取包含一个字符的，如果不包含，就返回错误
        for i in range(len_user_input):
            if user_input[i] in coll:
                collection_new.append(coll)
    if not collection_new:
        return None
    collection_new = list(set(collection_new))

    same_char_list = []
    for collection_new_one in collection_new: # 获取相同字符串多的问题
        count_same_char_one = count_same_char(user_input, collection_new_one)
        same_char_list.append((collection_new_one, count_same_char_one))
    same_char_list.sort(key=lambda x: x[1], reverse=True)
    if len(same_char_list) >= 500:
        same_char_list = same_char_list[0: 500]

    result =  process.extract(user_input, same_char_list, scorer=fuzz.token_set_ratio, limit=20)
    return result

Source File: internal.py From cheat.sh with MIT License

6 votes

def _get_page(self, topic, request_options=None):
        topics_list = self.get_topics_list()
        if topic.startswith(':'):
            topics_list = [x for x in topics_list if x.startswith(':')]
        else:
            topics_list = [x for x in topics_list if not x.startswith(':')]

        if _USING_FUZZYWUZZY:
            possible_topics = process.extract(topic, topics_list, scorer=fuzz.ratio)[:3]
        else:
            possible_topics = process.extract(topic, topics_list, limit=3, scorer=fuzz.ratio)
        possible_topics_text = "\n".join([("    * %s %s" % x) for x in possible_topics])
        return """
Unknown topic.
Do you mean one of these topics maybe?

%s
    """ % possible_topics_text

Source File: tag_ner.py From TaskBot with GNU General Public License v3.0

5 votes

def extract(self, context):
        entities = process.extract(context["query"], self.keywords)
        print(entities)
        entities = filter(lambda x: x[1] >= self.threshold, entities)
        entities = sorted(entities, key=lambda x: x[1] + len(x[0])/10, reverse=True)
        entities = list(map(lambda x: Tag(TAGMAP[x[0]]), entities))

        if len(entities) == 0:
            return None
        return entities[0]

Source File: smart_bubble.py From Persimmon with MIT License

5 votes

def search(self, string: str):
        if string:
            results = process.extract(string, self.cache,
                                      limit=len(self.cache))
            self.rv.data = [{'cls_name': block[0], 'cls_': block[2],
                             'bub': self, 'backdrop': self.backdrop,
                             'pin': self.pin, 'block_pos': self.pos}
                            for block in results if block[1] > 50]
        else:
            self.rv.data = [{'cls_name': name, 'cls_': class_, 'bub': self,
                             'backdrop': self.backdrop, 'pin': self.pin,
                             'block_pos': self.pos}
                            for class_, name in self.cache.items()]

Source File: fuzzy_search.py From VideoHub with MIT License

5 votes

def fuzzy(search_key, videos, video_titles):
    """
    - Returns a list of closest matching video IDs.
    """
    best_matches = process.extract(search_key, video_titles, limit=10)
    best_match_titles = []
    for match in best_matches:
        best_match_titles.append(match[0])
    best_match_IDs = []
    for title in best_match_titles:
        for ID in videos:
            if title == videos[ID]:
                best_match_IDs.append(ID)

    return best_match_IDs

Source File: gnome-pass-search-provider.py From gnome-pass-search-provider with GNU General Public License v3.0

5 votes

def get_result_set(self, terms):
        if terms[0] == "otp":
            field = terms[0]
        elif terms[0].startswith(":"):
            field = terms[0][1:]
            terms = terms[1:]
        else:
            field = None

        name = "".join(terms)
        password_list = []
        for root, dirs, files in walk(self.password_store):
            dir_path = root[len(self.password_store) + 1 :]

            if dir_path.startswith("."):
                continue

            for filename in files:
                if filename[-4:] != ".gpg":
                    continue
                path = path_join(dir_path, filename)[:-4]
                password_list.append(path)

        results = [
            e[0]
            for e in process.extract(
                name, password_list, limit=5, scorer=fuzz.partial_ratio
            )
        ]
        if field == "otp":
            results = [f"otp {r}" for r in results]
        elif field is not None:
            results = [f":{field} {r}" for r in results]
        return results

Source File: provider.py From feeluown-core with MIT License

5 votes

def search(self, keyword, **kwargs):
        limit = kwargs.get('limit', 10)
        repr_song_map = dict()
        for song in self.songs:
            key = song.title + ' ' + song.artists_name + str(song.identifier)
            repr_song_map[key] = song
        choices = repr_song_map.keys()
        result = process.extract(keyword, choices, limit=limit)
        result_songs = []
        for each, score in result:
            # if score > 80, keyword is almost included in song key
            if score > 80:
                result_songs.append(repr_song_map[each])
        return LSearchModel(q=keyword, songs=result_songs)

Source File: bot.py From app_rasa_chat_bot with MIT License

5 votes

def fuzzy_match_ents(ents, choices, limit=2, thresh=80):
    fuzz_matches_out = []
    for ent in ents:
        top_matches = process.extract(
            ent,
            set(choices),
            limit=limit,
            scorer=fuzz.partial_ratio)
        for match, score in top_matches:
            if score >= thresh:
                fuzz_matches_out.append(match)
    return fuzz_matches_out

Source File: ida_fuzzy.py From IDAFuzzy with MIT License

5 votes

def OnFormChange(self, fid):
        if fid == -1:
            # initialize
            pass
        elif fid == -2:
            # terminate
            pass
        elif fid == self.cEChooser.id:
            self.selected_id = self.GetControlValue(self.cEChooser)[0]
        elif fid == self.iStr1.id:
            self.s = self.GetControlValue(self.iStr1)
            self.EChooser.items = []
            if self.s == '':
                self.RefreshField(self.cEChooser)
                return 1
            self.fst.stop()
            self.fst.quit()  #  if you type speedy, FuzzySearch which executed before is not finished here.
            self.fst.terminate_event.set()
            self.fst.wait()
            #self.fst.terminate()  # but last time's FuzzySearch is meaningless, so terminate this. <- little dangerous?

            #stop and quit take time.(and maybe non-blocking)
            #So if you type speedy, some start() call will be ignored.
            #re-create thread solve this.
            self.fst = FuzzySearchThread()
            self.fst.refresh_list.connect(self.refresh_list)
            self.fst.finished.connect(self.finished)
            self.fst.setup(self.s)
            self.fst.start()

            # extracts = process.extract(s, names, limit=10)  # f.iStr1.value won't change until Form.Execute() returns.
        else:
            pass
        return 1

Source File: ida_fuzzy.py From IDAFuzzy with MIT License

5 votes

def run(self):
        f = functools.partial(hooked_scorer, terminate_event=self.terminate_event)
        try:
            res = process.extract(self.s, names, limit=LISTLEN, scorer=f)  # f.iStr1.value won't change until Form.Execute() returns.
            extracts = []
            for i in res:
                extracts.append(i[0])
            for i in range(10-len(res)):
                extracts.append("")
            self.refresh_list.emit(*extracts)  # call main Thread's UI function.
        except TerminateException:
            pass
        self.stop()
        self.finished.emit()


# --------------------------------------------------------------------------

Source File: utilities.py From estimagic with BSD 3-Clause "New" or "Revised" License

5 votes

def propose_algorithms(requested_algo, algos, number=3):
    """Propose a a number of algorithms based on similarity to the requested algorithm.

    Args:
        requested_algo (str): From the user requested algorithm.
        algos (dict(str, list(str))): Dictionary where keys are the package and values
            are lists of algorithms.
        number (int) : Number of proposals.

    Returns:
        proposals (list(str)): List of proposed algorithms.

    Example:
        >>> algos = {"scipy": ["L-BFGS-B", "TNC"], "nlopt": ["lbfgsb"]}
        >>> propose_algorithms("scipy_L-BFGS-B", algos, number=1)
        ['scipy_L-BFGS-B']
        >>> propose_algorithms("L-BFGS-B", algos, number=2)
        ['scipy_L-BFGS-B', 'nlopt_lbfgsb']

    """
    possibilities = [
        "_".join([origin, algo_name]) for origin in algos for algo_name in algos[origin]
    ]
    proposals_w_probs = fw_process.extract(requested_algo, possibilities, limit=number)
    proposals = [proposal[0] for proposal in proposals_w_probs]

    return proposals

Source File: bolt.py From Bolt with GNU General Public License v3.0

5 votes

def fuzzy(tokens):
    averages = []
    for token in tokens:
        sameTokenRemoved = False
        result = process.extract(token, tokens, scorer=fuzz.partial_ratio)
        scores = []
        for each in result:
            score = each[1]
            if score == 100 and not sameTokenRemoved:
                sameTokenRemoved = True
                continue
            scores.append(score)
        average = statistics.mean(scores)
        averages.append(average)
    return statistics.mean(averages)

Source File: pass-filter.py From alfred-pass with GNU General Public License v3.0

5 votes

def search_passwords_fuzzy(query):
    ''' Search passwords using the Fuzzy search method using fuzzywuzzy'''
    passwords = list_passwords()
    return [entry[0] for entry in process.extract(query, passwords)]

Source File: ADUserCreds.py From armory with GNU General Public License v3.0

5 votes

def search_term(self, txt, pw_count):
        pws = pw_count.keys()
        if type(txt) == str:
            txt = [txt]
        total_matches = 0
        for t in txt:
            matches = [r[0] for r in process.extract(t, pws, limit=None) if r[1] > 75]

            total_matches += sum([pw_count[p]["count"] for p in matches])

        return total_matches

Source File: pyinrail.py From pyinrail with MIT License

5 votes

def search_train(self, query):
        """
        search train by name or number
        """
        return [x[0] for x in process.extract(query, self.trains.values())]

Source File: pyinrail.py From pyinrail with MIT License

5 votes

def search_station(self, query):
        """
        search station by name or code
        """
        return [x[0] for x in process.extract(query, self.stations)]

Source File: util.py From rules-bot with GNU Affero General Public License v3.0

5 votes

def search(self, query):
        def processor(x):
            if isinstance(x, Issue):
                x = x.title
            return x.strip().lower()

        # We don't care about the score, so return first element
        # This must not happen while updating the self.issues dict so acquire the lock
        with self.issues_lock:
            return [result[0] for result in process.extract(query, self.issues, scorer=fuzz.partial_ratio,
                                                            processor=processor, limit=5)]

Source File: twistmoe.py From anime-downloader with The Unlicense

5 votes

def search(self, query):
        headers = {
        'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.46 Safari/537.36',
        'x-access-token': '1rj2vRtegS8Y60B3w3qNZm5T2Q0TN2NR'
        }
        # soup = helpers.soupify(helpers.get('https://twist.moe/', allow_redirects=True, headers=headers))
        req = helpers.get('https://twist.moe/api/anime', headers=headers)
        if 'being redirected' in req.text:
            logger.debug('Tring to extract cookie')
            cookie = get_cookie(req)
            logger.debug('Got cookie: ' + cookie)
            headers['cookie'] = cookie
            # XXX: Can't use helpers.get here becuse that one is cached. Investigate
            req = helpers.get('https://twist.moe/api/anime', headers=headers)
        all_anime = req.json()
        animes = []
        for anime in all_anime:
            animes.append(SearchResult(
                title=anime['title'],
                url='https://twist.moe/a/' + anime['slug']['slug'] + '/',
            ))
        animes = [ani[0] for ani in process.extract(query, animes)]
        return animes

Source File: tag_ner.py From TaskBot with GNU General Public License v3.0

5 votes

def transform(self, context):
        return self.extract(context)

Source File: searcher.py From todxpy with GNU General Public License v2.0

5 votes

def find_index_tag(tag, tlist):
    """
    Returns a list with first element as tag and rest indexes of todos with that tag
    """
    index_list = []
    similar_tags = []
    for i, todo in enumerate(tlist):
        similar_tags = process.extract(tag, todo.tags)
        if len(similar_tags) > 0:
            if len(index_list) == 0:
                index_list.append(similar_tags[0][0])
            if similar_tags[0][1] > 70:
                index_list.append(i)
    return index_list

Source File: sudoku_guessing.py From songoku with MIT License

4 votes

def solve_approximate(self, approximate=False):
        'If it finds a sudoku similar to one it has already done, uses its solution'
        string = self.as_string()
        if string in self.already_solved.keys():
            return self.already_solved[string], self.already_solved_numbers[string]

        else:
            # We save the attempts that we already did but were unsuccesful
            if string in self.already_solved_false:
                solved = False
            else:
                solved = sudoku_solving.solve(string)

            # If the sudoku is unsolvable but very similar to one we already did
            # we assume it's the same one but we couldn't quite catch some numbers
            # Approximate is percent-based, 90 = 90%
            if solved is False:
                # Saves this sudoku as false so we don't have to try to solve it every frame
                self.already_solved_false.append(string)

                if self.already_solved.keys():

                    guesses = process.extract(string, self.already_solved.keys())

                    if guesses:

                        # Prioritizes length, then similarity to the guess
                        if approximate is False:
                            best = max(guesses, key=lambda x: (x[1], len(self.already_solved_numbers[x[0]])))[0]
                            return self.already_solved[best], self.already_solved_numbers[best]
                        else:
                            sorty = sorted(guesses, key=lambda x: (len(self.already_solved_numbers[x[0]]), x[1]), reverse=True)
                            for item in sorty:
                                if item[1] > approximate:
                                    # Sort them by length and then get the one with biggest length that has addecuate ratio?
                                    return self.already_solved[item[0]], self.already_solved_numbers[item[0]]
                            else:
                                best = max(guesses, key=lambda x: (x[1], len(self.already_solved_numbers[x[0]])))[0]
                                return self.already_solved[best], self.already_solved_numbers[best]

            # Only saves correct solutions
            if solved is not False:
                # also save the numbers that already exist in the array
                # (so we don't write over them if we can't see them)
                self.already_solved_numbers[string] = self.get_existing_numbers()
                self.already_solved[string] = solved

                return solved, self.already_solved_numbers[string]

        return False, False

Source File: chatbot_fuzzy.py From nlp_xiaojiang with MIT License

4 votes

def fuzzy_fuzzywuzzy_list(fuzz, user_input, qa_list, collection, topn=50):
    '''编辑距离，速度比较慢，比起匹配方法，能够处理字符不一样的问题'''

    start_time = time.time()
    # user_input_set = set([user_input_one for user_input_one in user_input])
    user_input_set = [user_input_one for user_input_one in user_input]


    same_char_list = []
    max_data = 0
    max_data_list = []
    count_collection_new_one = 0
    for collection_new_one in collection: # 获取相同字符串多的问题
        count_same_char_one = len([x for x in user_input_set if x in collection_new_one])

        if count_same_char_one > 0:
            same_char_list.append((count_collection_new_one, count_same_char_one))
        if count_same_char_one > max_data:
            max_data_list.append(count_same_char_one)
            max_data = count_same_char_one
        count_collection_new_one += 1

    end_time1 = time.time()
    list_max_count = []
    len_max_data_list = len(max_data_list)
    for x in range(len_max_data_list):  # 获取前20排名
        for k,l in same_char_list:
            if l == max_data_list[len_max_data_list -1 - x]:
                list_max_count.append(qa_list[k]) #问答重这里取出来
        if len(list_max_count) >= 5000:
            list_max_count = list_max_count[0:5000]
            break

    end_time2 = time.time()

    # end_time1: 0.34090662002563477
    # end_time2: 0.4080846309661865

    # end_time1: 0.06417036056518555
    # end_time2: 0.08422374725341797

    # same_char_list.sort(key=lambda x: x[1], reverse=True)
    # if len(same_char_list) >= 20:
    #     same_char_list = same_char_list[0: 20]

    result =  process.extract(user_input, list_max_count, scorer=fuzz.token_set_ratio, limit=topn)
    end_time3 = time.time()

    # print('end_time1: ' + str(end_time1 - start_time))
    # print('end_time2: ' + str(end_time2 - start_time))
    # print('end_time3: ' + str(end_time3 - start_time))

    return result
    # [fuzz.WRatio, fuzz.QRatio,
    #  fuzz.token_set_ratio, fuzz.token_sort_ratio,
    #  fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio,
    #  fuzz.UWRatio, fuzz.UQRatio]

Source File: functions.py From avrae with GNU General Public License v3.0

4 votes

def search(list_to_search: list, value, key, cutoff=5, return_key=False, strict=False):
    """Fuzzy searches a list for an object
    result can be either an object or list of objects
    :param list_to_search: The list to search.
    :param value: The value to search for.
    :param key: A function defining what to search for.
    :param cutoff: The scorer cutoff value for fuzzy searching.
    :param return_key: Whether to return the key of the object that matched or the object itself.
    :param strict: If True, will only search for exact matches.
    :returns: A two-tuple (result, strict)"""
    # there is nothing to search
    if len(list_to_search) == 0:
        return [], False

    # full match, return result
    exact_matches = [a for a in list_to_search if value.lower() == key(a).lower()]
    if not (exact_matches or strict):
        partial_matches = [a for a in list_to_search if value.lower() in key(a).lower()]
        if len(partial_matches) > 1 or not partial_matches:
            names = [key(d).lower() for d in list_to_search]
            fuzzy_map = {key(d).lower(): d for d in list_to_search}
            fuzzy_results = [r for r in process.extract(value.lower(), names, scorer=fuzz.ratio) if r[1] >= cutoff]
            fuzzy_sum = sum(r[1] for r in fuzzy_results)
            fuzzy_matches_and_confidences = [(fuzzy_map[r[0]], r[1] / fuzzy_sum) for r in fuzzy_results]

            # display the results in order of confidence
            weighted_results = []
            weighted_results.extend((match, confidence) for match, confidence in fuzzy_matches_and_confidences)
            weighted_results.extend((match, len(value) / len(key(match))) for match in partial_matches)
            sorted_weighted = sorted(weighted_results, key=lambda e: e[1], reverse=True)

            # build results list, unique
            results = []
            for r in sorted_weighted:
                if r[0] not in results:
                    results.append(r[0])
        else:
            results = partial_matches
    else:
        results = exact_matches

    if len(results) > 1:
        if return_key:
            return [key(r) for r in results], False
        else:
            return results, False
    elif not results:
        return [], False
    else:
        if return_key:
            return key(results[0]), True
        else:
            return results[0], True

Python fuzzywuzzy.process.extract() Examples