Python Levenshtein.ratio() Examples

The following are 27 code examples of Levenshtein.ratio(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module Levenshtein , or try the search function .
Example #1
Source File: alignment.py    From errant with MIT License 6 votes vote down vote up
def get_sub_cost(self, o, c):
        # Short circuit if the only difference is case
        if o.lower == c.lower: return 0
        # Lemma cost
        if o.lemma == c.lemma: lemma_cost = 0
        else: lemma_cost = 0.499
        # POS cost
        if o.pos == c.pos: pos_cost = 0
        elif o.pos in self._open_pos and c.pos in self._open_pos: pos_cost = 0.25
        else: pos_cost = 0.5
        # Char cost
        char_cost = 1-Levenshtein.ratio(o.text, c.text)
        # Combine the costs
        return lemma_cost + pos_cost + char_cost

    # Get the cheapest alignment sequence and indices from the op matrix
    # align_seq = [(op, o_start, o_end, c_start, c_end), ...] 
Example #2
Source File: filters.py    From KTSpeechCrawler with MIT License 6 votes vote down vote up
def __call__(self, input):
        subtitles = input["subtitles"]
        subset = random.sample(subtitles, self.num_samples_to_test)

        transcripts = [(s, _get_transcript_google_web_asr(s)) for s in subset]
        transcripts = [(t, s) for (t, s) in transcripts if s is not None]
        if len(transcripts) == 0:
            #filter removes all the subtitles, as potentially unreliable sample
            subtitles = []
        else:
            overlap_ratio = [ratio(t["phrase"].lower(), s.lower()) for (t, s) in transcripts]
            passed_threshold =  sum(overlap_ratio) / len(overlap_ratio) > self.mean_wer_threshold
            if not passed_threshold:
                #removing all subtitles, as potentially unreliable
                subtitles = []
        input["subtitles"] = subtitles
        return input 
Example #3
Source File: status_notifications.py    From intake with MIT License 6 votes vote down vote up
def get_message_change_ratio(status_update):
    """Expects a status update instance, returns a number representing
    how much a message has been edited (1.0 completely changed, 0.0 unchanged)
    based on Levenshtein ratio.
    If a status update has no associated notification, returns None
    https://github.com/ztane/python-Levenshtein
    """
    if hasattr(status_update, 'notification'):
        author_profile = status_update.author.profile
        intro_text = get_notification_intro(author_profile) + '\n\n'
        return 1.0 - Levenshtein.ratio(
            *[message.replace(intro_text, '')
              for message in (
                status_update.notification.base_message,
                status_update.notification.sent_message)])
    else:
        return None 
Example #4
Source File: generate_accuracy_report.py    From namsel with MIT License 6 votes vote down vote up
def _get_compare_data(tif_txt_pair):
    tif = tif_txt_pair[0]
    txt = tif_txt_pair[1]
    if tif[:-4] == txt[:-4]: # This should always be true
#         ocr = run_main(tif, conf=Config(path='/home/zr/letters/conf/443cf9ec-76c7-44bc-95ad-593138d2d5fc.conf'), text=True)
#         ocr = run_main(tif, conf=Config(segmenter='stochastic', recognizer='hmm', break_width=3.6), text=True)
        ocr = run_main(tif, text=True)
#         ocr = run_all_confs_for_page(tif, text = True)
        ocr = ocr.strip()
        txt = open(txt,'r').read()
        txt = _normalize_input(txt)
        edit_dist = L.distance(txt, ocr)
        edit_ratio = L.ratio(txt, ocr)
        html = _make_html_diff(txt, ocr)
#        sys.exit()
        data = {'edit_distance': edit_dist,
                'edit_ratio': edit_ratio,
                'filename': os.path.basename(tif), 
                'html': html
            }
    return data 
Example #5
Source File: string_utils.py    From ph0neutria with Apache License 2.0 6 votes vote down vote up
def similar_string_fast(first_string, second_string):
    """Determine if two strings are similar (using two most effective methods).

    Params:
    - first_string: (type: string) first string.
    - second_string: (type: string) second string.

    Returns:
    - result: (type: bool) match result.
    """
    partial_score = fuzz.ratio(first_string, second_string)
    token_score = fuzz.token_set_ratio(first_string, second_string)

    if max(partial_score, token_score) >= SCORE_THRESHOLD_FAST:
        return True

    return False 
Example #6
Source File: scorer.py    From nmt-chatbot with GNU General Public License v3.0 5 votes vote down vote up
def answer_subsentence_similarity_by_ratio(index, question, answer):
    global valid_emoticon

    # Disabled or short or char emoticon
    if score_settings['answer_subsentence_similarity_modifier_value'] is None or len(answer) < score_settings['answer_subsentence_similarity_sentence_len'] or valid_emoticon:
        return 0

    # Split response into subsentences
    answer = list(filter(None, re.split(score_settings['subsentence_dividers'], answer)))

    # Find max similarity
    max_ratio = 0
    for num, subsentence in enumerate(answer):
        for sunsentence2 in answer[num+1:]:
            max_ratio = max(max_ratio, Levenshtein.ratio(subsentence, sunsentence2))

    # Not similar
    if max_ratio < score_settings['answer_subsentence_similarity_threshold']:
        return 0

    # Apply value
    if score_settings['answer_subsentence_similarity_modifier'] == 'value':
        return score_settings['answer_subsentence_similarity_modifier_value']

    # Apply multiplier
    if score_settings['answer_subsentence_similarity_modifier'] == 'multiplier':
        return (max_ratio - score_settings['answer_subsentence_similarity_threshold']) / (1 - score_settings['answer_subsentence_similarity_threshold']) * score_settings['answer_subsentence_similarity_modifier_value']

    return 0 
Example #7
Source File: predicate_alignment.py    From MultiKE with MIT License 5 votes vote down vote up
def init_predicate_alignment(predicate_local_name_dict_1, predicate_local_name_dict_2, predicate_init_sim):
    def get_predicate_match_dict(p_ln_dict_1, p_ln_dict_2):
        predicate_match_dict, sim_dict = {}, {}
        for p1, ln1 in p_ln_dict_1.items():
            match_p2 = ''
            max_sim = 0
            for p2, ln2 in p_ln_dict_2.items():
                sim_p2 = Levenshtein.ratio(ln1, ln2)
                if sim_p2 > max_sim:
                    match_p2 = p2
                    max_sim = sim_p2
            predicate_match_dict[p1] = match_p2
            sim_dict[p1] = max_sim
        return predicate_match_dict, sim_dict

    match_dict_1_2, sim_dict_1 = get_predicate_match_dict(predicate_local_name_dict_1, predicate_local_name_dict_2)
    match_dict_2_1, sim_dict_2 = get_predicate_match_dict(predicate_local_name_dict_2, predicate_local_name_dict_1)

    predicate_match_pairs_set = set()
    predicate_latent_match_pairs_similarity_dict = {}
    for p1, p2 in match_dict_1_2.items():
        if match_dict_2_1[p2] == p1:
            predicate_latent_match_pairs_similarity_dict[(p1, p2)] = sim_dict_1[p1]
            if sim_dict_1[p1] > predicate_init_sim:
                predicate_match_pairs_set.add((p1, p2, sim_dict_1[p1]))
                # print(p1, p2, sim_dict_1[p1], sim_dict_2[p2])
    return predicate_match_pairs_set, predicate_latent_match_pairs_similarity_dict 
Example #8
Source File: dist_utils.py    From tensorflow-DSMM with MIT License 5 votes vote down vote up
def _count_stats(s1, s2):
    # length
    l1 = len(s1)
    l2 = len(s2)
    len_diff = np_utils._try_divide(np.abs(l1-l2), (l1+l2)/2.)

    # set
    s1_set = set(s1)
    s2_set = set(s2)

    # unique length
    l1_unique = len(s1_set)
    l2_unique = len(s2_set)
    len_diff_unique = np_utils._try_divide(np.abs(l1_unique-l2_unique), (l1_unique+l2_unique)/2.)

    # unique ratio
    r1_unique = np_utils._try_divide(l1_unique, l1)
    r2_unique = np_utils._try_divide(l2_unique, l2)

    # jaccard coef
    li = len(s1_set.intersection(s2_set))
    lu = len(s1_set.union(s2_set))
    jaccard_coef = np_utils._try_divide(li, lu)

    # dice coef
    dice_coef = np_utils._try_divide(li, l1_unique + l2_unique)

    # common number
    common_ = _common_num(s1, s2)
    common_ratio_avg = np_utils._try_divide(common_, (l1 + l2) / 2.)
    common_ratio_max = np_utils._try_divide(common_, min(l1, l2))
    common_ratio_min = np_utils._try_divide(common_, max(l1, l2))

    # over all features
    f = [l1, l2, len_diff,
         l1_unique, l2_unique, len_diff_unique,
         r1_unique, r2_unique,
         li, lu, jaccard_coef, dice_coef,
         common_, common_ratio_avg, common_ratio_max, common_ratio_min
    ]
    return np.array(f, dtype=np.float32) 
Example #9
Source File: dist_utils.py    From tensorflow-DSMM with MIT License 5 votes vote down vote up
def _edit_dist(str1, str2):
    try:
        # very fast
        # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed
        # d = Levenshtein.ratio(str1, str2)
        d = Levenshtein.distance(str1, str2)/float(max(len(str1),len(str2)))
    except:
        # https://docs.python.org/2/library/difflib.html
        d = 1. - SequenceMatcher(lambda x: x==" ", str1, str2).ratio()
    return d 
Example #10
Source File: twitter_markov.py    From twitter_markov with GNU General Public License v3.0 5 votes vote down vote up
def check_tweet(self, text):
        '''Check if a string contains blacklisted words or is similar to a recent tweet.'''
        text = text.strip().lower()

        if not text:
            self.log.info("Rejected (empty)")
            return False

        if self.wordfilter.blacklisted(text):
            self.log.info("Rejected (blacklisted)")
            return False

        if tbu.helpers.length(text) > 280:
            self.log.info("Rejected (too long)")
            return False

        for line in self.recently_tweeted:
            if text in line.strip().lower():
                self.log.info("Rejected (Identical)")
                return False

            if Levenshtein.ratio(re.sub(r'\W+', '', text), re.sub(r'\W+', '', line.lower())) >= LEVENSHTEIN_LIMIT:
                self.log.info("Rejected (Levenshtein.ratio)")
                return False

        return True 
Example #11
Source File: predicates_computer.py    From lang2program with Apache License 2.0 5 votes vote down vote up
def similarity_ratio(x, y, threshold=FuzzyMatchGenerator.SIMILARITY_THRESHOLD):
    """Compute the similarity ratio between two strings.
    If the ratio exceeds the threshold, return it; otherwise, return 0.

    The similarity ratio is given by
        1 - (levenshtein distance with substitution cost = 2) / (total length)
    """
    ratio = Levenshtein.ratio(x, y)
    return ratio if ratio > threshold else 0.


################################
# NERValueGenerator 
Example #12
Source File: distance_text_or_vec.py    From nlp_xiaojiang with MIT License 5 votes vote down vote up
def ratio_levenshtein(str1, str2):
    return Leven.ratio(str1, str2) 
Example #13
Source File: submissions.py    From intake with MIT License 5 votes vote down vote up
def get_name_similarity_ratio(a, b):
    names = (get_full_lowercase_name(sub) for sub in (a, b))
    return Levenshtein.ratio(*names) 
Example #14
Source File: dist_utils.py    From BERT with Apache License 2.0 5 votes vote down vote up
def _edit_dist(str1, str2):
    try:
        # very fast
        # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed
        # d = Levenshtein.ratio(str1, str2)
        d = Levenshtein.distance(str1, str2)/float(max(len(str1),len(str2)))
    except:
        # https://docs.python.org/2/library/difflib.html
        d = 1. - SequenceMatcher(lambda x: x==" ", str1, str2).ratio()
    return d 
Example #15
Source File: generate_accuracy_report.py    From namsel with MIT License 5 votes vote down vote up
def do_pairwise_comparison(origflpath, ocrflpath):
    o = open(origflpath, 'r').read()
    s = open(ocrflpath, 'r').read()
    s = _normalize_input(s)
    
    return L.ratio(o,s)
    
    
#data = {'csrfmiddlewaretoken':s.cookies['csrftoken'], 
#        'edit_distance': edit_dist, 
#        'filename': os.path.basename(tif), 
#        'sample_set': t, 'html': html, 'timestamp': timestamp,
#        'comment': comment
#    } 
Example #16
Source File: dist_utils.py    From BERT with Apache License 2.0 5 votes vote down vote up
def _count_stats(s1, s2):
    # length
    l1 = len(s1)
    l2 = len(s2)
    len_diff = np_utils._try_divide(np.abs(l1-l2), (l1+l2)/2.)

    # set
    s1_set = set(s1)
    s2_set = set(s2)

    # unique length
    l1_unique = len(s1_set)
    l2_unique = len(s2_set)
    len_diff_unique = np_utils._try_divide(np.abs(l1_unique-l2_unique), (l1_unique+l2_unique)/2.)

    # unique ratio
    r1_unique = np_utils._try_divide(l1_unique, l1)
    r2_unique = np_utils._try_divide(l2_unique, l2)

    # jaccard coef
    li = len(s1_set.intersection(s2_set))
    lu = len(s1_set.union(s2_set))
    jaccard_coef = np_utils._try_divide(li, lu)

    # dice coef
    dice_coef = np_utils._try_divide(li, l1_unique + l2_unique)

    # common number
    common_ = _common_num(s1, s2)
    common_ratio_avg = np_utils._try_divide(common_, (l1 + l2) / 2.)
    common_ratio_max = np_utils._try_divide(common_, min(l1, l2))
    common_ratio_min = np_utils._try_divide(common_, max(l1, l2))

    # over all features
    f = [l1, l2, len_diff,
         l1_unique, l2_unique, len_diff_unique,
         r1_unique, r2_unique,
         li, lu, jaccard_coef, dice_coef,
         common_, common_ratio_avg, common_ratio_max, common_ratio_min
    ]
    return np.array(f, dtype=np.float32) 
Example #17
Source File: dist_utils.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def _edit_dist(str1, str2):
    try:
        # very fast
        # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed
        # d = Levenshtein.ratio(str1, str2)
        d = Levenshtein.distance(str1, str2)/float(max(len(str1),len(str2)))
    except:
        # https://docs.python.org/2/library/difflib.html
        d = 1. - SequenceMatcher(lambda x: x==" ", str1, str2).ratio()
    return d 
Example #18
Source File: scorer.py    From nmt-chatbot with GNU General Public License v3.0 5 votes vote down vote up
def question_answer_similarity_by_ratio(index, question, answer):
    global valid_emoticon

    # Disabled or short or char emoticon
    if score_settings['question_answer_similarity_modifier_value'] is None or len(answer) < score_settings['question_answer_similarity_sentence_len'] or valid_emoticon:
        return 0

    # Divide response into subsentences
    answer = list(filter(None, re.split(score_settings['subsentence_dividers'], answer))) + [answer]

    # Calculate similarity for every subsentence, gext maximum one
    ratio = max([Levenshtein.ratio(question, s) for s in answer])

    # Not similar
    if ratio < score_settings['question_answer_similarity_threshold']:
        return 0

    # Apply value
    if score_settings['question_answer_similarity_modifier'] == 'value':
        return score_settings['question_answer_similarity_modifier_value']

    # Apply multiplier
    if score_settings['question_answer_similarity_modifier'] == 'multiplier':
        return (ratio - score_settings['question_answer_similarity_threshold']) / (1 - score_settings['question_answer_similarity_threshold']) * score_settings['question_answer_similarity_modifier_value']

    return 0 
Example #19
Source File: merger.py    From errant with MIT License 5 votes vote down vote up
def char_cost(a, b):
    return Levenshtein.ratio(a.text, b.text)
    
# Merge the input alignment sequence to a single edit span 
Example #20
Source File: test_string_distances.py    From dirty_cat with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_compare_implementations():
    # Compare the implementations of python-Levenshtein to our
    # pure-Python implementations
    if Levenshtein is False:
        raise unittest.SkipTest
    # Test on strings with randomly placed common char
    for string1, string2 in _random_common_char_pairs(n_pairs=50):
        assert (string_distances._jaro_winkler(string1, string2,
                                               winkler=False)
                == Levenshtein.jaro(string1, string2)
                )
        assert (string_distances._jaro_winkler(string1, string2,
                                               winkler=True)
                == Levenshtein.jaro_winkler(string1, string2))
        assert (string_distances.levenshtein_ratio(string1, string2)
                == Levenshtein.ratio(string1, string2))
    # Test on random strings
    for string1, string2 in _random_string_pairs(n_pairs=50):
        assert (string_distances._jaro_winkler(string1, string2,
                                               winkler=False)
                == Levenshtein.jaro(string1, string2))
        assert (string_distances._jaro_winkler(string1, string2,
                                               winkler=True)
                == Levenshtein.jaro_winkler(string1, string2))
        assert (string_distances.levenshtein_ratio(string1, string2)
                == Levenshtein.ratio(string1, string2)) 
Example #21
Source File: string_utils.py    From ph0neutria with Apache License 2.0 5 votes vote down vote up
def fuzzy_score_string(first_string, second_string):
    """Produce a similarity score for two strings (using Levenshtein distance).

    Params:
    - first_string: (type: string) first string.
    - second_string: (type: string) second string.

    Returns:
    - result: (type: int) score.
    """
    score = 0

    if len(first_string) < len(second_string):
        shorter, longer = (first_string, second_string)
        window_length = len(shorter)

        num_iterations = len(longer) - len(shorter) + 1

        for position in range(0, num_iterations):
            window = longer[position:position + window_length]
            l_ratio = Levenshtein.ratio(window, shorter) * 100

            if l_ratio > 60:
                result = statistics.mean(
                    [100 - Levenshtein.distance(window, shorter) * 15, l_ratio, l_ratio])

            else:
                result = l_ratio

            if result > score:
                score = result

    else:
        l_ratio = Levenshtein.ratio(first_string, second_string) * 100
        score = statistics.mean(
            [100 - Levenshtein.distance(first_string, second_string) * 15, l_ratio, l_ratio])

    simple = fuzz.ratio(first_string, second_string)
    partial = fuzz.partial_ratio(first_string, second_string)
    sort = fuzz.token_sort_ratio(first_string, second_string)
    set_ratio = fuzz.token_set_ratio(first_string, second_string)

    score = max([score, simple, partial, sort, set_ratio])

    if score < 75:
        score = 0

    return score * 0.85 
Example #22
Source File: food_string_matching.py    From extract_recipe with Apache License 2.0 5 votes vote down vote up
def worker(num,total,foodStrings):
  stringMatches = []
  partialList = {}
  """thread worker function"""
  for foodString in foodStrings:
    for (i,key) in enumerate(foodList.keys()):
      if i%total==num:
        leven1 = fuzz.token_set_ratio(key,foodString)
        leven2 = Levenshtein.ratio(foodString,key)
        if leven2>0.5:
          stringMatches.append((key,foodList[key],leven1,leven2))
  pickle.dump(stringMatches,open(str(num)+'.p','wb'))
  return 
Example #23
Source File: initsimfeat.py    From holoclean with Apache License 2.0 5 votes vote down vote up
def gen_feat_tensor(input, classes, total_attrs):
    vid = int(input[0])
    attr_idx = input[1]
    init_value = input[2]
    # TODO: To add more similarity metrics increase the last dimension of tensor.
    tensor = torch.zeros(1, classes, total_attrs)
    domain = input[3].split('|||')
    for idx, val in enumerate(domain):
        if val == init_value:
            sim = -1.0
        else:
            sim = (2 * Levenshtein.ratio(val, init_value)) - 1
        tensor[0][idx][attr_idx] = sim
    return tensor 
Example #24
Source File: trivia.py    From pajbot with MIT License 5 votes vote down vote up
def on_message(self, source, message, whisper, **rest):
        if not message or whisper:
            return

        if self.question:
            right_answer = self.question["answer"].lower()
            user_answer = message.lower()
            if len(right_answer) <= 5:
                correct = right_answer == user_answer
            else:
                ratio = Levenshtein.ratio(right_answer, user_answer)
                correct = ratio >= 0.94

            if correct:
                if self.point_bounty > 0:
                    self.bot.safe_me(
                        f"{source} got the answer right! The answer was {self.question['answer']} FeelsGoodMan They get {self.point_bounty} points! PogChamp"
                    )
                    source.points += self.point_bounty
                else:
                    self.bot.safe_me(
                        f"{source} got the answer right! The answer was {self.question['answer']} FeelsGoodMan"
                    )

                self.question = None
                self.step = 0
                self.last_question = utils.now() 
Example #25
Source File: delex.py    From chimera with MIT License 5 votes vote down vote up
def lev_ratio(s1, s2):
    return ratio(s1, s2) 
Example #26
Source File: main.py    From oabot with MIT License 4 votes vote down vote up
def get_dissemin_paper(reference):
    """
    Given a citation template (as parsed by wikiciteparser and a proposed link)
    get dissemin API information for that link
    """
    doi = reference.get('ID_list', {}).get('DOI')
    title = reference.get('Title', '')
    authors = reference.get('Authors', [])
    date = reference.get('Date', '')

    # CS1 represents unparsed authors as {'last':'First Last'}
    for i in range(len(authors)):
        if 'first' not in authors[i]:
            authors[i] = {'plain':authors[i].get('last','')}

    args = {
        'title':title,
        'authors':authors,
        'date':date,
        'doi':doi,
        }

    for retry in range(5):
        try:
            req = requests.post('https://dissem.in/api/query/',
                                json=args,
                                headers={'User-Agent':OABOT_USER_AGENT},
                                timeout=10)

            resp = req.json()
            paper_object = resp.get('paper', {})
            if not paper_object:
                return {}

            paper_year = paper_object.get("date", "")[:4]
            paper_authorlast = paper_object.get("authors")[0].get("name", {}).get("last", "")
            if date[:4] == paper_year and ratio(authors[0].get("last", ""), paper_authorlast) > 0.75:
                return paper_object
            else:
                # Fails a basic author/date check, ignore Dissemin record
                return {}
        except (ValueError, requests.exceptions.RequestException) as e:
            sleep(5)
            continue
        except IndexError:
            # The author names are not what expected, give up on a record match
            # TODO: could probably try harder
            return {}
    return {} 
Example #27
Source File: food_string_matching.py    From extract_recipe with Apache License 2.0 4 votes vote down vote up
def getStringMatches(foodString):
  print(foodString)
  foodString = foodString.replace(',',' ').lower()
  foodStrings = []
  foodStrings.append(foodString)
  foodWords = foodString.split()
  if len(foodWords)>2:
    otherFoodWords = combinations(foodWords,2)
    for words in otherFoodWords:
      foodStrings.append(' '.join(words))
  if len(foodWords)>3:
    otherFoodWords = combinations(foodWords,3)
    for words in otherFoodWords:
      foodStrings.append(' '.join(words))
  stringMatches = []
  partialList = {}
  
 
  processes = []
  totalProcesses = NUM_PROCESSORS
  for i in range(totalProcesses):
    t = Process(target=worker, args=(i,totalProcesses,foodStrings,))
    processes.append(t)
  for t in processes:
    t.start()
  for t in processes:
    t.join()
    
  for i in range(totalProcesses):
    foo = pickle.load(open(str(i)+'.p','rb'))
    stringMatches = stringMatches + foo
    os.system('rm ' + str(i)+'.p')
    
  
  '''
  for foodString in foodStrings:
    for (i,key) in enumerate(foodList.keys()):
      partialList[key] = fuzz.token_set_ratio(key,foodString)

    foo = sorted(partialList.items(), key=operator.itemgetter(1),reverse=True)[:100]
    for result in foo:
      leven=Levenshtein.ratio(foodString,result[0])
      if leven>0.5:
        stringMatches.append((result[0],foodList[result[0]],result[1],leven))
  '''
  matches = (sorted(stringMatches, key=operator.itemgetter(2, 3), reverse=True))
  return matches