Python Examples of Levenshtein.ratio

Source File: alignment.py From errant with MIT License

6 votes

def get_sub_cost(self, o, c):
        # Short circuit if the only difference is case
        if o.lower == c.lower: return 0
        # Lemma cost
        if o.lemma == c.lemma: lemma_cost = 0
        else: lemma_cost = 0.499
        # POS cost
        if o.pos == c.pos: pos_cost = 0
        elif o.pos in self._open_pos and c.pos in self._open_pos: pos_cost = 0.25
        else: pos_cost = 0.5
        # Char cost
        char_cost = 1-Levenshtein.ratio(o.text, c.text)
        # Combine the costs
        return lemma_cost + pos_cost + char_cost

    # Get the cheapest alignment sequence and indices from the op matrix
    # align_seq = [(op, o_start, o_end, c_start, c_end), ...]

Source File: filters.py From KTSpeechCrawler with MIT License

6 votes

def __call__(self, input):
        subtitles = input["subtitles"]
        subset = random.sample(subtitles, self.num_samples_to_test)

        transcripts = [(s, _get_transcript_google_web_asr(s)) for s in subset]
        transcripts = [(t, s) for (t, s) in transcripts if s is not None]
        if len(transcripts) == 0:
            #filter removes all the subtitles, as potentially unreliable sample
            subtitles = []
        else:
            overlap_ratio = [ratio(t["phrase"].lower(), s.lower()) for (t, s) in transcripts]
            passed_threshold =  sum(overlap_ratio) / len(overlap_ratio) > self.mean_wer_threshold
            if not passed_threshold:
                #removing all subtitles, as potentially unreliable
                subtitles = []
        input["subtitles"] = subtitles
        return input

Source File: status_notifications.py From intake with MIT License

6 votes

def get_message_change_ratio(status_update):
    """Expects a status update instance, returns a number representing
    how much a message has been edited (1.0 completely changed, 0.0 unchanged)
    based on Levenshtein ratio.
    If a status update has no associated notification, returns None
    https://github.com/ztane/python-Levenshtein
    """
    if hasattr(status_update, 'notification'):
        author_profile = status_update.author.profile
        intro_text = get_notification_intro(author_profile) + '\n\n'
        return 1.0 - Levenshtein.ratio(
            *[message.replace(intro_text, '')
              for message in (
                status_update.notification.base_message,
                status_update.notification.sent_message)])
    else:
        return None

Source File: generate_accuracy_report.py From namsel with MIT License

6 votes

def _get_compare_data(tif_txt_pair):
    tif = tif_txt_pair[0]
    txt = tif_txt_pair[1]
    if tif[:-4] == txt[:-4]: # This should always be true
#         ocr = run_main(tif, conf=Config(path='/home/zr/letters/conf/443cf9ec-76c7-44bc-95ad-593138d2d5fc.conf'), text=True)
#         ocr = run_main(tif, conf=Config(segmenter='stochastic', recognizer='hmm', break_width=3.6), text=True)
        ocr = run_main(tif, text=True)
#         ocr = run_all_confs_for_page(tif, text = True)
        ocr = ocr.strip()
        txt = open(txt,'r').read()
        txt = _normalize_input(txt)
        edit_dist = L.distance(txt, ocr)
        edit_ratio = L.ratio(txt, ocr)
        html = _make_html_diff(txt, ocr)
#        sys.exit()
        data = {'edit_distance': edit_dist,
                'edit_ratio': edit_ratio,
                'filename': os.path.basename(tif), 
                'html': html
            }
    return data

Source File: string_utils.py From ph0neutria with Apache License 2.0

6 votes

def similar_string_fast(first_string, second_string):
    """Determine if two strings are similar (using two most effective methods).

    Params:
    - first_string: (type: string) first string.
    - second_string: (type: string) second string.

    Returns:
    - result: (type: bool) match result.
    """
    partial_score = fuzz.ratio(first_string, second_string)
    token_score = fuzz.token_set_ratio(first_string, second_string)

    if max(partial_score, token_score) >= SCORE_THRESHOLD_FAST:
        return True

    return False

Source File: scorer.py From nmt-chatbot with GNU General Public License v3.0

5 votes

def answer_subsentence_similarity_by_ratio(index, question, answer):
    global valid_emoticon

    # Disabled or short or char emoticon
    if score_settings['answer_subsentence_similarity_modifier_value'] is None or len(answer) < score_settings['answer_subsentence_similarity_sentence_len'] or valid_emoticon:
        return 0

    # Split response into subsentences
    answer = list(filter(None, re.split(score_settings['subsentence_dividers'], answer)))

    # Find max similarity
    max_ratio = 0
    for num, subsentence in enumerate(answer):
        for sunsentence2 in answer[num+1:]:
            max_ratio = max(max_ratio, Levenshtein.ratio(subsentence, sunsentence2))

    # Not similar
    if max_ratio < score_settings['answer_subsentence_similarity_threshold']:
        return 0

    # Apply value
    if score_settings['answer_subsentence_similarity_modifier'] == 'value':
        return score_settings['answer_subsentence_similarity_modifier_value']

    # Apply multiplier
    if score_settings['answer_subsentence_similarity_modifier'] == 'multiplier':
        return (max_ratio - score_settings['answer_subsentence_similarity_threshold']) / (1 - score_settings['answer_subsentence_similarity_threshold']) * score_settings['answer_subsentence_similarity_modifier_value']

    return 0

Source File: predicate_alignment.py From MultiKE with MIT License

5 votes

def init_predicate_alignment(predicate_local_name_dict_1, predicate_local_name_dict_2, predicate_init_sim):
    def get_predicate_match_dict(p_ln_dict_1, p_ln_dict_2):
        predicate_match_dict, sim_dict = {}, {}
        for p1, ln1 in p_ln_dict_1.items():
            match_p2 = ''
            max_sim = 0
            for p2, ln2 in p_ln_dict_2.items():
                sim_p2 = Levenshtein.ratio(ln1, ln2)
                if sim_p2 > max_sim:
                    match_p2 = p2
                    max_sim = sim_p2
            predicate_match_dict[p1] = match_p2
            sim_dict[p1] = max_sim
        return predicate_match_dict, sim_dict

    match_dict_1_2, sim_dict_1 = get_predicate_match_dict(predicate_local_name_dict_1, predicate_local_name_dict_2)
    match_dict_2_1, sim_dict_2 = get_predicate_match_dict(predicate_local_name_dict_2, predicate_local_name_dict_1)

    predicate_match_pairs_set = set()
    predicate_latent_match_pairs_similarity_dict = {}
    for p1, p2 in match_dict_1_2.items():
        if match_dict_2_1[p2] == p1:
            predicate_latent_match_pairs_similarity_dict[(p1, p2)] = sim_dict_1[p1]
            if sim_dict_1[p1] > predicate_init_sim:
                predicate_match_pairs_set.add((p1, p2, sim_dict_1[p1]))
                # print(p1, p2, sim_dict_1[p1], sim_dict_2[p2])
    return predicate_match_pairs_set, predicate_latent_match_pairs_similarity_dict

Source File: dist_utils.py From tensorflow-DSMM with MIT License

5 votes

def _count_stats(s1, s2):
    # length
    l1 = len(s1)
    l2 = len(s2)
    len_diff = np_utils._try_divide(np.abs(l1-l2), (l1+l2)/2.)

    # set
    s1_set = set(s1)
    s2_set = set(s2)

    # unique length
    l1_unique = len(s1_set)
    l2_unique = len(s2_set)
    len_diff_unique = np_utils._try_divide(np.abs(l1_unique-l2_unique), (l1_unique+l2_unique)/2.)

    # unique ratio
    r1_unique = np_utils._try_divide(l1_unique, l1)
    r2_unique = np_utils._try_divide(l2_unique, l2)

    # jaccard coef
    li = len(s1_set.intersection(s2_set))
    lu = len(s1_set.union(s2_set))
    jaccard_coef = np_utils._try_divide(li, lu)

    # dice coef
    dice_coef = np_utils._try_divide(li, l1_unique + l2_unique)

    # common number
    common_ = _common_num(s1, s2)
    common_ratio_avg = np_utils._try_divide(common_, (l1 + l2) / 2.)
    common_ratio_max = np_utils._try_divide(common_, min(l1, l2))
    common_ratio_min = np_utils._try_divide(common_, max(l1, l2))

    # over all features
    f = [l1, l2, len_diff,
         l1_unique, l2_unique, len_diff_unique,
         r1_unique, r2_unique,
         li, lu, jaccard_coef, dice_coef,
         common_, common_ratio_avg, common_ratio_max, common_ratio_min
    ]
    return np.array(f, dtype=np.float32)

Source File: dist_utils.py From tensorflow-DSMM with MIT License

5 votes

def _edit_dist(str1, str2):
    try:
        # very fast
        # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed
        # d = Levenshtein.ratio(str1, str2)
        d = Levenshtein.distance(str1, str2)/float(max(len(str1),len(str2)))
    except:
        # https://docs.python.org/2/library/difflib.html
        d = 1. - SequenceMatcher(lambda x: x==" ", str1, str2).ratio()
    return d

Source File: twitter_markov.py From twitter_markov with GNU General Public License v3.0

5 votes

def check_tweet(self, text):
        '''Check if a string contains blacklisted words or is similar to a recent tweet.'''
        text = text.strip().lower()

        if not text:
            self.log.info("Rejected (empty)")
            return False

        if self.wordfilter.blacklisted(text):
            self.log.info("Rejected (blacklisted)")
            return False

        if tbu.helpers.length(text) > 280:
            self.log.info("Rejected (too long)")
            return False

        for line in self.recently_tweeted:
            if text in line.strip().lower():
                self.log.info("Rejected (Identical)")
                return False

            if Levenshtein.ratio(re.sub(r'\W+', '', text), re.sub(r'\W+', '', line.lower())) >= LEVENSHTEIN_LIMIT:
                self.log.info("Rejected (Levenshtein.ratio)")
                return False

        return True

Source File: predicates_computer.py From lang2program with Apache License 2.0

5 votes

def similarity_ratio(x, y, threshold=FuzzyMatchGenerator.SIMILARITY_THRESHOLD):
    """Compute the similarity ratio between two strings.
    If the ratio exceeds the threshold, return it; otherwise, return 0.

    The similarity ratio is given by
        1 - (levenshtein distance with substitution cost = 2) / (total length)
    """
    ratio = Levenshtein.ratio(x, y)
    return ratio if ratio > threshold else 0.


################################
# NERValueGenerator

Source File: distance_text_or_vec.py From nlp_xiaojiang with MIT License

5 votes

def ratio_levenshtein(str1, str2):
    return Leven.ratio(str1, str2)

Source File: submissions.py From intake with MIT License

5 votes

def get_name_similarity_ratio(a, b):
    names = (get_full_lowercase_name(sub) for sub in (a, b))
    return Levenshtein.ratio(*names)

Source File: dist_utils.py From BERT with Apache License 2.0

5 votes

def _edit_dist(str1, str2):
    try:
        # very fast
        # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed
        # d = Levenshtein.ratio(str1, str2)
        d = Levenshtein.distance(str1, str2)/float(max(len(str1),len(str2)))
    except:
        # https://docs.python.org/2/library/difflib.html
        d = 1. - SequenceMatcher(lambda x: x==" ", str1, str2).ratio()
    return d

Source File: generate_accuracy_report.py From namsel with MIT License

5 votes

def do_pairwise_comparison(origflpath, ocrflpath):
    o = open(origflpath, 'r').read()
    s = open(ocrflpath, 'r').read()
    s = _normalize_input(s)
    
    return L.ratio(o,s)
    
    
#data = {'csrfmiddlewaretoken':s.cookies['csrftoken'], 
#        'edit_distance': edit_dist, 
#        'filename': os.path.basename(tif), 
#        'sample_set': t, 'html': html, 'timestamp': timestamp,
#        'comment': comment
#    }

Source File: dist_utils.py From BERT with Apache License 2.0

5 votes

def _count_stats(s1, s2):
    # length
    l1 = len(s1)
    l2 = len(s2)
    len_diff = np_utils._try_divide(np.abs(l1-l2), (l1+l2)/2.)

    # set
    s1_set = set(s1)
    s2_set = set(s2)

    # unique length
    l1_unique = len(s1_set)
    l2_unique = len(s2_set)
    len_diff_unique = np_utils._try_divide(np.abs(l1_unique-l2_unique), (l1_unique+l2_unique)/2.)

    # unique ratio
    r1_unique = np_utils._try_divide(l1_unique, l1)
    r2_unique = np_utils._try_divide(l2_unique, l2)

    # jaccard coef
    li = len(s1_set.intersection(s2_set))
    lu = len(s1_set.union(s2_set))
    jaccard_coef = np_utils._try_divide(li, lu)

    # dice coef
    dice_coef = np_utils._try_divide(li, l1_unique + l2_unique)

    # common number
    common_ = _common_num(s1, s2)
    common_ratio_avg = np_utils._try_divide(common_, (l1 + l2) / 2.)
    common_ratio_max = np_utils._try_divide(common_, min(l1, l2))
    common_ratio_min = np_utils._try_divide(common_, max(l1, l2))

    # over all features
    f = [l1, l2, len_diff,
         l1_unique, l2_unique, len_diff_unique,
         r1_unique, r2_unique,
         li, lu, jaccard_coef, dice_coef,
         common_, common_ratio_avg, common_ratio_max, common_ratio_min
    ]
    return np.array(f, dtype=np.float32)

Source File: dist_utils.py From kaggle-HomeDepot with MIT License

5 votes

def _edit_dist(str1, str2):
    try:
        # very fast
        # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed
        # d = Levenshtein.ratio(str1, str2)
        d = Levenshtein.distance(str1, str2)/float(max(len(str1),len(str2)))
    except:
        # https://docs.python.org/2/library/difflib.html
        d = 1. - SequenceMatcher(lambda x: x==" ", str1, str2).ratio()
    return d

Source File: scorer.py From nmt-chatbot with GNU General Public License v3.0

5 votes

def question_answer_similarity_by_ratio(index, question, answer):
    global valid_emoticon

    # Disabled or short or char emoticon
    if score_settings['question_answer_similarity_modifier_value'] is None or len(answer) < score_settings['question_answer_similarity_sentence_len'] or valid_emoticon:
        return 0

    # Divide response into subsentences
    answer = list(filter(None, re.split(score_settings['subsentence_dividers'], answer))) + [answer]

    # Calculate similarity for every subsentence, gext maximum one
    ratio = max([Levenshtein.ratio(question, s) for s in answer])

    # Not similar
    if ratio < score_settings['question_answer_similarity_threshold']:
        return 0

    # Apply value
    if score_settings['question_answer_similarity_modifier'] == 'value':
        return score_settings['question_answer_similarity_modifier_value']

    # Apply multiplier
    if score_settings['question_answer_similarity_modifier'] == 'multiplier':
        return (ratio - score_settings['question_answer_similarity_threshold']) / (1 - score_settings['question_answer_similarity_threshold']) * score_settings['question_answer_similarity_modifier_value']

    return 0

Source File: merger.py From errant with MIT License

5 votes

def char_cost(a, b):
    return Levenshtein.ratio(a.text, b.text)
    
# Merge the input alignment sequence to a single edit span

Source File: test_string_distances.py From dirty_cat with BSD 3-Clause "New" or "Revised" License

5 votes

def test_compare_implementations():
    # Compare the implementations of python-Levenshtein to our
    # pure-Python implementations
    if Levenshtein is False:
        raise unittest.SkipTest
    # Test on strings with randomly placed common char
    for string1, string2 in _random_common_char_pairs(n_pairs=50):
        assert (string_distances._jaro_winkler(string1, string2,
                                               winkler=False)
                == Levenshtein.jaro(string1, string2)
                )
        assert (string_distances._jaro_winkler(string1, string2,
                                               winkler=True)
                == Levenshtein.jaro_winkler(string1, string2))
        assert (string_distances.levenshtein_ratio(string1, string2)
                == Levenshtein.ratio(string1, string2))
    # Test on random strings
    for string1, string2 in _random_string_pairs(n_pairs=50):
        assert (string_distances._jaro_winkler(string1, string2,
                                               winkler=False)
                == Levenshtein.jaro(string1, string2))
        assert (string_distances._jaro_winkler(string1, string2,
                                               winkler=True)
                == Levenshtein.jaro_winkler(string1, string2))
        assert (string_distances.levenshtein_ratio(string1, string2)
                == Levenshtein.ratio(string1, string2))

Source File: string_utils.py From ph0neutria with Apache License 2.0

5 votes

def fuzzy_score_string(first_string, second_string):
    """Produce a similarity score for two strings (using Levenshtein distance).

    Params:
    - first_string: (type: string) first string.
    - second_string: (type: string) second string.

    Returns:
    - result: (type: int) score.
    """
    score = 0

    if len(first_string) < len(second_string):
        shorter, longer = (first_string, second_string)
        window_length = len(shorter)

        num_iterations = len(longer) - len(shorter) + 1

        for position in range(0, num_iterations):
            window = longer[position:position + window_length]
            l_ratio = Levenshtein.ratio(window, shorter) * 100

            if l_ratio > 60:
                result = statistics.mean(
                    [100 - Levenshtein.distance(window, shorter) * 15, l_ratio, l_ratio])

            else:
                result = l_ratio

            if result > score:
                score = result

    else:
        l_ratio = Levenshtein.ratio(first_string, second_string) * 100
        score = statistics.mean(
            [100 - Levenshtein.distance(first_string, second_string) * 15, l_ratio, l_ratio])

    simple = fuzz.ratio(first_string, second_string)
    partial = fuzz.partial_ratio(first_string, second_string)
    sort = fuzz.token_sort_ratio(first_string, second_string)
    set_ratio = fuzz.token_set_ratio(first_string, second_string)

    score = max([score, simple, partial, sort, set_ratio])

    if score < 75:
        score = 0

    return score * 0.85

Source File: food_string_matching.py From extract_recipe with Apache License 2.0

5 votes

def worker(num,total,foodStrings):
  stringMatches = []
  partialList = {}
  """thread worker function"""
  for foodString in foodStrings:
    for (i,key) in enumerate(foodList.keys()):
      if i%total==num:
        leven1 = fuzz.token_set_ratio(key,foodString)
        leven2 = Levenshtein.ratio(foodString,key)
        if leven2>0.5:
          stringMatches.append((key,foodList[key],leven1,leven2))
  pickle.dump(stringMatches,open(str(num)+'.p','wb'))
  return

Source File: initsimfeat.py From holoclean with Apache License 2.0

5 votes

def gen_feat_tensor(input, classes, total_attrs):
    vid = int(input[0])
    attr_idx = input[1]
    init_value = input[2]
    # TODO: To add more similarity metrics increase the last dimension of tensor.
    tensor = torch.zeros(1, classes, total_attrs)
    domain = input[3].split('|||')
    for idx, val in enumerate(domain):
        if val == init_value:
            sim = -1.0
        else:
            sim = (2 * Levenshtein.ratio(val, init_value)) - 1
        tensor[0][idx][attr_idx] = sim
    return tensor

Source File: trivia.py From pajbot with MIT License

5 votes

def on_message(self, source, message, whisper, **rest):
        if not message or whisper:
            return

        if self.question:
            right_answer = self.question["answer"].lower()
            user_answer = message.lower()
            if len(right_answer) <= 5:
                correct = right_answer == user_answer
            else:
                ratio = Levenshtein.ratio(right_answer, user_answer)
                correct = ratio >= 0.94

            if correct:
                if self.point_bounty > 0:
                    self.bot.safe_me(
                        f"{source} got the answer right! The answer was {self.question['answer']} FeelsGoodMan They get {self.point_bounty} points! PogChamp"
                    )
                    source.points += self.point_bounty
                else:
                    self.bot.safe_me(
                        f"{source} got the answer right! The answer was {self.question['answer']} FeelsGoodMan"
                    )

                self.question = None
                self.step = 0
                self.last_question = utils.now()

Source File: delex.py From chimera with MIT License

5 votes

def lev_ratio(s1, s2):
    return ratio(s1, s2)

Source File: main.py From oabot with MIT License

4 votes

def get_dissemin_paper(reference):
    """
    Given a citation template (as parsed by wikiciteparser and a proposed link)
    get dissemin API information for that link
    """
    doi = reference.get('ID_list', {}).get('DOI')
    title = reference.get('Title', '')
    authors = reference.get('Authors', [])
    date = reference.get('Date', '')

    # CS1 represents unparsed authors as {'last':'First Last'}
    for i in range(len(authors)):
        if 'first' not in authors[i]:
            authors[i] = {'plain':authors[i].get('last','')}

    args = {
        'title':title,
        'authors':authors,
        'date':date,
        'doi':doi,
        }

    for retry in range(5):
        try:
            req = requests.post('https://dissem.in/api/query/',
                                json=args,
                                headers={'User-Agent':OABOT_USER_AGENT},
                                timeout=10)

            resp = req.json()
            paper_object = resp.get('paper', {})
            if not paper_object:
                return {}

            paper_year = paper_object.get("date", "")[:4]
            paper_authorlast = paper_object.get("authors")[0].get("name", {}).get("last", "")
            if date[:4] == paper_year and ratio(authors[0].get("last", ""), paper_authorlast) > 0.75:
                return paper_object
            else:
                # Fails a basic author/date check, ignore Dissemin record
                return {}
        except (ValueError, requests.exceptions.RequestException) as e:
            sleep(5)
            continue
        except IndexError:
            # The author names are not what expected, give up on a record match
            # TODO: could probably try harder
            return {}
    return {}

Source File: food_string_matching.py From extract_recipe with Apache License 2.0

4 votes

def getStringMatches(foodString):
  print(foodString)
  foodString = foodString.replace(',',' ').lower()
  foodStrings = []
  foodStrings.append(foodString)
  foodWords = foodString.split()
  if len(foodWords)>2:
    otherFoodWords = combinations(foodWords,2)
    for words in otherFoodWords:
      foodStrings.append(' '.join(words))
  if len(foodWords)>3:
    otherFoodWords = combinations(foodWords,3)
    for words in otherFoodWords:
      foodStrings.append(' '.join(words))
  stringMatches = []
  partialList = {}
  
 
  processes = []
  totalProcesses = NUM_PROCESSORS
  for i in range(totalProcesses):
    t = Process(target=worker, args=(i,totalProcesses,foodStrings,))
    processes.append(t)
  for t in processes:
    t.start()
  for t in processes:
    t.join()
    
  for i in range(totalProcesses):
    foo = pickle.load(open(str(i)+'.p','rb'))
    stringMatches = stringMatches + foo
    os.system('rm ' + str(i)+'.p')
    
  
  '''
  for foodString in foodStrings:
    for (i,key) in enumerate(foodList.keys()):
      partialList[key] = fuzz.token_set_ratio(key,foodString)

    foo = sorted(partialList.items(), key=operator.itemgetter(1),reverse=True)[:100]
    for result in foo:
      leven=Levenshtein.ratio(foodString,result[0])
      if leven>0.5:
        stringMatches.append((result[0],foodList[result[0]],result[1],leven))
  '''
  matches = (sorted(stringMatches, key=operator.itemgetter(2, 3), reverse=True))
  return matches

Python Levenshtein.ratio() Examples