Python Levenshtein.ratio() Examples
The following are 27
code examples of Levenshtein.ratio().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
Levenshtein
, or try the search function
.
Example #1
Source File: alignment.py From errant with MIT License | 6 votes |
def get_sub_cost(self, o, c): # Short circuit if the only difference is case if o.lower == c.lower: return 0 # Lemma cost if o.lemma == c.lemma: lemma_cost = 0 else: lemma_cost = 0.499 # POS cost if o.pos == c.pos: pos_cost = 0 elif o.pos in self._open_pos and c.pos in self._open_pos: pos_cost = 0.25 else: pos_cost = 0.5 # Char cost char_cost = 1-Levenshtein.ratio(o.text, c.text) # Combine the costs return lemma_cost + pos_cost + char_cost # Get the cheapest alignment sequence and indices from the op matrix # align_seq = [(op, o_start, o_end, c_start, c_end), ...]
Example #2
Source File: filters.py From KTSpeechCrawler with MIT License | 6 votes |
def __call__(self, input): subtitles = input["subtitles"] subset = random.sample(subtitles, self.num_samples_to_test) transcripts = [(s, _get_transcript_google_web_asr(s)) for s in subset] transcripts = [(t, s) for (t, s) in transcripts if s is not None] if len(transcripts) == 0: #filter removes all the subtitles, as potentially unreliable sample subtitles = [] else: overlap_ratio = [ratio(t["phrase"].lower(), s.lower()) for (t, s) in transcripts] passed_threshold = sum(overlap_ratio) / len(overlap_ratio) > self.mean_wer_threshold if not passed_threshold: #removing all subtitles, as potentially unreliable subtitles = [] input["subtitles"] = subtitles return input
Example #3
Source File: status_notifications.py From intake with MIT License | 6 votes |
def get_message_change_ratio(status_update): """Expects a status update instance, returns a number representing how much a message has been edited (1.0 completely changed, 0.0 unchanged) based on Levenshtein ratio. If a status update has no associated notification, returns None https://github.com/ztane/python-Levenshtein """ if hasattr(status_update, 'notification'): author_profile = status_update.author.profile intro_text = get_notification_intro(author_profile) + '\n\n' return 1.0 - Levenshtein.ratio( *[message.replace(intro_text, '') for message in ( status_update.notification.base_message, status_update.notification.sent_message)]) else: return None
Example #4
Source File: generate_accuracy_report.py From namsel with MIT License | 6 votes |
def _get_compare_data(tif_txt_pair): tif = tif_txt_pair[0] txt = tif_txt_pair[1] if tif[:-4] == txt[:-4]: # This should always be true # ocr = run_main(tif, conf=Config(path='/home/zr/letters/conf/443cf9ec-76c7-44bc-95ad-593138d2d5fc.conf'), text=True) # ocr = run_main(tif, conf=Config(segmenter='stochastic', recognizer='hmm', break_width=3.6), text=True) ocr = run_main(tif, text=True) # ocr = run_all_confs_for_page(tif, text = True) ocr = ocr.strip() txt = open(txt,'r').read() txt = _normalize_input(txt) edit_dist = L.distance(txt, ocr) edit_ratio = L.ratio(txt, ocr) html = _make_html_diff(txt, ocr) # sys.exit() data = {'edit_distance': edit_dist, 'edit_ratio': edit_ratio, 'filename': os.path.basename(tif), 'html': html } return data
Example #5
Source File: string_utils.py From ph0neutria with Apache License 2.0 | 6 votes |
def similar_string_fast(first_string, second_string): """Determine if two strings are similar (using two most effective methods). Params: - first_string: (type: string) first string. - second_string: (type: string) second string. Returns: - result: (type: bool) match result. """ partial_score = fuzz.ratio(first_string, second_string) token_score = fuzz.token_set_ratio(first_string, second_string) if max(partial_score, token_score) >= SCORE_THRESHOLD_FAST: return True return False
Example #6
Source File: scorer.py From nmt-chatbot with GNU General Public License v3.0 | 5 votes |
def answer_subsentence_similarity_by_ratio(index, question, answer): global valid_emoticon # Disabled or short or char emoticon if score_settings['answer_subsentence_similarity_modifier_value'] is None or len(answer) < score_settings['answer_subsentence_similarity_sentence_len'] or valid_emoticon: return 0 # Split response into subsentences answer = list(filter(None, re.split(score_settings['subsentence_dividers'], answer))) # Find max similarity max_ratio = 0 for num, subsentence in enumerate(answer): for sunsentence2 in answer[num+1:]: max_ratio = max(max_ratio, Levenshtein.ratio(subsentence, sunsentence2)) # Not similar if max_ratio < score_settings['answer_subsentence_similarity_threshold']: return 0 # Apply value if score_settings['answer_subsentence_similarity_modifier'] == 'value': return score_settings['answer_subsentence_similarity_modifier_value'] # Apply multiplier if score_settings['answer_subsentence_similarity_modifier'] == 'multiplier': return (max_ratio - score_settings['answer_subsentence_similarity_threshold']) / (1 - score_settings['answer_subsentence_similarity_threshold']) * score_settings['answer_subsentence_similarity_modifier_value'] return 0
Example #7
Source File: predicate_alignment.py From MultiKE with MIT License | 5 votes |
def init_predicate_alignment(predicate_local_name_dict_1, predicate_local_name_dict_2, predicate_init_sim): def get_predicate_match_dict(p_ln_dict_1, p_ln_dict_2): predicate_match_dict, sim_dict = {}, {} for p1, ln1 in p_ln_dict_1.items(): match_p2 = '' max_sim = 0 for p2, ln2 in p_ln_dict_2.items(): sim_p2 = Levenshtein.ratio(ln1, ln2) if sim_p2 > max_sim: match_p2 = p2 max_sim = sim_p2 predicate_match_dict[p1] = match_p2 sim_dict[p1] = max_sim return predicate_match_dict, sim_dict match_dict_1_2, sim_dict_1 = get_predicate_match_dict(predicate_local_name_dict_1, predicate_local_name_dict_2) match_dict_2_1, sim_dict_2 = get_predicate_match_dict(predicate_local_name_dict_2, predicate_local_name_dict_1) predicate_match_pairs_set = set() predicate_latent_match_pairs_similarity_dict = {} for p1, p2 in match_dict_1_2.items(): if match_dict_2_1[p2] == p1: predicate_latent_match_pairs_similarity_dict[(p1, p2)] = sim_dict_1[p1] if sim_dict_1[p1] > predicate_init_sim: predicate_match_pairs_set.add((p1, p2, sim_dict_1[p1])) # print(p1, p2, sim_dict_1[p1], sim_dict_2[p2]) return predicate_match_pairs_set, predicate_latent_match_pairs_similarity_dict
Example #8
Source File: dist_utils.py From tensorflow-DSMM with MIT License | 5 votes |
def _count_stats(s1, s2): # length l1 = len(s1) l2 = len(s2) len_diff = np_utils._try_divide(np.abs(l1-l2), (l1+l2)/2.) # set s1_set = set(s1) s2_set = set(s2) # unique length l1_unique = len(s1_set) l2_unique = len(s2_set) len_diff_unique = np_utils._try_divide(np.abs(l1_unique-l2_unique), (l1_unique+l2_unique)/2.) # unique ratio r1_unique = np_utils._try_divide(l1_unique, l1) r2_unique = np_utils._try_divide(l2_unique, l2) # jaccard coef li = len(s1_set.intersection(s2_set)) lu = len(s1_set.union(s2_set)) jaccard_coef = np_utils._try_divide(li, lu) # dice coef dice_coef = np_utils._try_divide(li, l1_unique + l2_unique) # common number common_ = _common_num(s1, s2) common_ratio_avg = np_utils._try_divide(common_, (l1 + l2) / 2.) common_ratio_max = np_utils._try_divide(common_, min(l1, l2)) common_ratio_min = np_utils._try_divide(common_, max(l1, l2)) # over all features f = [l1, l2, len_diff, l1_unique, l2_unique, len_diff_unique, r1_unique, r2_unique, li, lu, jaccard_coef, dice_coef, common_, common_ratio_avg, common_ratio_max, common_ratio_min ] return np.array(f, dtype=np.float32)
Example #9
Source File: dist_utils.py From tensorflow-DSMM with MIT License | 5 votes |
def _edit_dist(str1, str2): try: # very fast # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed # d = Levenshtein.ratio(str1, str2) d = Levenshtein.distance(str1, str2)/float(max(len(str1),len(str2))) except: # https://docs.python.org/2/library/difflib.html d = 1. - SequenceMatcher(lambda x: x==" ", str1, str2).ratio() return d
Example #10
Source File: twitter_markov.py From twitter_markov with GNU General Public License v3.0 | 5 votes |
def check_tweet(self, text): '''Check if a string contains blacklisted words or is similar to a recent tweet.''' text = text.strip().lower() if not text: self.log.info("Rejected (empty)") return False if self.wordfilter.blacklisted(text): self.log.info("Rejected (blacklisted)") return False if tbu.helpers.length(text) > 280: self.log.info("Rejected (too long)") return False for line in self.recently_tweeted: if text in line.strip().lower(): self.log.info("Rejected (Identical)") return False if Levenshtein.ratio(re.sub(r'\W+', '', text), re.sub(r'\W+', '', line.lower())) >= LEVENSHTEIN_LIMIT: self.log.info("Rejected (Levenshtein.ratio)") return False return True
Example #11
Source File: predicates_computer.py From lang2program with Apache License 2.0 | 5 votes |
def similarity_ratio(x, y, threshold=FuzzyMatchGenerator.SIMILARITY_THRESHOLD): """Compute the similarity ratio between two strings. If the ratio exceeds the threshold, return it; otherwise, return 0. The similarity ratio is given by 1 - (levenshtein distance with substitution cost = 2) / (total length) """ ratio = Levenshtein.ratio(x, y) return ratio if ratio > threshold else 0. ################################ # NERValueGenerator
Example #12
Source File: distance_text_or_vec.py From nlp_xiaojiang with MIT License | 5 votes |
def ratio_levenshtein(str1, str2): return Leven.ratio(str1, str2)
Example #13
Source File: submissions.py From intake with MIT License | 5 votes |
def get_name_similarity_ratio(a, b): names = (get_full_lowercase_name(sub) for sub in (a, b)) return Levenshtein.ratio(*names)
Example #14
Source File: dist_utils.py From BERT with Apache License 2.0 | 5 votes |
def _edit_dist(str1, str2): try: # very fast # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed # d = Levenshtein.ratio(str1, str2) d = Levenshtein.distance(str1, str2)/float(max(len(str1),len(str2))) except: # https://docs.python.org/2/library/difflib.html d = 1. - SequenceMatcher(lambda x: x==" ", str1, str2).ratio() return d
Example #15
Source File: generate_accuracy_report.py From namsel with MIT License | 5 votes |
def do_pairwise_comparison(origflpath, ocrflpath): o = open(origflpath, 'r').read() s = open(ocrflpath, 'r').read() s = _normalize_input(s) return L.ratio(o,s) #data = {'csrfmiddlewaretoken':s.cookies['csrftoken'], # 'edit_distance': edit_dist, # 'filename': os.path.basename(tif), # 'sample_set': t, 'html': html, 'timestamp': timestamp, # 'comment': comment # }
Example #16
Source File: dist_utils.py From BERT with Apache License 2.0 | 5 votes |
def _count_stats(s1, s2): # length l1 = len(s1) l2 = len(s2) len_diff = np_utils._try_divide(np.abs(l1-l2), (l1+l2)/2.) # set s1_set = set(s1) s2_set = set(s2) # unique length l1_unique = len(s1_set) l2_unique = len(s2_set) len_diff_unique = np_utils._try_divide(np.abs(l1_unique-l2_unique), (l1_unique+l2_unique)/2.) # unique ratio r1_unique = np_utils._try_divide(l1_unique, l1) r2_unique = np_utils._try_divide(l2_unique, l2) # jaccard coef li = len(s1_set.intersection(s2_set)) lu = len(s1_set.union(s2_set)) jaccard_coef = np_utils._try_divide(li, lu) # dice coef dice_coef = np_utils._try_divide(li, l1_unique + l2_unique) # common number common_ = _common_num(s1, s2) common_ratio_avg = np_utils._try_divide(common_, (l1 + l2) / 2.) common_ratio_max = np_utils._try_divide(common_, min(l1, l2)) common_ratio_min = np_utils._try_divide(common_, max(l1, l2)) # over all features f = [l1, l2, len_diff, l1_unique, l2_unique, len_diff_unique, r1_unique, r2_unique, li, lu, jaccard_coef, dice_coef, common_, common_ratio_avg, common_ratio_max, common_ratio_min ] return np.array(f, dtype=np.float32)
Example #17
Source File: dist_utils.py From kaggle-HomeDepot with MIT License | 5 votes |
def _edit_dist(str1, str2): try: # very fast # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed # d = Levenshtein.ratio(str1, str2) d = Levenshtein.distance(str1, str2)/float(max(len(str1),len(str2))) except: # https://docs.python.org/2/library/difflib.html d = 1. - SequenceMatcher(lambda x: x==" ", str1, str2).ratio() return d
Example #18
Source File: scorer.py From nmt-chatbot with GNU General Public License v3.0 | 5 votes |
def question_answer_similarity_by_ratio(index, question, answer): global valid_emoticon # Disabled or short or char emoticon if score_settings['question_answer_similarity_modifier_value'] is None or len(answer) < score_settings['question_answer_similarity_sentence_len'] or valid_emoticon: return 0 # Divide response into subsentences answer = list(filter(None, re.split(score_settings['subsentence_dividers'], answer))) + [answer] # Calculate similarity for every subsentence, gext maximum one ratio = max([Levenshtein.ratio(question, s) for s in answer]) # Not similar if ratio < score_settings['question_answer_similarity_threshold']: return 0 # Apply value if score_settings['question_answer_similarity_modifier'] == 'value': return score_settings['question_answer_similarity_modifier_value'] # Apply multiplier if score_settings['question_answer_similarity_modifier'] == 'multiplier': return (ratio - score_settings['question_answer_similarity_threshold']) / (1 - score_settings['question_answer_similarity_threshold']) * score_settings['question_answer_similarity_modifier_value'] return 0
Example #19
Source File: merger.py From errant with MIT License | 5 votes |
def char_cost(a, b): return Levenshtein.ratio(a.text, b.text) # Merge the input alignment sequence to a single edit span
Example #20
Source File: test_string_distances.py From dirty_cat with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_compare_implementations(): # Compare the implementations of python-Levenshtein to our # pure-Python implementations if Levenshtein is False: raise unittest.SkipTest # Test on strings with randomly placed common char for string1, string2 in _random_common_char_pairs(n_pairs=50): assert (string_distances._jaro_winkler(string1, string2, winkler=False) == Levenshtein.jaro(string1, string2) ) assert (string_distances._jaro_winkler(string1, string2, winkler=True) == Levenshtein.jaro_winkler(string1, string2)) assert (string_distances.levenshtein_ratio(string1, string2) == Levenshtein.ratio(string1, string2)) # Test on random strings for string1, string2 in _random_string_pairs(n_pairs=50): assert (string_distances._jaro_winkler(string1, string2, winkler=False) == Levenshtein.jaro(string1, string2)) assert (string_distances._jaro_winkler(string1, string2, winkler=True) == Levenshtein.jaro_winkler(string1, string2)) assert (string_distances.levenshtein_ratio(string1, string2) == Levenshtein.ratio(string1, string2))
Example #21
Source File: string_utils.py From ph0neutria with Apache License 2.0 | 5 votes |
def fuzzy_score_string(first_string, second_string): """Produce a similarity score for two strings (using Levenshtein distance). Params: - first_string: (type: string) first string. - second_string: (type: string) second string. Returns: - result: (type: int) score. """ score = 0 if len(first_string) < len(second_string): shorter, longer = (first_string, second_string) window_length = len(shorter) num_iterations = len(longer) - len(shorter) + 1 for position in range(0, num_iterations): window = longer[position:position + window_length] l_ratio = Levenshtein.ratio(window, shorter) * 100 if l_ratio > 60: result = statistics.mean( [100 - Levenshtein.distance(window, shorter) * 15, l_ratio, l_ratio]) else: result = l_ratio if result > score: score = result else: l_ratio = Levenshtein.ratio(first_string, second_string) * 100 score = statistics.mean( [100 - Levenshtein.distance(first_string, second_string) * 15, l_ratio, l_ratio]) simple = fuzz.ratio(first_string, second_string) partial = fuzz.partial_ratio(first_string, second_string) sort = fuzz.token_sort_ratio(first_string, second_string) set_ratio = fuzz.token_set_ratio(first_string, second_string) score = max([score, simple, partial, sort, set_ratio]) if score < 75: score = 0 return score * 0.85
Example #22
Source File: food_string_matching.py From extract_recipe with Apache License 2.0 | 5 votes |
def worker(num,total,foodStrings): stringMatches = [] partialList = {} """thread worker function""" for foodString in foodStrings: for (i,key) in enumerate(foodList.keys()): if i%total==num: leven1 = fuzz.token_set_ratio(key,foodString) leven2 = Levenshtein.ratio(foodString,key) if leven2>0.5: stringMatches.append((key,foodList[key],leven1,leven2)) pickle.dump(stringMatches,open(str(num)+'.p','wb')) return
Example #23
Source File: initsimfeat.py From holoclean with Apache License 2.0 | 5 votes |
def gen_feat_tensor(input, classes, total_attrs): vid = int(input[0]) attr_idx = input[1] init_value = input[2] # TODO: To add more similarity metrics increase the last dimension of tensor. tensor = torch.zeros(1, classes, total_attrs) domain = input[3].split('|||') for idx, val in enumerate(domain): if val == init_value: sim = -1.0 else: sim = (2 * Levenshtein.ratio(val, init_value)) - 1 tensor[0][idx][attr_idx] = sim return tensor
Example #24
Source File: trivia.py From pajbot with MIT License | 5 votes |
def on_message(self, source, message, whisper, **rest): if not message or whisper: return if self.question: right_answer = self.question["answer"].lower() user_answer = message.lower() if len(right_answer) <= 5: correct = right_answer == user_answer else: ratio = Levenshtein.ratio(right_answer, user_answer) correct = ratio >= 0.94 if correct: if self.point_bounty > 0: self.bot.safe_me( f"{source} got the answer right! The answer was {self.question['answer']} FeelsGoodMan They get {self.point_bounty} points! PogChamp" ) source.points += self.point_bounty else: self.bot.safe_me( f"{source} got the answer right! The answer was {self.question['answer']} FeelsGoodMan" ) self.question = None self.step = 0 self.last_question = utils.now()
Example #25
Source File: delex.py From chimera with MIT License | 5 votes |
def lev_ratio(s1, s2): return ratio(s1, s2)
Example #26
Source File: main.py From oabot with MIT License | 4 votes |
def get_dissemin_paper(reference): """ Given a citation template (as parsed by wikiciteparser and a proposed link) get dissemin API information for that link """ doi = reference.get('ID_list', {}).get('DOI') title = reference.get('Title', '') authors = reference.get('Authors', []) date = reference.get('Date', '') # CS1 represents unparsed authors as {'last':'First Last'} for i in range(len(authors)): if 'first' not in authors[i]: authors[i] = {'plain':authors[i].get('last','')} args = { 'title':title, 'authors':authors, 'date':date, 'doi':doi, } for retry in range(5): try: req = requests.post('https://dissem.in/api/query/', json=args, headers={'User-Agent':OABOT_USER_AGENT}, timeout=10) resp = req.json() paper_object = resp.get('paper', {}) if not paper_object: return {} paper_year = paper_object.get("date", "")[:4] paper_authorlast = paper_object.get("authors")[0].get("name", {}).get("last", "") if date[:4] == paper_year and ratio(authors[0].get("last", ""), paper_authorlast) > 0.75: return paper_object else: # Fails a basic author/date check, ignore Dissemin record return {} except (ValueError, requests.exceptions.RequestException) as e: sleep(5) continue except IndexError: # The author names are not what expected, give up on a record match # TODO: could probably try harder return {} return {}
Example #27
Source File: food_string_matching.py From extract_recipe with Apache License 2.0 | 4 votes |
def getStringMatches(foodString): print(foodString) foodString = foodString.replace(',',' ').lower() foodStrings = [] foodStrings.append(foodString) foodWords = foodString.split() if len(foodWords)>2: otherFoodWords = combinations(foodWords,2) for words in otherFoodWords: foodStrings.append(' '.join(words)) if len(foodWords)>3: otherFoodWords = combinations(foodWords,3) for words in otherFoodWords: foodStrings.append(' '.join(words)) stringMatches = [] partialList = {} processes = [] totalProcesses = NUM_PROCESSORS for i in range(totalProcesses): t = Process(target=worker, args=(i,totalProcesses,foodStrings,)) processes.append(t) for t in processes: t.start() for t in processes: t.join() for i in range(totalProcesses): foo = pickle.load(open(str(i)+'.p','rb')) stringMatches = stringMatches + foo os.system('rm ' + str(i)+'.p') ''' for foodString in foodStrings: for (i,key) in enumerate(foodList.keys()): partialList[key] = fuzz.token_set_ratio(key,foodString) foo = sorted(partialList.items(), key=operator.itemgetter(1),reverse=True)[:100] for result in foo: leven=Levenshtein.ratio(foodString,result[0]) if leven>0.5: stringMatches.append((result[0],foodList[result[0]],result[1],leven)) ''' matches = (sorted(stringMatches, key=operator.itemgetter(2, 3), reverse=True)) return matches