Python difflib.SequenceMatcher() Examples
The following are 30
code examples of difflib.SequenceMatcher().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
difflib
, or try the search function
.
Example #1
Source File: test_hoeffding_tree_regressor.py From scikit-multiflow with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_hoeffding_tree_regressor_model_description(): stream = RegressionGenerator( n_samples=500, n_features=20, n_informative=15, random_state=1 ) learner = HoeffdingTreeRegressor(leaf_prediction='mean') max_samples = 500 X, y = stream.next_sample(max_samples) learner.partial_fit(X, y) expected_description = "if Attribute 6 <= 0.1394515530995348:\n" \ " Leaf = Statistics {0: 276.0000, 1: -21537.4157, 2: 11399392.2187}\n" \ "if Attribute 6 > 0.1394515530995348:\n" \ " Leaf = Statistics {0: 224.0000, 1: 22964.8868, 2: 10433581.2534}\n" assert SequenceMatcher( None, expected_description, learner.get_model_description() ).ratio() > 0.9
Example #2
Source File: structural_similarity.py From html-similarity with BSD 3-Clause "New" or "Revised" License | 6 votes |
def structural_similarity(document_1, document_2): """ Computes the structural similarity between two DOM Trees :param document_1: html string :param document_2: html string :return: int """ try: document_1 = lxml.html.parse(StringIO(document_1)) document_2 = lxml.html.parse(StringIO(document_2)) except Exception as e: print(e) return 0 tags1 = get_tags(document_1) tags2 = get_tags(document_2) diff = difflib.SequenceMatcher() diff.set_seq1(tags1) diff.set_seq2(tags2) return diff.ratio()
Example #3
Source File: __init__.py From faces with GNU General Public License v2.0 | 6 votes |
def _sortKeywords(keyword, kwds): """Sort a list of keywords, based on the searched one.""" sm = SequenceMatcher() sm.set_seq1(keyword.lower()) ratios = [(ratcliff(keyword, k, sm), k) for k in kwds] checkContained = False if len(keyword) > 4: checkContained = True for idx, data in enumerate(ratios): ratio, key = data if key.startswith(keyword): ratios[idx] = (ratio+0.5, key) elif checkContained and keyword in key: ratios[idx] = (ratio+0.3, key) ratios.sort() ratios.reverse() return [r[1] for r in ratios]
Example #4
Source File: helpers.py From faces with GNU General Public License v2.0 | 6 votes |
def getAKAsInLanguage(movie, lang, _searchedTitle=None): """Return a list of AKAs of a movie, in the specified language. If _searchedTitle is given, the AKAs are sorted by their similarity to it.""" akas = [] for language, aka in akasLanguages(movie): if lang == language: akas.append(aka) if _searchedTitle: scores = [] if isinstance(_searchedTitle, unicode): _searchedTitle = _searchedTitle.encode('utf8') for aka in akas: m_aka = aka if isinstance(m_aka): m_aka = m_aka.encode('utf8') scores.append(difflib.SequenceMatcher(None, m_aka.lower(), _searchedTitle.lower()), aka) scores.sort(reverse=True) akas = [x[1] for x in scores] return akas
Example #5
Source File: password_validation.py From bioforum with MIT License | 6 votes |
def validate(self, password, user=None): if not user: return for attribute_name in self.user_attributes: value = getattr(user, attribute_name, None) if not value or not isinstance(value, str): continue value_parts = re.split(r'\W+', value) + [value] for value_part in value_parts: if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() >= self.max_similarity: try: verbose_name = str(user._meta.get_field(attribute_name).verbose_name) except FieldDoesNotExist: verbose_name = attribute_name raise ValidationError( _("The password is too similar to the %(verbose_name)s."), code='password_too_similar', params={'verbose_name': verbose_name}, )
Example #6
Source File: emake.py From emake with GNU General Public License v2.0 | 6 votes |
def __java_final (self, home): path = [ home ] subdir = [] try: for sub in os.listdir(home): newpath = os.path.join(home, sub) if os.path.isdir(newpath): import difflib m = difflib.SequenceMatcher(None, sys.platform, sub) subdir.append((m.ratio(), sub)) except: pass subdir.sort() if subdir: path.append(os.path.join(home, subdir[-1][1])) return ' '.join([ '-I%s'%self.pathtext(n) for n in path ]) # 取得 java配置
Example #7
Source File: func.py From fastlane with MIT License | 6 votes |
def __show_diff(expected, actual): seqm = difflib.SequenceMatcher(None, expected, actual) output = [Style.RESET_ALL] for opcode, a0, a1, b0, b1 in seqm.get_opcodes(): if opcode == "equal": output.append(seqm.a[a0:a1]) elif opcode == "insert": output.append(Fore.GREEN + seqm.b[b0:b1] + Style.RESET_ALL) elif opcode == "delete": output.append(Fore.RED + seqm.a[a0:a1] + Style.RESET_ALL) elif opcode == "replace": output.append(Fore.BLUE + seqm.b[b0:b1] + Style.RESET_ALL) else: raise RuntimeError("unexpected opcode") return "".join(output)
Example #8
Source File: __init__.py From sneakpeek with MIT License | 6 votes |
def is_needle_in_hay(cls, needle, hay): needle_length = len(needle.split()) max_sim_val = 0 for ngram in ngrams(hay.split(), needle_length + int(.2 * needle_length)): hay_ngram = u" ".join(ngram) similarity = SequenceMatcher(None, hay_ngram, needle).ratio() if similarity > max_sim_val: max_sim_val = similarity max_sim_string = hay_ngram return max_sim_val # how confident are we that needle was found in hay # https://stackoverflow.com/a/31505798 # given a string paragraph, return a list of sentences
Example #9
Source File: fuzz.py From PythonOS with MIT License | 5 votes |
def partial_ratio(s1, s2): """"Return the ratio of the most similar substring as a number between 0 and 100.""" s1, s2 = utils.make_type_consistent(s1, s2) if len(s1) <= len(s2): shorter = s1 longer = s2 else: shorter = s2 longer = s1 m = SequenceMatcher(None, shorter, longer) blocks = m.get_matching_blocks() # each block represents a sequence of matching characters in a string # of the form (idx_1, idx_2, len) # the best partial match will block align with at least one of those blocks # e.g. shorter = "abcd", longer = XXXbcdeEEE # block = (1,3,3) # best score === ratio("abcd", "Xbcd") scores = [] for block in blocks: long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0 long_end = long_start + len(shorter) long_substr = longer[long_start:long_end] m2 = SequenceMatcher(None, shorter, long_substr) r = m2.ratio() if r > .995: return 100 else: scores.append(r) return utils.intr(100 * max(scores)) ############################## # Advanced Scoring Functions # ##############################
Example #10
Source File: _fuzzywuzzy_token_sort.py From abydos with GNU General Public License v3.0 | 5 votes |
def sim(self, src: str, tar: str) -> float: """Return the FuzzyWuzzy Token Sort similarity of two strings. Parameters ---------- src : str Source string (or QGrams/Counter objects) for comparison tar : str Target string (or QGrams/Counter objects) for comparison Returns ------- float FuzzyWuzzy Token Sort similarity Examples -------- >>> cmp = FuzzyWuzzyTokenSort() >>> cmp.sim('cat', 'hat') 0.6666666666666666 >>> cmp.sim('Niall', 'Neil') 0.6666666666666666 >>> cmp.sim('aluminum', 'Catalan') 0.4 >>> cmp.sim('ATCG', 'TAGC') 0.5 .. versionadded:: 0.4.0 """ src = ' '.join( sorted(self.params['tokenizer'].tokenize(src).get_list()) ) tar = ' '.join( sorted(self.params['tokenizer'].tokenize(tar).get_list()) ) return SequenceMatcher(None, src, tar).ratio()
Example #11
Source File: namediff.py From mtgencode with MIT License | 5 votes |
def f_nearest_per_thread(workitem): (worknames, names, n) = workitem # each thread (well, process) needs to generate its own matchers matchers = [difflib.SequenceMatcher(b=name, autojunk=False) for name in names] return map(lambda name: f_nearest(name, matchers, n), worknames)
Example #12
Source File: collation.py From OpenDoor with GNU General Public License v3.0 | 5 votes |
def process(self, response): """ Process data :return: str """ if response.status in self.DEFAULT_STATUSES: super().process(response) length = self.__get_content_length() if self.MIN_CONTENT_LENGTH < length: # the page is allowed for comparison if not self.previous_item: # 1st match. Push items for next compare step self.previous_item.update({'length': length, 'text': self._body}) return None else: if length == self.previous_item.get('length') and self.MIN_CONTENT_LENGTH < length: # identical, seems to drop failed for success return self.RESPONSE_INDEX else: matcher = SequenceMatcher(a=self.previous_item['text'], b=self._body) matcher.get_matching_blocks() if 'length' in self.current_item: next_matcher = SequenceMatcher(a=self.current_item['text'], b=self._body) if next_matcher.ratio() == matcher.ratio(): return self.RESPONSE_INDEX if self.MIN_RATIO_INDEX < matcher.ratio(): return self.RESPONSE_INDEX else: self.current_item.update({'length': length, 'text': self._body}) if self.MIN_CONTENT_LENGTH < length: self.previous_item.update({'length': length, 'text': self._body}) return None
Example #13
Source File: london_underground.py From python-astar with BSD 3-Clause "New" or "Revised" License | 5 votes |
def get_station_by_name(stations, name): """lookup by name, the name does not have to be exact.""" name = name.lower() ratios = [(SequenceMatcher(None, name, v.name.lower()).ratio(), v) for v in stations.values()] best = max(ratios, key=lambda a: a[0]) if best[0] > 0.7: return best[1] else: return None
Example #14
Source File: unique.py From angr with BSD 2-Clause "Simplified" License | 5 votes |
def sequence_matcher_similarity(state_a, state_b): """ The `difflib.SequenceMatcher` ratio between the state addresses in the history of the path. :param state_a: The first state to compare :param state_b: The second state to compare """ addrs_a = tuple(state_a.history.bbl_addrs) addrs_b = tuple(state_b.history.bbl_addrs) return SequenceMatcher(a=addrs_a, b=addrs_b).ratio()
Example #15
Source File: TestCmd.py From GYP3 with BSD 3-Clause "New" or "Revised" License | 5 votes |
def simple_diff(a, b, fromfile='', tofile='', fromfiledate='', tofiledate='', n=3, lineterm='\n'): """ A function with the same calling signature as difflib.context_diff (diff -c) and difflib.unified_diff (diff -u) but which prints output like the simple, unadorned 'diff" command. """ a = [to_str(q) for q in a] b = [to_str(q) for q in b] sm = difflib.SequenceMatcher(None, a, b) def comma(x1, x2): return x1 + 1 == x2 and str(x2) or '%s,%s' % (x1 + 1, x2) result = [] for op, a1, a2, b1, b2 in sm.get_opcodes(): if op == 'delete': result.append("%sd%d" % (comma(a1, a2), b1)) result.extend(['< ' + l for l in a[a1:a2]]) elif op == 'insert': result.append("%da%s" % (a1, comma(b1, b2))) result.extend(['> ' + l for l in b[b1:b2]]) elif op == 'replace': result.append("%sc%s" % (comma(a1, a2), comma(b1, b2))) result.extend(['< ' + l for l in a[a1:a2]]) result.append('---') result.extend(['> ' + l for l in b[b1:b2]]) return result
Example #16
Source File: test_difflib.py From oss-ftp with MIT License | 5 votes |
def test_one_insert(self): sm = difflib.SequenceMatcher(None, 'b' * 100, 'a' + 'b' * 100) self.assertAlmostEqual(sm.ratio(), 0.995, places=3) self.assertEqual(list(sm.get_opcodes()), [ ('insert', 0, 0, 0, 1), ('equal', 0, 100, 1, 101)]) sm = difflib.SequenceMatcher(None, 'b' * 100, 'b' * 50 + 'a' + 'b' * 50) self.assertAlmostEqual(sm.ratio(), 0.995, places=3) self.assertEqual(list(sm.get_opcodes()), [ ('equal', 0, 50, 0, 50), ('insert', 50, 50, 50, 51), ('equal', 50, 100, 51, 101)])
Example #17
Source File: features.py From TaxoRL with MIT License | 5 votes |
def LCS(x, y): match = SequenceMatcher(None, x, y).find_longest_match(0, len(x), 0, len(y)) res = 2.0 * match.size / (len(x) + len(y)) # [0, 1] return int(round(res, 1) * 10) # [0,10]
Example #18
Source File: fuzz.py From PythonOS with MIT License | 5 votes |
def ratio(s1, s2): s1, s2 = utils.make_type_consistent(s1, s2) m = SequenceMatcher(None, s1, s2) return utils.intr(100 * m.ratio())
Example #19
Source File: routing.py From RSSNewsGAE with Apache License 2.0 | 5 votes |
def closest_rule(self, adapter): def _score_rule(rule): return sum([ 0.98 * difflib.SequenceMatcher( None, rule.endpoint, self.endpoint ).ratio(), 0.01 * bool(set(self.values or ()).issubset(rule.arguments)), 0.01 * bool(rule.methods and self.method in rule.methods) ]) if adapter and adapter.map._rules: return max(adapter.map._rules, key=_score_rule)
Example #20
Source File: test_difflib.py From BinderFilter with MIT License | 5 votes |
def test_comparing_empty_lists(self): # Check fix for bug #979794 group_gen = difflib.SequenceMatcher(None, [], []).get_grouped_opcodes() self.assertRaises(StopIteration, group_gen.next) diff_gen = difflib.unified_diff([], []) self.assertRaises(StopIteration, diff_gen.next)
Example #21
Source File: test_difflib.py From BinderFilter with MIT License | 5 votes |
def test_ratio_for_null_seqn(self): # Check clearing of SF bug 763023 s = difflib.SequenceMatcher(None, [], []) self.assertEqual(s.ratio(), 1) self.assertEqual(s.quick_ratio(), 1) self.assertEqual(s.real_quick_ratio(), 1)
Example #22
Source File: test_difflib.py From BinderFilter with MIT License | 5 votes |
def test_one_insert_homogenous_sequence(self): # By default autojunk=True and the heuristic kicks in for a sequence # of length 200+ seq1 = 'b' * 200 seq2 = 'a' + 'b' * 200 sm = difflib.SequenceMatcher(None, seq1, seq2) self.assertAlmostEqual(sm.ratio(), 0, places=3) # Now turn the heuristic off sm = difflib.SequenceMatcher(None, seq1, seq2, autojunk=False) self.assertAlmostEqual(sm.ratio(), 0.9975, places=3)
Example #23
Source File: test_difflib.py From BinderFilter with MIT License | 5 votes |
def test_one_delete(self): sm = difflib.SequenceMatcher(None, 'a' * 40 + 'c' + 'b' * 40, 'a' * 40 + 'b' * 40) self.assertAlmostEqual(sm.ratio(), 0.994, places=3) self.assertEqual(list(sm.get_opcodes()), [ ('equal', 0, 40, 0, 40), ('delete', 40, 41, 40, 40), ('equal', 41, 81, 40, 80)])
Example #24
Source File: test_difflib.py From BinderFilter with MIT License | 5 votes |
def test_one_insert(self): sm = difflib.SequenceMatcher(None, 'b' * 100, 'a' + 'b' * 100) self.assertAlmostEqual(sm.ratio(), 0.995, places=3) self.assertEqual(list(sm.get_opcodes()), [ ('insert', 0, 0, 0, 1), ('equal', 0, 100, 1, 101)]) sm = difflib.SequenceMatcher(None, 'b' * 100, 'b' * 50 + 'a' + 'b' * 50) self.assertAlmostEqual(sm.ratio(), 0.995, places=3) self.assertEqual(list(sm.get_opcodes()), [ ('equal', 0, 50, 0, 50), ('insert', 50, 50, 50, 51), ('equal', 50, 100, 51, 101)])
Example #25
Source File: test_parser.py From selectolax with MIT License | 5 votes |
def test_nodes(): html = ( '<div><p id="p1"></p><p id="p2"></p><p id="p3"><a>link</a></p>' '<p id="p4"></p><p id="p5">text</p><p id="p6"></p></div>' ) htmlp = HTMLParser(html) assert isinstance(htmlp.root, Node) assert isinstance(htmlp.body, Node) html_output = htmlp.html assert len(html_output) >= len(html) assert SequenceMatcher(None, html, html_output).ratio() > 0.8
Example #26
Source File: dist_utils.py From kaggle-HomeDepot with MIT License | 5 votes |
def _longest_match_ratio(str1, str2): sq = SequenceMatcher(lambda x: x==" ", str1, str2) match = sq.find_longest_match(0, len(str1), 0, len(str2)) return np_utils._try_divide(match.size, min(len(str1), len(str2)))
Example #27
Source File: dist_utils.py From kaggle-HomeDepot with MIT License | 5 votes |
def _longest_match_size(str1, str2): sq = SequenceMatcher(lambda x: x==" ", str1, str2) match = sq.find_longest_match(0, len(str1), 0, len(str2)) return match.size
Example #28
Source File: dist_utils.py From kaggle-HomeDepot with MIT License | 5 votes |
def _edit_dist(str1, str2): try: # very fast # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed # d = Levenshtein.ratio(str1, str2) d = Levenshtein.distance(str1, str2)/float(max(len(str1),len(str2))) except: # https://docs.python.org/2/library/difflib.html d = 1. - SequenceMatcher(lambda x: x==" ", str1, str2).ratio() return d
Example #29
Source File: homedepot_functions.py From kaggle-HomeDepot with MIT License | 5 votes |
def seq_matcher(s1,s2): seq=difflib.SequenceMatcher(None, s1,s2) rt=round(seq.ratio(),7) l1=len(s1) l2=len(s2) if len(s1)==0 or len(s2)==0: rt=0 rt_scaled=0 else: rt_scaled=round(rt*max(l1,l2)/min(l1,l2),7) return rt, rt_scaled
Example #30
Source File: __main__.py From telegram-export with Mozilla Public License 2.0 | 5 votes |
def find_dialog(dialogs, query, top=25, threshold=0.7): """ Iterate through dialogs and return, sorted, the best matches for a given query. """ seq = difflib.SequenceMatcher(b=query, autojunk=False) scores = [] for index, dialog in enumerate(dialogs): seq.set_seq1(dialog.name) name_score = seq.ratio() if query.lower() in dialog.name.lower(): # If query is a substring of the name, make it a good match. # Slightly boost dialogs which were recently active, so not # all substring-matched dialogs have exactly the same score. boost = (index/len(dialogs))/25 name_score = max(name_score, 0.75 + boost) if getattr(dialog.entity, 'username', None): seq.set_seq1(dialog.entity.username) username_score = seq.ratio() else: username_score = 0 if getattr(dialog.entity, 'phone', None): seq.set_seq1(dialog.entity.phone) phone_score = seq.ratio() else: phone_score = 0 scores.append((dialog, max(name_score, username_score, phone_score))) scores.sort(key=lambda t: t[1], reverse=True) matches = tuple(score[0] for score in scores if score[1] > threshold) num_not_shown = 0 if len(matches) <= top else len(matches) - top return matches[:top], num_not_shown