Python difflib.SequenceMatcher() Examples

The following are 30 code examples of difflib.SequenceMatcher(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module difflib , or try the search function .
Example #1
Source File: test_hoeffding_tree_regressor.py    From scikit-multiflow with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_hoeffding_tree_regressor_model_description():
    stream = RegressionGenerator(
        n_samples=500, n_features=20, n_informative=15, random_state=1
    )

    learner = HoeffdingTreeRegressor(leaf_prediction='mean')

    max_samples = 500
    X, y = stream.next_sample(max_samples)
    learner.partial_fit(X, y)

    expected_description = "if Attribute 6 <= 0.1394515530995348:\n" \
                           "  Leaf = Statistics {0: 276.0000, 1: -21537.4157, 2: 11399392.2187}\n" \
                           "if Attribute 6 > 0.1394515530995348:\n" \
                           "  Leaf = Statistics {0: 224.0000, 1: 22964.8868, 2: 10433581.2534}\n"

    assert SequenceMatcher(
        None, expected_description, learner.get_model_description()
    ).ratio() > 0.9 
Example #2
Source File: structural_similarity.py    From html-similarity with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def structural_similarity(document_1, document_2):
    """
    Computes the structural similarity between two DOM Trees
    :param document_1: html string
    :param document_2: html string
    :return: int
    """
    try:
        document_1 = lxml.html.parse(StringIO(document_1))
        document_2 = lxml.html.parse(StringIO(document_2))
    except Exception as e:
        print(e)
        return 0

    tags1 = get_tags(document_1)
    tags2 = get_tags(document_2)
    diff = difflib.SequenceMatcher()
    diff.set_seq1(tags1)
    diff.set_seq2(tags2)

    return diff.ratio() 
Example #3
Source File: __init__.py    From faces with GNU General Public License v2.0 6 votes vote down vote up
def _sortKeywords(keyword, kwds):
    """Sort a list of keywords, based on the searched one."""
    sm = SequenceMatcher()
    sm.set_seq1(keyword.lower())
    ratios = [(ratcliff(keyword, k, sm), k) for k in kwds]
    checkContained = False
    if len(keyword) > 4:
        checkContained = True
    for idx, data in enumerate(ratios):
        ratio, key = data
        if key.startswith(keyword):
            ratios[idx] = (ratio+0.5, key)
        elif checkContained and keyword in key:
            ratios[idx] = (ratio+0.3, key)
    ratios.sort()
    ratios.reverse()
    return [r[1] for r in ratios] 
Example #4
Source File: helpers.py    From faces with GNU General Public License v2.0 6 votes vote down vote up
def getAKAsInLanguage(movie, lang, _searchedTitle=None):
    """Return a list of AKAs of a movie, in the specified language.
    If _searchedTitle is given, the AKAs are sorted by their similarity
    to it."""
    akas = []
    for language, aka in akasLanguages(movie):
        if lang == language:
            akas.append(aka)
    if _searchedTitle:
        scores = []
        if isinstance(_searchedTitle, unicode):
            _searchedTitle = _searchedTitle.encode('utf8')
        for aka in akas:
            m_aka = aka
            if isinstance(m_aka):
                m_aka = m_aka.encode('utf8')
            scores.append(difflib.SequenceMatcher(None, m_aka.lower(),
                            _searchedTitle.lower()), aka)
        scores.sort(reverse=True)
        akas = [x[1] for x in scores]
    return akas 
Example #5
Source File: password_validation.py    From bioforum with MIT License 6 votes vote down vote up
def validate(self, password, user=None):
        if not user:
            return

        for attribute_name in self.user_attributes:
            value = getattr(user, attribute_name, None)
            if not value or not isinstance(value, str):
                continue
            value_parts = re.split(r'\W+', value) + [value]
            for value_part in value_parts:
                if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() >= self.max_similarity:
                    try:
                        verbose_name = str(user._meta.get_field(attribute_name).verbose_name)
                    except FieldDoesNotExist:
                        verbose_name = attribute_name
                    raise ValidationError(
                        _("The password is too similar to the %(verbose_name)s."),
                        code='password_too_similar',
                        params={'verbose_name': verbose_name},
                    ) 
Example #6
Source File: emake.py    From emake with GNU General Public License v2.0 6 votes vote down vote up
def __java_final (self, home):
		path = [ home ]
		subdir = []
		try:
			for sub in os.listdir(home):
				newpath = os.path.join(home, sub)
				if os.path.isdir(newpath):
					import difflib
					m = difflib.SequenceMatcher(None, sys.platform, sub)
					subdir.append((m.ratio(), sub))
		except:
			pass
		subdir.sort()
		if subdir:
			path.append(os.path.join(home, subdir[-1][1]))
		return ' '.join([ '-I%s'%self.pathtext(n) for n in path ])

	# 取得 java配置 
Example #7
Source File: func.py    From fastlane with MIT License 6 votes vote down vote up
def __show_diff(expected, actual):
    seqm = difflib.SequenceMatcher(None, expected, actual)
    output = [Style.RESET_ALL]

    for opcode, a0, a1, b0, b1 in seqm.get_opcodes():
        if opcode == "equal":
            output.append(seqm.a[a0:a1])
        elif opcode == "insert":
            output.append(Fore.GREEN + seqm.b[b0:b1] + Style.RESET_ALL)
        elif opcode == "delete":
            output.append(Fore.RED + seqm.a[a0:a1] + Style.RESET_ALL)
        elif opcode == "replace":
            output.append(Fore.BLUE + seqm.b[b0:b1] + Style.RESET_ALL)
        else:
            raise RuntimeError("unexpected opcode")

    return "".join(output) 
Example #8
Source File: __init__.py    From sneakpeek with MIT License 6 votes vote down vote up
def is_needle_in_hay(cls, needle, hay):

        needle_length = len(needle.split())
        max_sim_val = 0

        for ngram in ngrams(hay.split(), needle_length + int(.2 * needle_length)):
            hay_ngram = u" ".join(ngram)
            similarity = SequenceMatcher(None, hay_ngram, needle).ratio()
            if similarity > max_sim_val:
                max_sim_val = similarity
                max_sim_string = hay_ngram

        return max_sim_val  # how confident are we that needle was found in hay

    # https://stackoverflow.com/a/31505798
    # given a string paragraph, return a list of sentences 
Example #9
Source File: fuzz.py    From PythonOS with MIT License 5 votes vote down vote up
def partial_ratio(s1, s2):
    """"Return the ratio of the most similar substring
    as a number between 0 and 100."""
    s1, s2 = utils.make_type_consistent(s1, s2)

    if len(s1) <= len(s2):
        shorter = s1
        longer = s2
    else:
        shorter = s2
        longer = s1

    m = SequenceMatcher(None, shorter, longer)
    blocks = m.get_matching_blocks()

    # each block represents a sequence of matching characters in a string
    # of the form (idx_1, idx_2, len)
    # the best partial match will block align with at least one of those blocks
    #   e.g. shorter = "abcd", longer = XXXbcdeEEE
    #   block = (1,3,3)
    #   best score === ratio("abcd", "Xbcd")
    scores = []
    for block in blocks:
        long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0
        long_end = long_start + len(shorter)
        long_substr = longer[long_start:long_end]

        m2 = SequenceMatcher(None, shorter, long_substr)
        r = m2.ratio()
        if r > .995:
            return 100
        else:
            scores.append(r)

    return utils.intr(100 * max(scores))


##############################
# Advanced Scoring Functions #
############################## 
Example #10
Source File: _fuzzywuzzy_token_sort.py    From abydos with GNU General Public License v3.0 5 votes vote down vote up
def sim(self, src: str, tar: str) -> float:
        """Return the FuzzyWuzzy Token Sort similarity of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            FuzzyWuzzy Token Sort similarity

        Examples
        --------
        >>> cmp = FuzzyWuzzyTokenSort()
        >>> cmp.sim('cat', 'hat')
        0.6666666666666666
        >>> cmp.sim('Niall', 'Neil')
        0.6666666666666666
        >>> cmp.sim('aluminum', 'Catalan')
        0.4
        >>> cmp.sim('ATCG', 'TAGC')
        0.5


        .. versionadded:: 0.4.0

        """
        src = ' '.join(
            sorted(self.params['tokenizer'].tokenize(src).get_list())
        )
        tar = ' '.join(
            sorted(self.params['tokenizer'].tokenize(tar).get_list())
        )

        return SequenceMatcher(None, src, tar).ratio() 
Example #11
Source File: namediff.py    From mtgencode with MIT License 5 votes vote down vote up
def f_nearest_per_thread(workitem):
    (worknames, names, n) = workitem
    # each thread (well, process) needs to generate its own matchers
    matchers = [difflib.SequenceMatcher(b=name, autojunk=False) for name in names]
    return map(lambda name: f_nearest(name, matchers, n), worknames) 
Example #12
Source File: collation.py    From OpenDoor with GNU General Public License v3.0 5 votes vote down vote up
def process(self, response):
        """
        Process data
        :return: str
        """

        if response.status in self.DEFAULT_STATUSES:
            super().process(response)
            length = self.__get_content_length()
            if self.MIN_CONTENT_LENGTH < length:
                # the page is allowed for comparison

                if not self.previous_item:
                    # 1st match. Push items for next compare step
                    self.previous_item.update({'length': length, 'text': self._body})
                    return None
                else:
                    if length == self.previous_item.get('length') and self.MIN_CONTENT_LENGTH < length:
                        # identical, seems to drop failed for success
                        return self.RESPONSE_INDEX
                    else:
                        matcher = SequenceMatcher(a=self.previous_item['text'], b=self._body)
                        matcher.get_matching_blocks()

                        if 'length' in self.current_item:
                            next_matcher = SequenceMatcher(a=self.current_item['text'], b=self._body)
                            if next_matcher.ratio() == matcher.ratio():
                                return self.RESPONSE_INDEX
                        if self.MIN_RATIO_INDEX < matcher.ratio():
                            return self.RESPONSE_INDEX
                        else:
                            self.current_item.update({'length': length, 'text': self._body})

                    if self.MIN_CONTENT_LENGTH < length:
                        self.previous_item.update({'length': length, 'text': self._body})
        return None 
Example #13
Source File: london_underground.py    From python-astar with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def get_station_by_name(stations, name):
    """lookup by name, the name does not have to be exact."""
    name = name.lower()
    ratios = [(SequenceMatcher(None, name, v.name.lower()).ratio(), v)
              for v in stations.values()]
    best = max(ratios, key=lambda a: a[0])
    if best[0] > 0.7:
        return best[1]
    else:
        return None 
Example #14
Source File: unique.py    From angr with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def sequence_matcher_similarity(state_a, state_b):
        """
        The `difflib.SequenceMatcher` ratio between the state addresses in the history of the path.
        :param state_a: The first state to compare
        :param state_b: The second state to compare
        """
        addrs_a = tuple(state_a.history.bbl_addrs)
        addrs_b = tuple(state_b.history.bbl_addrs)
        return SequenceMatcher(a=addrs_a, b=addrs_b).ratio() 
Example #15
Source File: TestCmd.py    From GYP3 with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def simple_diff(a, b, fromfile='', tofile='',
                fromfiledate='', tofiledate='', n=3, lineterm='\n'):
    """
    A function with the same calling signature as difflib.context_diff
    (diff -c) and difflib.unified_diff (diff -u) but which prints
    output like the simple, unadorned 'diff" command.
    """
    a = [to_str(q) for q in a]
    b = [to_str(q) for q in b]
    sm = difflib.SequenceMatcher(None, a, b)

    def comma(x1, x2):
        return x1 + 1 == x2 and str(x2) or '%s,%s' % (x1 + 1, x2)
    result = []
    for op, a1, a2, b1, b2 in sm.get_opcodes():
        if op == 'delete':
            result.append("%sd%d" % (comma(a1, a2), b1))
            result.extend(['< ' + l for l in a[a1:a2]])
        elif op == 'insert':
            result.append("%da%s" % (a1, comma(b1, b2)))
            result.extend(['> ' + l for l in b[b1:b2]])
        elif op == 'replace':
            result.append("%sc%s" % (comma(a1, a2), comma(b1, b2)))
            result.extend(['< ' + l for l in a[a1:a2]])
            result.append('---')
            result.extend(['> ' + l for l in b[b1:b2]])
    return result 
Example #16
Source File: test_difflib.py    From oss-ftp with MIT License 5 votes vote down vote up
def test_one_insert(self):
        sm = difflib.SequenceMatcher(None, 'b' * 100, 'a' + 'b' * 100)
        self.assertAlmostEqual(sm.ratio(), 0.995, places=3)
        self.assertEqual(list(sm.get_opcodes()),
            [   ('insert', 0, 0, 0, 1),
                ('equal', 0, 100, 1, 101)])
        sm = difflib.SequenceMatcher(None, 'b' * 100, 'b' * 50 + 'a' + 'b' * 50)
        self.assertAlmostEqual(sm.ratio(), 0.995, places=3)
        self.assertEqual(list(sm.get_opcodes()),
            [   ('equal', 0, 50, 0, 50),
                ('insert', 50, 50, 50, 51),
                ('equal', 50, 100, 51, 101)]) 
Example #17
Source File: features.py    From TaxoRL with MIT License 5 votes vote down vote up
def LCS(x, y):
    match = SequenceMatcher(None, x, y).find_longest_match(0, len(x), 0, len(y))
    res = 2.0 * match.size / (len(x) + len(y))  # [0, 1]
    return int(round(res, 1) * 10)  # [0,10] 
Example #18
Source File: fuzz.py    From PythonOS with MIT License 5 votes vote down vote up
def ratio(s1, s2):
    s1, s2 = utils.make_type_consistent(s1, s2)

    m = SequenceMatcher(None, s1, s2)
    return utils.intr(100 * m.ratio()) 
Example #19
Source File: routing.py    From RSSNewsGAE with Apache License 2.0 5 votes vote down vote up
def closest_rule(self, adapter):
        def _score_rule(rule):
            return sum([
                0.98 * difflib.SequenceMatcher(
                    None, rule.endpoint, self.endpoint
                ).ratio(),
                0.01 * bool(set(self.values or ()).issubset(rule.arguments)),
                0.01 * bool(rule.methods and self.method in rule.methods)
            ])

        if adapter and adapter.map._rules:
            return max(adapter.map._rules, key=_score_rule) 
Example #20
Source File: test_difflib.py    From BinderFilter with MIT License 5 votes vote down vote up
def test_comparing_empty_lists(self):
        # Check fix for bug #979794
        group_gen = difflib.SequenceMatcher(None, [], []).get_grouped_opcodes()
        self.assertRaises(StopIteration, group_gen.next)
        diff_gen = difflib.unified_diff([], [])
        self.assertRaises(StopIteration, diff_gen.next) 
Example #21
Source File: test_difflib.py    From BinderFilter with MIT License 5 votes vote down vote up
def test_ratio_for_null_seqn(self):
        # Check clearing of SF bug 763023
        s = difflib.SequenceMatcher(None, [], [])
        self.assertEqual(s.ratio(), 1)
        self.assertEqual(s.quick_ratio(), 1)
        self.assertEqual(s.real_quick_ratio(), 1) 
Example #22
Source File: test_difflib.py    From BinderFilter with MIT License 5 votes vote down vote up
def test_one_insert_homogenous_sequence(self):
        # By default autojunk=True and the heuristic kicks in for a sequence
        # of length 200+
        seq1 = 'b' * 200
        seq2 = 'a' + 'b' * 200

        sm = difflib.SequenceMatcher(None, seq1, seq2)
        self.assertAlmostEqual(sm.ratio(), 0, places=3)

        # Now turn the heuristic off
        sm = difflib.SequenceMatcher(None, seq1, seq2, autojunk=False)
        self.assertAlmostEqual(sm.ratio(), 0.9975, places=3) 
Example #23
Source File: test_difflib.py    From BinderFilter with MIT License 5 votes vote down vote up
def test_one_delete(self):
        sm = difflib.SequenceMatcher(None, 'a' * 40 + 'c' + 'b' * 40, 'a' * 40 + 'b' * 40)
        self.assertAlmostEqual(sm.ratio(), 0.994, places=3)
        self.assertEqual(list(sm.get_opcodes()),
            [   ('equal', 0, 40, 0, 40),
                ('delete', 40, 41, 40, 40),
                ('equal', 41, 81, 40, 80)]) 
Example #24
Source File: test_difflib.py    From BinderFilter with MIT License 5 votes vote down vote up
def test_one_insert(self):
        sm = difflib.SequenceMatcher(None, 'b' * 100, 'a' + 'b' * 100)
        self.assertAlmostEqual(sm.ratio(), 0.995, places=3)
        self.assertEqual(list(sm.get_opcodes()),
            [   ('insert', 0, 0, 0, 1),
                ('equal', 0, 100, 1, 101)])
        sm = difflib.SequenceMatcher(None, 'b' * 100, 'b' * 50 + 'a' + 'b' * 50)
        self.assertAlmostEqual(sm.ratio(), 0.995, places=3)
        self.assertEqual(list(sm.get_opcodes()),
            [   ('equal', 0, 50, 0, 50),
                ('insert', 50, 50, 50, 51),
                ('equal', 50, 100, 51, 101)]) 
Example #25
Source File: test_parser.py    From selectolax with MIT License 5 votes vote down vote up
def test_nodes():
    html = (
        '<div><p id="p1"></p><p id="p2"></p><p id="p3"><a>link</a></p>'
        '<p id="p4"></p><p id="p5">text</p><p id="p6"></p></div>'
    )
    htmlp = HTMLParser(html)

    assert isinstance(htmlp.root, Node)
    assert isinstance(htmlp.body, Node)
    html_output = htmlp.html
    assert len(html_output) >= len(html)
    assert SequenceMatcher(None, html, html_output).ratio() > 0.8 
Example #26
Source File: dist_utils.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def _longest_match_ratio(str1, str2):
    sq = SequenceMatcher(lambda x: x==" ", str1, str2)
    match = sq.find_longest_match(0, len(str1), 0, len(str2))
    return np_utils._try_divide(match.size, min(len(str1), len(str2))) 
Example #27
Source File: dist_utils.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def _longest_match_size(str1, str2):
    sq = SequenceMatcher(lambda x: x==" ", str1, str2)
    match = sq.find_longest_match(0, len(str1), 0, len(str2))
    return match.size 
Example #28
Source File: dist_utils.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def _edit_dist(str1, str2):
    try:
        # very fast
        # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed
        # d = Levenshtein.ratio(str1, str2)
        d = Levenshtein.distance(str1, str2)/float(max(len(str1),len(str2)))
    except:
        # https://docs.python.org/2/library/difflib.html
        d = 1. - SequenceMatcher(lambda x: x==" ", str1, str2).ratio()
    return d 
Example #29
Source File: homedepot_functions.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def seq_matcher(s1,s2):
    seq=difflib.SequenceMatcher(None, s1,s2)
    rt=round(seq.ratio(),7)
    l1=len(s1)
    l2=len(s2)
    if len(s1)==0 or len(s2)==0:
        rt=0
        rt_scaled=0
    else:
        rt_scaled=round(rt*max(l1,l2)/min(l1,l2),7)
    return rt, rt_scaled 
Example #30
Source File: __main__.py    From telegram-export with Mozilla Public License 2.0 5 votes vote down vote up
def find_dialog(dialogs, query, top=25, threshold=0.7):
    """
    Iterate through dialogs and return, sorted,
    the best matches for a given query.
    """
    seq = difflib.SequenceMatcher(b=query, autojunk=False)
    scores = []
    for index, dialog in enumerate(dialogs):
        seq.set_seq1(dialog.name)
        name_score = seq.ratio()
        if query.lower() in dialog.name.lower():
            # If query is a substring of the name, make it a good match.
            # Slightly boost dialogs which were recently active, so not
            # all substring-matched dialogs have exactly the same score.
            boost = (index/len(dialogs))/25
            name_score = max(name_score, 0.75 + boost)
        if getattr(dialog.entity, 'username', None):
            seq.set_seq1(dialog.entity.username)
            username_score = seq.ratio()
        else:
            username_score = 0
        if getattr(dialog.entity, 'phone', None):
            seq.set_seq1(dialog.entity.phone)
            phone_score = seq.ratio()
        else:
            phone_score = 0

        scores.append((dialog, max(name_score, username_score, phone_score)))
    scores.sort(key=lambda t: t[1], reverse=True)
    matches = tuple(score[0] for score in scores if score[1] > threshold)
    num_not_shown = 0 if len(matches) <= top else len(matches) - top
    return matches[:top], num_not_shown