Python Examples of difflib.SequenceMatcher

Source File: test_hoeffding_tree_regressor.py From scikit-multiflow with BSD 3-Clause "New" or "Revised" License

6 votes

def test_hoeffding_tree_regressor_model_description():
    stream = RegressionGenerator(
        n_samples=500, n_features=20, n_informative=15, random_state=1
    )

    learner = HoeffdingTreeRegressor(leaf_prediction='mean')

    max_samples = 500
    X, y = stream.next_sample(max_samples)
    learner.partial_fit(X, y)

    expected_description = "if Attribute 6 <= 0.1394515530995348:\n" \
                           "  Leaf = Statistics {0: 276.0000, 1: -21537.4157, 2: 11399392.2187}\n" \
                           "if Attribute 6 > 0.1394515530995348:\n" \
                           "  Leaf = Statistics {0: 224.0000, 1: 22964.8868, 2: 10433581.2534}\n"

    assert SequenceMatcher(
        None, expected_description, learner.get_model_description()
    ).ratio() > 0.9

Source File: structural_similarity.py From html-similarity with BSD 3-Clause "New" or "Revised" License

6 votes

def structural_similarity(document_1, document_2):
    """
    Computes the structural similarity between two DOM Trees
    :param document_1: html string
    :param document_2: html string
    :return: int
    """
    try:
        document_1 = lxml.html.parse(StringIO(document_1))
        document_2 = lxml.html.parse(StringIO(document_2))
    except Exception as e:
        print(e)
        return 0

    tags1 = get_tags(document_1)
    tags2 = get_tags(document_2)
    diff = difflib.SequenceMatcher()
    diff.set_seq1(tags1)
    diff.set_seq2(tags2)

    return diff.ratio()

Source File: __init__.py From faces with GNU General Public License v2.0

6 votes

def _sortKeywords(keyword, kwds):
    """Sort a list of keywords, based on the searched one."""
    sm = SequenceMatcher()
    sm.set_seq1(keyword.lower())
    ratios = [(ratcliff(keyword, k, sm), k) for k in kwds]
    checkContained = False
    if len(keyword) > 4:
        checkContained = True
    for idx, data in enumerate(ratios):
        ratio, key = data
        if key.startswith(keyword):
            ratios[idx] = (ratio+0.5, key)
        elif checkContained and keyword in key:
            ratios[idx] = (ratio+0.3, key)
    ratios.sort()
    ratios.reverse()
    return [r[1] for r in ratios]

Source File: helpers.py From faces with GNU General Public License v2.0

6 votes

def getAKAsInLanguage(movie, lang, _searchedTitle=None):
    """Return a list of AKAs of a movie, in the specified language.
    If _searchedTitle is given, the AKAs are sorted by their similarity
    to it."""
    akas = []
    for language, aka in akasLanguages(movie):
        if lang == language:
            akas.append(aka)
    if _searchedTitle:
        scores = []
        if isinstance(_searchedTitle, unicode):
            _searchedTitle = _searchedTitle.encode('utf8')
        for aka in akas:
            m_aka = aka
            if isinstance(m_aka):
                m_aka = m_aka.encode('utf8')
            scores.append(difflib.SequenceMatcher(None, m_aka.lower(),
                            _searchedTitle.lower()), aka)
        scores.sort(reverse=True)
        akas = [x[1] for x in scores]
    return akas

Source File: password_validation.py From bioforum with MIT License

6 votes

def validate(self, password, user=None):
        if not user:
            return

        for attribute_name in self.user_attributes:
            value = getattr(user, attribute_name, None)
            if not value or not isinstance(value, str):
                continue
            value_parts = re.split(r'\W+', value) + [value]
            for value_part in value_parts:
                if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() >= self.max_similarity:
                    try:
                        verbose_name = str(user._meta.get_field(attribute_name).verbose_name)
                    except FieldDoesNotExist:
                        verbose_name = attribute_name
                    raise ValidationError(
                        _("The password is too similar to the %(verbose_name)s."),
                        code='password_too_similar',
                        params={'verbose_name': verbose_name},
                    )

Source File: emake.py From emake with GNU General Public License v2.0

6 votes

def __java_final (self, home):
		path = [ home ]
		subdir = []
		try:
			for sub in os.listdir(home):
				newpath = os.path.join(home, sub)
				if os.path.isdir(newpath):
					import difflib
					m = difflib.SequenceMatcher(None, sys.platform, sub)
					subdir.append((m.ratio(), sub))
		except:
			pass
		subdir.sort()
		if subdir:
			path.append(os.path.join(home, subdir[-1][1]))
		return ' '.join([ '-I%s'%self.pathtext(n) for n in path ])

	# 取得 java配置

Source File: func.py From fastlane with MIT License

6 votes

def __show_diff(expected, actual):
    seqm = difflib.SequenceMatcher(None, expected, actual)
    output = [Style.RESET_ALL]

    for opcode, a0, a1, b0, b1 in seqm.get_opcodes():
        if opcode == "equal":
            output.append(seqm.a[a0:a1])
        elif opcode == "insert":
            output.append(Fore.GREEN + seqm.b[b0:b1] + Style.RESET_ALL)
        elif opcode == "delete":
            output.append(Fore.RED + seqm.a[a0:a1] + Style.RESET_ALL)
        elif opcode == "replace":
            output.append(Fore.BLUE + seqm.b[b0:b1] + Style.RESET_ALL)
        else:
            raise RuntimeError("unexpected opcode")

    return "".join(output)

Source File: __init__.py From sneakpeek with MIT License

6 votes

def is_needle_in_hay(cls, needle, hay):

        needle_length = len(needle.split())
        max_sim_val = 0

        for ngram in ngrams(hay.split(), needle_length + int(.2 * needle_length)):
            hay_ngram = u" ".join(ngram)
            similarity = SequenceMatcher(None, hay_ngram, needle).ratio()
            if similarity > max_sim_val:
                max_sim_val = similarity
                max_sim_string = hay_ngram

        return max_sim_val  # how confident are we that needle was found in hay

    # https://stackoverflow.com/a/31505798
    # given a string paragraph, return a list of sentences

Source File: fuzz.py From PythonOS with MIT License

5 votes

def partial_ratio(s1, s2):
    """"Return the ratio of the most similar substring
    as a number between 0 and 100."""
    s1, s2 = utils.make_type_consistent(s1, s2)

    if len(s1) <= len(s2):
        shorter = s1
        longer = s2
    else:
        shorter = s2
        longer = s1

    m = SequenceMatcher(None, shorter, longer)
    blocks = m.get_matching_blocks()

    # each block represents a sequence of matching characters in a string
    # of the form (idx_1, idx_2, len)
    # the best partial match will block align with at least one of those blocks
    #   e.g. shorter = "abcd", longer = XXXbcdeEEE
    #   block = (1,3,3)
    #   best score === ratio("abcd", "Xbcd")
    scores = []
    for block in blocks:
        long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0
        long_end = long_start + len(shorter)
        long_substr = longer[long_start:long_end]

        m2 = SequenceMatcher(None, shorter, long_substr)
        r = m2.ratio()
        if r > .995:
            return 100
        else:
            scores.append(r)

    return utils.intr(100 * max(scores))


##############################
# Advanced Scoring Functions #
##############################

Source File: _fuzzywuzzy_token_sort.py From abydos with GNU General Public License v3.0

5 votes

def sim(self, src: str, tar: str) -> float:
        """Return the FuzzyWuzzy Token Sort similarity of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            FuzzyWuzzy Token Sort similarity

        Examples
        --------
        >>> cmp = FuzzyWuzzyTokenSort()
        >>> cmp.sim('cat', 'hat')
        0.6666666666666666
        >>> cmp.sim('Niall', 'Neil')
        0.6666666666666666
        >>> cmp.sim('aluminum', 'Catalan')
        0.4
        >>> cmp.sim('ATCG', 'TAGC')
        0.5


        .. versionadded:: 0.4.0

        """
        src = ' '.join(
            sorted(self.params['tokenizer'].tokenize(src).get_list())
        )
        tar = ' '.join(
            sorted(self.params['tokenizer'].tokenize(tar).get_list())
        )

        return SequenceMatcher(None, src, tar).ratio()

Source File: namediff.py From mtgencode with MIT License

5 votes

def f_nearest_per_thread(workitem):
    (worknames, names, n) = workitem
    # each thread (well, process) needs to generate its own matchers
    matchers = [difflib.SequenceMatcher(b=name, autojunk=False) for name in names]
    return map(lambda name: f_nearest(name, matchers, n), worknames)

Source File: collation.py From OpenDoor with GNU General Public License v3.0

5 votes

def process(self, response):
        """
        Process data
        :return: str
        """

        if response.status in self.DEFAULT_STATUSES:
            super().process(response)
            length = self.__get_content_length()
            if self.MIN_CONTENT_LENGTH < length:
                # the page is allowed for comparison

                if not self.previous_item:
                    # 1st match. Push items for next compare step
                    self.previous_item.update({'length': length, 'text': self._body})
                    return None
                else:
                    if length == self.previous_item.get('length') and self.MIN_CONTENT_LENGTH < length:
                        # identical, seems to drop failed for success
                        return self.RESPONSE_INDEX
                    else:
                        matcher = SequenceMatcher(a=self.previous_item['text'], b=self._body)
                        matcher.get_matching_blocks()

                        if 'length' in self.current_item:
                            next_matcher = SequenceMatcher(a=self.current_item['text'], b=self._body)
                            if next_matcher.ratio() == matcher.ratio():
                                return self.RESPONSE_INDEX
                        if self.MIN_RATIO_INDEX < matcher.ratio():
                            return self.RESPONSE_INDEX
                        else:
                            self.current_item.update({'length': length, 'text': self._body})

                    if self.MIN_CONTENT_LENGTH < length:
                        self.previous_item.update({'length': length, 'text': self._body})
        return None

Source File: london_underground.py From python-astar with BSD 3-Clause "New" or "Revised" License

5 votes

def get_station_by_name(stations, name):
    """lookup by name, the name does not have to be exact."""
    name = name.lower()
    ratios = [(SequenceMatcher(None, name, v.name.lower()).ratio(), v)
              for v in stations.values()]
    best = max(ratios, key=lambda a: a[0])
    if best[0] > 0.7:
        return best[1]
    else:
        return None

Source File: unique.py From angr with BSD 2-Clause "Simplified" License

5 votes

def sequence_matcher_similarity(state_a, state_b):
        """
        The `difflib.SequenceMatcher` ratio between the state addresses in the history of the path.
        :param state_a: The first state to compare
        :param state_b: The second state to compare
        """
        addrs_a = tuple(state_a.history.bbl_addrs)
        addrs_b = tuple(state_b.history.bbl_addrs)
        return SequenceMatcher(a=addrs_a, b=addrs_b).ratio()

Source File: TestCmd.py From GYP3 with BSD 3-Clause "New" or "Revised" License

5 votes

def simple_diff(a, b, fromfile='', tofile='',
                fromfiledate='', tofiledate='', n=3, lineterm='\n'):
    """
    A function with the same calling signature as difflib.context_diff
    (diff -c) and difflib.unified_diff (diff -u) but which prints
    output like the simple, unadorned 'diff" command.
    """
    a = [to_str(q) for q in a]
    b = [to_str(q) for q in b]
    sm = difflib.SequenceMatcher(None, a, b)

    def comma(x1, x2):
        return x1 + 1 == x2 and str(x2) or '%s,%s' % (x1 + 1, x2)
    result = []
    for op, a1, a2, b1, b2 in sm.get_opcodes():
        if op == 'delete':
            result.append("%sd%d" % (comma(a1, a2), b1))
            result.extend(['< ' + l for l in a[a1:a2]])
        elif op == 'insert':
            result.append("%da%s" % (a1, comma(b1, b2)))
            result.extend(['> ' + l for l in b[b1:b2]])
        elif op == 'replace':
            result.append("%sc%s" % (comma(a1, a2), comma(b1, b2)))
            result.extend(['< ' + l for l in a[a1:a2]])
            result.append('---')
            result.extend(['> ' + l for l in b[b1:b2]])
    return result

Source File: test_difflib.py From oss-ftp with MIT License

5 votes

def test_one_insert(self):
        sm = difflib.SequenceMatcher(None, 'b' * 100, 'a' + 'b' * 100)
        self.assertAlmostEqual(sm.ratio(), 0.995, places=3)
        self.assertEqual(list(sm.get_opcodes()),
            [   ('insert', 0, 0, 0, 1),
                ('equal', 0, 100, 1, 101)])
        sm = difflib.SequenceMatcher(None, 'b' * 100, 'b' * 50 + 'a' + 'b' * 50)
        self.assertAlmostEqual(sm.ratio(), 0.995, places=3)
        self.assertEqual(list(sm.get_opcodes()),
            [   ('equal', 0, 50, 0, 50),
                ('insert', 50, 50, 50, 51),
                ('equal', 50, 100, 51, 101)])

Source File: features.py From TaxoRL with MIT License

5 votes

def LCS(x, y):
    match = SequenceMatcher(None, x, y).find_longest_match(0, len(x), 0, len(y))
    res = 2.0 * match.size / (len(x) + len(y))  # [0, 1]
    return int(round(res, 1) * 10)  # [0,10]

Source File: fuzz.py From PythonOS with MIT License

5 votes

def ratio(s1, s2):
    s1, s2 = utils.make_type_consistent(s1, s2)

    m = SequenceMatcher(None, s1, s2)
    return utils.intr(100 * m.ratio())

Source File: routing.py From RSSNewsGAE with Apache License 2.0

5 votes

def closest_rule(self, adapter):
        def _score_rule(rule):
            return sum([
                0.98 * difflib.SequenceMatcher(
                    None, rule.endpoint, self.endpoint
                ).ratio(),
                0.01 * bool(set(self.values or ()).issubset(rule.arguments)),
                0.01 * bool(rule.methods and self.method in rule.methods)
            ])

        if adapter and adapter.map._rules:
            return max(adapter.map._rules, key=_score_rule)