Python Examples of ssdeep.compare

Source File: game.py From DueUtil with GNU General Public License v3.0

6 votes

def get_spam_level(player, message_content):
    """
    Get's a spam level for a message using a 
    fuzzy hash > 50% means it's probably spam
    """

    message_hash = ssdeep.hash(message_content)
    spam_level = 0
    spam_levels = [ssdeep.compare(message_hash, prior_hash) for prior_hash in player.last_message_hashes if
                   prior_hash is not None]
    if len(spam_levels) > 0:
        spam_level = max(spam_levels)
    player.last_message_hashes.append(message_hash)
    if spam_level > SPAM_TOLERANCE:
        player.spam_detections += 1
    return spam_level

Source File: apifuzz.py From codex-backend with MIT License

6 votes

def searchFuzzy(fuzz, limit, thresh):
    client = MongoClient(envget('metadata.host'), envget('metadata.port'))
    db = client[envget('db_metadata_name')]
    coll_meta = db["db_metadata_collection"]

    f1 = coll_meta.find({}, {"file_id": 1, "fuzzy_hash": 1}).limit(limit)
    l = []
    for f in f1:
        l.append(f)

    ret = {}
    for a in l:
        res = -1
        try:
            res = ssdeep.compare(a["fuzzy_hash"], fuzz)
        except InternalError:
            print(str(res) + "------" +
                  str(a["fuzzy_hash"]) + "-----" + str(a["file_id"]))
            continue
        if(res >= thresh):
            ret[a["file_id"]] = res

    return ret

Source File: SearchModule.py From codex-backend with MIT License

6 votes

def fuzz_search_fast(id, p, fuzz):
    block = int(fuzz.split(':')[0])
    lap = 500
    coll_meta = db[envget("db_metadata_collection")]

    f1 = coll_meta.find({}, {"file_id": 1, p: 1})
    l = []
    for f in f1:
        l.append(f)
    dic = {}
    for a in l:
        res = -1
        try:
            f_comp = a[p]
            block_comp = int(f_comp.split(':')[0])
            if(block_comp <= block + lap and block_comp >= block - lap):
                res = ssdeep.compare(f_comp, fuzz)
                if(res > 0):
                    dic[a["file_id"]] = res
        except Exception, e:
            logging.exception(
                "fuzz_search_fast(id=" + str(id) + ",p=" + str(p) + ",fuzz=" + str(fuzz))
            continue

Source File: ssdeep_analytics.py From multiscanner with Mozilla Public License 2.0

6 votes

def main():
    parser = argparse.ArgumentParser(description='Script to interact with '
        'Multiscanner\'s Elasticsearch datastore to run analytics based on '
        'ssdeep hash.')
    group = parser.add_mutually_exclusive_group(required=True)
    parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
                        help='Increase output to stdout')
    group.add_argument('-c', '--compare', dest='compare', action='store_true',
        help='Run ssdeep.compare using a few optimizations based on ssdeep'
        ' hash structure.')
    group.add_argument('-g', '--group', dest='group', action='store_true',
        help='Returns group of samples based on ssdeep hash.')

    args = parser.parse_args()

    ssdeep_analytic = SSDeepAnalytic(debug=args.verbose)

    if args.compare:
        ssdeep_analytic.ssdeep_compare()
        print('[*] Success')
    elif args.group:
        pprint(ssdeep_analytic.ssdeep_group())
        print('[*] Success')

Source File: gitgot.py From GitGot with GNU Lesser General Public License v3.0

5 votes

def should_parse(repo, state, is_gist=False):
    owner_login = repo.owner.login if is_gist else repo.repository.owner.login
    if owner_login in state.bad_users:
        print(bcolors.FAIL + "Failed check: Ignore User" + bcolors.ENDC)
        return False
    if not is_gist and repo.repository.name in state.bad_repos:
        print(bcolors.FAIL + "Failed check: Ignore Repo" + bcolors.ENDC)
        return False
    if not is_gist and repo.name in state.bad_files:
        print(bcolors.FAIL + "Failed check: Ignore File" + bcolors.ENDC)
        return False

    # Fuzzy Hash Comparison
    try:
        if not is_gist:
            # Temporary fix for PyGithub until fixed upstream (PyGithub#1178)
            repo._url.value = repo._url.value.replace(
                repo._path.value,
                urllib.parse.quote(repo._path.value))

        candidate_sig = ssdeep.hash(repo.decoded_content)
        for sig in state.bad_signatures:
            similarity = ssdeep.compare(candidate_sig, sig)
            if similarity > SIMILARITY_THRESHOLD:
                print(
                    bcolors.FAIL +
                    "Failed check: Ignore Fuzzy Signature on Contents "
                    "({}% Similarity)".format(similarity) +
                    bcolors.ENDC)
                return False
    except github.UnknownObjectException:
        print(
            bcolors.FAIL +
            "API Error: File no longer exists on github.com" +
            bcolors.ENDC)
        return False
    return True

Source File: vectorization.py From IntroductionToMachineLearningForSecurityPros with GNU General Public License v3.0

5 votes

def vectorize(feature_set, c2_data):
    vector = np.zeros((len(feature_set),), dtype=np.float)
    for index, (offset, code, ssdeep_hash) in enumerate(feature_set):
        if offset not in c2_data:
            continue
        if c2_data[offset]["code"] == code:
            d = ssdeep.compare(c2_data[offset]["content_ssdeep"], ssdeep_hash)
            d = float(d) / float(100.0)
            vector[index] = d

    return vector

Source File: vectorization.py From IntroductionToMachineLearningForSecurityPros with GNU General Public License v3.0

5 votes

def vectorize_with_sparse_features(sparse_feature_set, feature_count, c2_data):
    vector = lil_matrix((1, feature_count), dtype=np.float)
    for index, (offset, code, ssdeep_hash) in sparse_feature_set:
        if offset not in c2_data:
            continue
        if c2_data[offset]["code"] == code:
            d = ssdeep.compare(c2_data[offset]["content_ssdeep"], ssdeep_hash)
            d = float(d) / float(100.0)
            vector[0, index] = d

    return vector

Source File: cfire.py From Security-Research with BSD 3-Clause "New" or "Revised" License

5 votes

def ssdeepcompare(target, IP):
    try:
        ss_target = requests.get('http://{}/'.format(target))
        ssdeep_target_fuzz = ssdeep.hash(ss_target.text)
        print target, ssdeep_target_fuzz
        content = requests.get('https://{}'.format(IP), verify=False, timeout = 5, headers = {'Host': target})
        ssdeep_fuzz = ssdeep.hash(content.text)
        print IP, ssdeep_fuzz
        print "ssdeep score for", IP, "is", ssdeep.compare(ssdeep_target_fuzz, ssdeep_fuzz)
    except(requests.exceptions.ConnectionError):
        print "cant connect to", IP

Source File: hash.py From FACT_core with GNU General Public License v3.0

5 votes

def get_ssdeep_comparison(first, second):
    return ssdeep.compare(first, second)

Source File: file_coverage.py From FACT_core with GNU General Public License v3.0

5 votes

def _find_similar_file_for(self, file_uid: str, parent_uid: str, comparison_fo: FileObject):
        hash_one = self.database.get_ssdeep_hash(file_uid)
        if hash_one:
            id1 = self._get_similar_file_id(file_uid, parent_uid)
            for potential_match in comparison_fo.files_included:
                id2 = self._get_similar_file_id(potential_match, comparison_fo.uid)
                hash_two = self.database.get_ssdeep_hash(potential_match)
                ssdeep_similarity = ssdeep.compare(hash_one, hash_two)
                if hash_two and ssdeep_similarity > self.ssdeep_ignore_threshold:
                    yield (id1, id2), ssdeep_similarity

Source File: vectorization.py From IDPanel with MIT License

5 votes

def vectorize(feature_set, c2_data):
    vector = np.zeros((len(feature_set),), dtype=np.float)
    for index, (offset, code, ssdeep_hash) in enumerate(feature_set):
        if offset not in c2_data:
            continue
        if c2_data[offset]["code"] == code:
            d = ssdeep.compare(c2_data[offset]["content_ssdeep"], ssdeep_hash)
            d = float(d) / float(100.0)
            vector[index] = d

    return vector

Source File: vectorization.py From IDPanel with MIT License

5 votes

def vectorize_with_sparse_features(sparse_feature_set, feature_count, c2_data):
    vector = lil_matrix((1, feature_count), dtype=np.float)
    for index, (offset, code, ssdeep_hash) in sparse_feature_set:
        if offset not in c2_data:
            continue
        if c2_data[offset]["code"] == code:
            d = ssdeep.compare(c2_data[offset]["content_ssdeep"], ssdeep_hash)
            d = float(d) / float(100.0)
            vector[0, index] = d

    return vector

Source File: ssdeep_querying.py From ssdeep-elastic with MIT License

4 votes

def get_matching_items_by_ssdeep(ssdeep_value, threshold_grade):
    """
    A function that finds matching items by ssdeep comparison with optimizations using ElasticSearch
    :param ssdeep_value: The ssdeep hash value of the item
    :param threshold_grade: The grade being used as a threshold, only items that pass this grade will be returned
    :return: A List of matching items (in this case, a list of sha256 hash values)
    """
    chunksize, chunk, double_chunk = ssdeep_value.split(':')
    chunksize = int(chunksize)

    es = elasticsearch.Elasticsearch(['localhost:9200'])

    query = {
        'query': {
            'bool': {
                'must': [
                    {
                        'terms': {
                            'chunksize': [chunksize, chunksize * 2, int(chunksize / 2)]
                        }
                    },
                    {
                        'bool': {
                            'should': [
                                {
                                    'match': {
                                        'chunk': {
                                            'query': chunk
                                        }
                                    }
                                },
                                {
                                    'match': {
                                        'double_chunk': {
                                            'query': double_chunk
                                        }
                                    }
                                }
                            ],
                            'minimum_should_match': 1
                        }
                    }
                ]
            }
        }
    }

    results = es.search('ssdeep-index', body=query)

    sha256_list_to_return = []

    for record in results['hits']['hits']:
        record_ssdeep = record['_source']['ssdeep']
        ssdeep_grade = ssdeep.compare(record_ssdeep, ssdeep_value)

        if ssdeep_grade >= threshold_grade:
            sha256_list_to_return.append(record['_source']['sha256'])

    return sha256_list_to_return

Source File: parse_ssdeep.py From android-malware-analysis with GNU General Public License v3.0

4 votes

def main():
    all_hashes = {'malicious': [], 'benign': []}
    app_malicious_map = {} # mapping from android app names to 1 or 0 for malware or goodware
    similarity_buckets = ['similarity_limit_0', 'similarity_limit_0.2', 'similarity_limit_0.4', 'similarity_limit_0.6', 'similarity_limit_0.8', 'similarity_limit_1.0']
    root_dir = os.getcwd()
    for i, directory in enumerate(['benign_apk', 'malicious_apk']):
        os.chdir(directory)
        with open(directory.split('_')[0] + '_apk_ssdeep.csv') as hashes:
            for j, line in enumerate(hashes):
                if j == 0: continue
                b64hash = line.split(',')[0]
                app_name = line.split(',')[-1].split('/')[-1][:-2]
                app_malicious_map[app_name] = [1,0] if i else [0,1]
                all_hashes['malicious' if i else 'benign'].append((app_name, b64hash))
        os.chdir(root_dir)
    all_apps = {} # mapping from each app to its similarity score and classification
    num_zero = {}
    num_each = {}
    for category in all_hashes:
        num_zero[category] = 0
        num_each[category] = 0
        for app_and_hash in all_hashes[category]:
            similarity_scores = []
            this_score = app_and_hash[1]
            for i in range(1000):
                other_score = random.choice(all_hashes[category])[1]
                similarity_scores.append(ssdeep.compare(this_score, other_score))
            score = numpy.mean(similarity_scores)
            num_each[category] += 1
            if score == 0: num_zero[category] += 1
            bit_vector = []
            last_limit = -0.01
            for limit in similarity_buckets:
                float_limit = float(limit.split('_')[-1])
                if score <= float_limit and score > last_limit:
                    bit_vector.append(1)
                else:
                    bit_vector.append(0)
                last_limit = float_limit
            if not any(bit_vector): # score > 1
                bit_vector[-1] = 1
            all_apps[app_and_hash[0]] = {'vector': bit_vector, 'malicious': app_malicious_map[app_and_hash[0]]}
    with open('app_hash_vectors.json', 'w') as outfile:
        json.dump({'features': similarity_buckets, 'apps': all_apps}, outfile)
    print('{} of {} malicious apps and {} of {} benign apps had zero similarity found'.format(num_zero['malicious'], num_each['malicious'], num_zero['benign'], num_zero['benign']))
    print('Wrote data on ' + str(len(all_apps)) + ' apps to a file.')

Source File: ssdeep_python.py From Learning-Python-for-Forensics-Second-Edition with MIT License

4 votes

def main(known_file, comparison, output_type):
    """
    The main function handles the main operations of the script
    :param known_file: path to known file
    :param comparison: path to look for similar files
    :param output_type: type of output to provide
    :return: None
    """

    # Check output formats
    if output_type not in OUTPUT_OPTS:
        logger.error(
            "Unsupported output format '{}' selected. Please "
            "use one of {}".format(
                output_type, ", ".join(OUTPUT_OPTS)))
        sys.exit(2)
    elif output_type == 'csv':
        # Special handling for CSV headers
        print('"similarity","known_file","known_hash",'
              '"comp_file","comp_hash"')

    # Check provided file paths
    known_file = os.path.abspath(known_file)
    comparison = os.path.abspath(comparison)

    # Generate ssdeep signature for known file
    if not os.path.exists(known_file):
        logger.error("Error - path {} not found".format(
            comparison))
        sys.exit(1)

    known_hash = ssdeep.hash_from_file(known_file)

    # Generate and test ssdeep signature for comparison file(s)
    if os.path.isdir(comparison):
        # Process files in folders
        for root, _, files in os.walk(comparison):
            for f in files:
                file_entry = os.path.join(root, f)
                comp_hash = ssdeep.hash_from_file(file_entry)
                comp_val = ssdeep.compare(known_hash, comp_hash)
                output(known_file, known_hash,
                       file_entry, comp_hash,
                       comp_val, output_type)

    elif os.path.isfile(comparison):
        # Process a single file
        comp_hash = ssdeep.hash_from_file(comparison)
        comp_val = ssdeep.compare(known_hash, comp_hash)
        output(known_file, known_hash, file_entry, comp_hash,
               comp_val, output_type)
    else:
        logger.error("Error - path {} not found".format(
            comparison))
        sys.exit(1)

Source File: malfunction.py From Malfunction with GNU Lesser General Public License v2.1

4 votes

def process_sigs(cursor, sig_list, bin_list):
    """ Process the function signatures

    Go through every function and compare it to functions in every binary
    Get the highest score per function and add it to a score_list
    cursor - the database cursor
    sig_list - the list of function signatures for analysis
    bin_list - the list of binaries in the current database to compare to"""

    score_list = []

    maxval = 0
    for row in bin_list:
        cursor.execute("SELECT count(hash) FROM functions WHERE binaryid=?", (row[0],))
        maxval += int(cursor.fetchone()[0])
    maxval = maxval*len(sig_list)
    if progressbar:
        widgets = [" ", progressbar.Bar(marker="#"), " ", progressbar.Percentage(), " ", progressbar.ETA()]
        pbar = progressbar.ProgressBar(widgets=widgets,
                           maxval=maxval).start()
    else:
        pbar = None
    i = 0
    for row in bin_list:
        function_score_list = []

        for sig in sig_list:
            highest_score = 0
            cursor.execute("SELECT hash FROM functions WHERE binaryid=?",
                           (row[0], ))
            # h means hash, hash is a keyword in Python
            # so we can't use it

            for h in cursor.fetchall():
                strength = ssdeep.compare(sig, h[0])

                if strength > highest_score:
                    highest_score = strength

                i += 1
                if pbar:
                    pbar.update(i)
                elif i % 10000 == 0 or i == maxval:
                    print("%d / %d Done" % (i, maxval))

            function_score_list.append(highest_score)

        score_list.append(function_score_list)
    if pbar:
        pbar.finish()
    return score_list

Python ssdeep.compare() Examples