Python ssdeep.compare() Examples

The following are 16 code examples of ssdeep.compare(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module ssdeep , or try the search function .
Example #1
Source File: game.py    From DueUtil with GNU General Public License v3.0 6 votes vote down vote up
def get_spam_level(player, message_content):
    """
    Get's a spam level for a message using a 
    fuzzy hash > 50% means it's probably spam
    """

    message_hash = ssdeep.hash(message_content)
    spam_level = 0
    spam_levels = [ssdeep.compare(message_hash, prior_hash) for prior_hash in player.last_message_hashes if
                   prior_hash is not None]
    if len(spam_levels) > 0:
        spam_level = max(spam_levels)
    player.last_message_hashes.append(message_hash)
    if spam_level > SPAM_TOLERANCE:
        player.spam_detections += 1
    return spam_level 
Example #2
Source File: apifuzz.py    From codex-backend with MIT License 6 votes vote down vote up
def searchFuzzy(fuzz, limit, thresh):
    client = MongoClient(envget('metadata.host'), envget('metadata.port'))
    db = client[envget('db_metadata_name')]
    coll_meta = db["db_metadata_collection"]

    f1 = coll_meta.find({}, {"file_id": 1, "fuzzy_hash": 1}).limit(limit)
    l = []
    for f in f1:
        l.append(f)

    ret = {}
    for a in l:
        res = -1
        try:
            res = ssdeep.compare(a["fuzzy_hash"], fuzz)
        except InternalError:
            print(str(res) + "------" +
                  str(a["fuzzy_hash"]) + "-----" + str(a["file_id"]))
            continue
        if(res >= thresh):
            ret[a["file_id"]] = res

    return ret 
Example #3
Source File: SearchModule.py    From codex-backend with MIT License 6 votes vote down vote up
def fuzz_search_fast(id, p, fuzz):
    block = int(fuzz.split(':')[0])
    lap = 500
    coll_meta = db[envget("db_metadata_collection")]

    f1 = coll_meta.find({}, {"file_id": 1, p: 1})
    l = []
    for f in f1:
        l.append(f)
    dic = {}
    for a in l:
        res = -1
        try:
            f_comp = a[p]
            block_comp = int(f_comp.split(':')[0])
            if(block_comp <= block + lap and block_comp >= block - lap):
                res = ssdeep.compare(f_comp, fuzz)
                if(res > 0):
                    dic[a["file_id"]] = res
        except Exception, e:
            logging.exception(
                "fuzz_search_fast(id=" + str(id) + ",p=" + str(p) + ",fuzz=" + str(fuzz))
            continue 
Example #4
Source File: ssdeep_analytics.py    From multiscanner with Mozilla Public License 2.0 6 votes vote down vote up
def main():
    parser = argparse.ArgumentParser(description='Script to interact with '
        'Multiscanner\'s Elasticsearch datastore to run analytics based on '
        'ssdeep hash.')
    group = parser.add_mutually_exclusive_group(required=True)
    parser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
                        help='Increase output to stdout')
    group.add_argument('-c', '--compare', dest='compare', action='store_true',
        help='Run ssdeep.compare using a few optimizations based on ssdeep'
        ' hash structure.')
    group.add_argument('-g', '--group', dest='group', action='store_true',
        help='Returns group of samples based on ssdeep hash.')

    args = parser.parse_args()

    ssdeep_analytic = SSDeepAnalytic(debug=args.verbose)

    if args.compare:
        ssdeep_analytic.ssdeep_compare()
        print('[*] Success')
    elif args.group:
        pprint(ssdeep_analytic.ssdeep_group())
        print('[*] Success') 
Example #5
Source File: gitgot.py    From GitGot with GNU Lesser General Public License v3.0 5 votes vote down vote up
def should_parse(repo, state, is_gist=False):
    owner_login = repo.owner.login if is_gist else repo.repository.owner.login
    if owner_login in state.bad_users:
        print(bcolors.FAIL + "Failed check: Ignore User" + bcolors.ENDC)
        return False
    if not is_gist and repo.repository.name in state.bad_repos:
        print(bcolors.FAIL + "Failed check: Ignore Repo" + bcolors.ENDC)
        return False
    if not is_gist and repo.name in state.bad_files:
        print(bcolors.FAIL + "Failed check: Ignore File" + bcolors.ENDC)
        return False

    # Fuzzy Hash Comparison
    try:
        if not is_gist:
            # Temporary fix for PyGithub until fixed upstream (PyGithub#1178)
            repo._url.value = repo._url.value.replace(
                repo._path.value,
                urllib.parse.quote(repo._path.value))

        candidate_sig = ssdeep.hash(repo.decoded_content)
        for sig in state.bad_signatures:
            similarity = ssdeep.compare(candidate_sig, sig)
            if similarity > SIMILARITY_THRESHOLD:
                print(
                    bcolors.FAIL +
                    "Failed check: Ignore Fuzzy Signature on Contents "
                    "({}% Similarity)".format(similarity) +
                    bcolors.ENDC)
                return False
    except github.UnknownObjectException:
        print(
            bcolors.FAIL +
            "API Error: File no longer exists on github.com" +
            bcolors.ENDC)
        return False
    return True 
Example #6
Source File: vectorization.py    From IntroductionToMachineLearningForSecurityPros with GNU General Public License v3.0 5 votes vote down vote up
def vectorize(feature_set, c2_data):
    vector = np.zeros((len(feature_set),), dtype=np.float)
    for index, (offset, code, ssdeep_hash) in enumerate(feature_set):
        if offset not in c2_data:
            continue
        if c2_data[offset]["code"] == code:
            d = ssdeep.compare(c2_data[offset]["content_ssdeep"], ssdeep_hash)
            d = float(d) / float(100.0)
            vector[index] = d

    return vector 
Example #7
Source File: vectorization.py    From IntroductionToMachineLearningForSecurityPros with GNU General Public License v3.0 5 votes vote down vote up
def vectorize_with_sparse_features(sparse_feature_set, feature_count, c2_data):
    vector = lil_matrix((1, feature_count), dtype=np.float)
    for index, (offset, code, ssdeep_hash) in sparse_feature_set:
        if offset not in c2_data:
            continue
        if c2_data[offset]["code"] == code:
            d = ssdeep.compare(c2_data[offset]["content_ssdeep"], ssdeep_hash)
            d = float(d) / float(100.0)
            vector[0, index] = d

    return vector 
Example #8
Source File: cfire.py    From Security-Research with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def ssdeepcompare(target, IP):
    try:
        ss_target = requests.get('http://{}/'.format(target))
        ssdeep_target_fuzz = ssdeep.hash(ss_target.text)
        print target, ssdeep_target_fuzz
        content = requests.get('https://{}'.format(IP), verify=False, timeout = 5, headers = {'Host': target})
        ssdeep_fuzz = ssdeep.hash(content.text)
        print IP, ssdeep_fuzz
        print "ssdeep score for", IP, "is", ssdeep.compare(ssdeep_target_fuzz, ssdeep_fuzz)
    except(requests.exceptions.ConnectionError):
        print "cant connect to", IP 
Example #9
Source File: hash.py    From FACT_core with GNU General Public License v3.0 5 votes vote down vote up
def get_ssdeep_comparison(first, second):
    return ssdeep.compare(first, second) 
Example #10
Source File: file_coverage.py    From FACT_core with GNU General Public License v3.0 5 votes vote down vote up
def _find_similar_file_for(self, file_uid: str, parent_uid: str, comparison_fo: FileObject):
        hash_one = self.database.get_ssdeep_hash(file_uid)
        if hash_one:
            id1 = self._get_similar_file_id(file_uid, parent_uid)
            for potential_match in comparison_fo.files_included:
                id2 = self._get_similar_file_id(potential_match, comparison_fo.uid)
                hash_two = self.database.get_ssdeep_hash(potential_match)
                ssdeep_similarity = ssdeep.compare(hash_one, hash_two)
                if hash_two and ssdeep_similarity > self.ssdeep_ignore_threshold:
                    yield (id1, id2), ssdeep_similarity 
Example #11
Source File: vectorization.py    From IDPanel with MIT License 5 votes vote down vote up
def vectorize(feature_set, c2_data):
    vector = np.zeros((len(feature_set),), dtype=np.float)
    for index, (offset, code, ssdeep_hash) in enumerate(feature_set):
        if offset not in c2_data:
            continue
        if c2_data[offset]["code"] == code:
            d = ssdeep.compare(c2_data[offset]["content_ssdeep"], ssdeep_hash)
            d = float(d) / float(100.0)
            vector[index] = d

    return vector 
Example #12
Source File: vectorization.py    From IDPanel with MIT License 5 votes vote down vote up
def vectorize_with_sparse_features(sparse_feature_set, feature_count, c2_data):
    vector = lil_matrix((1, feature_count), dtype=np.float)
    for index, (offset, code, ssdeep_hash) in sparse_feature_set:
        if offset not in c2_data:
            continue
        if c2_data[offset]["code"] == code:
            d = ssdeep.compare(c2_data[offset]["content_ssdeep"], ssdeep_hash)
            d = float(d) / float(100.0)
            vector[0, index] = d

    return vector 
Example #13
Source File: ssdeep_querying.py    From ssdeep-elastic with MIT License 4 votes vote down vote up
def get_matching_items_by_ssdeep(ssdeep_value, threshold_grade):
    """
    A function that finds matching items by ssdeep comparison with optimizations using ElasticSearch
    :param ssdeep_value: The ssdeep hash value of the item
    :param threshold_grade: The grade being used as a threshold, only items that pass this grade will be returned
    :return: A List of matching items (in this case, a list of sha256 hash values)
    """
    chunksize, chunk, double_chunk = ssdeep_value.split(':')
    chunksize = int(chunksize)

    es = elasticsearch.Elasticsearch(['localhost:9200'])

    query = {
        'query': {
            'bool': {
                'must': [
                    {
                        'terms': {
                            'chunksize': [chunksize, chunksize * 2, int(chunksize / 2)]
                        }
                    },
                    {
                        'bool': {
                            'should': [
                                {
                                    'match': {
                                        'chunk': {
                                            'query': chunk
                                        }
                                    }
                                },
                                {
                                    'match': {
                                        'double_chunk': {
                                            'query': double_chunk
                                        }
                                    }
                                }
                            ],
                            'minimum_should_match': 1
                        }
                    }
                ]
            }
        }
    }

    results = es.search('ssdeep-index', body=query)

    sha256_list_to_return = []

    for record in results['hits']['hits']:
        record_ssdeep = record['_source']['ssdeep']
        ssdeep_grade = ssdeep.compare(record_ssdeep, ssdeep_value)

        if ssdeep_grade >= threshold_grade:
            sha256_list_to_return.append(record['_source']['sha256'])

    return sha256_list_to_return 
Example #14
Source File: parse_ssdeep.py    From android-malware-analysis with GNU General Public License v3.0 4 votes vote down vote up
def main():
    all_hashes = {'malicious': [], 'benign': []}
    app_malicious_map = {} # mapping from android app names to 1 or 0 for malware or goodware
    similarity_buckets = ['similarity_limit_0', 'similarity_limit_0.2', 'similarity_limit_0.4', 'similarity_limit_0.6', 'similarity_limit_0.8', 'similarity_limit_1.0']
    root_dir = os.getcwd()
    for i, directory in enumerate(['benign_apk', 'malicious_apk']):
        os.chdir(directory)
        with open(directory.split('_')[0] + '_apk_ssdeep.csv') as hashes:
            for j, line in enumerate(hashes):
                if j == 0: continue
                b64hash = line.split(',')[0]
                app_name = line.split(',')[-1].split('/')[-1][:-2]
                app_malicious_map[app_name] = [1,0] if i else [0,1]
                all_hashes['malicious' if i else 'benign'].append((app_name, b64hash))
        os.chdir(root_dir)
    all_apps = {} # mapping from each app to its similarity score and classification
    num_zero = {}
    num_each = {}
    for category in all_hashes:
        num_zero[category] = 0
        num_each[category] = 0
        for app_and_hash in all_hashes[category]:
            similarity_scores = []
            this_score = app_and_hash[1]
            for i in range(1000):
                other_score = random.choice(all_hashes[category])[1]
                similarity_scores.append(ssdeep.compare(this_score, other_score))
            score = numpy.mean(similarity_scores)
            num_each[category] += 1
            if score == 0: num_zero[category] += 1
            bit_vector = []
            last_limit = -0.01
            for limit in similarity_buckets:
                float_limit = float(limit.split('_')[-1])
                if score <= float_limit and score > last_limit:
                    bit_vector.append(1)
                else:
                    bit_vector.append(0)
                last_limit = float_limit
            if not any(bit_vector): # score > 1
                bit_vector[-1] = 1
            all_apps[app_and_hash[0]] = {'vector': bit_vector, 'malicious': app_malicious_map[app_and_hash[0]]}
    with open('app_hash_vectors.json', 'w') as outfile:
        json.dump({'features': similarity_buckets, 'apps': all_apps}, outfile)
    print('{} of {} malicious apps and {} of {} benign apps had zero similarity found'.format(num_zero['malicious'], num_each['malicious'], num_zero['benign'], num_zero['benign']))
    print('Wrote data on ' + str(len(all_apps)) + ' apps to a file.') 
Example #15
Source File: ssdeep_python.py    From Learning-Python-for-Forensics-Second-Edition with MIT License 4 votes vote down vote up
def main(known_file, comparison, output_type):
    """
    The main function handles the main operations of the script
    :param known_file: path to known file
    :param comparison: path to look for similar files
    :param output_type: type of output to provide
    :return: None
    """

    # Check output formats
    if output_type not in OUTPUT_OPTS:
        logger.error(
            "Unsupported output format '{}' selected. Please "
            "use one of {}".format(
                output_type, ", ".join(OUTPUT_OPTS)))
        sys.exit(2)
    elif output_type == 'csv':
        # Special handling for CSV headers
        print('"similarity","known_file","known_hash",'
              '"comp_file","comp_hash"')

    # Check provided file paths
    known_file = os.path.abspath(known_file)
    comparison = os.path.abspath(comparison)

    # Generate ssdeep signature for known file
    if not os.path.exists(known_file):
        logger.error("Error - path {} not found".format(
            comparison))
        sys.exit(1)

    known_hash = ssdeep.hash_from_file(known_file)

    # Generate and test ssdeep signature for comparison file(s)
    if os.path.isdir(comparison):
        # Process files in folders
        for root, _, files in os.walk(comparison):
            for f in files:
                file_entry = os.path.join(root, f)
                comp_hash = ssdeep.hash_from_file(file_entry)
                comp_val = ssdeep.compare(known_hash, comp_hash)
                output(known_file, known_hash,
                       file_entry, comp_hash,
                       comp_val, output_type)

    elif os.path.isfile(comparison):
        # Process a single file
        comp_hash = ssdeep.hash_from_file(comparison)
        comp_val = ssdeep.compare(known_hash, comp_hash)
        output(known_file, known_hash, file_entry, comp_hash,
               comp_val, output_type)
    else:
        logger.error("Error - path {} not found".format(
            comparison))
        sys.exit(1) 
Example #16
Source File: malfunction.py    From Malfunction with GNU Lesser General Public License v2.1 4 votes vote down vote up
def process_sigs(cursor, sig_list, bin_list):
    """ Process the function signatures

    Go through every function and compare it to functions in every binary
    Get the highest score per function and add it to a score_list
    cursor - the database cursor
    sig_list - the list of function signatures for analysis
    bin_list - the list of binaries in the current database to compare to"""

    score_list = []

    maxval = 0
    for row in bin_list:
        cursor.execute("SELECT count(hash) FROM functions WHERE binaryid=?", (row[0],))
        maxval += int(cursor.fetchone()[0])
    maxval = maxval*len(sig_list)
    if progressbar:
        widgets = [" ", progressbar.Bar(marker="#"), " ", progressbar.Percentage(), " ", progressbar.ETA()]
        pbar = progressbar.ProgressBar(widgets=widgets,
                           maxval=maxval).start()
    else:
        pbar = None
    i = 0
    for row in bin_list:
        function_score_list = []

        for sig in sig_list:
            highest_score = 0
            cursor.execute("SELECT hash FROM functions WHERE binaryid=?",
                           (row[0], ))
            # h means hash, hash is a keyword in Python
            # so we can't use it

            for h in cursor.fetchall():
                strength = ssdeep.compare(sig, h[0])

                if strength > highest_score:
                    highest_score = strength

                i += 1
                if pbar:
                    pbar.update(i)
                elif i % 10000 == 0 or i == maxval:
                    print("%d / %d Done" % (i, maxval))

            function_score_list.append(highest_score)

        score_list.append(function_score_list)
    if pbar:
        pbar.finish()
    return score_list