Python Examples of langdetect.detect

Source File: outsource_sayer.py From sketal with MIT License

6 votes

def get_lang(text):
        resu = None

        try:
            langs = langdetect.detect_langs(text)

            for language in langs:
                if language.lang == "ru":
                    language.prob += 0.2

                if resu is None or resu < language:
                    resu = language

        except langdetect.lang_detect_exception.LangDetectException:
            pass

        if resu is None:
            return "ru"

        return resu.lang

Source File: detect_language.py From freshonions-torscraper with GNU Affero General Public License v3.0

6 votes

def classify(text, debug = False):
	# identifier.set_languages(DETECT_LANGUAGES)
	try:
		lang1 = detect_langs(text)[0]
	except UnicodeDecodeError:
		lang1 = detect_langs(text.decode("utf-8"))[0]
	prob = lang1.prob
	lang = lang1.lang

	if debug:
		return (lang, prob)

	if prob > 0.90:
		return lang

	return None

Source File: translatewiki.py From editquality with MIT License

5 votes

def process_normalized_lang_map(text):
    try:
        lang_map = {l.lang: l.prob
                    for l in langdetect.detect_langs(text or "")}
    except langdetect.lang_detect_exception.LangDetectException:
        lang_map = {}

    normalized_lang_map = defaultdict(lambda: 0.0)
    for lang in ALL_LANGS:
        norm_lang = COMMON_LANGUAGE_MAP.get(lang, lang)
        normalized_lang_map[norm_lang] += lang_map.get(lang, 0.0)

    return normalized_lang_map

Source File: textcat.py From OrangeAssassin with Apache License 2.0

5 votes

def check_language(self, msg, target=None):
        """Check the language of the message.

        Add the result to the metadata and and trigger the
        rule if it is present in the config and the languages
        are not in the ok list.

        :return True if the message language is unwanted and False
        otherwise
        """
        prob = self["textcat_acceptable_prob"]
        results = langdetect.detect_langs(msg.text)
        self.ctxt.log.debug("TextCat results: %s", results)
        langs = [lang.lang for lang in results if lang.prob > prob]
        if len(langs) > self["textcat_max_languages"]:
            self.ctxt.log.debug("Too many languages.")
            return False
        msg.plugin_tags["LANGUAGES"] = " ".join(langs)
        ok_languages = self["ok_languages"]
        if "all" in ok_languages:
            # All good.
            return False
        for lang in langs:
            if lang not in ok_languages:
                return True
        return False

Source File: product.py From impactstory-tng with MIT License

5 votes

def languages_with_examples(self):
        resp = {}

        try:
            for (source, posts) in self.altmetric_api_raw["posts"].iteritems():
                for post in posts:
                    for key in ["title", "summary"]:
                        try:
                            num_words_in_post = len(post[key].split(" "))
                            top_detection = langdetect.detect_langs(post[key])[0]
                            if (num_words_in_post > 7) and (top_detection.prob > 0.90):

                                if top_detection.lang != "en":
                                    language_name = get_language_from_abbreviation(top_detection.lang)
                                    # print u"LANGUAGE:", language_name, top_detection.prob, post[key]

                                    # overwrites.  that's ok, we just want one example
                                    resp[language_name] = post["url"]

                        except langdetect.lang_detect_exception.LangDetectException:
                            pass

        except (KeyError, AttributeError, TypeError):
            pass

        return resp

Source File: app.py From threatconnect-playbooks with Apache License 2.0

5 votes

def run(self):
        """Run the App main logic.

        This method should contain the core logic of the App.
        """
        text = self.tcex.playbook.read(self.args.text)

        detected_language_code = detect_langs(text)[0].lang
        detected_language_probability = detect_langs(text)[0].prob

        self.tcex.playbook.create_output('detectedLanguageCode', detected_language_code, 'String')
        self.tcex.playbook.create_output('detectedLanguageProbability', detected_language_probability, 'String')
        self.exit_message = 'Detected the language as {} (with a probability of {})'.format(detected_language_code, detected_language_probability)

Source File: wl_detection.py From Wordless with GNU General Public License v3.0

4 votes

def detect_lang(main, file):
    text = ''

    try:
        with open(file['path'], 'r', encoding = file['encoding']) as f:
            if main.settings_custom['auto_detection']['detection_settings']['number_lines_no_limit']:
                for line in f:
                    text += line
            else:
                for i, line in enumerate(f):
                    if i < main.settings_custom['auto_detection']['detection_settings']['number_lines']:
                        text += line
                    else:
                        break

        lang_code_639_1 = langid.classify(text)[0]

        # Chinese (Simplified) & Chinese (Traditional)
        if lang_code_639_1 == 'zh':
            lang_code_639_1 = 'zh_cn'

            for lang in sorted(langdetect.detect_langs(text), key = lambda item: -item.prob):
                if lang.lang in ['zh-cn', 'zh-tw']:
                    lang_code_639_1 = lang.lang.replace('-', '_')

                    break
        # Norwegian Bokmål
        elif lang_code_639_1 == 'no':
            lang_code_639_1 = 'nb'

        # Serbian (Cyrillic)
        elif lang_code_639_1 == 'sr':
            lang_code_639_1 = 'sr_cyrl'

        lang = wl_conversion.to_iso_639_3(main, lang_code_639_1)

        success = True
    except:
        lang = main.settings_custom['auto_detection']['default_settings']['default_lang']

        success = False

    return lang, success

Source File: encrypted_shared_preferences.py From scrounger with BSD 3-Clause "New" or "Revised" License

4 votes

def run(self):
        result = {
            "title": "Application Does Not Encrypt Shared Preferences",
            "details": "",
            "severity": "Medium",
            "report": False
        }

        if not self.device.installed(self.identifier):
            return {"print": "Application not installed"}

        Log.info("Starting the application")
        self.device.start(self.identifier)
        sleep(5)

        Log.info("Finding files in application's data")
        target_paths = ["{}/shared_prefs".format(file_path) for file_path in
            self.device.data_paths(self.identifier)]

        listed_files = []
        report_files = []
        for data_path in target_paths:
            listed_files += self.device.find_files(data_path)

        Log.info("Analysing application's data")

        for filename in listed_files:
            if filename:
                file_content = self.device.file_content(filename)

                lang = detect_langs(file_content)[0]
                Log.debug("{} language {}: {}".format(filename,
                    lang.lang, lang.prob))

                if lang.prob > float("0.{}".format(self.min_percentage)):
                    report_files += [filename]

        if report_files:
            result.update({
                "report": True,
                "details": "* Unencrypted Files:\n * {}".format("\n * ".join(
                    report_files))
            })

        return {
            "{}_result".format(self.name()): result
        }

Source File: unencrypted_keychain_data.py From scrounger with BSD 3-Clause "New" or "Revised" License

4 votes

def run(self):
        result = {
            "title": "Application Saves Unencrypted Data In Keychain",
            "details": "",
            "severity": "Low",
            "report": False
        }

        Log.info("Getting keychain's IDs")

        ent_module = EModule()
        ent_module.binary = self.binary
        ent_result, entitlements = ent_module.run(), None
        for key in ent_result:
            if key.endswith("_entitlements"):
                entitlements = ent_result[key]

        if not entitlements:
            return {"print": "Couldn't get entitlements from the bianry."}

        keychain_id = self.identifier
        if "keychain-access-groups" in entitlements:
            keychain_id = entitlements["keychain-access-groups"]

        keychain_module = KeychainModule()
        keychain_module.device = self.device
        keychain_module.output = None
        keychain_result = keychain_module.run()
        keychain_data = keychain_result["keychain_data"]

        data = []
        for key in keychain_data:
            if (key["entitlement_group"] and \
            keychain_id in key["entitlement_group"]) or (key["account"] and \
            keychain_id in key["account"]) or (key["service"] and \
            keychain_id in key["service"]):
                data += [str(key['keychain_data'])]

        report_data = []
        for item in data:
            lang = detect_langs(item)[0]
            if lang.prob > float("0.{}".format(self.min_percentage)):
                report_data += [item]

        if report_data:
            result.update({
                "report": True,
                "details": "The following data was found:\n* {}".format(
                    "\n* ".join(report_data))
            })

        return {
            "{}_result".format(self.name()): result
        }

Python langdetect.detect_langs() Examples