Python langdetect.detect_langs() Examples

The following are 9 code examples of langdetect.detect_langs(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module langdetect , or try the search function .
Example #1
Source File: outsource_sayer.py    From sketal with MIT License 6 votes vote down vote up
def get_lang(text):
        resu = None

        try:
            langs = langdetect.detect_langs(text)

            for language in langs:
                if language.lang == "ru":
                    language.prob += 0.2

                if resu is None or resu < language:
                    resu = language

        except langdetect.lang_detect_exception.LangDetectException:
            pass

        if resu is None:
            return "ru"

        return resu.lang 
Example #2
Source File: detect_language.py    From freshonions-torscraper with GNU Affero General Public License v3.0 6 votes vote down vote up
def classify(text, debug = False):
	# identifier.set_languages(DETECT_LANGUAGES)
	try:
		lang1 = detect_langs(text)[0]
	except UnicodeDecodeError:
		lang1 = detect_langs(text.decode("utf-8"))[0]
	prob = lang1.prob
	lang = lang1.lang

	if debug:
		return (lang, prob)

	if prob > 0.90:
		return lang

	return None 
Example #3
Source File: translatewiki.py    From editquality with MIT License 5 votes vote down vote up
def process_normalized_lang_map(text):
    try:
        lang_map = {l.lang: l.prob
                    for l in langdetect.detect_langs(text or "")}
    except langdetect.lang_detect_exception.LangDetectException:
        lang_map = {}

    normalized_lang_map = defaultdict(lambda: 0.0)
    for lang in ALL_LANGS:
        norm_lang = COMMON_LANGUAGE_MAP.get(lang, lang)
        normalized_lang_map[norm_lang] += lang_map.get(lang, 0.0)

    return normalized_lang_map 
Example #4
Source File: textcat.py    From OrangeAssassin with Apache License 2.0 5 votes vote down vote up
def check_language(self, msg, target=None):
        """Check the language of the message.

        Add the result to the metadata and and trigger the
        rule if it is present in the config and the languages
        are not in the ok list.

        :return True if the message language is unwanted and False
        otherwise
        """
        prob = self["textcat_acceptable_prob"]
        results = langdetect.detect_langs(msg.text)
        self.ctxt.log.debug("TextCat results: %s", results)
        langs = [lang.lang for lang in results if lang.prob > prob]
        if len(langs) > self["textcat_max_languages"]:
            self.ctxt.log.debug("Too many languages.")
            return False
        msg.plugin_tags["LANGUAGES"] = " ".join(langs)
        ok_languages = self["ok_languages"]
        if "all" in ok_languages:
            # All good.
            return False
        for lang in langs:
            if lang not in ok_languages:
                return True
        return False 
Example #5
Source File: product.py    From impactstory-tng with MIT License 5 votes vote down vote up
def languages_with_examples(self):
        resp = {}

        try:
            for (source, posts) in self.altmetric_api_raw["posts"].iteritems():
                for post in posts:
                    for key in ["title", "summary"]:
                        try:
                            num_words_in_post = len(post[key].split(" "))
                            top_detection = langdetect.detect_langs(post[key])[0]
                            if (num_words_in_post > 7) and (top_detection.prob > 0.90):

                                if top_detection.lang != "en":
                                    language_name = get_language_from_abbreviation(top_detection.lang)
                                    # print u"LANGUAGE:", language_name, top_detection.prob, post[key]

                                    # overwrites.  that's ok, we just want one example
                                    resp[language_name] = post["url"]

                        except langdetect.lang_detect_exception.LangDetectException:
                            pass

        except (KeyError, AttributeError, TypeError):
            pass

        return resp 
Example #6
Source File: app.py    From threatconnect-playbooks with Apache License 2.0 5 votes vote down vote up
def run(self):
        """Run the App main logic.

        This method should contain the core logic of the App.
        """
        text = self.tcex.playbook.read(self.args.text)

        detected_language_code = detect_langs(text)[0].lang
        detected_language_probability = detect_langs(text)[0].prob

        self.tcex.playbook.create_output('detectedLanguageCode', detected_language_code, 'String')
        self.tcex.playbook.create_output('detectedLanguageProbability', detected_language_probability, 'String')
        self.exit_message = 'Detected the language as {} (with a probability of {})'.format(detected_language_code, detected_language_probability) 
Example #7
Source File: wl_detection.py    From Wordless with GNU General Public License v3.0 4 votes vote down vote up
def detect_lang(main, file):
    text = ''

    try:
        with open(file['path'], 'r', encoding = file['encoding']) as f:
            if main.settings_custom['auto_detection']['detection_settings']['number_lines_no_limit']:
                for line in f:
                    text += line
            else:
                for i, line in enumerate(f):
                    if i < main.settings_custom['auto_detection']['detection_settings']['number_lines']:
                        text += line
                    else:
                        break

        lang_code_639_1 = langid.classify(text)[0]

        # Chinese (Simplified) & Chinese (Traditional)
        if lang_code_639_1 == 'zh':
            lang_code_639_1 = 'zh_cn'

            for lang in sorted(langdetect.detect_langs(text), key = lambda item: -item.prob):
                if lang.lang in ['zh-cn', 'zh-tw']:
                    lang_code_639_1 = lang.lang.replace('-', '_')

                    break
        # Norwegian Bokmål
        elif lang_code_639_1 == 'no':
            lang_code_639_1 = 'nb'

        # Serbian (Cyrillic)
        elif lang_code_639_1 == 'sr':
            lang_code_639_1 = 'sr_cyrl'

        lang = wl_conversion.to_iso_639_3(main, lang_code_639_1)

        success = True
    except:
        lang = main.settings_custom['auto_detection']['default_settings']['default_lang']

        success = False

    return lang, success 
Example #8
Source File: encrypted_shared_preferences.py    From scrounger with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def run(self):
        result = {
            "title": "Application Does Not Encrypt Shared Preferences",
            "details": "",
            "severity": "Medium",
            "report": False
        }

        if not self.device.installed(self.identifier):
            return {"print": "Application not installed"}

        Log.info("Starting the application")
        self.device.start(self.identifier)
        sleep(5)

        Log.info("Finding files in application's data")
        target_paths = ["{}/shared_prefs".format(file_path) for file_path in
            self.device.data_paths(self.identifier)]

        listed_files = []
        report_files = []
        for data_path in target_paths:
            listed_files += self.device.find_files(data_path)

        Log.info("Analysing application's data")

        for filename in listed_files:
            if filename:
                file_content = self.device.file_content(filename)

                lang = detect_langs(file_content)[0]
                Log.debug("{} language {}: {}".format(filename,
                    lang.lang, lang.prob))

                if lang.prob > float("0.{}".format(self.min_percentage)):
                    report_files += [filename]

        if report_files:
            result.update({
                "report": True,
                "details": "* Unencrypted Files:\n * {}".format("\n * ".join(
                    report_files))
            })

        return {
            "{}_result".format(self.name()): result
        } 
Example #9
Source File: unencrypted_keychain_data.py    From scrounger with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def run(self):
        result = {
            "title": "Application Saves Unencrypted Data In Keychain",
            "details": "",
            "severity": "Low",
            "report": False
        }

        Log.info("Getting keychain's IDs")

        ent_module = EModule()
        ent_module.binary = self.binary
        ent_result, entitlements = ent_module.run(), None
        for key in ent_result:
            if key.endswith("_entitlements"):
                entitlements = ent_result[key]

        if not entitlements:
            return {"print": "Couldn't get entitlements from the bianry."}

        keychain_id = self.identifier
        if "keychain-access-groups" in entitlements:
            keychain_id = entitlements["keychain-access-groups"]

        keychain_module = KeychainModule()
        keychain_module.device = self.device
        keychain_module.output = None
        keychain_result = keychain_module.run()
        keychain_data = keychain_result["keychain_data"]

        data = []
        for key in keychain_data:
            if (key["entitlement_group"] and \
            keychain_id in key["entitlement_group"]) or (key["account"] and \
            keychain_id in key["account"]) or (key["service"] and \
            keychain_id in key["service"]):
                data += [str(key['keychain_data'])]

        report_data = []
        for item in data:
            lang = detect_langs(item)[0]
            if lang.prob > float("0.{}".format(self.min_percentage)):
                report_data += [item]

        if report_data:
            result.update({
                "report": True,
                "details": "The following data was found:\n* {}".format(
                    "\n* ".join(report_data))
            })

        return {
            "{}_result".format(self.name()): result
        }