Python langdetect.detect_langs() Examples
The following are 9
code examples of langdetect.detect_langs().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
langdetect
, or try the search function
.
Example #1
Source File: outsource_sayer.py From sketal with MIT License | 6 votes |
def get_lang(text): resu = None try: langs = langdetect.detect_langs(text) for language in langs: if language.lang == "ru": language.prob += 0.2 if resu is None or resu < language: resu = language except langdetect.lang_detect_exception.LangDetectException: pass if resu is None: return "ru" return resu.lang
Example #2
Source File: detect_language.py From freshonions-torscraper with GNU Affero General Public License v3.0 | 6 votes |
def classify(text, debug = False): # identifier.set_languages(DETECT_LANGUAGES) try: lang1 = detect_langs(text)[0] except UnicodeDecodeError: lang1 = detect_langs(text.decode("utf-8"))[0] prob = lang1.prob lang = lang1.lang if debug: return (lang, prob) if prob > 0.90: return lang return None
Example #3
Source File: translatewiki.py From editquality with MIT License | 5 votes |
def process_normalized_lang_map(text): try: lang_map = {l.lang: l.prob for l in langdetect.detect_langs(text or "")} except langdetect.lang_detect_exception.LangDetectException: lang_map = {} normalized_lang_map = defaultdict(lambda: 0.0) for lang in ALL_LANGS: norm_lang = COMMON_LANGUAGE_MAP.get(lang, lang) normalized_lang_map[norm_lang] += lang_map.get(lang, 0.0) return normalized_lang_map
Example #4
Source File: textcat.py From OrangeAssassin with Apache License 2.0 | 5 votes |
def check_language(self, msg, target=None): """Check the language of the message. Add the result to the metadata and and trigger the rule if it is present in the config and the languages are not in the ok list. :return True if the message language is unwanted and False otherwise """ prob = self["textcat_acceptable_prob"] results = langdetect.detect_langs(msg.text) self.ctxt.log.debug("TextCat results: %s", results) langs = [lang.lang for lang in results if lang.prob > prob] if len(langs) > self["textcat_max_languages"]: self.ctxt.log.debug("Too many languages.") return False msg.plugin_tags["LANGUAGES"] = " ".join(langs) ok_languages = self["ok_languages"] if "all" in ok_languages: # All good. return False for lang in langs: if lang not in ok_languages: return True return False
Example #5
Source File: product.py From impactstory-tng with MIT License | 5 votes |
def languages_with_examples(self): resp = {} try: for (source, posts) in self.altmetric_api_raw["posts"].iteritems(): for post in posts: for key in ["title", "summary"]: try: num_words_in_post = len(post[key].split(" ")) top_detection = langdetect.detect_langs(post[key])[0] if (num_words_in_post > 7) and (top_detection.prob > 0.90): if top_detection.lang != "en": language_name = get_language_from_abbreviation(top_detection.lang) # print u"LANGUAGE:", language_name, top_detection.prob, post[key] # overwrites. that's ok, we just want one example resp[language_name] = post["url"] except langdetect.lang_detect_exception.LangDetectException: pass except (KeyError, AttributeError, TypeError): pass return resp
Example #6
Source File: app.py From threatconnect-playbooks with Apache License 2.0 | 5 votes |
def run(self): """Run the App main logic. This method should contain the core logic of the App. """ text = self.tcex.playbook.read(self.args.text) detected_language_code = detect_langs(text)[0].lang detected_language_probability = detect_langs(text)[0].prob self.tcex.playbook.create_output('detectedLanguageCode', detected_language_code, 'String') self.tcex.playbook.create_output('detectedLanguageProbability', detected_language_probability, 'String') self.exit_message = 'Detected the language as {} (with a probability of {})'.format(detected_language_code, detected_language_probability)
Example #7
Source File: wl_detection.py From Wordless with GNU General Public License v3.0 | 4 votes |
def detect_lang(main, file): text = '' try: with open(file['path'], 'r', encoding = file['encoding']) as f: if main.settings_custom['auto_detection']['detection_settings']['number_lines_no_limit']: for line in f: text += line else: for i, line in enumerate(f): if i < main.settings_custom['auto_detection']['detection_settings']['number_lines']: text += line else: break lang_code_639_1 = langid.classify(text)[0] # Chinese (Simplified) & Chinese (Traditional) if lang_code_639_1 == 'zh': lang_code_639_1 = 'zh_cn' for lang in sorted(langdetect.detect_langs(text), key = lambda item: -item.prob): if lang.lang in ['zh-cn', 'zh-tw']: lang_code_639_1 = lang.lang.replace('-', '_') break # Norwegian Bokmål elif lang_code_639_1 == 'no': lang_code_639_1 = 'nb' # Serbian (Cyrillic) elif lang_code_639_1 == 'sr': lang_code_639_1 = 'sr_cyrl' lang = wl_conversion.to_iso_639_3(main, lang_code_639_1) success = True except: lang = main.settings_custom['auto_detection']['default_settings']['default_lang'] success = False return lang, success
Example #8
Source File: encrypted_shared_preferences.py From scrounger with BSD 3-Clause "New" or "Revised" License | 4 votes |
def run(self): result = { "title": "Application Does Not Encrypt Shared Preferences", "details": "", "severity": "Medium", "report": False } if not self.device.installed(self.identifier): return {"print": "Application not installed"} Log.info("Starting the application") self.device.start(self.identifier) sleep(5) Log.info("Finding files in application's data") target_paths = ["{}/shared_prefs".format(file_path) for file_path in self.device.data_paths(self.identifier)] listed_files = [] report_files = [] for data_path in target_paths: listed_files += self.device.find_files(data_path) Log.info("Analysing application's data") for filename in listed_files: if filename: file_content = self.device.file_content(filename) lang = detect_langs(file_content)[0] Log.debug("{} language {}: {}".format(filename, lang.lang, lang.prob)) if lang.prob > float("0.{}".format(self.min_percentage)): report_files += [filename] if report_files: result.update({ "report": True, "details": "* Unencrypted Files:\n * {}".format("\n * ".join( report_files)) }) return { "{}_result".format(self.name()): result }
Example #9
Source File: unencrypted_keychain_data.py From scrounger with BSD 3-Clause "New" or "Revised" License | 4 votes |
def run(self): result = { "title": "Application Saves Unencrypted Data In Keychain", "details": "", "severity": "Low", "report": False } Log.info("Getting keychain's IDs") ent_module = EModule() ent_module.binary = self.binary ent_result, entitlements = ent_module.run(), None for key in ent_result: if key.endswith("_entitlements"): entitlements = ent_result[key] if not entitlements: return {"print": "Couldn't get entitlements from the bianry."} keychain_id = self.identifier if "keychain-access-groups" in entitlements: keychain_id = entitlements["keychain-access-groups"] keychain_module = KeychainModule() keychain_module.device = self.device keychain_module.output = None keychain_result = keychain_module.run() keychain_data = keychain_result["keychain_data"] data = [] for key in keychain_data: if (key["entitlement_group"] and \ keychain_id in key["entitlement_group"]) or (key["account"] and \ keychain_id in key["account"]) or (key["service"] and \ keychain_id in key["service"]): data += [str(key['keychain_data'])] report_data = [] for item in data: lang = detect_langs(item)[0] if lang.prob > float("0.{}".format(self.min_percentage)): report_data += [item] if report_data: result.update({ "report": True, "details": "The following data was found:\n* {}".format( "\n* ".join(report_data)) }) return { "{}_result".format(self.name()): result }