Python regex.UNICODE Examples
The following are 30
code examples of regex.UNICODE().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
regex
, or try the search function
.
Example #1
Source File: bpe.py From ParlAI with MIT License | 6 votes |
def __init__(self, opt: Opt, shared: TShared = None): """ Initialize the BPE module. :param opt: options :param shared: shared dictionary """ super().__init__(opt, shared) if not SUBWORD_BPE_INSTALLED: raise RuntimeError( "Please run \"pip install 'git+https://github.com/rsennrich" "/subword-nmt.git#egg=subword-nmt'\"" ) if not opt.get('dict_file'): raise RuntimeError('--dict-file is mandatory.') self.splitter = re.compile(r'\w+|[^\w\s]', re.UNICODE) self.codecs = f"{opt['dict_file']}.codecs" if os.path.exists(self.codecs): self._load_from_codecs()
Example #2
Source File: regexp_tokenizer.py From FusionNet with MIT License | 6 votes |
def __init__(self, **kwargs): """ Args: annotators: None or empty set (only tokenizes). substitutions: if true, normalizes some token types (e.g. quotes). """ self._regexp = regex.compile( '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|' '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|' '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|' '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' % (self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN, self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2, self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE, self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT, self.NON_WS), flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE ) if len(kwargs.get('annotators', {})) > 0: logger.warning('%s only tokenizes! Skipping annotators: %s' % (type(self).__name__, kwargs.get('annotators'))) self.annotators = set() self.substitutions = kwargs.get('substitutions', True)
Example #3
Source File: regexp_tokenizer.py From OpenQA with MIT License | 6 votes |
def __init__(self, **kwargs): """ Args: annotators: None or empty set (only tokenizes). substitutions: if true, normalizes some token types (e.g. quotes). """ self._regexp = regex.compile( '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|' '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|' '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|' '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' % (self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN, self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2, self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE, self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT, self.NON_WS), flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE ) if len(kwargs.get('annotators', {})) > 0: logger.warning('%s only tokenizes! Skipping annotators: %s' % (type(self).__name__, kwargs.get('annotators'))) self.annotators = set() self.substitutions = kwargs.get('substitutions', True)
Example #4
Source File: feature_engineering.py From coling2018_fake-news-challenge with Apache License 2.0 | 6 votes |
def sdm_sim(headlines, bodies): def similarity(headline, body): clean_headline = clean(headline) clean_body = clean(body) fullClient = retinasdk.FullClient("e8bf8de0-fe52-11e6-b22d-93a4ae922ff1", apiServer="http://api.cortical.io/rest", retinaName="en_associative") RE = re.compile(u'[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]', re.UNICODE) clean_body = RE.sub(u'', clean_body) # clean_body = clean_body.encode('ascii', 'ignore') clean_body = clean_body.encode('utf8', 'ignore') clean_body = clean_body.decode('utf8', 'ignore') # print(clean_body) clean_body.replace("0x6e", " ") # newdata = clean_body[:start] + clean_body[end:] # clean_body = clean_body.translate(None, '0x6e') comp_with_stop_words = fullClient.compare('[{"text": "'+clean_headline+'"}, {"text": "'+clean_body +'"}]') sim = comp_with_stop_words.cosineSimilarity features = [] features.append(sim) return features x = [] for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))): x.append(similarity(headline, body)) return x
Example #5
Source File: regexp_tokenizer.py From justcopy-backend with MIT License | 6 votes |
def __init__(self, **kwargs): """ Args: annotators: None or empty set (only tokenizes). substitutions: if true, normalizes some token types (e.g. quotes). """ self._regexp = regex.compile( '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|' '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|' '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|' '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' % (self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN, self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2, self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE, self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT, self.NON_WS), flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE ) if len(kwargs.get('annotators', {})) > 0: logger.warning('%s only tokenizes! Skipping annotators: %s' % (type(self).__name__, kwargs.get('annotators'))) self.annotators = set() self.substitutions = kwargs.get('substitutions', True)
Example #6
Source File: util.py From urduhack with MIT License | 6 votes |
def remove_punctuation(text: str, marks=None) -> str: """ Remove punctuation from ``text`` by removing all instances of ``marks``. Args: text (str): Urdu text marks (str): If specified, remove only the characters in this string, e.g. ``marks=',;:'`` removes commas, semi-colons, and colons. Otherwise, all punctuation marks are removed. Returns: str: returns a ``str`` object containing normalized text. Note: When ``marks=None``, Python's built-in :meth:`str.translate()` is used to remove punctuation; otherwise, a regular expression is used instead. The former's performance is about 5-10x faster. Examples: >>> from urduhack.preprocessing import remove_punctuation >>> output = remove_punctuation("کر ؟ سکتی ہے۔") کر سکتی ہے """ if marks: return re.sub('[{}]+'.format(re.escape(marks)), '', text, flags=re.UNICODE) return text.translate(PUNCTUATION_TRANSLATE_UNICODE)
Example #7
Source File: regexp_tokenizer.py From RCZoo with MIT License | 6 votes |
def __init__(self, **kwargs): """ Args: annotators: None or empty set (only tokenizes). substitutions: if true, normalizes some token types (e.g. quotes). """ self._regexp = regex.compile( '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|' '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|' '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|' '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' % (self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN, self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2, self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE, self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT, self.NON_WS), flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE ) if len(kwargs.get('annotators', {})) > 0: logger.warning('%s only tokenizes! Skipping annotators: %s' % (type(self).__name__, kwargs.get('annotators'))) self.annotators = set() self.substitutions = kwargs.get('substitutions', True)
Example #8
Source File: locale.py From dateparser with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _set_splitters(self, settings=None): splitters = { 'wordchars': set(), # The ones that split string only if they are not surrounded by letters from both sides 'capturing': set(), # The ones that are not filtered out from tokens after split } splitters['capturing'] |= set(ALWAYS_KEEP_TOKENS) wordchars = self._get_wordchars(settings) skip = set(self.info.get('skip', [])) | splitters['capturing'] for token in skip: if not re.match(r'^\W+$', token, re.UNICODE): continue if token in wordchars: splitters['wordchars'].add(token) self._splitters = splitters
Example #9
Source File: utils.py From dragonfly with GNU Lesser General Public License v3.0 | 6 votes |
def _phrase_to_regex(phrase): # Treat whitespace between words as meaning anything other than alphanumeric # characters. pattern = r"[^\w--_]+".join(regex.escape(word) for word in phrase.split()) # Treat spaces at the beginning or end of the phrase as matching any # whitespace character. This makes it easy to select stuff like non-breaking # space, which occurs frequently in browsers. # TODO Support newlines. Note that these are frequently implemented as # separate text nodes in the accessibility tree, so the obvious # implementation would not work well. if phrase == " ": pattern = r"\s" else: if phrase.startswith(" "): pattern = r"\s" + pattern if phrase.endswith(" "): pattern = pattern + r"\s" # Only match at boundaries of alphanumeric sequences if the phrase ends # are alphanumeric. if regex.search(r"^[\w--_]", phrase, regex.VERSION1 | regex.UNICODE): pattern = r"(?<![\w--_])" + pattern if regex.search(r"[\w--_]$", phrase, regex.VERSION1 | regex.UNICODE): pattern = pattern + r"(?![\w--_])" return pattern
Example #10
Source File: locale.py From dateparser with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _generate_relative_translations(self, normalize=False): relative_translations = self.info.get('relative-type-regex', {}) relative_dictionary = OrderedDict() for key, value in relative_translations.items(): if normalize: value = list(map(normalize_unicode, value)) pattern = '|'.join(sorted(value, key=len, reverse=True)) pattern = DIGIT_GROUP_PATTERN.sub(r'?P<n>\d+', pattern) pattern = re.compile(r'^(?:{})$'.format(pattern), re.UNICODE | re.IGNORECASE) relative_dictionary[pattern] = key return relative_dictionary
Example #11
Source File: dictionary.py From dateparser with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _construct_split_regex(self): known_words_group = "|".join(map(re.escape, self._get_sorted_words_from_cache())) if self._no_word_spacing: regex = r"^(.*?)({})(.*)$".format(known_words_group) else: regex = r"^(.*?(?:\A|\W|_|\d))({})((?:\Z|\W|_|\d).*)$".format(known_words_group) self._split_regex_cache.setdefault( self._settings.registry_key, {})[self.info['name']] = \ re.compile(regex, re.UNICODE | re.IGNORECASE)
Example #12
Source File: locale.py From dateparser with BSD 3-Clause "New" or "Revised" License | 5 votes |
def get_wordchars_for_detection(self, settings): if self._wordchars_for_detection is None: wordchars = set() for word in self._get_dictionary(settings): if re.match(r'^[\W\d_]+$', word, re.UNICODE): continue for char in word: wordchars.add(char.lower()) self._wordchars_for_detection = wordchars - {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ":", "(", ")", "'", "q", "a", "m", "p", " "} return self._wordchars_for_detection
Example #13
Source File: locale.py From dateparser with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _set_wordchars(self, settings=None): wordchars = set() for word in self._get_dictionary(settings): if re.match(r'^[\W\d_]+$', word, re.UNICODE): continue for char in word: wordchars.add(char.lower()) self._wordchars = wordchars - {" "} | {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}
Example #14
Source File: utils.py From RCZoo with MIT License | 5 votes |
def regex_match_score(prediction, pattern): """Check if the prediction matches the given regular expression.""" try: compiled = re.compile( pattern, flags=re.IGNORECASE + re.UNICODE + re.MULTILINE ) except BaseException: logger.warn('Regular expression failed to compile: %s' % pattern) return False return compiled.match(prediction) is not None
Example #15
Source File: retrieval_drqa_eval.py From semanticRetrievalMRS with MIT License | 5 votes |
def regex_match(text, pattern): """Test if a regex pattern is contained within a text.""" try: pattern = re.compile( pattern, flags=re.IGNORECASE + re.UNICODE + re.MULTILINE, ) except BaseException: return False return pattern.search(text) is not None
Example #16
Source File: grammar.py From estnltk with GNU General Public License v2.0 | 5 votes |
def __init__(self, *postags, **kwargs): super(Postags, self).__init__(kwargs.get('name')) self.__postags = postags self.__pattern = re.compile('\L<postags>', postags=postags, flags=re.UNICODE | re.IGNORECASE)
Example #17
Source File: grammar.py From estnltk with GNU General Public License v2.0 | 5 votes |
def __init__(self, *lemmas, **kwargs): super(Lemmas, self).__init__(kwargs.get('name')) self.__lemmas = lemmas self.__pattern = re.compile('\L<lemmas>', lemmas=lemmas, flags=re.UNICODE | re.IGNORECASE)
Example #18
Source File: dictionary.py From dateparser with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _construct_match_relative_regex(self): known_relative_strings_group = "|".join(self._get_sorted_relative_strings_from_cache()) regex = "^({})$".format(known_relative_strings_group) self._match_relative_regex_cache.setdefault( self._settings.registry_key, {})[self.info['name']] = \ re.compile(regex, re.UNICODE | re.IGNORECASE)
Example #19
Source File: utils.py From Multi-Step-Reasoning with Apache License 2.0 | 5 votes |
def regex_match_score(prediction, pattern): """Check if the prediction matches the given regular expression.""" try: compiled = re.compile( pattern, flags=re.IGNORECASE + re.UNICODE + re.MULTILINE ) except BaseException: logger.warn('Regular expression failed to compile: %s' % pattern) return False return compiled.match(prediction) is not None
Example #20
Source File: utils.py From RCZoo with MIT License | 5 votes |
def regex_match_score(prediction, pattern): """Check if the prediction matches the given regular expression.""" try: compiled = re.compile( pattern, flags=re.IGNORECASE + re.UNICODE + re.MULTILINE ) except BaseException: logger.warn('Regular expression failed to compile: %s' % pattern) return False return compiled.match(prediction) is not None
Example #21
Source File: utils.py From RCZoo with MIT License | 5 votes |
def regex_match_score(prediction, pattern): """Check if the prediction matches the given regular expression.""" try: compiled = re.compile( pattern, flags=re.IGNORECASE + re.UNICODE + re.MULTILINE ) except BaseException: logger.warn('Regular expression failed to compile: %s' % pattern) return False return compiled.match(prediction) is not None
Example #22
Source File: simple_tokenizer.py From RCZoo with MIT License | 5 votes |
def __init__(self, **kwargs): """ Args: annotators: None or empty set (only tokenizes). """ self._regexp = regex.compile( '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS), flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE ) if len(kwargs.get('annotators', {})) > 0: logger.warning('%s only tokenizes! Skipping annotators: %s' % (type(self).__name__, kwargs.get('annotators'))) self.annotators = set()
Example #23
Source File: utils.py From RCZoo with MIT License | 5 votes |
def regex_match_score(prediction, pattern): """Check if the prediction matches the given regular expression.""" try: compiled = re.compile( pattern, flags=re.IGNORECASE + re.UNICODE + re.MULTILINE ) except BaseException: logger.warn('Regular expression failed to compile: %s' % pattern) return False return compiled.match(prediction) is not None
Example #24
Source File: utils.py From RCZoo with MIT License | 5 votes |
def regex_match_score(prediction, pattern): """Check if the prediction matches the given regular expression.""" try: compiled = re.compile( pattern, flags=re.IGNORECASE + re.UNICODE + re.MULTILINE ) except BaseException: logger.warn('Regular expression failed to compile: %s' % pattern) return False return compiled.match(prediction) is not None
Example #25
Source File: utils.py From RCZoo with MIT License | 5 votes |
def regex_match_score(prediction, pattern): """Check if the prediction matches the given regular expression.""" try: compiled = re.compile( pattern, flags=re.IGNORECASE + re.UNICODE + re.MULTILINE ) except BaseException: logger.warn('Regular expression failed to compile: %s' % pattern) return False return compiled.match(prediction) is not None
Example #26
Source File: utils.py From RCZoo with MIT License | 5 votes |
def regex_match_score(prediction, pattern): """Check if the prediction matches the given regular expression.""" try: compiled = re.compile( pattern, flags=re.IGNORECASE + re.UNICODE + re.MULTILINE ) except BaseException: logger.warn('Regular expression failed to compile: %s' % pattern) return False return compiled.match(prediction) is not None
Example #27
Source File: utils.py From RCZoo with MIT License | 5 votes |
def regex_match_score(prediction, pattern): """Check if the prediction matches the given regular expression.""" try: compiled = re.compile( pattern, flags=re.IGNORECASE + re.UNICODE + re.MULTILINE ) except BaseException: logger.warn('Regular expression failed to compile: %s' % pattern) return False return compiled.match(prediction) is not None
Example #28
Source File: utils.py From MnemonicReader with BSD 3-Clause "New" or "Revised" License | 5 votes |
def regex_match_score(prediction, pattern): """Check if the prediction matches the given regular expression.""" try: compiled = re.compile( pattern, flags=re.IGNORECASE + re.UNICODE + re.MULTILINE ) except BaseException: logger.warn('Regular expression failed to compile: %s' % pattern) return False return compiled.match(prediction) is not None
Example #29
Source File: simple_tokenizer.py From FusionNet with MIT License | 5 votes |
def __init__(self, **kwargs): """ Args: annotators: None or empty set (only tokenizes). """ self._regexp = regex.compile( '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS), flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE ) if len(kwargs.get('annotators', {})) > 0: logger.warning('%s only tokenizes! Skipping annotators: %s' % (type(self).__name__, kwargs.get('annotators'))) self.annotators = set()
Example #30
Source File: main.py From OpenQA with MIT License | 5 votes |
def has_answer(args, answer, t): global PROCESS_TOK text = [] for i in range(len(t)): text.append(t[i].lower()) res_list = [] if (args.dataset == "CuratedTrec"): try: ans_regex = re.compile("(%s)"%answer[0], flags=re.IGNORECASE + re.UNICODE) except: return False, res_list paragraph = " ".join(text) answer_new = ans_regex.findall(paragraph) for a in answer_new: single_answer = normalize(a[0]) single_answer = PROCESS_TOK.tokenize(single_answer) single_answer = single_answer.words(uncased=True) for i in range(0, len(text) - len(single_answer) + 1): if single_answer == text[i: i + len(single_answer)]: res_list.append((i, i+len(single_answer)-1)) else: for a in answer: single_answer = " ".join(a).lower() single_answer = normalize(single_answer) single_answer = PROCESS_TOK.tokenize(single_answer) single_answer = single_answer.words(uncased=True) for i in range(0, len(text) - len(single_answer) + 1): if single_answer == text[i: i + len(single_answer)]: res_list.append((i, i+len(single_answer)-1)) if (len(res_list)>0): return True, res_list else: return False, res_list