Python Examples of regex.UNICODE

Source File: bpe.py From ParlAI with MIT License

6 votes

def __init__(self, opt: Opt, shared: TShared = None):
        """
        Initialize the BPE module.

        :param opt:
            options
        :param shared:
            shared dictionary
        """
        super().__init__(opt, shared)
        if not SUBWORD_BPE_INSTALLED:
            raise RuntimeError(
                "Please run \"pip install 'git+https://github.com/rsennrich"
                "/subword-nmt.git#egg=subword-nmt'\""
            )
        if not opt.get('dict_file'):
            raise RuntimeError('--dict-file is mandatory.')

        self.splitter = re.compile(r'\w+|[^\w\s]', re.UNICODE)

        self.codecs = f"{opt['dict_file']}.codecs"
        if os.path.exists(self.codecs):
            self._load_from_codecs()

Source File: regexp_tokenizer.py From FusionNet with MIT License

6 votes

def __init__(self, **kwargs):
        """
        Args:
            annotators: None or empty set (only tokenizes).
            substitutions: if true, normalizes some token types (e.g. quotes).
        """
        self._regexp = regex.compile(
            '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|'
            '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|'
            '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|'
            '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' %
            (self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN,
             self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2,
             self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE,
             self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT,
             self.NON_WS),
            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
        )
        if len(kwargs.get('annotators', {})) > 0:
            logger.warning('%s only tokenizes! Skipping annotators: %s' %
                           (type(self).__name__, kwargs.get('annotators')))
        self.annotators = set()
        self.substitutions = kwargs.get('substitutions', True)

Source File: regexp_tokenizer.py From OpenQA with MIT License

6 votes

def __init__(self, **kwargs):
        """
        Args:
            annotators: None or empty set (only tokenizes).
            substitutions: if true, normalizes some token types (e.g. quotes).
        """
        self._regexp = regex.compile(
            '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|'
            '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|'
            '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|'
            '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' %
            (self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN,
             self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2,
             self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE,
             self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT,
             self.NON_WS),
            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
        )
        if len(kwargs.get('annotators', {})) > 0:
            logger.warning('%s only tokenizes! Skipping annotators: %s' %
                           (type(self).__name__, kwargs.get('annotators')))
        self.annotators = set()
        self.substitutions = kwargs.get('substitutions', True)

Source File: feature_engineering.py From coling2018_fake-news-challenge with Apache License 2.0

6 votes

def sdm_sim(headlines, bodies):
    def similarity(headline, body):
        clean_headline = clean(headline)
        clean_body = clean(body)
        fullClient = retinasdk.FullClient("e8bf8de0-fe52-11e6-b22d-93a4ae922ff1", apiServer="http://api.cortical.io/rest", retinaName="en_associative")

        RE = re.compile(u'[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]', re.UNICODE)
        clean_body = RE.sub(u'', clean_body)
        #         clean_body = clean_body.encode('ascii', 'ignore')
        clean_body = clean_body.encode('utf8', 'ignore')
        clean_body = clean_body.decode('utf8', 'ignore')
        #         print(clean_body)
        clean_body.replace("0x6e", " ")
        #         newdata = clean_body[:start] + clean_body[end:]
        #         clean_body = clean_body.translate(None, '0x6e')
        comp_with_stop_words = fullClient.compare('[{"text": "'+clean_headline+'"}, {"text": "'+clean_body +'"}]')
        sim = comp_with_stop_words.cosineSimilarity

        features = []
        features.append(sim)
        return features
    x = []
    for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
        x.append(similarity(headline, body))
    return x

Source File: regexp_tokenizer.py From justcopy-backend with MIT License

6 votes

def __init__(self, **kwargs):
        """
        Args:
            annotators: None or empty set (only tokenizes).
            substitutions: if true, normalizes some token types (e.g. quotes).
        """
        self._regexp = regex.compile(
            '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|'
            '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|'
            '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|'
            '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' %
            (self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN,
             self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2,
             self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE,
             self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT,
             self.NON_WS),
            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
        )
        if len(kwargs.get('annotators', {})) > 0:
            logger.warning('%s only tokenizes! Skipping annotators: %s' %
                           (type(self).__name__, kwargs.get('annotators')))
        self.annotators = set()
        self.substitutions = kwargs.get('substitutions', True)

Source File: util.py From urduhack with MIT License

6 votes

def remove_punctuation(text: str, marks=None) -> str:
    """
    Remove punctuation from ``text`` by removing all instances of ``marks``.

    Args:
        text (str): Urdu text
        marks (str): If specified, remove only the characters in this string,
            e.g. ``marks=',;:'`` removes commas, semi-colons, and colons.
            Otherwise, all punctuation marks are removed.
    Returns:
        str: returns a ``str`` object containing normalized text.
    Note:
        When ``marks=None``, Python's built-in :meth:`str.translate()` is
        used to remove punctuation; otherwise, a regular expression is used
        instead. The former's performance is about 5-10x faster.
    Examples:
        >>> from urduhack.preprocessing import remove_punctuation
        >>> output = remove_punctuation("کر ؟ سکتی ہے۔")
        کر سکتی ہے

    """
    if marks:
        return re.sub('[{}]+'.format(re.escape(marks)), '', text, flags=re.UNICODE)

    return text.translate(PUNCTUATION_TRANSLATE_UNICODE)

Source File: regexp_tokenizer.py From RCZoo with MIT License

6 votes

def __init__(self, **kwargs):
        """
        Args:
            annotators: None or empty set (only tokenizes).
            substitutions: if true, normalizes some token types (e.g. quotes).
        """
        self._regexp = regex.compile(
            '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|'
            '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|'
            '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|'
            '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' %
            (self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN,
             self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2,
             self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE,
             self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT,
             self.NON_WS),
            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
        )
        if len(kwargs.get('annotators', {})) > 0:
            logger.warning('%s only tokenizes! Skipping annotators: %s' %
                           (type(self).__name__, kwargs.get('annotators')))
        self.annotators = set()
        self.substitutions = kwargs.get('substitutions', True)

Source File: locale.py From dateparser with BSD 3-Clause "New" or "Revised" License

6 votes

def _set_splitters(self, settings=None):
        splitters = {
            'wordchars': set(),  # The ones that split string only if they are not surrounded by letters from both sides
            'capturing': set(),  # The ones that are not filtered out from tokens after split
        }
        splitters['capturing'] |= set(ALWAYS_KEEP_TOKENS)

        wordchars = self._get_wordchars(settings)
        skip = set(self.info.get('skip', [])) | splitters['capturing']
        for token in skip:
            if not re.match(r'^\W+$', token, re.UNICODE):
                continue
            if token in wordchars:
                splitters['wordchars'].add(token)

        self._splitters = splitters

Source File: utils.py From dragonfly with GNU Lesser General Public License v3.0

6 votes

def _phrase_to_regex(phrase):
    # Treat whitespace between words as meaning anything other than alphanumeric
    # characters.
    pattern = r"[^\w--_]+".join(regex.escape(word) for word in phrase.split())
    # Treat spaces at the beginning or end of the phrase as matching any
    # whitespace character. This makes it easy to select stuff like non-breaking
    # space, which occurs frequently in browsers.
    # TODO Support newlines. Note that these are frequently implemented as
    # separate text nodes in the accessibility tree, so the obvious
    # implementation would not work well.
    if phrase == " ":
        pattern = r"\s"
    else:
        if phrase.startswith(" "):
            pattern = r"\s" + pattern
        if phrase.endswith(" "):
            pattern = pattern + r"\s"
    # Only match at boundaries of alphanumeric sequences if the phrase ends
    # are alphanumeric.
    if regex.search(r"^[\w--_]", phrase, regex.VERSION1 | regex.UNICODE):
        pattern = r"(?<![\w--_])" + pattern
    if regex.search(r"[\w--_]$", phrase, regex.VERSION1 | regex.UNICODE):
        pattern = pattern + r"(?![\w--_])"
    return pattern

Source File: locale.py From dateparser with BSD 3-Clause "New" or "Revised" License

5 votes

def _generate_relative_translations(self, normalize=False):
        relative_translations = self.info.get('relative-type-regex', {})
        relative_dictionary = OrderedDict()
        for key, value in relative_translations.items():
            if normalize:
                value = list(map(normalize_unicode, value))
            pattern = '|'.join(sorted(value, key=len, reverse=True))
            pattern = DIGIT_GROUP_PATTERN.sub(r'?P<n>\d+', pattern)
            pattern = re.compile(r'^(?:{})$'.format(pattern), re.UNICODE | re.IGNORECASE)
            relative_dictionary[pattern] = key
        return relative_dictionary

Source File: dictionary.py From dateparser with BSD 3-Clause "New" or "Revised" License

5 votes

def _construct_split_regex(self):
        known_words_group = "|".join(map(re.escape, self._get_sorted_words_from_cache()))
        if self._no_word_spacing:
            regex = r"^(.*?)({})(.*)$".format(known_words_group)
        else:
            regex = r"^(.*?(?:\A|\W|_|\d))({})((?:\Z|\W|_|\d).*)$".format(known_words_group)
        self._split_regex_cache.setdefault(
            self._settings.registry_key, {})[self.info['name']] = \
            re.compile(regex, re.UNICODE | re.IGNORECASE)

Source File: locale.py From dateparser with BSD 3-Clause "New" or "Revised" License

5 votes

def get_wordchars_for_detection(self, settings):
        if self._wordchars_for_detection is None:
            wordchars = set()
            for word in self._get_dictionary(settings):
                if re.match(r'^[\W\d_]+$', word, re.UNICODE):
                    continue
                for char in word:
                    wordchars.add(char.lower())
            self._wordchars_for_detection = wordchars - {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
                                                         ":", "(", ")", "'", "q", "a", "m", "p", " "}
        return self._wordchars_for_detection

Source File: locale.py From dateparser with BSD 3-Clause "New" or "Revised" License

5 votes

def _set_wordchars(self, settings=None):
        wordchars = set()
        for word in self._get_dictionary(settings):
            if re.match(r'^[\W\d_]+$', word, re.UNICODE):
                continue
            for char in word:
                wordchars.add(char.lower())

        self._wordchars = wordchars - {" "} | {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}

Source File: utils.py From RCZoo with MIT License

5 votes

def regex_match_score(prediction, pattern):
    """Check if the prediction matches the given regular expression."""
    try:
        compiled = re.compile(
            pattern,
            flags=re.IGNORECASE + re.UNICODE + re.MULTILINE
        )
    except BaseException:
        logger.warn('Regular expression failed to compile: %s' % pattern)
        return False
    return compiled.match(prediction) is not None

Source File: retrieval_drqa_eval.py From semanticRetrievalMRS with MIT License

5 votes

def regex_match(text, pattern):
    """Test if a regex pattern is contained within a text."""
    try:
        pattern = re.compile(
            pattern,
            flags=re.IGNORECASE + re.UNICODE + re.MULTILINE,
        )
    except BaseException:
        return False
    return pattern.search(text) is not None

Source File: grammar.py From estnltk with GNU General Public License v2.0

5 votes

def __init__(self, *postags, **kwargs):
        super(Postags, self).__init__(kwargs.get('name'))
        self.__postags = postags
        self.__pattern = re.compile('\L<postags>', postags=postags, flags=re.UNICODE | re.IGNORECASE)

Source File: grammar.py From estnltk with GNU General Public License v2.0

5 votes

def __init__(self, *lemmas, **kwargs):
        super(Lemmas, self).__init__(kwargs.get('name'))
        self.__lemmas = lemmas
        self.__pattern = re.compile('\L<lemmas>', lemmas=lemmas, flags=re.UNICODE | re.IGNORECASE)

Source File: dictionary.py From dateparser with BSD 3-Clause "New" or "Revised" License

5 votes

def _construct_match_relative_regex(self):
        known_relative_strings_group = "|".join(self._get_sorted_relative_strings_from_cache())
        regex = "^({})$".format(known_relative_strings_group)
        self._match_relative_regex_cache.setdefault(
            self._settings.registry_key, {})[self.info['name']] = \
            re.compile(regex, re.UNICODE | re.IGNORECASE)

Source File: utils.py From Multi-Step-Reasoning with Apache License 2.0

5 votes

def regex_match_score(prediction, pattern):
    """Check if the prediction matches the given regular expression."""
    try:
        compiled = re.compile(
            pattern,
            flags=re.IGNORECASE + re.UNICODE + re.MULTILINE
        )
    except BaseException:
        logger.warn('Regular expression failed to compile: %s' % pattern)
        return False
    return compiled.match(prediction) is not None

Source File: utils.py From RCZoo with MIT License

5 votes

def regex_match_score(prediction, pattern):
    """Check if the prediction matches the given regular expression."""
    try:
        compiled = re.compile(
            pattern,
            flags=re.IGNORECASE + re.UNICODE + re.MULTILINE
        )
    except BaseException:
        logger.warn('Regular expression failed to compile: %s' % pattern)
        return False
    return compiled.match(prediction) is not None

Source File: utils.py From RCZoo with MIT License

5 votes

def regex_match_score(prediction, pattern):
    """Check if the prediction matches the given regular expression."""
    try:
        compiled = re.compile(
            pattern,
            flags=re.IGNORECASE + re.UNICODE + re.MULTILINE
        )
    except BaseException:
        logger.warn('Regular expression failed to compile: %s' % pattern)
        return False
    return compiled.match(prediction) is not None

Source File: simple_tokenizer.py From RCZoo with MIT License

5 votes

def __init__(self, **kwargs):
        """
        Args:
            annotators: None or empty set (only tokenizes).
        """
        self._regexp = regex.compile(
            '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS),
            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
        )
        if len(kwargs.get('annotators', {})) > 0:
            logger.warning('%s only tokenizes! Skipping annotators: %s' %
                           (type(self).__name__, kwargs.get('annotators')))
        self.annotators = set()

Source File: utils.py From RCZoo with MIT License

5 votes

def regex_match_score(prediction, pattern):
    """Check if the prediction matches the given regular expression."""
    try:
        compiled = re.compile(
            pattern,
            flags=re.IGNORECASE + re.UNICODE + re.MULTILINE
        )
    except BaseException:
        logger.warn('Regular expression failed to compile: %s' % pattern)
        return False
    return compiled.match(prediction) is not None

Source File: utils.py From RCZoo with MIT License

5 votes

def regex_match_score(prediction, pattern):
    """Check if the prediction matches the given regular expression."""
    try:
        compiled = re.compile(
            pattern,
            flags=re.IGNORECASE + re.UNICODE + re.MULTILINE
        )
    except BaseException:
        logger.warn('Regular expression failed to compile: %s' % pattern)
        return False
    return compiled.match(prediction) is not None

Source File: utils.py From RCZoo with MIT License

5 votes

def regex_match_score(prediction, pattern):
    """Check if the prediction matches the given regular expression."""
    try:
        compiled = re.compile(
            pattern,
            flags=re.IGNORECASE + re.UNICODE + re.MULTILINE
        )
    except BaseException:
        logger.warn('Regular expression failed to compile: %s' % pattern)
        return False
    return compiled.match(prediction) is not None

Source File: utils.py From RCZoo with MIT License

5 votes

def regex_match_score(prediction, pattern):
    """Check if the prediction matches the given regular expression."""
    try:
        compiled = re.compile(
            pattern,
            flags=re.IGNORECASE + re.UNICODE + re.MULTILINE
        )
    except BaseException:
        logger.warn('Regular expression failed to compile: %s' % pattern)
        return False
    return compiled.match(prediction) is not None

Source File: utils.py From RCZoo with MIT License

5 votes

def regex_match_score(prediction, pattern):
    """Check if the prediction matches the given regular expression."""
    try:
        compiled = re.compile(
            pattern,
            flags=re.IGNORECASE + re.UNICODE + re.MULTILINE
        )
    except BaseException:
        logger.warn('Regular expression failed to compile: %s' % pattern)
        return False
    return compiled.match(prediction) is not None

Source File: utils.py From MnemonicReader with BSD 3-Clause "New" or "Revised" License

5 votes

def regex_match_score(prediction, pattern):
    """Check if the prediction matches the given regular expression."""
    try:
        compiled = re.compile(
            pattern,
            flags=re.IGNORECASE + re.UNICODE + re.MULTILINE
        )
    except BaseException:
        logger.warn('Regular expression failed to compile: %s' % pattern)
        return False
    return compiled.match(prediction) is not None

Source File: simple_tokenizer.py From FusionNet with MIT License

5 votes

def __init__(self, **kwargs):
        """
        Args:
            annotators: None or empty set (only tokenizes).
        """
        self._regexp = regex.compile(
            '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS),
            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
        )
        if len(kwargs.get('annotators', {})) > 0:
            logger.warning('%s only tokenizes! Skipping annotators: %s' %
                           (type(self).__name__, kwargs.get('annotators')))
        self.annotators = set()

Source File: main.py From OpenQA with MIT License

5 votes

def has_answer(args, answer, t):
    global PROCESS_TOK
    text = []
    for i in range(len(t)):
        text.append(t[i].lower())
    res_list = []
    if (args.dataset == "CuratedTrec"):
        try:
            ans_regex = re.compile("(%s)"%answer[0], flags=re.IGNORECASE + re.UNICODE)
        except:
            return False, res_list
        paragraph = " ".join(text)
        answer_new = ans_regex.findall(paragraph)
        for a in answer_new:
            single_answer = normalize(a[0])
            single_answer = PROCESS_TOK.tokenize(single_answer)
            single_answer = single_answer.words(uncased=True)
            for i in range(0, len(text) - len(single_answer) + 1):
                if single_answer == text[i: i + len(single_answer)]:
                    res_list.append((i, i+len(single_answer)-1))
    else:
        for a in answer:
            single_answer = " ".join(a).lower()
            single_answer = normalize(single_answer)
            single_answer = PROCESS_TOK.tokenize(single_answer)
            single_answer = single_answer.words(uncased=True)
            for i in range(0, len(text) - len(single_answer) + 1):
                if single_answer == text[i: i + len(single_answer)]:
                    res_list.append((i, i+len(single_answer)-1))
    if (len(res_list)>0):
        return True, res_list
    else:
        return False, res_list

Python regex.UNICODE Examples