Python Examples of regex.compile

Source File: barcodes.py From umis with MIT License

6 votes

def exact_barcode_filter(chunk, bc1, bc2, bc3, re_string=None):
    if not re_string:
        re_string = '(.*):CELL_(?P<CB>.*):UMI_(.*)\\n(.*)\\n\\+\\n(.*)\\n'
    parser_re = re.compile(re_string)
    kept = []
    for read in chunk:
        match = parser_re.search(read).groupdict()
        cb1 = match['CB']
        if bc3:
            cb1, cb2, cb3 = cb1.split("-")
        elif bc2:
            cb1, cb2 = cb1.split("-")
        if cb1 not in bc1:
            continue
        if bc2 and cb2 not in bc2:
            continue
        if bc3 and cb3 not in bc3:
            continue
        kept.append(read)
    return kept

Source File: sentence_splitter.py From SoMaJo with GNU General Public License v3.0

6 votes

def __init__(self, is_tuple=False, language="de_CMC"):
        """Create a SentenceSplitter object. If the tokenized paragraphs
        contain token classes or extra info, set is_tuple=True.

        """
        self.is_tuple = is_tuple
        # full stop, ellipsis, exclamation and question marks
        self.sentence_ending_punct = re.compile(r"^(?:\.+|…+\.*|[!?]+)$")
        self.opening_punct = re.compile(r"^(?:['\"¿¡\p{Pi}\p{Ps}–—]|-{2,})$")
        self.closing_punct = re.compile(r"^(?:['\"\p{Pf}\p{Pe}])$")
        # International quotes: «» “” ‹› ‘’
        # German quotes: »« „“ ›‹ ‚‘
        self.problematic_quotes = set(['"'])
        if language == "de" or language == "de_CMC":
            # German opening quotes [»›] have category Pf
            # German closing quotes [“‘«‹] have category Pi
            self.problematic_quotes = set(['"', "»", "«", "›", "‹", "“", "‘"])
        self.eos_abbreviations = utils.read_abbreviation_file("eos_abbreviations.txt")

Source File: regexp_tokenizer.py From OpenQA with MIT License

6 votes

def __init__(self, **kwargs):
        """
        Args:
            annotators: None or empty set (only tokenizes).
            substitutions: if true, normalizes some token types (e.g. quotes).
        """
        self._regexp = regex.compile(
            '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|'
            '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|'
            '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|'
            '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' %
            (self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN,
             self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2,
             self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE,
             self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT,
             self.NON_WS),
            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
        )
        if len(kwargs.get('annotators', {})) > 0:
            logger.warning('%s only tokenizes! Skipping annotators: %s' %
                           (type(self).__name__, kwargs.get('annotators')))
        self.annotators = set()
        self.substitutions = kwargs.get('substitutions', True)

Source File: tokenization.py From GPT2sQA with Apache License 2.0

6 votes

def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None):
        self.max_len = max_len if max_len is not None else int(1e12)
        self.encoder = json.load(open(vocab_file))
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.errors = errors  # how to handle errors in decoding
        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        self.cache = {}

        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")

        self.special_tokens = {}
        self.special_tokens_decoder = {}
        self.set_special_tokens(special_tokens)

Source File: regexp_tokenizer.py From justcopy-backend with MIT License

6 votes

def __init__(self, **kwargs):
        """
        Args:
            annotators: None or empty set (only tokenizes).
            substitutions: if true, normalizes some token types (e.g. quotes).
        """
        self._regexp = regex.compile(
            '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|'
            '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|'
            '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|'
            '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' %
            (self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN,
             self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2,
             self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE,
             self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT,
             self.NON_WS),
            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
        )
        if len(kwargs.get('annotators', {})) > 0:
            logger.warning('%s only tokenizes! Skipping annotators: %s' %
                           (type(self).__name__, kwargs.get('annotators')))
        self.annotators = set()
        self.substitutions = kwargs.get('substitutions', True)

Source File: beta_to_unicode.py From cltk with MIT License

6 votes

def __init__(self, pattern1=None, pattern2=None, pattern3=None):
        if pattern1 is None:
            pattern1 = UPPER
        if pattern2 is None:
            pattern2 = LOWER
        if pattern3 is None:
            pattern3 = PUNCT
        self.pattern1 = \
            [(regex.compile(beta_regex, flags=regex.VERSION1), repl)
             for (beta_regex, repl) in pattern1]
        self.pattern2 = \
            [(regex.compile(beta_regex, flags=regex.VERSION1), repl)
             for (beta_regex, repl) in pattern2]
        self.pattern3 = \
            [(regex.compile(beta_regex, flags=regex.VERSION1), repl)
             for (beta_regex, repl) in pattern3]

Source File: sentence_splitter.py From SoMaJo with GNU General Public License v3.0

6 votes

def __init__(self, is_tuple=False, language="de_CMC"):
        """Create a SentenceSplitter object. If the tokenized paragraphs
        contain token classes or extra info, set is_tuple=True.

        """
        self.is_tuple = is_tuple
        # full stop, ellipsis, exclamation and question marks
        self.sentence_ending_punct = re.compile(r"^(?:\.+|…+\.*|[!?]+)$")
        self.opening_punct = re.compile(r"^(?:['\"¿¡\p{Pi}\p{Ps}–—]|-{2,})$")
        self.closing_punct = re.compile(r"^(?:['\"\p{Pf}\p{Pe}])$")
        # International quotes: «» “” ‹› ‘’
        # German quotes: »« „“ ›‹ ‚‘
        self.problematic_quotes = set(['"'])
        if language == "de" or language == "de_CMC":
            # German opening quotes [»›] have category Pf
            # German closing quotes [“‘«‹] have category Pi
            self.problematic_quotes = set(['"', "»", "«", "›", "‹", "“", "‘"])
        self.eos_abbreviations = utils.read_abbreviation_file("eos_abbreviations.txt")

Source File: phrase2vec.py From mat2vec with MIT License

6 votes

def exclude_words(phrasegrams, words):
    """Given a list of words, excludes those from the keys of the phrase dictionary."""
    new_phrasergrams = {}
    words_re_list = []
    for word in words:
        we = regex.escape(word)
        words_re_list.append("^" + we + "$|^" + we + "_|_" + we + "$|_" + we + "_")
    word_reg = regex.compile(r""+"|".join(words_re_list))
    for gram in tqdm(phrasegrams):
        valid = True
        for sub_gram in gram:
            if word_reg.search(sub_gram.decode("unicode_escape", "ignore")) is not None:
                valid = False
                break
            if not valid:
                continue
        if valid:
            new_phrasergrams[gram] = phrasegrams[gram]
    return new_phrasergrams


# Generating word grams.

Source File: tokenization_roberta.py From TextClassify with Apache License 2.0

6 votes

def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>",
                 cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs):
        super(RobertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
                                               sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
                                               mask_token=mask_token, **kwargs)

        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.errors = errors  # how to handle errors in decoding
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        self.cache = {}

        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")

Source File: tokenization_gpt2.py From TextClassify with Apache License 2.0

6 votes

def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>",
                 bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs):
        super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)

        self.encoder = json.load(open(vocab_file))
        self.decoder = {v:k for k,v in self.encoder.items()}
        self.errors = errors # how to handle errors in decoding
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        self.cache = {}

        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")

Source File: tokenization_gpt2.py From Bert-Chinese-Text-Classification-Pytorch with MIT License

6 votes

def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None):
        self.max_len = max_len if max_len is not None else int(1e12)
        self.encoder = json.load(open(vocab_file))
        self.decoder = {v:k for k,v in self.encoder.items()}
        self.errors = errors # how to handle errors in decoding
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        self.cache = {}

        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")

        self.special_tokens = {}
        self.special_tokens_decoder = {}
        self.set_special_tokens(special_tokens)

Source File: tokenization_gpt2.py From NLP_Toolkit with Apache License 2.0

6 votes

def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>",
                 bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs):
        super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
        self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
        self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens

        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.errors = errors  # how to handle errors in decoding
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        self.cache = {}

        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")

Source File: hosters.py From bioconda-utils with MIT License

6 votes

def get_versions(self, req, orig_version):
        exclude = set(self.exclude)
        vals = {key: val
                for key, val in self.vals.items()
                if key not in exclude}
        link_pattern = replace_named_capture_group(self.link_pattern_compiled, vals)
        link_re = re.compile(link_pattern)
        result = []
        for url in self.releases_urls:
            parser = HrefParser(link_re)
            parser.feed(await req.get_text_from_url(url))
            for match in parser.get_matches():
                match["link"] = urljoin(url, match["href"])
                match["releases_url"] = url

                match["vals"] = vals
                result.append(match)
        return result

Source File: umis.py From umis with MIT License

6 votes

def umi_histogram(fastq):
    ''' Counts the number of reads for each UMI

    Expects formatted fastq files.
    '''
    annotations = detect_fastq_annotations(fastq)
    re_string = construct_transformed_regex(annotations)
    parser_re = re.compile(re_string)

    counter = collections.Counter()
    for read in read_fastq(fastq):
        match = parser_re.search(read).groupdict()
        counter[match['MB']] += 1

    for bc, count in counter.most_common():
        sys.stdout.write('{}\t{}\n'.format(bc, count))

Source File: hosters.py From bioconda-utils with MIT License

6 votes

def get_versions(self, req, orig_version):
        exclude = set(self.exclude)
        vals = {key: val
                for key, val in self.vals.items()
                if key not in exclude}
        link_pattern = replace_named_capture_group(self.link_pattern_compiled, vals)
        link_re = re.compile(link_pattern)
        result = []
        for url in self.releases_urls:
            files = await req.get_ftp_listing(url)
            for fname in files:
                match = link_re.search(fname)
                if match:
                    data = match.groupdict()
                    data['fn'] = fname
                    data['link'] = "ftp://" + vals['host'] + fname
                    data['releases_url'] = url
                    result.append(data)
        return result

Source File: pygrok.py From pygrok with MIT License

6 votes

def _load_search_pattern(self):
        self.type_mapper = {}
        py_regex_pattern = self.pattern
        while True:
            # Finding all types specified in the groks
            m = re.findall(r'%{(\w+):(\w+):(\w+)}', py_regex_pattern)
            for n in m:
                self.type_mapper[n[1]] = n[2]
            #replace %{pattern_name:custom_name} (or %{pattern_name:custom_name:type}
            # with regex and regex group name

            py_regex_pattern = re.sub(r'%{(\w+):(\w+)(?::\w+)?}',
                lambda m: "(?P<" + m.group(2) + ">" + self.predefined_patterns[m.group(1)].regex_str + ")",
                py_regex_pattern)

            #replace %{pattern_name} with regex
            py_regex_pattern = re.sub(r'%{(\w+)}',
                lambda m: "(" + self.predefined_patterns[m.group(1)].regex_str + ")",
                py_regex_pattern)

            if re.search('%{\w+(:\w+)?}', py_regex_pattern) is None:
                break

        self.regex_obj = re.compile(py_regex_pattern)

Source File: helpers.py From PyInquirer with MIT License

6 votes

def expect_regex(self, pattern):
        """Read until matches pattern or timeout."""
        # inspired by pexpect/pty_spawn and  pexpect/expect.py expect_loop
        end_time = time.time() + self.timeout
        buf = ''
        prog = regex.compile(pattern)
        while (end_time - time.time()) > 0.0:
            # switch to nonblocking read
            reads, _, _ = select.select([self.fd], [], [], end_time - time.time())
            if len(reads) > 0:
                try:
                    buf = remove_ansi_escape_sequences(buf + self.read())
                except EOFError:
                    assert prog.match(buf) is not None, \
                        'output was:\n%s\nexpect regex pattern:\n%s' % (buf, pattern)
                if prog.match(buf):
                    return True
            else:
                # do not eat up CPU when waiting for the timeout to expire
                time.sleep(self.timeout/10)
        assert prog.match(buf) is not None, \
            'output was:\n%s\nexpect regex pattern:\n%s' % (buf, pattern)

Source File: helpers.py From talon with Apache License 2.0

6 votes

def contains_sender_names(sender):
    '''Returns a functions to search sender\'s name or it\'s part.

    >>> feature = contains_sender_names("Sergey N.  Obukhov <xxx@example.com>")
    >>> feature("Sergey Obukhov")
    1
    >>> feature("BR, Sergey N.")
    1
    >>> feature("Sergey")
    1
    >>> contains_sender_names("<serobnic@mail.ru>")("Serobnic")
    1
    >>> contains_sender_names("<serobnic@mail.ru>")("serobnic")
    1
    '''
    names = '( |$)|'.join(flatten_list([[e, e.capitalize()]
                                        for e in extract_names(sender)]))
    names = names or sender
    if names != '':
        return binary_regex_search(re.compile(names))
    return lambda s: 0

Source File: helpers_test.py From talon with Apache License 2.0

5 votes

def test_binary_regex_search():
    eq_(1, h.binary_regex_search(re.compile("12"))("12"))
    eq_(0, h.binary_regex_search(re.compile("12"))("34"))

Source File: strings.py From ibis with Apache License 2.0

5 votes

def execute_string_like_series_string(op, data, pattern, escape, **kwargs):
    new_pattern = re.compile(sql_like_to_regex(pattern, escape=escape))
    return data.map(
        lambda x, pattern=new_pattern: pattern.search(x) is not None
    )

Source File: client.py From ibis with Apache License 2.0

5 votes

def list_tables(self, like: str = None, database: str = None) -> list:
        """List all tables inside given or current database.

        Parameters
        ----------
        like : str, optional
        database : str, optional

        Returns
        -------
        list
        """
        _database = None

        if not self.db_name == database:
            _database = self.db_name
            self.set_database(database)

        tables = self.con.get_tables()

        if _database:
            self.set_database(_database)

        if like is None:
            return tables
        pattern = re.compile(like)
        return list(filter(lambda t: pattern.findall(t), tables))

Source File: genrelease.py From ibis with Apache License 2.0

5 votes

def iter_release_notes(repo, from_ref, to_ref, default_role):
    """Yield release notes from `from_ref` to `to_ref`."""
    pattern = re.compile(
        r'^(?:{})\s+#(\d+)\s+from'.format('|'.join(GITHUB_CLOSE_KEYWORDS)),
        flags=re.MULTILINE | re.IGNORECASE,
    )
    for commit in commits_between(
        repo, from_ref, to_ref, options=pygit2.GIT_SORT_TOPOLOGICAL
    ):
        message = commit.message.strip()
        subject, *lines = map(str.strip, message.splitlines())
        tag, *rest = subject.split(':', 1)
        tag = tag.lower()
        lineitem = ''.join(rest) or subject
        role = KEYWORD_MAP.get(tag, default_role)
        modifier = ' major' if role == 'bug' else ''
        try:
            issue_number, *_ = pattern.findall(message)
        except ValueError:
            issue_number = '-'
        yield "* :{role}:`{issue_number}{modifier}` {lineitem}".format(
            role=role,
            issue_number=issue_number,
            modifier=modifier,
            lineitem=lineitem.strip(),
        )

Source File: encoder.py From GPT2 with MIT License

5 votes

def __init__(self, encoder, bpe_merges, errors='replace'):
        self.encoder = encoder
        self.decoder = {v:k for k,v in self.encoder.items()}
        self.errors = errors # how to handle errors in decoding
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        self.cache = {}

        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")

Source File: spelling_checker.py From kaggle-HomeDepot with MIT License

5 votes

def autocorrect_query(self, query):
        # correct bigram
        bigram_corrector = {}
        words_bigram = self.get_valid_bigram_words(query.split(" "))
        for word in words_bigram:
            corrected_word = self.spelling_checker.correct(word)
            if word != corrected_word:
                bigram_corrector[word] = corrected_word

        for k,v in bigram_corrector.items():
            pattern = regex.compile(r"(?<=\W|^)%s(?=\W|$)"%k)
            query = regex.sub(pattern, v, query)

        # correct unigram
        corrected_query = []
        for word in query.split(" "):
            if len(word) < self.min_len:
                corrected_query.append(word)
            elif self.exclude_stopwords and word in config.STOP_WORDS:
                corrected_query.append(word)
            elif self.skip_digit and len(re.findall(re.compile("\d+"), word)):
                corrected_query.append(word)
            else:
                corrected_word = self.spelling_checker.correct(word)
                if len(corrected_word) < self.min_len:
                    corrected_query.append(word)
                else:
                    corrected_query.append( corrected_word )
        return " ".join(corrected_query)

Source File: search.py From PyDev.Debugger with Eclipse Public License 1.0

5 votes

def __init__(self, regexp, flags = 0, maxLength = None):
        """
        @type  regexp: str
        @param regexp: Regular expression string.

        @type  flags: int
        @param flags: Regular expression flags.

        @type  maxLength: int
        @param maxLength: Maximum expected length of the strings matched by
            this regular expression.

            This value will be used to calculate the required buffer size when
            doing buffered searches.

            Ideally it should be an exact value, but in some cases it's not
            possible to calculate so an upper limit should be given instead.

            If that's not possible either, C{None} should be used. That will
            cause an exception to be raised if this pattern is used in a
            buffered search.
        """
        self.pattern   = regexp
        self.flags     = flags
        self.regexp    = re.compile(regexp, flags)
        self.maxLength = maxLength

Source File: postprocessing.py From ocr-process-service with MIT License

5 votes

def remove_non_numeric_date(text):
        regex = re.compile(r'[^\d\/]+')
        return regex.sub('', text)

Source File: postprocessing.py From ocr-process-service with MIT License

5 votes

def remove_non_numeric_currency(text):
        regex = re.compile(r'[^\d\,\.]+')
        temp = regex.sub('', text)
        return temp.replace(',', ".")

Source File: postprocessing.py From ocr-process-service with MIT License

5 votes

def remove_non_numeric(text):
        non_decimal = re.compile(r'[^\d]+')
        return non_decimal.sub('', text)

Source File: postprocessing.py From ocr-process-service with MIT License

5 votes

def approximate_match(word_re, lines, fuzziness='e<=1'):
        logger = logging.getLogger(__name__)
        logger.debug('Looking for %s with fuzziness: %s' % (word_re, fuzziness))
        best_partial_matches = []
        search = re.compile(
            ur'(' + word_re + '){' + fuzziness + '}',
            flags=re.BESTMATCH | re.IGNORECASE).search
        for m in ifilter(None, imap(search, lines)):
            logger.debug('%s %s' % (m.span(), m[0]))
            best_partial_matches.append(m[0])
        return best_partial_matches

Source File: simple_tokenizer.py From FusionNet with MIT License

5 votes

def __init__(self, **kwargs):
        """
        Args:
            annotators: None or empty set (only tokenizes).
        """
        self._regexp = regex.compile(
            '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS),
            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
        )
        if len(kwargs.get('annotators', {})) > 0:
            logger.warning('%s only tokenizes! Skipping annotators: %s' %
                           (type(self).__name__, kwargs.get('annotators')))
        self.annotators = set()

Python regex.compile() Examples