Python Examples of regex.finditer

Source File: autosum_arxiv.py From autosum with MIT License

6 votes

def get_arxiv_meta_archive(aid):
    title = ''
    authors = []
    jref = ''
    txt = ''
    with tarfile.open("./kddcup2003/hep-th-abs.tar.gz", "r:gz") as t:
        for m in t.getmembers():
            if m.name.find(aid) != -1:
                txt = t.extractfile(m).read()
                break
    for m in regex.finditer(r'Title:\s+(.*)(?=Author)', txt, regex.S):
        title = clean_line(m.group(1))
        break
    for m in regex.finditer(r'Authors?:\s+(.*)(?=Comment)', txt, regex.S):
        a = clean_line(m.group(1))
        authors = regex.split(r'(?:,\s*(?:and\s+)?|\s+and\s+)', a)
        break
    for m in regex.finditer(r'Journal-ref:\s+(.*?)(?=\\\\)', txt, regex.S):
        jref = clean_line(m.group(1))
        break

    return title, authors, jref

Source File: functional_load.py From CorpusTools with BSD 3-Clause "New" or "Revised" License

6 votes

def neutralize_with_all_envs(trans, env_filters):
    string = ''.join(trans.with_word_boundaries())
    length = len(string)
    for env_filter in env_filters:
        pattern = env_filter.generate_regular_expression()
        for match in re.finditer(pattern, string, overlapped=True):
            mid_index = match.start('MID')
            temp = ''
            for i in range(length):
                if i == mid_index:
                    s = '-'
                else:
                    s = string[i]
                temp += s
            string = temp
    return string


# This function is weirdly named. It should really be something like
# average_minpair_fl
# It has also been changed so as to have two "relativizer" options:
# one to words containing the relevant segments and one to all
# words in the corpus (though it basically does the calculation
# by calling the above minpair_fl() function).

Source File: _panphon.py From panphon with MIT License

6 votes

def compile_regex_from_str(self, ft_str):
        """Given a string describing features masks for a sequence of segments,
        return a regex matching the corresponding strings.

        Args:
            ft_str (str): feature masks, each enclosed in square brackets, in
            which the features are delimited by any standard delimiter.

        Returns:
           Pattern: regular expression pattern equivalent to `ft_str`
        """

        sequence = []
        for m in re.finditer(r'\[([^]]+)\]', ft_str):
            ft_mask = fts(m.group(1))
            segs = self.all_segs_matching_fts(ft_mask)
            sub_pat = '({})'.format('|'.join(segs))
            sequence.append(sub_pat)
        pattern = ''.join(sequence)
        regex = re.compile(pattern)
        return regex

Source File: test_partialparse.py From ctparse with MIT License

6 votes

def test_partial_parse() -> None:
    match_a = regex.match("(?<R1>a)", "ab")
    match_b = next(regex.finditer("(?<R2>b)", "ab"))

    pp = PartialParse.from_regex_matches(
        (RegexMatch(1, match_a), RegexMatch(2, match_b))
    )

    assert len(pp.prod) == 2
    assert len(pp.rules) == 2

    assert isinstance(pp.score, float)

    def mock_rule(ts: datetime.datetime, a: Time) -> Time:
        return Time()

    pp2 = pp.apply_rule(
        datetime.datetime(day=1, month=1, year=2015), mock_rule, "mock_rule", (0, 1)
    )

    assert pp != pp2

    with pytest.raises(ValueError):
        PartialParse((), ())

Source File: distance.py From panphon with MIT License

6 votes

def map_to_dogol_prime(self, s):
        """Map a string to Dogolpolsky' classes

        Args:
            s (unicode): IPA word

        Returns:
            (unicode): word with all segments collapsed to D' classes
        """
        segs = []
        for seg in self.fm.seg_regex.finditer(s):
            fts = self.fm.fts(seg.group(0))
            for mask, label in self.dogol_prime:
                if fts >= mask:
                    segs.append(label)
                    break
        return ''.join(segs)

Source File: __init__.py From date-extractor with Apache License 2.0

5 votes

def getFirstDateFromText(text, debug=False, default_hour=0, default_minute=0, default_second=0, return_precision=False):
    #print("starting getFirstDateFromText")
    global patterns

    for match in regex.finditer(patterns['date_compiled'], text):
        #print("\nmatch is", match.group(0))
        #print("\nmatch.index is", ([item for item in match.groupdict().items() if item[1]]))
        if not isDefinitelyNotDate(match.group(0)):
            match = dict((k, num(v)) for k, v in match.groupdict().items() if num(v))
            return datetime_from_dict(match, debug, default_hour, default_minute, default_second, return_precision)
    #print "finishing getFirstDateFromText"

# the date of a webpage, like a blog or article, will often be the first date mentioned

Source File: tokenizer.py From SoMaJo with GNU General Public License v3.0

5 votes

def _split_emojis(self, node, token_class="emoticon"):
        boundaries = []
        for m in re.finditer(r"\X", node.value.text):
            if m.end() - m.start() > 1:
                if re.search(r"[\p{Extended_Pictographic}\p{Emoji_Presentation}\uFE0F]", m.group()):
                    boundaries.append((m.start(), m.end(), None))
            else:
                if re.search(r"[\p{Extended_Pictographic}\p{Emoji_Presentation}]", m.group()):
                    boundaries.append((m.start(), m.end(), None))
        self._split_on_boundaries(node, boundaries, token_class)

Source File: test_types.py From ctparse with MIT License

5 votes

def test_init(self):
        m = next(regex.finditer(r"(?P<R1>match me)", "xxx match me xxx"))
        r = RegexMatch(1, m)
        self.assertEqual(r.mstart, 4)
        self.assertEqual(r.mend, 12)
        self.assertEqual(len(r), 8)
        self.assertEqual(r._text, "match me")
        self.assertEqual(repr(r), "RegexMatch[4-12]{1:match me}")
        self.assertEqual(r.nb_str(), "RegexMatch[]{1:match me}")

Source File: test_rule.py From ctparse with MIT License

5 votes

def test_regex_match(self):
        m = next(regex.finditer("(?P<R1>x)", "x"))
        r = RegexMatch(1, m)
        self.assertTrue(regex_match(1)(r))
        self.assertFalse(regex_match(1)(TestClassA()))

Source File: distance.py From panphon with MIT License

5 votes

def ftstr2dict(ftstr):
    fts = {}
    for m in re.finditer(r'([-0+])(\w+)', ftstr):
        v, k = m.groups()
        fts[k] = {'-': -1, '0': 0, '+': 1}[v]
    return fts

Source File: _panphon.py From panphon with MIT License

5 votes

def segment_text(text, seg_regex=SEG_REGEX):
    """Return an iterator of segments in the text.

    Args:
        text (unicode): string of IPA Unicode text
        seg_regex (_regex.Pattern): compiled regex defining a segment (base +
                                    modifiers)

    Return:
        generator: segments in the input text
    """
    for m in seg_regex.finditer(text):
        yield m.group(0)

Source File: _panphon.py From panphon with MIT License

5 votes

def fts(s):
    """Given string `s` with +/-[alphabetical sequence]s, return list of features.

    Args:
        s (str): string with segments of the sort "+son -syl 0cor"

    Return:
        list: list of (value, feature) tuples
    """
    return [m.groups() for m in FT_REGEX.finditer(s)]

Source File: _panphon.py From panphon with MIT License

5 votes

def pat(p):
    """Given a string `p` with feature matrices (features grouped with square
    brackets into segments, return a list of sets of (value, feature) tuples.

    Args:
        p (str): list of feature matrices as strings

    Return:
        list: list of sets of (value, feature) tuples
    """
    pattern = []
    for matrix in [m.group(0) for m in MT_REGEX.finditer(p)]:
        segment = set([m.groups() for m in FT_REGEX.finditer(matrix)])
        pattern.append(segment)
    return pattern

Source File: _panphon.py From panphon with MIT License

5 votes

def filter_string(self, word):
        """Return a string like the input but containing only legal IPA segments

        Args:
            word (unicode): input string to be filtered

        Returns:
            unicode: string identical to `word` but with invalid IPA segments
                     absent

        """
        segs = [m.group(0) for m in self.seg_regex.finditer(word)]
        return ''.join(segs)

Source File: segment.py From panphon with MIT License

5 votes

def __init__(self, names, features={}, ftstr='', weights=None):
        """Construct a `Segment` object

        Args:
            names (list): ordered list of feature names
            features (dict): name-value pairs for specified features
            ftstr (unicode): a string, each /(+|0|-)\w+/ sequence of which is
                             interpreted as a feature specification
            weights (float): order list of feature weights/saliences
            """
        self.n2s = {-1: '-', 0: '0', 1: '+'}
        self.s2n = {k: v for (v, k) in self.n2s.items()}
        self.names = names
        """Set a feature specification"""
        self.data = {}
        for name in names:
            if name in features:
                self.data[name] = features[name]
            else:
                self.data[name] = 0
        for m in re.finditer(r'(\+|0|-)(\w+)', ftstr):
            v, k = m.groups()
            self.data[k] = self.s2n[v]
        if weights:
            self.weights = weights
        else:
            self.weights = [1 for _ in names]

Source File: event_tagger.py From estnltk with GNU General Public License v2.0

5 votes

def _match(self, text):
        matches = []
        if self.mapping:
            seq = self.map.keys()
        else:
            seq = self.regex_sequence

        for r in seq:
            for matchobj in re.finditer(r, text, overlapped=True):
                groups = (matchobj.groupdict())
                result = {
                    'start': matchobj.start(),
                    'end': matchobj.end(),
                    'regex': r,
                    'groups':groups
                }

                if self.mapping:
                    for k, v in self.map[r].items():
                        if k not in result.keys():
                            result[k] = v

                matches.append(
                    result
                )

        return matches

Source File: run_coqa.py From FlowDelta with MIT License

5 votes

def split_with_span(s):
    if s.split() == []:
        return [], []
    else:
        return zip(*[(m.group(0), (m.start(), m.end()-1)) for m in re.finditer(r'\S+', s)])

Source File: tokenizer.py From SoMaJo with GNU General Public License v3.0

5 votes

def _split_left(self, regex, node):
        boundaries = []
        prev_end = 0
        for m in regex.finditer(node.value.text):
            boundaries.append((prev_end, m.start(), None))
            prev_end = m.start()
        self._split_on_boundaries(node, boundaries, token_class=None, lock_match=False)

Source File: tokenizer.py From SoMaJo with GNU General Public License v3.0

5 votes

def _split_matches(self, regex, node, token_class="regular", repl=None, split_named_subgroups=True, delete_whitespace=False):
        boundaries = []
        split_groups = split_named_subgroups and len(regex.groupindex) > 0
        group_numbers = sorted(regex.groupindex.values())
        for m in regex.finditer(node.value.text):
            if split_groups:
                for g in group_numbers:
                    if m.span(g) != (-1, -1):
                        boundaries.append((m.start(g), m.end(g), None))
            else:
                if repl is None:
                    boundaries.append((m.start(), m.end(), None))
                else:
                    boundaries.append((m.start(), m.end(), m.expand(repl)))
        self._split_on_boundaries(node, boundaries, token_class, delete_whitespace=delete_whitespace)

Source File: tokenizer.py From SoMaJo with GNU General Public License v3.0

5 votes

def _split_set(self, regex, node, items, token_class="regular", ignore_case=False):
        boundaries = []
        for m in regex.finditer(node.value.text):
            instance = m.group(0)
            if ignore_case:
                instance = instance.lower()
            if instance in items:
                boundaries.append((m.start(), m.end(), None))
        self._split_on_boundaries(node, boundaries, token_class)

Source File: tokenizer.py From SoMaJo with GNU General Public License v3.0

5 votes

def _split_left(self, regex, node):
        boundaries = []
        prev_end = 0
        for m in regex.finditer(node.value.text):
            boundaries.append((prev_end, m.start(), None))
            prev_end = m.start()
        self._split_on_boundaries(node, boundaries, token_class=None, lock_match=False)

Source File: tokenizer.py From SoMaJo with GNU General Public License v3.0

5 votes

def _split_abbreviations(self, token_dll, split_multipart_abbrevs=True):
        """Turn instances of abbreviations into tokens."""
        self._split_all_matches(self.single_letter_ellipsis, token_dll, "abbreviation")
        self._split_all_matches(self.and_cetera, token_dll, "abbreviation")
        self._split_all_matches(self.str_abbreviations, token_dll, "abbreviation")
        self._split_all_matches(self.nr_abbreviations, token_dll, "abbreviation")
        self._split_all_matches(self.single_token_abbreviation, token_dll, "abbreviation")
        self._split_all_matches(self.single_letter_abbreviation, token_dll, "abbreviation")
        self._split_all_matches(self.ps, token_dll, "abbreviation")

        for t in token_dll:
            if t.value.markup or t.value._locked:
                continue
            boundaries = []
            for m in self.abbreviation.finditer(t.value.text):
                instance = m.group(0)
                if split_multipart_abbrevs and self.multipart_abbreviation.fullmatch(instance):
                    start, end = m.span(0)
                    s = start
                    for i, c in enumerate(instance, start=1):
                        if c == ".":
                            boundaries.append((s, start + i, None))
                            s = start + i
                else:
                    boundaries.append((m.start(), m.end(), None))
            self._split_on_boundaries(t, boundaries, "abbreviation")

Source File: tokenizer.py From SoMaJo with GNU General Public License v3.0

5 votes

def _split_matches(self, regex, node, token_class="regular", repl=None, split_named_subgroups=True, delete_whitespace=False):
        boundaries = []
        split_groups = split_named_subgroups and len(regex.groupindex) > 0
        group_numbers = sorted(regex.groupindex.values())
        for m in regex.finditer(node.value.text):
            if split_groups:
                for g in group_numbers:
                    if m.span(g) != (-1, -1):
                        boundaries.append((m.start(g), m.end(g), None))
            else:
                if repl is None:
                    boundaries.append((m.start(), m.end(), None))
                else:
                    boundaries.append((m.start(), m.end(), m.expand(repl)))
        self._split_on_boundaries(node, boundaries, token_class, delete_whitespace=delete_whitespace)

Source File: tokenizer.py From SoMaJo with GNU General Public License v3.0

5 votes

def _split_emojis(self, node, token_class="emoticon"):
        boundaries = []
        for m in re.finditer(r"\X", node.value.text):
            if m.end() - m.start() > 1:
                if re.search(r"[\p{Extended_Pictographic}\p{Emoji_Presentation}\uFE0F]", m.group()):
                    boundaries.append((m.start(), m.end(), None))
            else:
                if re.search(r"[\p{Extended_Pictographic}\p{Emoji_Presentation}]", m.group()):
                    boundaries.append((m.start(), m.end(), None))
        self._split_on_boundaries(node, boundaries, token_class)

Source File: tokenizer.py From SoMaJo with GNU General Public License v3.0

5 votes

def _split_set(self, regex, node, items, token_class="regular", ignore_case=False):
        boundaries = []
        for m in regex.finditer(node.value.text):
            instance = m.group(0)
            if ignore_case:
                instance = instance.lower()
            if instance in items:
                boundaries.append((m.start(), m.end(), None))
        self._split_on_boundaries(node, boundaries, token_class)

Source File: tokenizer.py From SoMaJo with GNU General Public License v3.0

5 votes

def _split_abbreviations(self, token_dll, split_multipart_abbrevs=True):
        """Turn instances of abbreviations into tokens."""
        self._split_all_matches(self.single_letter_ellipsis, token_dll, "abbreviation")
        self._split_all_matches(self.and_cetera, token_dll, "abbreviation")
        self._split_all_matches(self.str_abbreviations, token_dll, "abbreviation")
        self._split_all_matches(self.nr_abbreviations, token_dll, "abbreviation")
        self._split_all_matches(self.single_token_abbreviation, token_dll, "abbreviation")
        self._split_all_matches(self.single_letter_abbreviation, token_dll, "abbreviation")
        self._split_all_matches(self.ps, token_dll, "abbreviation")

        for t in token_dll:
            if t.value.markup or t.value._locked:
                continue
            boundaries = []
            for m in self.abbreviation.finditer(t.value.text):
                instance = m.group(0)
                if split_multipart_abbrevs and self.multipart_abbreviation.fullmatch(instance):
                    start, end = m.span(0)
                    s = start
                    for i, c in enumerate(instance, start=1):
                        if c == ".":
                            boundaries.append((s, start + i, None))
                            s = start + i
                else:
                    boundaries.append((m.start(), m.end(), None))
            self._split_on_boundaries(t, boundaries, "abbreviation")

Source File: strtools.py From extratools with MIT License

5 votes

def extract(s: str, entities: Iterable[str], useregex=False, ignorecase=True) -> Iterable[str]:
    for m in re.compile(
            r"\b(?:{})\b".format(r"|".join(
                e if useregex else re.escape(e).replace(' ', r"s+") for e in entities
            )),
            re.I if ignorecase else 0
        ).finditer(s):
        yield m.group(0)

Source File: strtools.py From extratools with MIT License

5 votes

def __findeqtagpairspans(
        s: str,
        tag: str,
        useregex: bool = False
    ) -> Iterable[Tuple[Tuple[int, int], ...]]:
    for match in re.finditer(r"(?P<__open>{})(?P<__content>.*?)(?P<__close>\1)".format(tag if useregex else re.escape(tag)), s):
        yield (match.span("__open"), match.span("__content"), match.span("__close"))

Source File: strtools.py From extratools with MIT License

5 votes

def __findtagpairspans(
        s: str,
        tag: str, closetag: Optional[str] = None,
        useregex: bool = False
    ) -> Iterable[Tuple[Tuple[int, int], ...]]:
    if closetag is None or tag == closetag:
        yield from __findeqtagpairspans(s, tag, useregex=useregex)
        return

    if not useregex:
        tag = re.escape(tag)
        closetag = re.escape(closetag)

    retags = re.compile(r"(?P<__open>{})|(?P<__close>{})".format(tag, closetag))

    startspans = []

    for match in retags.finditer(s):
        opengroup = match.group("__open")
        if opengroup:
            startspans.append(match.span())
            continue

        closegroup = match.group("__close")
        if closegroup and startspans:
            startspan = startspans.pop()
            endspan = match.span()

            yield (startspan, (startspan[1], endspan[0]), endspan)

Source File: strtools.py From extratools with MIT License

5 votes

def extract(s: str, entities: Iterable[str], useregex=False, ignorecase=True) -> Iterable[str]:
    for m in re.compile(
            r"\b(?:{})\b".format(r"|".join(
                e if useregex else re.escape(e).replace(' ', r"s+") for e in entities
            )),
            re.I if ignorecase else 0
        ).finditer(s):
        yield m.group(0)

Python regex.finditer() Examples