Python regex.finditer() Examples
The following are 30
code examples of regex.finditer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
regex
, or try the search function
.
Example #1
Source File: autosum_arxiv.py From autosum with MIT License | 6 votes |
def get_arxiv_meta_archive(aid): title = '' authors = [] jref = '' txt = '' with tarfile.open("./kddcup2003/hep-th-abs.tar.gz", "r:gz") as t: for m in t.getmembers(): if m.name.find(aid) != -1: txt = t.extractfile(m).read() break for m in regex.finditer(r'Title:\s+(.*)(?=Author)', txt, regex.S): title = clean_line(m.group(1)) break for m in regex.finditer(r'Authors?:\s+(.*)(?=Comment)', txt, regex.S): a = clean_line(m.group(1)) authors = regex.split(r'(?:,\s*(?:and\s+)?|\s+and\s+)', a) break for m in regex.finditer(r'Journal-ref:\s+(.*?)(?=\\\\)', txt, regex.S): jref = clean_line(m.group(1)) break return title, authors, jref
Example #2
Source File: functional_load.py From CorpusTools with BSD 3-Clause "New" or "Revised" License | 6 votes |
def neutralize_with_all_envs(trans, env_filters): string = ''.join(trans.with_word_boundaries()) length = len(string) for env_filter in env_filters: pattern = env_filter.generate_regular_expression() for match in re.finditer(pattern, string, overlapped=True): mid_index = match.start('MID') temp = '' for i in range(length): if i == mid_index: s = '-' else: s = string[i] temp += s string = temp return string # This function is weirdly named. It should really be something like # average_minpair_fl # It has also been changed so as to have two "relativizer" options: # one to words containing the relevant segments and one to all # words in the corpus (though it basically does the calculation # by calling the above minpair_fl() function).
Example #3
Source File: _panphon.py From panphon with MIT License | 6 votes |
def compile_regex_from_str(self, ft_str): """Given a string describing features masks for a sequence of segments, return a regex matching the corresponding strings. Args: ft_str (str): feature masks, each enclosed in square brackets, in which the features are delimited by any standard delimiter. Returns: Pattern: regular expression pattern equivalent to `ft_str` """ sequence = [] for m in re.finditer(r'\[([^]]+)\]', ft_str): ft_mask = fts(m.group(1)) segs = self.all_segs_matching_fts(ft_mask) sub_pat = '({})'.format('|'.join(segs)) sequence.append(sub_pat) pattern = ''.join(sequence) regex = re.compile(pattern) return regex
Example #4
Source File: test_partialparse.py From ctparse with MIT License | 6 votes |
def test_partial_parse() -> None: match_a = regex.match("(?<R1>a)", "ab") match_b = next(regex.finditer("(?<R2>b)", "ab")) pp = PartialParse.from_regex_matches( (RegexMatch(1, match_a), RegexMatch(2, match_b)) ) assert len(pp.prod) == 2 assert len(pp.rules) == 2 assert isinstance(pp.score, float) def mock_rule(ts: datetime.datetime, a: Time) -> Time: return Time() pp2 = pp.apply_rule( datetime.datetime(day=1, month=1, year=2015), mock_rule, "mock_rule", (0, 1) ) assert pp != pp2 with pytest.raises(ValueError): PartialParse((), ())
Example #5
Source File: distance.py From panphon with MIT License | 6 votes |
def map_to_dogol_prime(self, s): """Map a string to Dogolpolsky' classes Args: s (unicode): IPA word Returns: (unicode): word with all segments collapsed to D' classes """ segs = [] for seg in self.fm.seg_regex.finditer(s): fts = self.fm.fts(seg.group(0)) for mask, label in self.dogol_prime: if fts >= mask: segs.append(label) break return ''.join(segs)
Example #6
Source File: __init__.py From date-extractor with Apache License 2.0 | 5 votes |
def getFirstDateFromText(text, debug=False, default_hour=0, default_minute=0, default_second=0, return_precision=False): #print("starting getFirstDateFromText") global patterns for match in regex.finditer(patterns['date_compiled'], text): #print("\nmatch is", match.group(0)) #print("\nmatch.index is", ([item for item in match.groupdict().items() if item[1]])) if not isDefinitelyNotDate(match.group(0)): match = dict((k, num(v)) for k, v in match.groupdict().items() if num(v)) return datetime_from_dict(match, debug, default_hour, default_minute, default_second, return_precision) #print "finishing getFirstDateFromText" # the date of a webpage, like a blog or article, will often be the first date mentioned
Example #7
Source File: tokenizer.py From SoMaJo with GNU General Public License v3.0 | 5 votes |
def _split_emojis(self, node, token_class="emoticon"): boundaries = [] for m in re.finditer(r"\X", node.value.text): if m.end() - m.start() > 1: if re.search(r"[\p{Extended_Pictographic}\p{Emoji_Presentation}\uFE0F]", m.group()): boundaries.append((m.start(), m.end(), None)) else: if re.search(r"[\p{Extended_Pictographic}\p{Emoji_Presentation}]", m.group()): boundaries.append((m.start(), m.end(), None)) self._split_on_boundaries(node, boundaries, token_class)
Example #8
Source File: test_types.py From ctparse with MIT License | 5 votes |
def test_init(self): m = next(regex.finditer(r"(?P<R1>match me)", "xxx match me xxx")) r = RegexMatch(1, m) self.assertEqual(r.mstart, 4) self.assertEqual(r.mend, 12) self.assertEqual(len(r), 8) self.assertEqual(r._text, "match me") self.assertEqual(repr(r), "RegexMatch[4-12]{1:match me}") self.assertEqual(r.nb_str(), "RegexMatch[]{1:match me}")
Example #9
Source File: test_rule.py From ctparse with MIT License | 5 votes |
def test_regex_match(self): m = next(regex.finditer("(?P<R1>x)", "x")) r = RegexMatch(1, m) self.assertTrue(regex_match(1)(r)) self.assertFalse(regex_match(1)(TestClassA()))
Example #10
Source File: distance.py From panphon with MIT License | 5 votes |
def ftstr2dict(ftstr): fts = {} for m in re.finditer(r'([-0+])(\w+)', ftstr): v, k = m.groups() fts[k] = {'-': -1, '0': 0, '+': 1}[v] return fts
Example #11
Source File: _panphon.py From panphon with MIT License | 5 votes |
def segment_text(text, seg_regex=SEG_REGEX): """Return an iterator of segments in the text. Args: text (unicode): string of IPA Unicode text seg_regex (_regex.Pattern): compiled regex defining a segment (base + modifiers) Return: generator: segments in the input text """ for m in seg_regex.finditer(text): yield m.group(0)
Example #12
Source File: _panphon.py From panphon with MIT License | 5 votes |
def fts(s): """Given string `s` with +/-[alphabetical sequence]s, return list of features. Args: s (str): string with segments of the sort "+son -syl 0cor" Return: list: list of (value, feature) tuples """ return [m.groups() for m in FT_REGEX.finditer(s)]
Example #13
Source File: _panphon.py From panphon with MIT License | 5 votes |
def pat(p): """Given a string `p` with feature matrices (features grouped with square brackets into segments, return a list of sets of (value, feature) tuples. Args: p (str): list of feature matrices as strings Return: list: list of sets of (value, feature) tuples """ pattern = [] for matrix in [m.group(0) for m in MT_REGEX.finditer(p)]: segment = set([m.groups() for m in FT_REGEX.finditer(matrix)]) pattern.append(segment) return pattern
Example #14
Source File: _panphon.py From panphon with MIT License | 5 votes |
def filter_string(self, word): """Return a string like the input but containing only legal IPA segments Args: word (unicode): input string to be filtered Returns: unicode: string identical to `word` but with invalid IPA segments absent """ segs = [m.group(0) for m in self.seg_regex.finditer(word)] return ''.join(segs)
Example #15
Source File: segment.py From panphon with MIT License | 5 votes |
def __init__(self, names, features={}, ftstr='', weights=None): """Construct a `Segment` object Args: names (list): ordered list of feature names features (dict): name-value pairs for specified features ftstr (unicode): a string, each /(+|0|-)\w+/ sequence of which is interpreted as a feature specification weights (float): order list of feature weights/saliences """ self.n2s = {-1: '-', 0: '0', 1: '+'} self.s2n = {k: v for (v, k) in self.n2s.items()} self.names = names """Set a feature specification""" self.data = {} for name in names: if name in features: self.data[name] = features[name] else: self.data[name] = 0 for m in re.finditer(r'(\+|0|-)(\w+)', ftstr): v, k = m.groups() self.data[k] = self.s2n[v] if weights: self.weights = weights else: self.weights = [1 for _ in names]
Example #16
Source File: event_tagger.py From estnltk with GNU General Public License v2.0 | 5 votes |
def _match(self, text): matches = [] if self.mapping: seq = self.map.keys() else: seq = self.regex_sequence for r in seq: for matchobj in re.finditer(r, text, overlapped=True): groups = (matchobj.groupdict()) result = { 'start': matchobj.start(), 'end': matchobj.end(), 'regex': r, 'groups':groups } if self.mapping: for k, v in self.map[r].items(): if k not in result.keys(): result[k] = v matches.append( result ) return matches
Example #17
Source File: run_coqa.py From FlowDelta with MIT License | 5 votes |
def split_with_span(s): if s.split() == []: return [], [] else: return zip(*[(m.group(0), (m.start(), m.end()-1)) for m in re.finditer(r'\S+', s)])
Example #18
Source File: tokenizer.py From SoMaJo with GNU General Public License v3.0 | 5 votes |
def _split_left(self, regex, node): boundaries = [] prev_end = 0 for m in regex.finditer(node.value.text): boundaries.append((prev_end, m.start(), None)) prev_end = m.start() self._split_on_boundaries(node, boundaries, token_class=None, lock_match=False)
Example #19
Source File: tokenizer.py From SoMaJo with GNU General Public License v3.0 | 5 votes |
def _split_matches(self, regex, node, token_class="regular", repl=None, split_named_subgroups=True, delete_whitespace=False): boundaries = [] split_groups = split_named_subgroups and len(regex.groupindex) > 0 group_numbers = sorted(regex.groupindex.values()) for m in regex.finditer(node.value.text): if split_groups: for g in group_numbers: if m.span(g) != (-1, -1): boundaries.append((m.start(g), m.end(g), None)) else: if repl is None: boundaries.append((m.start(), m.end(), None)) else: boundaries.append((m.start(), m.end(), m.expand(repl))) self._split_on_boundaries(node, boundaries, token_class, delete_whitespace=delete_whitespace)
Example #20
Source File: tokenizer.py From SoMaJo with GNU General Public License v3.0 | 5 votes |
def _split_set(self, regex, node, items, token_class="regular", ignore_case=False): boundaries = [] for m in regex.finditer(node.value.text): instance = m.group(0) if ignore_case: instance = instance.lower() if instance in items: boundaries.append((m.start(), m.end(), None)) self._split_on_boundaries(node, boundaries, token_class)
Example #21
Source File: tokenizer.py From SoMaJo with GNU General Public License v3.0 | 5 votes |
def _split_left(self, regex, node): boundaries = [] prev_end = 0 for m in regex.finditer(node.value.text): boundaries.append((prev_end, m.start(), None)) prev_end = m.start() self._split_on_boundaries(node, boundaries, token_class=None, lock_match=False)
Example #22
Source File: tokenizer.py From SoMaJo with GNU General Public License v3.0 | 5 votes |
def _split_abbreviations(self, token_dll, split_multipart_abbrevs=True): """Turn instances of abbreviations into tokens.""" self._split_all_matches(self.single_letter_ellipsis, token_dll, "abbreviation") self._split_all_matches(self.and_cetera, token_dll, "abbreviation") self._split_all_matches(self.str_abbreviations, token_dll, "abbreviation") self._split_all_matches(self.nr_abbreviations, token_dll, "abbreviation") self._split_all_matches(self.single_token_abbreviation, token_dll, "abbreviation") self._split_all_matches(self.single_letter_abbreviation, token_dll, "abbreviation") self._split_all_matches(self.ps, token_dll, "abbreviation") for t in token_dll: if t.value.markup or t.value._locked: continue boundaries = [] for m in self.abbreviation.finditer(t.value.text): instance = m.group(0) if split_multipart_abbrevs and self.multipart_abbreviation.fullmatch(instance): start, end = m.span(0) s = start for i, c in enumerate(instance, start=1): if c == ".": boundaries.append((s, start + i, None)) s = start + i else: boundaries.append((m.start(), m.end(), None)) self._split_on_boundaries(t, boundaries, "abbreviation")
Example #23
Source File: tokenizer.py From SoMaJo with GNU General Public License v3.0 | 5 votes |
def _split_matches(self, regex, node, token_class="regular", repl=None, split_named_subgroups=True, delete_whitespace=False): boundaries = [] split_groups = split_named_subgroups and len(regex.groupindex) > 0 group_numbers = sorted(regex.groupindex.values()) for m in regex.finditer(node.value.text): if split_groups: for g in group_numbers: if m.span(g) != (-1, -1): boundaries.append((m.start(g), m.end(g), None)) else: if repl is None: boundaries.append((m.start(), m.end(), None)) else: boundaries.append((m.start(), m.end(), m.expand(repl))) self._split_on_boundaries(node, boundaries, token_class, delete_whitespace=delete_whitespace)
Example #24
Source File: tokenizer.py From SoMaJo with GNU General Public License v3.0 | 5 votes |
def _split_emojis(self, node, token_class="emoticon"): boundaries = [] for m in re.finditer(r"\X", node.value.text): if m.end() - m.start() > 1: if re.search(r"[\p{Extended_Pictographic}\p{Emoji_Presentation}\uFE0F]", m.group()): boundaries.append((m.start(), m.end(), None)) else: if re.search(r"[\p{Extended_Pictographic}\p{Emoji_Presentation}]", m.group()): boundaries.append((m.start(), m.end(), None)) self._split_on_boundaries(node, boundaries, token_class)
Example #25
Source File: tokenizer.py From SoMaJo with GNU General Public License v3.0 | 5 votes |
def _split_set(self, regex, node, items, token_class="regular", ignore_case=False): boundaries = [] for m in regex.finditer(node.value.text): instance = m.group(0) if ignore_case: instance = instance.lower() if instance in items: boundaries.append((m.start(), m.end(), None)) self._split_on_boundaries(node, boundaries, token_class)
Example #26
Source File: tokenizer.py From SoMaJo with GNU General Public License v3.0 | 5 votes |
def _split_abbreviations(self, token_dll, split_multipart_abbrevs=True): """Turn instances of abbreviations into tokens.""" self._split_all_matches(self.single_letter_ellipsis, token_dll, "abbreviation") self._split_all_matches(self.and_cetera, token_dll, "abbreviation") self._split_all_matches(self.str_abbreviations, token_dll, "abbreviation") self._split_all_matches(self.nr_abbreviations, token_dll, "abbreviation") self._split_all_matches(self.single_token_abbreviation, token_dll, "abbreviation") self._split_all_matches(self.single_letter_abbreviation, token_dll, "abbreviation") self._split_all_matches(self.ps, token_dll, "abbreviation") for t in token_dll: if t.value.markup or t.value._locked: continue boundaries = [] for m in self.abbreviation.finditer(t.value.text): instance = m.group(0) if split_multipart_abbrevs and self.multipart_abbreviation.fullmatch(instance): start, end = m.span(0) s = start for i, c in enumerate(instance, start=1): if c == ".": boundaries.append((s, start + i, None)) s = start + i else: boundaries.append((m.start(), m.end(), None)) self._split_on_boundaries(t, boundaries, "abbreviation")
Example #27
Source File: strtools.py From extratools with MIT License | 5 votes |
def extract(s: str, entities: Iterable[str], useregex=False, ignorecase=True) -> Iterable[str]: for m in re.compile( r"\b(?:{})\b".format(r"|".join( e if useregex else re.escape(e).replace(' ', r"s+") for e in entities )), re.I if ignorecase else 0 ).finditer(s): yield m.group(0)
Example #28
Source File: strtools.py From extratools with MIT License | 5 votes |
def __findeqtagpairspans( s: str, tag: str, useregex: bool = False ) -> Iterable[Tuple[Tuple[int, int], ...]]: for match in re.finditer(r"(?P<__open>{})(?P<__content>.*?)(?P<__close>\1)".format(tag if useregex else re.escape(tag)), s): yield (match.span("__open"), match.span("__content"), match.span("__close"))
Example #29
Source File: strtools.py From extratools with MIT License | 5 votes |
def __findtagpairspans( s: str, tag: str, closetag: Optional[str] = None, useregex: bool = False ) -> Iterable[Tuple[Tuple[int, int], ...]]: if closetag is None or tag == closetag: yield from __findeqtagpairspans(s, tag, useregex=useregex) return if not useregex: tag = re.escape(tag) closetag = re.escape(closetag) retags = re.compile(r"(?P<__open>{})|(?P<__close>{})".format(tag, closetag)) startspans = [] for match in retags.finditer(s): opengroup = match.group("__open") if opengroup: startspans.append(match.span()) continue closegroup = match.group("__close") if closegroup and startspans: startspan = startspans.pop() endspan = match.span() yield (startspan, (startspan[1], endspan[0]), endspan)
Example #30
Source File: strtools.py From extratools with MIT License | 5 votes |
def extract(s: str, entities: Iterable[str], useregex=False, ignorecase=True) -> Iterable[str]: for m in re.compile( r"\b(?:{})\b".format(r"|".join( e if useregex else re.escape(e).replace(' ', r"s+") for e in entities )), re.I if ignorecase else 0 ).finditer(s): yield m.group(0)