Python nltk.tokenize.WordPunctTokenizer() Examples
The following are 25
code examples of nltk.tokenize.WordPunctTokenizer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.tokenize
, or try the search function
.
Example #1
Source File: reader.py From atap with Apache License 2.0 | 6 votes |
def __init__(self, root, fileids=PKL_PATTERN, **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids) self._word_tokenizer = WordPunctTokenizer() self._sent_tokenizer = nltk.data.LazyLoader( 'tokenizers/punkt/english.pickle')
Example #2
Source File: xmldocs.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def words(self, fileid=None): """ Returns all of the words and punctuation symbols in the specified file that were in text nodes -- ie, tags are ignored. Like the xml() method, fileid can only specify one file. :return: the given file's text nodes as a list of words and punctuation symbols :rtype: list(str) """ elt = self.xml(fileid) encoding = self.encoding(fileid) word_tokenizer = WordPunctTokenizer() iterator = elt.getiterator() out = [] for node in iterator: text = node.text if text is not None: if isinstance(text, bytes): text = text.decode(encoding) toks = word_tokenizer.tokenize(text) out.extend(toks) return out
Example #3
Source File: WGGraph.py From AbTextSumm with Mozilla Public License 2.0 | 6 votes |
def getredundantComponents(sentences): window_size=4 introList=[] midlist=[] endlist=[] for sent in sentences: words = WordPunctTokenizer().tokenize(sent) length_sent=len(words) f_point = (length_sent)//3 m_point=(length_sent)//2 index_span=window_size//2 intro=' '.join(word for word in words[0:window_size]) mid=' '.join(word for word in words[m_point-index_span:m_point+index_span]) end=' '.join(word for word in words[-window_size:]) introList.append(intro) midlist.append(mid) endlist.append(end) return introList, midlist, endlist
Example #4
Source File: sw_loader.py From texar with Apache License 2.0 | 6 votes |
def __getitem__(self, idx): idx, start, end = self.lst[idx] dialog = self.raw[idx][start:end] source, target = dialog[:-1], dialog[-1] spks, utts = list(zip(*[(speaker, WordPunctTokenizer().tokenize(uttr)) for speaker, uttr, _ in source])) spks = list(spks) while len(spks) < 10: spks.append(0) source = '|||'.join([' '.join(uttr) for uttr in utts]) target_test = ' '.join(WordPunctTokenizer().tokenize(target[1])) return spks, source, target_test, target[0]
Example #5
Source File: xmldocs.py From luscan-devel with GNU General Public License v2.0 | 6 votes |
def words(self, fileid=None): """ Returns all of the words and punctuation symbols in the specified file that were in text nodes -- ie, tags are ignored. Like the xml() method, fileid can only specify one file. :return: the given file's text nodes as a list of words and punctuation symbols :rtype: list(str) """ elt = self.xml(fileid) word_tokenizer=WordPunctTokenizer() iterator = elt.getiterator() out = [] for node in iterator: text = node.text if text is not None: toks = word_tokenizer.tokenize(text) out.extend(toks) return out
Example #6
Source File: reader.py From atap with Apache License 2.0 | 6 votes |
def __init__(self, root, fileids=DOC_PATTERN, word_tokenizer=WordPunctTokenizer(), sent_tokenizer=nltk.data.LazyLoader( 'tokenizers/punkt/english.pickle'), encoding='latin-1', **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids, encoding) self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._tags = TAGS
Example #7
Source File: reader.py From atap with Apache License 2.0 | 6 votes |
def __init__(self, root, fileids=PKL_PATTERN, **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids) self._word_tokenizer = WordPunctTokenizer() self._sent_tokenizer = nltk.data.LazyLoader( 'tokenizers/punkt/english.pickle')
Example #8
Source File: xmldocs.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def words(self, fileid=None): """ Returns all of the words and punctuation symbols in the specified file that were in text nodes -- ie, tags are ignored. Like the xml() method, fileid can only specify one file. :return: the given file's text nodes as a list of words and punctuation symbols :rtype: list(str) """ elt = self.xml(fileid) encoding = self.encoding(fileid) word_tokenizer=WordPunctTokenizer() iterator = elt.getiterator() out = [] for node in iterator: text = node.text if text is not None: if isinstance(text, bytes): text = text.decode(encoding) toks = word_tokenizer.tokenize(text) out.extend(toks) return out
Example #9
Source File: reader.py From atap with Apache License 2.0 | 6 votes |
def __init__(self, root, fileids=PKL_PATTERN, **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids) self._word_tokenizer = WordPunctTokenizer() self._sent_tokenizer = nltk.data.LazyLoader( 'tokenizers/punkt/english.pickle')
Example #10
Source File: parse_utils.py From deep-mlsa with Apache License 2.0 | 5 votes |
def __init__(self): self.tokenizers = { 'en': TweetTokenizer(), 'de': WordPunctTokenizer(), 'it': WordPunctTokenizer(), 'fr': WordPunctTokenizer(), 'default': WordPunctTokenizer() } self.tokenizer = TweetTokenizer()
Example #11
Source File: document.py From magpie with MIT License | 5 votes |
def compute_wordset(self): tokens = WordPunctTokenizer().tokenize(self.text) lowercase = [t.lower() for t in tokens] return set(lowercase) - {',', '.', '!', ';', ':', '-', '', None}
Example #12
Source File: sent_analysis.py From kryptoflow with GNU General Public License v3.0 | 5 votes |
def __init__(self): self._sent_analyzer = SIA() self._word_tokenizer = WordPunctTokenizer().tokenize self._sent_tokenizer = nltk.data.LazyLoader( 'tokenizers/punkt/english.pickle' ).tokenize self._ids = []
Example #13
Source File: pipeline.py From RE-NLG-Dataset with MIT License | 5 votes |
def __get_words_boundaries(self): """ function to tokenize words in the document and return words boundaries of each sentence using a tokenizer. :return: """ tokenizer = WordPunctTokenizer() words = list(tokenizer.span_tokenize(self.text)) return words
Example #14
Source File: TfidfRetriever.py From SOQAL with MIT License | 5 votes |
def __init__(self, docs, k, ngrams, vectorizer=None, tfidf_matrix=None): self.k = k # number of documents to return self.tokenizer = WordPunctTokenizer() self.stemmer = ARLSTem() self.docs = docs self.vectorizer = TfidfVectorizer(ngram_range=(1, ngrams), norm=None) if tfidf_matrix is None or vectorizer is None: self.tfidf_matrix = self.vectorizer.fit_transform(docs) else: self.vectorizer = vectorizer self.tfidf_matrix = tfidf_matrix
Example #15
Source File: TfidfRetriever.py From SOQAL with MIT License | 5 votes |
def __init__(self, docs, k, ngrams, vectorizer=None, tfidf_matrix=None): self.k = k # number of documents to return self.tokenizer = WordPunctTokenizer() self.stemmer = ARLSTem() self.docs = docs self.stopwords = stopwords.words('arabic') self.vectorizer = TfidfVectorizer(ngram_range=(1, ngrams), norm=None, stop_words=self.stopwords) if tfidf_matrix is None or vectorizer is None: docs_stemmed = self.docs_stem() self.tfidf_matrix = self.vectorizer.fit_transform(docs_stemmed) else: self.vectorizer = vectorizer self.tfidf_matrix = tfidf_matrix
Example #16
Source File: slidingwindow_distance.py From SOQAL with MIT License | 5 votes |
def __init__(self): self.tokenizer = WordPunctTokenizer() self.stemmer = ARLSTem() self.SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"'
Example #17
Source File: tokenizer.py From cotk with Apache License 2.0 | 5 votes |
def __init__(self, method: str, special_tokens: List[str] = None): self.method = method self.special_tokens = special_tokens if method == "nltk": self._callable_tokenizer = WordPunctTokenizer().tokenize elif method == "space": self._callable_tokenizer = str.split else: raise ValueError('`method` is invalid value {}, should be "nltk" or "space" '.format(method)) self._setting_hash = hashlib.sha256(dumps(["adapter", method, special_tokens])).hexdigest()
Example #18
Source File: _preprocessor.py From sumpy with Apache License 2.0 | 5 votes |
def build(self): if not hasattr(self, "_word_tokenizer"): self._word_tokenizer = None if self._word_tokenizer is None: self._word_tokenizer = WordPunctTokenizer().tokenize
Example #19
Source File: preprocessor.py From sumpy with Apache License 2.0 | 5 votes |
def build_word_tokenizer(self): """Return a function that splits a string into a sequence of words.""" if self._word_tokenizer is not None: tokenize = self._word_tokenizer else: tokenize = WordPunctTokenizer().tokenize return tokenize
Example #20
Source File: preprocessor.py From sumpy with Apache License 2.0 | 5 votes |
def _build_word_tokenizer(self): """Return a function that splits a string into a sequence of words.""" if self._word_tokenizer is not None: tokenize = self._word_tokenizer else: tokenize = WordPunctTokenizer().tokenize return tokenize
Example #21
Source File: tfidf_reader.py From SOQAL with MIT License | 5 votes |
def __init__(self, P): self.tokenizer = WordPunctTokenizer() self.stemmer = ARLSTem() self.docs = self.get_answer_canditates(P) docs_stem = [] for doc in self.docs: docs_stem.append(self.stem_string(doc)) self.stopwords = stopwords.words('arabic') self.vectorizer = TfidfVectorizer(ngram_range=(1, 4), norm=None) # , stop_words=self.stopwords) self.tfidf_matrix = self.vectorizer.fit_transform(docs_stem)
Example #22
Source File: fasttext_embedding.py From SOQAL with MIT License | 5 votes |
def __init__(self, model_path): self.model_path = model_path print("loading fastText model ...") #self.model = pickle.load(open(self.model_path,"rb")) self.model = KeyedVectors.load_word2vec_format(self.model_path, encoding='utf-8', unicode_errors='ignore') print("done fastText loading model") self.tokenizer = WordPunctTokenizer() self.stemmer = ARLSTem() self.SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"' self.vocab = self.model.vocab
Example #23
Source File: preprocess_aclImdb_v1.py From lambda-deep-learning-demo with Apache License 2.0 | 5 votes |
def data_tockenize(text): tokenizer = WordPunctTokenizer() tokens = tokenizer.tokenize(text) return (" ".join(tokens)).strip()
Example #24
Source File: sw_loader.py From texar with Apache License 2.0 | 4 votes |
def __init__(self, jsonl_path, mode=None): self.mode = mode self.raw = [] self.lst = [] self.refs = [] if mode == 'test': lst = json.load(open(jsonl_path, 'r')) for item in lst: context = item['context'] dialog = [] for utts in context: p = utts.find(':') dialog.append(( (utts[p - 1] == 'A') * 2 - 1, utts[p + 2:-1], 0)) if dialog[0][1][-1] == '>': dialog = dialog[1:] if len(dialog) == 0: continue responses = [] for resp in item['responses']: responses.append(resp) spk = (item['speaker'] == 'A') * 2 - 1 dialog.append((spk, responses[0], 0)) responses = responses[1:] responses = [' '.join(WordPunctTokenizer().tokenize(resp)) for resp in responses] if len(responses) == 0: continue self.raw.append(dialog) self.lst.append((len(self.raw) - 1, 0, len(dialog))) self.refs.append(responses) return from collections import Counter self.ct = Counter() self.topics = [] with open(jsonl_path, 'r') as f: for idx, item in enumerate(reader(f)): utts = item['utts'] self.topics.append(item['topic']) self.raw.append([(int(speaker == 'A') * 2 - 1, sentence, _) for speaker, sentence, _ in utts]) lst = [(idx, start, start + wnd_sz) for start in range(0, len(utts) - wnd_sz)] + \ [(idx, 0, end) for end in range(2, min(wnd_sz + 1, len(utts)))] self.lst += lst self.refs = [['none']] * len(self.lst)
Example #25
Source File: sw_loader.py From texar with Apache License 2.0 | 4 votes |
def generate_reference_for_test_dialog(dataset, data_root): vocab = {} with open(os.path.join(data_root, 'vocab.txt'), 'r') as f: p = f.read().splitlines() for i, x in enumerate(p): vocab[x] = i dts_train = dataset['train'] dts_val = dataset['val'] dts_test = dataset['test'] vectorizer = TfidfVectorizer(tokenizer=WordPunctTokenizer().tokenize, vocabulary=vocab) saved = [] meta = [] data = [] tidx = {} for i in range(len(dts_test)): topic, cct, source, target = dts_test.get(i) meta.append((topic, cct, target)) data.append(source) for i in range(len(dts_train)): topic, cct, source, target = dts_train.get(i) saved.append((topic, cct, target)) data.append(source) if topic not in tidx: tidx[topic] = [] tidx[topic].append(i) result = vectorizer.fit_transform(data) x = result[:len(dts_test)] y = result[len(dts_test):] from tqdm import tqdm from sklearn.preprocessing import normalize y = normalize(y) x = normalize(x) dts_test.refs = [] for i in tqdm(range(len(dts_test))): c = tidx[meta[i][0]] p = (y * x[i].T).toarray().reshape(-1)[c] d = p.argsort() cnt = 0 refs = [] for a in d[::-1]: if saved[a][1] == meta[i][1]: refs.append(' '.join( WordPunctTokenizer().tokenize(saved[a][2][1]))) cnt += 1 if cnt == 10: break dts_test.refs.append(refs)