Python nltk.stem.lancaster.LancasterStemmer() Examples
The following are 7
code examples of nltk.stem.lancaster.LancasterStemmer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.stem.lancaster
, or try the search function
.
Example #1
Source File: Data.py From NLU with MIT License | 6 votes |
def __init__(self): ############################################################### # # Sets up all default requirements and placeholders # needed for the NLU engine to run. # # - Helpers: Useful global functions # - Logging: Logging class # - LancasterStemmer: Word stemmer # ############################################################### self.ignore = [',','.','!','?'] self.Helpers = Helpers() self._confs = self.Helpers.loadConfigs() self.LogFile = self.Helpers.setLogFile(self._confs["aiCore"]["Logs"]+"JumpWay/") self.LancasterStemmer = LancasterStemmer()
Example #2
Source File: Users.py From NLU with MIT License | 6 votes |
def __init__(self, Logging, LogFile): self.LancasterStemmer = LancasterStemmer() self.Logging = Logging self.LogFile = LogFile self.ignore = [ '?', '!' ] self.Logging.logMessage( self.LogFile, "Data", "INFO", "Data Helper Ready")
Example #3
Source File: matcher.py From text-matcher with GNU General Public License v3.0 | 6 votes |
def getTokens(self, removeStopwords=True): """ Tokenizes the text, breaking it up into words, removing punctuation. """ tokenizer = nltk.RegexpTokenizer('[a-zA-Z]\w+\'?\w*') # A custom regex tokenizer. spans = list(tokenizer.span_tokenize(self.text)) # Take note of how many spans there are in the text self.length = spans[-1][-1] tokens = tokenizer.tokenize(self.text) tokens = [ token.lower() for token in tokens ] # make them lowercase stemmer = LancasterStemmer() tokens = [ stemmer.stem(token) for token in tokens ] if not removeStopwords: self.spans = spans return tokens tokenSpans = list(zip(tokens, spans)) # zip it up stopwords = nltk.corpus.stopwords.words('english') # get stopwords tokenSpans = [ token for token in tokenSpans if token[0] not in stopwords ] # remove stopwords from zip self.spans = [ x[1] for x in tokenSpans ] # unzip; get spans return [ x[0] for x in tokenSpans ] # unzip; get tokens
Example #4
Source File: Data.py From NLU with MIT License | 5 votes |
def extract(self, data=None, splitIt=False): ############################################################### # # Extracts words from sentences, stripping out characters in # the ignore list above # # https://www.nltk.org/_modules/nltk/stem/lancaster.html # http://insightsbot.com/blog/R8fu5/bag-of-words-algorithm-in-python-introduction # ############################################################### return [self.LancasterStemmer.stem(word) for word in (data.split() if splitIt == True else data) if word not in self.ignore]
Example #5
Source File: Users.py From NLU with MIT License | 5 votes |
def extract(self, data=None, lowerIt=True, splitIt=False, ignoreWords=False): if ignoreWords: return [self.LancasterStemmer.stem(word if lowerIt == False else word.lower()) for word in (data.split() if splitIt == True else data) if word not in self.ignore] else: return [self.LancasterStemmer.stem(word if lowerIt == False else word.lower()) for word in (data.split() if splitIt == True else data)]
Example #6
Source File: Mitie.py From NLU with MIT License | 5 votes |
def __init__(self): ############################################################### # # Sets up all default requirements # # - Helpers: Useful global functions # - LancasterStemmer: Word stemmer # ############################################################### self.Helpers = Helpers() self._confs = self.Helpers.loadConfigs() self.stemmer = LancasterStemmer()
Example #7
Source File: adversarial_squad.py From adversarial-squad with MIT License | 5 votes |
def get_vocabularies(dataset, vocab_file, nearby_file): """Create map from example ID to (basic_words, nearby_words.""" with open(vocab_file) as f: basic_vocab = [line.strip() for line in f] with open(nearby_file) as f: nearby_words = json.load(f) stemmer = LancasterStemmer() vocabs = {} for a in dataset['data']: for p in a['paragraphs']: for q in p['qas']: q_words = [w.lower() for w in word_tokenize(q['question'])] if OPTS.mode == 'basic': vocabs[q['id']] = (basic_vocab, []) elif OPTS.mode == 'add-question-words': vocabs[q['id']] = (basic_vocab, q_words) elif OPTS.mode.endswith('-nearby'): q_stems = [stemmer.stem(qw) for qw in q_words] cur_vocab = [w for w in basic_vocab if w not in q_stems] cur_nearby = [] for q_word, q_stem in zip(q_words, q_stems): if q_word in nearby_words: qw_nearby = [] for nearby_word in nearby_words[q_word]: if len(qw_nearby) == OPTS.num_nearby: break if nearby_word['word'] in PUNCTUATION: continue nearby_stem = stemmer.stem(nearby_word['word']) if nearby_stem != q_stem: qw_nearby.append(nearby_word['word']) cur_nearby.extend(qw_nearby) vocabs[q['id']] = (cur_vocab, cur_nearby) return vocabs