Python nltk.download() Examples
The following are 30
code examples of nltk.download().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example #1
Source File: treebank_encoder.py From PyTorch-NLP with BSD 3-Clause "New" or "Revised" License | 6 votes |
def __init__(self, *args, **kwargs): if 'tokenize' in kwargs: raise TypeError('``TreebankEncoder`` does not take keyword argument ``tokenize``.') if 'detokenize' in kwargs: raise TypeError('``TreebankEncoder`` does not take keyword argument ``detokenize``.') try: import nltk # Required for moses nltk.download('perluniprops') nltk.download('nonbreaking_prefixes') from nltk.tokenize.treebank import TreebankWordTokenizer from nltk.tokenize.treebank import TreebankWordDetokenizer except ImportError: print("Please install NLTK. " "See the docs at http://nltk.org for more information.") raise super().__init__( *args, tokenize=TreebankWordTokenizer().tokenize, detokenize=TreebankWordDetokenizer().detokenize, **kwargs)
Example #2
Source File: ngram_featurizer.py From metal with Apache License 2.0 | 6 votes |
def __init__( self, anonymize=True, trim_window=5, lowercase=True, drop_stopwords=True, stem=True, ngram_range=(1, 3), **vectorizer_kwargs, ): self.anonymize = anonymize self.lowercase = lowercase self.drop_stopwords = drop_stopwords if drop_stopwords: nltk.download("stopwords") self.stopwords = set(nltk.corpus.stopwords.words("english")) self.trim_window = trim_window self.stem = stem if stem: self.porter = nltk.PorterStemmer() self.vectorizer = CountVectorizer( ngram_range=ngram_range, binary=True, **vectorizer_kwargs )
Example #3
Source File: res_sen2vec.py From resilient-community-apps with MIT License | 6 votes |
def __init__(self, w2v, sif, log=None): # A NLPWord2Vec to get the vec for a word self.word2vec = w2v # A ResSIF used to get word count self.sif = sif # util to pre-process data self.utils = WordSentenceUtils() self.log = log if log else logging.getLogger(__name__) self.sentence_vectors = [] self.feature_size = 0 # download nltk resource if necessary nltk.download('words', quiet=True) self.setofwords = set(nltk_words.words()) # pca vector self.pca_u = []
Example #4
Source File: __init__.py From senpy with Apache License 2.0 | 6 votes |
def install_deps(*plugins): installed = False nltk_resources = set() requirements = [] for info in plugins: requirements = info.get('requirements', []) if requirements: requirements += missing_requirements(requirements) nltk_resources |= set(info.get('nltk_resources', [])) if requirements: logger.info('Installing requirements: ' + str(requirements)) pip_args = [sys.executable, '-m', 'pip', 'install'] for req in requirements: pip_args.append(req) process = subprocess.Popen( pip_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) _log_subprocess_output(process) exitcode = process.wait() installed = True if exitcode != 0: raise models.Error( "Dependencies not properly installed: {}".format(pip_args)) installed |= download(list(nltk_resources)) return installed
Example #5
Source File: install.py From metadoc with MIT License | 6 votes |
def install_nltk_sets(): DATA_DIR = os.path.join(os.path.dirname(__file__), "extract/data") REQUIRED_CORPORA = [ 'brown', # Required for FastNPExtractor 'punkt', # Required for WordTokenizer 'wordnet', # Required for lemmatization and Wordnet 'maxent_ne_chunker', 'stopwords', 'words' ] for each in REQUIRED_CORPORA: print(('[+] Downloading corpus: "{0}"'.format(each))) nltk.download(each, download_dir=DATA_DIR) from metadoc.extract.pos import do_train print('[+] Training tagger now.') do_train() remove_zips(DATA_DIR) return
Example #6
Source File: tokenizer.py From prenlp with Apache License 2.0 | 6 votes |
def __init__(self): try: from nltk.tokenize.moses import MosesTokenizer except Exception as ex: import nltk nltk.download('perluniprops') nltk.download('nonbreaking_prefixes') self.tokenizer = MosesTokenizer()
Example #7
Source File: utils.py From essaysense with MIT License | 6 votes |
def __init__(self, hyperparameters, lookup_table): """Constructor for initializing ASAP-AES datasets. Args: - hyperparameters: hyperparameters of the experiments. - lookup_table: word embedding lookup table, which should be a dict mapping words into their NumPy vector repre- sentation. """ # This constructor tries to detect or download NLTK's tokenizer # automatically. try: self.s_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') except LookupError: nltk.download("punkt") self.s_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # Also load hyperparameters and lookup table. self.lookup_table = lookup_table self.hp = hyperparameters
Example #8
Source File: utils.py From Fox-V3 with GNU Affero General Public License v3.0 | 6 votes |
def remove_stopwords(tokens, language): """ Takes a language (i.e. 'english'), and a set of word tokens. Returns the tokenized text with any stopwords removed. Stop words are words like "is, the, a, ..." Be sure to download the required NLTK corpus before calling this function: - from chatterbot.utils import nltk_download_corpus - nltk_download_corpus('corpora/stopwords') """ from nltk.corpus import stopwords # Get the stopwords for the specified language stop_words = stopwords.words(language) # Remove the stop words from the set of word tokens tokens = set(tokens) - set(stop_words) return tokens
Example #9
Source File: test_corpus.py From cltk with MIT License | 6 votes |
def setUpClass(self): try: corpus_importer = CorpusImporter("latin") corpus_importer.import_corpus("latin_models_cltk") corpus_importer.import_corpus("latin_text_latin_library") except: raise Exception("Failure to download test corpus") self.reader = get_corpus_reader( language="latin", corpus_name="latin_text_latin_library" ) self.reader._fileids = ["pervig.txt"] # Need a additional instance because tests below change internals #TO-DO Fix self.reader_2 = get_corpus_reader( language="latin", corpus_name="latin_text_latin_library" ) self.reader_3 = get_corpus_reader( language="latin", corpus_name="latin_text_latin_library" ) self.reader_4 = get_corpus_reader( language="latin", corpus_name="latin_text_latin_library" )
Example #10
Source File: NewsAutosummarize.py From Python-Scripts-Repo-on-Data-Science with GNU General Public License v3.0 | 6 votes |
def get_only_text_washingtonpost_url(url): # this func will take the URL as an argument and return only # the raw text of the url. # this function works specifically for the washPost articles # because we know the structure of the pages page = urllib.urlopen(url).read().decode('utf8') # we download the URL soup = BeautifulSoup(page) # initialize a beautifulsoup object with the page we downloaded text = ' '.join(map(lambda p: p.text, soup.find_all('article'))) # the above gets everything bewteen a pair of HTML tags # that look a certain way e.g. <article> stuff</article> # the above format is specific to the washington post soup2 = BeautifulSoup(text) # find all the paragraph tage <p> text = ' '.join(map(lambda p: p.text, soup2.find_all('p'))) return soup.title.text, text ####################################################################### # TEST ######################################################################
Example #11
Source File: word.py From flambe with MIT License | 6 votes |
def __init__(self, ngrams: Union[int, List[int]] = 1, exclude_stopwords: bool = False, stop_words: Optional[List] = None) -> None: """ Initialize the NGramsTokenizer Parameters ---------- ngrams : Union[int, List[int]], optional [description], by default 1 exclude_stopwords: bool [description], by default False stop_words: Optional[List] [description], by default None """ self.ngrams = ngrams self.exclude_stopwords = exclude_stopwords if self.exclude_stopwords: self.stop_words = stop_words if self.stop_words is None: nltk.download('stopwords', quiet=True) self.stop_words = stopwords.words('english') nltk.download('punkt', quiet=True)
Example #12
Source File: language_util.py From talk-generator with MIT License | 6 votes |
def print_corpus_download_warning(): corpus_warning = """ Hmm... --------------------- We had some trouble downloading the NLTK corpuses.. Try running the following from a command line. This should download the needed packages.. but it might also tell you if there is another issue. $ python3 -m nltk.downloader punkt averaged_perceptron_tagger """ logger.warning(corpus_warning) # Helpers
Example #13
Source File: agents.py From ParlAI with MIT License | 6 votes |
def get_sentence_tokenizer(): """ Loads the nltk sentence tokenizer. """ try: import nltk except ImportError: raise ImportError('Please install nltk (e.g. pip install nltk).') # nltk-specific setup st_path = 'tokenizers/punkt/{0}.pickle'.format('english') try: sent_tok = nltk.data.load(st_path) except LookupError: nltk.download('punkt') sent_tok = nltk.data.load(st_path) return sent_tok
Example #14
Source File: wordnet.py From gobbli with Apache License 2.0 | 6 votes |
def __init__(self, skip_download_check: bool = False, spacy_model="en_core_web_sm"): try: from nltk.corpus import wordnet import nltk except ImportError: raise ImportError( "WordNet-based data augmentation requires nltk to be installed." ) self.wn = wordnet try: import spacy from spacy.tokens import Token except ImportError: raise ImportError( "WordNet-based data augmentation requires spaCy and a language " "model to be installed (for part of speech tagging)." ) if not skip_download_check: nltk.download("wordnet") self.nlp = spacy.load(spacy_model, parser=False, tagger=True, entity=False) Token.set_extension("replacement", default=None, force=True)
Example #15
Source File: download.py From textkit with MIT License | 6 votes |
def download(): ''' Install required libraries. Note this library will install nltk dependencies into your user directory. ''' click.echo("Installing nltk packages into your user directories in " + "the following order of existence (first found):\n" + '\n'.join(nltk.data.path)) extensions = [("taggers", "averaged_perceptron_tagger"), ("corpora", "wordnet"), ("tokenizers", "punkt")] missing = check_packages_exist(extensions) for ext_tuple in missing: nltk.download(ext_tuple[1])
Example #16
Source File: nlp.py From Quora with MIT License | 6 votes |
def lemmatize(tokens): """ lemmatize tokens """ try: wnl = nltk.WordNetLemmatizer() except LookupError: nltk.download('wordnet') wnl = nltk.WordNetLemmatizer() return [wnl.lemmatize(t) for t in tokens]
Example #17
Source File: agents.py From neural_chat with MIT License | 6 votes |
def get_sentence_tokenizer(): """ Loads the nltk sentence tokenizer """ try: import nltk except ImportError: raise ImportError('Please install nltk (e.g. pip install nltk).') # nltk-specific setup st_path = 'tokenizers/punkt/{0}.pickle'.format('english') try: sent_tok = nltk.data.load(st_path) except LookupError: nltk.download('punkt') sent_tok = nltk.data.load(st_path) return sent_tok
Example #18
Source File: nlp.py From twip with MIT License | 6 votes |
def nltk_download(name, ignore_errors=True): r"""Like nltk.download, but be quiet about it, and get a room (separate python process) Does some simple whitespace normalization on `name`, but doesn't yet do fuzzy matching Caches the normalized names of packages already attempted, so they aren't re-tried >>> nltk_download('nonexistent dataset name', ignore_errors=True) False >>> nltk_download('WordNet', ignore_errors=True) True >>> nltk_download('wordnet', ignore_errors=True) True """ name = re.sub(r"[-\s=+']+", '_', name.lower()) if name in nltk_download.done: return nltk_download.done[name] proc = subprocess.Popen(["python", "-c", "import nltk; nltk.download('{}')".format(name)], stdout=subprocess.PIPE) msgs = [s for s in proc.communicate() if s is not None] if any(re.match(r'^\[nltk_data\]\s+Error', msg, flags=re.IGNORECASE) for msg in msgs): nltk_download.done[name] = False if ignore_errors: return nltk_download.done[name] raise ValueError('Unable to download the requested NLTK dataset: {}'.format('\n'.join(msgs))) nltk_download.done[name] = True return nltk_download.done[name]
Example #19
Source File: transformation_functions.py From sparklingml with Apache License 2.0 | 6 votes |
def get(self, lang): if lang not in self._spacys: import spacy # Hack to dynamically download languages on cluster machines, # you can remove if you have the models installed and just do: # cls._spacys[lang] = spacy.load(lang) try: old_exit = sys.exit sys.exit = None try: self._spacys[lang] = spacy.load(lang) except Exception: spacy.cli.download(lang) self._spacys[lang] = spacy.load(lang) except Exception as e: raise Exception( "Failed to find or download language {0}: {1}" .format(lang, e)) finally: sys.exit = old_exit return self._spacys[lang]
Example #20
Source File: test_text_filters.py From pliers with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_token_removal_filter(): stim = TextStim(text='this is not a very long sentence') filt = TokenRemovalFilter() assert filt.transform(stim).text == 'long sentence' filt2 = TokenRemovalFilter(tokens=['a', 'the', 'is']) assert filt2.transform(stim).text == 'this not very long sentence' stim2 = TextStim(text='More. is Real, sentence that\'ll work') try: nltk.data.find('corpora/stopwords') except LookupError: nltk.download('stopwords') from nltk.corpus import stopwords tokens = set(stopwords.words('english')) | set(string.punctuation) filt3 = TokenRemovalFilter(tokens=tokens) assert filt3.transform(stim2).text == 'More Real sentence \'ll work'
Example #21
Source File: nlp.py From Quora with MIT License | 5 votes |
def remove_stopwords(tokens): """ remove stopwords from tokens """ try: stopwords = nltk.corpus.stopwords.words('english') except LookupError: nltk.download('stopwords') stopwords = nltk.corpus.stopwords.words('english') return [t for t in tokens if t.lower() not in stopwords]
Example #22
Source File: bidaf_model_runtime.py From botbuilder-python with MIT License | 5 votes |
def init_bidaf(bidaf_model_dir: str, download_ntlk_punkt: bool = False) -> bool: if os.path.isdir(bidaf_model_dir): print("bidaf model directory already present..", file=sys.stderr) else: print("Creating bidaf model directory..", file=sys.stderr) os.makedirs(bidaf_model_dir, exist_ok=True) # Download Punkt Sentence Tokenizer if download_ntlk_punkt: nltk.download("punkt", download_dir=bidaf_model_dir) nltk.download("punkt") # Download bidaf onnx model onnx_model_file = os.path.abspath(os.path.join(bidaf_model_dir, "bidaf.onnx")) print(f"Checking file {onnx_model_file}..", file=sys.stderr) if os.path.isfile(onnx_model_file): print("bidaf.onnx downloaded already!", file=sys.stderr) else: print("Downloading bidaf.onnx...", file=sys.stderr) response = requests.get( "https://onnxzoo.blob.core.windows.net/models/opset_9/bidaf/bidaf.onnx", stream=True, ) with open(onnx_model_file, "wb") as f: response.raw.decode_content = True shutil.copyfileobj(response.raw, f) return True
Example #23
Source File: test_text_extractors.py From pliers with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_part_of_speech_extractor(): import nltk nltk.download('tagsets') stim = ComplexTextStim(join(TEXT_DIR, 'complex_stim_with_header.txt')) result = merge_results(PartOfSpeechExtractor().transform(stim), format='wide', extractor_names=False) assert result.shape == (4, 54) assert result['NN'].sum() == 1 result = result.sort_values('onset') assert result['VBD'].iloc[3] == 1
Example #24
Source File: text.py From pliers with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __init__(self, tokens=None, language='english'): self.language = language if tokens: self.tokens = set(tokens) else: try: nltk.data.find('corpora/stopwords') except LookupError: nltk.download('stopwords') from nltk.corpus import stopwords self.tokens = set(stopwords.words(self.language)) super().__init__()
Example #25
Source File: word_sentence_utils.py From resilient-community-apps with MIT License | 5 votes |
def __init__(self): nltk.download("wordnet", quiet=True) nltk.download("stopwords", quiet=True) nltk.download('averaged_perceptron_tagger', quiet=True) self.remove_list = ", . ; ? ~ ! * ) ( { } $ # @ < > ] [".split() self.lem = WordNetLemmatizer()
Example #26
Source File: hq_main.py From HackQ-Trivia with MIT License | 5 votes |
def download_nltk_resources(): for resource in {"stopwords", "averaged_perceptron_tagger", "punkt"}: nltk.download(resource, quiet=True)
Example #27
Source File: sum.py From vidsum with GNU General Public License v3.0 | 5 votes |
def download_video_srt(subs): """ Downloads specified Youtube video's subtitles as a vtt/srt file. Args: subs(str): Full url of Youtube video Returns: True The video will be downloaded as 1.mp4 and its subtitles as 1.(lang).srt Both, the video and its subtitles, will be downloaded to the same location as that of this script (sum.py) """ ydl_opts = { 'format': 'best', 'outtmpl': '1.%(ext)s', 'subtitlesformat': 'srt', 'writeautomaticsub': True, # 'allsubtitles': True # Get all subtitles } movie_filename = "" subtitle_filename = "" with youtube_dl.YoutubeDL(ydl_opts) as ydl: # ydl.download([subs]) result = ydl.extract_info("{}".format(url), download=True) movie_filename = ydl.prepare_filename(result) subtitle_info = result.get("requested_subtitles") subtitle_language = subtitle_info.keys()[0] subtitle_ext = subtitle_info.get(subtitle_language).get("ext") subtitle_filename = movie_filename.replace(".mp4", ".%s.%s" % (subtitle_language, subtitle_ext)) return movie_filename, subtitle_filename
Example #28
Source File: piglatin.py From CloudBot with GNU General Public License v3.0 | 5 votes |
def load_nltk(): nltk.download('cmudict') global pronunciations pronunciations = nltk.corpus.cmudict.dict()
Example #29
Source File: setup.py From rake-nltk with MIT License | 5 votes |
def _post_install(): """Post installation nltk corpus downloads.""" import nltk nltk.download("punkt") nltk.download("stopwords")
Example #30
Source File: downloadcorpus.py From chicago-justice with GNU General Public License v3.0 | 5 votes |
def handle(self, *args, **options): LOG.info('Downloading NLTK data') if options['download_dir']: dest = options['download_dir'] nltk.download('punkt', download_dir=dest) nltk.download('wordnet', download_dir=dest) else: nltk.download('punkt') nltk.download('wordnet')