Python ftfy.fix_text() Examples
The following are 30
code examples of ftfy.fix_text().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
ftfy
, or try the search function
.
Example #1
Source File: utils.py From comet-commonsense with Apache License 2.0 | 8 votes |
def encode(self, texts, verbose=True): texts_tokens = [] if verbose: for text in tqdm(texts, ncols=80, leave=False): text = self.nlp(text_standardize(ftfy.fix_text(text))) text_tokens = [] for token in text: text_tokens.extend( [self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')]) texts_tokens.append(text_tokens) else: for text in texts: text = self.nlp(text_standardize(ftfy.fix_text(text))) text_tokens = [] for token in text: text_tokens.extend( [self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')]) texts_tokens.append(text_tokens) return texts_tokens
Example #2
Source File: tokenization_openai.py From Bert-Chinese-Text-Classification-Pytorch with MIT License | 7 votes |
def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None): try: import ftfy import spacy self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat']) self.fix_text = ftfy.fix_text except ImportError: logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") self.nlp = BasicTokenizer(do_lower_case=True, never_split=special_tokens if special_tokens is not None else []) self.fix_text = None self.max_len = max_len if max_len is not None else int(1e12) self.encoder = json.load(open(vocab_file, encoding="utf-8")) self.decoder = {v:k for k,v in self.encoder.items()} merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] merges = [tuple(merge.split()) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} self.special_tokens = {} self.special_tokens_decoder = {} self.set_special_tokens(special_tokens)
Example #3
Source File: cnndm.py From lm-human-preferences with MIT License | 6 votes |
def cnndm_generator(mode, seed=0, shuffle=False, comm=None): # data originally from https://github.com/abisee/cnn-dailymail if mode == 'valid': mode = 'val' with open(gcs.download_file_cached(f'gs://lm-human-preferences/datasets/cnndm/url_lists/all_{mode}.txt', comm=comm)) as f: urls = [line.strip() for line in f] if shuffle: random.seed(seed) random.shuffle(urls) # if n_eval > 0: # urls = urls[:n_eval] urls_dir = gcs.download_directory_cached(f'gs://lm-human-preferences/datasets/cnndm/cache_{mode}', comm=comm) for i, url in enumerate(urls): path = os.path.join(urls_dir, get_path_of_url(url)) text = open(path).read() text = clean_up_start(text) text = ftfy.fix_text(text) text = re.sub(r"\n{3,}", "\n\n", text) text = text.split('@highlight')[0].strip() yield text # _, ref_sents = get_art_abs(path)
Example #4
Source File: autosumpdf.py From autosum with MIT License | 6 votes |
def search_citation(text, exp): '''Finds sentences around citations, where the regexp `exp matches''' text = text.decode('utf-8') lines = text.split('\n') text = ' '.join(lines) text = ' '.join(text.split()) text = ftfy.fix_text(text) logging.info("Search...'{0!s}'".format(exp)) sentences = split_sentences(text) regex = re.compile(exp, flags=(re.I)) founds = set() for sent in sentences: if regex.search(sent): founds.add(sent) return founds
Example #5
Source File: main.py From python-examples with MIT License | 6 votes |
def test(data): text, expected = data text2 = text.encode('cp437').decode('utf-8') text3 = unidecode(text2) text4 = unicodedata.normalize('NFC', text2) text5 = unidecode(text4) print(' text:', text, '| len:', len(text)) print(' expected:', expected, ' | len:', len(expected)) print(' text == expected:', text == expected) print('-------------------------------------') print('text.encode("cp437").decode("utf-8"):', text2, ' | len:', len(text2), '| expected:', text2 == expected) print(' unicode(text2):', text3, ' | len:', len(text3), '| expected:', text3 == expected) print('-------------------------------------') print(' unicodedata.normalize("NFC", text2):', text4, ' | len:', len(text4), '| expected:', text4 == expected) print(' unicode(text4):', text5, ' | len:', len(text5), '| expected:', text5 == expected) print('-------------------------------------') print(' ftfy.fix_text(text):', ftfy.fix_text(text)) print('-------------------------------------')
Example #6
Source File: text_utils.py From openai-gpt-pytorch with MIT License | 6 votes |
def encode(self, texts, verbose=True): texts_tokens = [] if verbose: for text in tqdm(texts, ncols=80, leave=False): text = self.nlp(text_standardize(ftfy.fix_text(text))) text_tokens = [] for token in text: text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')]) texts_tokens.append(text_tokens) else: for text in texts: text = self.nlp(text_standardize(ftfy.fix_text(text))) text_tokens = [] for token in text: text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')]) texts_tokens.append(text_tokens) return texts_tokens
Example #7
Source File: subtitle.py From bazarr with GNU General Public License v3.0 | 6 votes |
def get_modified_content(self, format="srt", debug=False): """ :return: string """ if not self.mods: return fix_text(self.content.decode(encoding=self._guessed_encoding), **ftfy_defaults).encode( encoding=self._guessed_encoding) submods = SubtitleModifications(debug=debug) if submods.load(content=self.text, language=self.language): logger.info("Applying mods: %s", self.mods) submods.modify(*self.mods) self.mods = submods.mods_used content = fix_text(self.pysubs2_to_unicode(submods.f, format=format), **ftfy_defaults)\ .encode(encoding=self._guessed_encoding) submods.f = None del submods return content return None
Example #8
Source File: tokenization_openai.py From HPSG-Neural-Parser with MIT License | 6 votes |
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs): super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs) try: import ftfy import spacy self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat']) self.fix_text = ftfy.fix_text except ImportError: logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") self.nlp = BasicTokenizer(do_lower_case=True) self.fix_text = None self.encoder = json.load(open(vocab_file, encoding="utf-8")) self.decoder = {v:k for k,v in self.encoder.items()} merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] merges = [tuple(merge.split()) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {}
Example #9
Source File: summarize.py From python-qutescript with BSD 2-Clause "Simplified" License | 6 votes |
def summarize_text(request): if request.html: parser = HtmlParser.from_file(file_path=request.html, url=request.url, tokenizer=Tokenizer(LANGUAGE)) else: parser = PlaintextParser.from_file(file_path=request.html, tokenizer=Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sentences = [fix_text(str(s)) for s in summarizer(parser.document, SENTENCES_COUNT)] html = generate_html(sentences, fix_text(request.title)).render() request.send_html(html)
Example #10
Source File: tokenization_openai.py From PPLM with Apache License 2.0 | 6 votes |
def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None): try: import ftfy import spacy self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat']) self.fix_text = ftfy.fix_text except ImportError: logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") self.nlp = BasicTokenizer(do_lower_case=True, never_split=special_tokens if special_tokens is not None else []) self.fix_text = None self.max_len = max_len if max_len is not None else int(1e12) self.encoder = json.load(open(vocab_file, encoding="utf-8")) self.decoder = {v:k for k,v in self.encoder.items()} merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] merges = [tuple(merge.split()) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} self.special_tokens = {} self.special_tokens_decoder = {} self.set_special_tokens(special_tokens)
Example #11
Source File: tokenization_openai.py From KagNet with MIT License | 6 votes |
def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None): try: import ftfy import spacy self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat']) self.fix_text = ftfy.fix_text except ImportError: logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") self.nlp = BasicTokenizer(do_lower_case=True, never_split=special_tokens if special_tokens is not None else []) self.fix_text = None self.max_len = max_len if max_len is not None else int(1e12) self.encoder = json.load(open(vocab_file, encoding="utf-8")) self.decoder = {v:k for k,v in self.encoder.items()} merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] merges = [tuple(merge.split()) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} self.set_special_tokens(special_tokens)
Example #12
Source File: tokenization_openai.py From bert_on_stilts with Apache License 2.0 | 6 votes |
def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None): try: import ftfy import spacy self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat']) self.fix_text = ftfy.fix_text except ImportError: logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") self.nlp = BasicTokenizer(do_lower_case=True, never_split=special_tokens if special_tokens is not None else []) self.fix_text = None self.max_len = max_len if max_len is not None else int(1e12) self.encoder = json.load(open(vocab_file, encoding="utf-8")) self.decoder = {v:k for k,v in self.encoder.items()} merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] merges = [tuple(merge.split()) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} self.set_special_tokens(special_tokens)
Example #13
Source File: tokenization_openai.py From CCF-BDCI-Sentiment-Analysis-Baseline with Apache License 2.0 | 6 votes |
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs): super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs) try: import ftfy from spacy.lang.en import English _nlp = English() self.nlp = _nlp.Defaults.create_tokenizer(_nlp) self.fix_text = ftfy.fix_text except ImportError: logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") self.nlp = BasicTokenizer(do_lower_case=True) self.fix_text = None self.encoder = json.load(open(vocab_file, encoding="utf-8")) self.decoder = {v:k for k,v in self.encoder.items()} merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] merges = [tuple(merge.split()) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {}
Example #14
Source File: text_utils.py From models with MIT License | 6 votes |
def encode(self, texts, verbose=True): texts_tokens = [] if verbose: for text in tqdm(texts, ncols=80, leave=False): text = self.nlp(text_standardize(ftfy.fix_text(text))) text_tokens = [] for token in text: text_tokens.extend( [self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')]) texts_tokens.append(text_tokens) else: for text in texts: text = self.nlp(text_standardize(ftfy.fix_text(text))) text_tokens = [] for token in text: text_tokens.extend( [self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')]) texts_tokens.append(text_tokens) return texts_tokens
Example #15
Source File: tokenization_openai.py From TextClassify with Apache License 2.0 | 6 votes |
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs): super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs) try: import ftfy from spacy.lang.en import English _nlp = English() self.nlp = _nlp.Defaults.create_tokenizer(_nlp) self.fix_text = ftfy.fix_text except ImportError: logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") self.nlp = BasicTokenizer(do_lower_case=True) self.fix_text = None self.encoder = json.load(open(vocab_file, encoding="utf-8")) self.decoder = {v:k for k,v in self.encoder.items()} merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] merges = [tuple(merge.split()) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {}
Example #16
Source File: tldr.py From lm-human-preferences with MIT License | 6 votes |
def tldr_generator(mode, seed=0, shuffle=False, comm=None): random.seed(seed) if mode == 'test': mode = 'valid' # validation set serves as training set, since we don't have access.. assert mode in ['train', 'valid'] with open(gcs.download_file_cached(f'gs://lm-human-preferences/tldr/{mode}-subset.json', comm=comm)) as f: datas = json.load(f) if shuffle: random.seed(seed) random.shuffle(datas) for data in datas: text = data['content'] text = ftfy.fix_text(text) text = re.sub(r"\n{3,}", "\n\n", text) text = text.strip() yield text
Example #17
Source File: tokenization_openai.py From squash-generation with MIT License | 6 votes |
def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None): try: import ftfy import spacy self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat']) self.fix_text = ftfy.fix_text except ImportError: logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") self.nlp = BasicTokenizer(do_lower_case=True, never_split=special_tokens if special_tokens is not None else []) self.fix_text = None self.max_len = max_len if max_len is not None else int(1e12) self.encoder = json.load(open(vocab_file, encoding="utf-8")) self.decoder = {v:k for k,v in self.encoder.items()} merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] merges = [tuple(merge.split()) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} self.special_tokens = {} self.special_tokens_decoder = {} self.set_special_tokens(special_tokens)
Example #18
Source File: text_utils.py From finetune-transformer-lm with MIT License | 6 votes |
def encode(self, texts, verbose=True): texts_tokens = [] if verbose: for text in tqdm(texts, ncols=80, leave=False): text = self.nlp(text_standardize(ftfy.fix_text(text))) text_tokens = [] for token in text: text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')]) texts_tokens.append(text_tokens) else: for text in texts: text = self.nlp(text_standardize(ftfy.fix_text(text))) text_tokens = [] for token in text: text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')]) texts_tokens.append(text_tokens) return texts_tokens
Example #19
Source File: tokenization_xlm.py From HPSG-Neural-Parser with MIT License | 5 votes |
def _tokenize(self, text): """ Tokenize a string. """ split_tokens = [] if self.fix_text is None: # Using BERT's BasicTokenizer text = self.nlp.tokenize(text) for token in text: split_tokens.extend([t for t in self.bpe(token).split(' ')]) else: # Using SpaCy & ftfy (original tokenization process of OpenAI GPT) text = self.nlp(text_standardize(self.fix_text(text))) for token in text: split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')]) return split_tokens
Example #20
Source File: pre_process.py From gpt-2-tensorflow2.0 with MIT License | 5 votes |
def process_text(text_files): print("Pre-processing the text data.....") file_writer = open(PROCESS_DATA_PATH, "w") for file_name in tqdm.tqdm(text_files): fr = open(file_name, 'r') file_writer.writelines([fix_text(line, normalization='NFKC') for line in fr.readlines()]) fr.close file_writer.close()
Example #21
Source File: EnglishTextPreprocessor.py From NeuronBlocks with MIT License | 5 votes |
def preprocess(self, string): if self.__unicode_fix: string = ftfy.fix_text(string) if self.__DBC2SBC: string = self.DBC2SBC(string) return string
Example #22
Source File: tokenization_xlm.py From TextClassify with Apache License 2.0 | 5 votes |
def _tokenize(self, text): """ Tokenize a string. """ split_tokens = [] if self.fix_text is None: # Using BERT's BasicTokenizer text = self.nlp.tokenize(text) for token in text: split_tokens.extend([t for t in self.bpe(token).split(' ')]) else: # Using SpaCy & ftfy (original tokenization process of OpenAI GPT) text = self.nlp(text_standardize(self.fix_text(text))) for token in text: split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')]) return split_tokens
Example #23
Source File: tokenization_openai.py From PPLM with Apache License 2.0 | 5 votes |
def set_special_tokens(self, special_tokens): """ Add a list of additional tokens to the encoder. The additional tokens are indexed starting from the last index of the current vocabulary in the order of the `special_tokens` list. """ if not special_tokens: self.special_tokens = {} self.special_tokens_decoder = {} return self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens)) self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()} if self.fix_text is None: # Using BERT's BasicTokenizer: we can update the tokenizer self.nlp.never_split = special_tokens logger.info("Special tokens {}".format(self.special_tokens))
Example #24
Source File: tokenization_openai.py From PPLM with Apache License 2.0 | 5 votes |
def tokenize(self, text): """ Tokenize a string. """ split_tokens = [] if self.fix_text is None: # Using BERT's BasicTokenizer text = self.nlp.tokenize(text) for token in text: split_tokens.extend([t for t in self.bpe(token).split(' ')]) else: # Using SpaCy & ftfy (original tokenization process of OpenAI GPT) text = self.nlp(text_standardize(self.fix_text(text))) for token in text: split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')]) return split_tokens
Example #25
Source File: upgradedCleaner.py From EchoBurst with MIT License | 5 votes |
def jsonCleaner(jsonl, filter, writeList, count): """Reads and cleans the JSONL input files.""" for jsonFile in jsonl: try: # loads the line as a dictionary d = json.loads(jsonFile, encoding="utf-8") body = d["body"] # deletes URLs body = re.sub(r'https?:\/\/.*[\r\n]*', '', body, flags=re.MULTILINE) # fixes any unicode encoding errors body = ftfy.fix_text(body) # deletes chosen punctuation body = body.translate(filter) body = body.lower() # eliminates empty, deleted or removed comments if re.search('[a-z]', body) \ and body != " removed " \ and body != " deleted " \ and "your submission has been automatically removed" not in body \ and d["author"] != "AutoModerator": # this line can be changed to include subreddits or other pieces of data in addition to the comment body writeList.append("{}\n".format(body)) count += 1 # every 500 000th line it will spit out the 10 most recent comments if count == 500000: print(writeList[-10:]) count = 0 # handles invalid dictionaries except ValueError: print("Decoding Error")
Example #26
Source File: upgradedCleaner.py From EchoBurst with MIT License | 5 votes |
def textCleaner(text, filter, writeList, count): for line in text: # deletes URLs line = re.sub(r'https?:\/\/.*[\r\n]*', '', line, flags=re.MULTILINE) # fixes unicode encoding errors line = ftfy.fix_text(line) # deletes chosen punctuation line = line.translate(filter) line = line.lower() if re.search('[a-z]', line): writeList.append(line + "\n") count += 1 if count == 500000: print(writeList[-10:]) count = 0
Example #27
Source File: tokenization_xlm.py From TextClassify with Apache License 2.0 | 5 votes |
def __init__(self, vocab_file, merges_file, unk_token="<unk>", bos_token="<s>", sep_token="</s>", pad_token="<pad>", cls_token="</s>", mask_token="<special1>", additional_special_tokens=["<special0>", "<special1>", "<special2>", "<special3>", "<special4>", "<special5>", "<special6>", "<special7>", "<special8>", "<special9>"], **kwargs): super(XLMTokenizer, self).__init__(unk_token=unk_token, bos_token=bos_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, additional_special_tokens=additional_special_tokens, **kwargs) try: import ftfy from spacy.lang.en import English _nlp = English() self.nlp = _nlp.Defaults.create_tokenizer(_nlp) self.fix_text = ftfy.fix_text except ImportError: logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") self.nlp = BasicTokenizer(do_lower_case=True) self.fix_text = None self.encoder = json.load(open(vocab_file, encoding="utf-8")) self.decoder = {v:k for k,v in self.encoder.items()} merges = open(merges_file, encoding='utf-8').read().split('\n')[:-1] merges = [tuple(merge.split()[:2]) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {}
Example #28
Source File: tokenization_xlm.py From HPSG-Neural-Parser with MIT License | 5 votes |
def __init__(self, vocab_file, merges_file, unk_token="<unk>", bos_token="<s>", sep_token="</s>", pad_token="<pad>", cls_token="</s>", mask_token="<special1>", additional_special_tokens=["<special0>", "<special1>", "<special2>", "<special3>", "<special4>", "<special5>", "<special6>", "<special7>", "<special8>", "<special9>"], **kwargs): super(XLMTokenizer, self).__init__(unk_token=unk_token, bos_token=bos_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, additional_special_tokens=additional_special_tokens, **kwargs) try: import ftfy import spacy self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat']) self.fix_text = ftfy.fix_text except ImportError: logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") self.nlp = BasicTokenizer(do_lower_case=True) self.fix_text = None self.encoder = json.load(open(vocab_file, encoding="utf-8")) self.decoder = {v:k for k,v in self.encoder.items()} merges = open(merges_file, encoding='utf-8').read().split('\n')[:-1] merges = [tuple(merge.split()[:2]) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {}
Example #29
Source File: tokenization_openai.py From KagNet with MIT License | 5 votes |
def tokenize(self, text): """ Tokenize a string. """ split_tokens = [] if self.fix_text is None: # Using BERT's BasicTokenizer text = self.nlp.tokenize(text) for token in text: split_tokens.extend([t for t in self.bpe(token).split(' ')]) else: # Using SpaCy & ftfy (original tokenization process of OpenAI GPT) text = self.nlp(text_standardize(self.fix_text(text))) for token in text: split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')]) return split_tokens
Example #30
Source File: plugin.py From limnoria-plugins with Do What The F*ck You Want To Public License | 5 votes |
def normalize(self, q): q = BeautifulSoup(q) q = fix_text(q.text).replace(r"\'", "'").replace(r"\"", '"') q = re.sub("([.!?])([A-Z(])(?![.'])", r"\g<1> \g<2>", q) q = re.sub("([,;:)])([a-zA-Z(])", r"\g<1> \g<2>", q) q = " ".join(q.split()) return q