Python Examples of string.punctuation

Source File: utils.py From MnemonicReader with BSD 3-Clause "New" or "Revised" License

8 votes

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

Source File: Encryption.py From vault with MIT License

8 votes

def gen_salt(self, set_=True):
        """
            Generate a random salt
        """

        min_char = 8
        max_char = 12
        allchar = string.ascii_letters + string.punctuation + string.digits
        salt = "".join(choice(allchar)
                       for x in range(randint(min_char, max_char))).encode()

        # Set the salt in the same instance if required
        if set_:
            self.set_salt(salt)

        return salt

Source File: strings_utils.py From locality-sensitive-hashing with MIT License

6 votes

def normalize(str):
    """
        Normalizes the string making string all lower case and removes all punctuation.
        :param str: string to be normalized
        :return: normalized string, if str is None or empty it returns the original string
    """

    if str:
        if isinstance(str, unicode):
            not_letters_or_digits = u'!"#%\'()*+,-./:;<=>?@[\]^_`{|}~'
            translate_to = u''
            translate_table = dict((ord(char), translate_to) for char in not_letters_or_digits)
            return str.translate(translate_table)
        else:
            return str.lower().translate(string.maketrans("",""), string.punctuation)
    else:
        return str

Source File: utils_classical.py From interpret-text with MIT License

6 votes

def __init__(
        self,
        parser,
        stop_words=spacy.lang.en.stop_words.STOP_WORDS,
        punctuations=string.punctuation,
    ):
        """Initialize the BOWTokenizer object.

        Arguments:
            parser {spacy.lang.en.English - by default} -- Any parser object
                that supports parser(sentence) call on it.

        Keyword Arguments:
            stop_words {iterable over str} -- Set of stop words to be removed.
            (default: {spacy.lang.en.stop_words.STOP_WORDS})
            punctuations {iterable over str} -- Set of punctuations to be
            removed. (default: {string.punctuation})
        """
        self.parser = parser
        # list of stop words and punctuation marks
        self.stop_words = stop_words
        self.punctuations = punctuations

Source File: evaluate-v1.1.py From MnemonicReader with BSD 3-Clause "New" or "Revised" License

6 votes

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

Source File: my_utils.py From ICDAR-2019-SROIE with MIT License

6 votes

def random_string(n):
    if n == 0:
        return ""

    x = random.random()
    if x > 0.5:
        pad = " " * n
    elif x > 0.3:
        pad = "".join(random.choices(digits + " \t\n", k=n))
    elif x > 0.2:
        pad = "".join(random.choices(ascii_uppercase + " \t\n", k=n))
    elif x > 0.1:
        pad = "".join(random.choices(ascii_uppercase + digits + " \t\n", k=n))
    else:
        pad = "".join(
            random.choices(ascii_uppercase + digits + punctuation + " \t\n", k=n)
        )

    return pad

Source File: generic_utils.py From BAMnet with Apache License 2.0

6 votes

def normalize_answer(s):
    """Lower text and remove extra whitespace."""
    def remove_articles(text):
        return re_art.sub(' ', text)

    def remove_punc(text):
        return re_punc.sub(' ', text)  # convert punctuation to spaces

    def white_space_fix(text):
        return ' '.join(text.split())

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

Source File: preprocessing.py From Image-Caption-Generator with MIT License

6 votes

def clean_captions(captions):
	# Prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for _, caption_list in captions.items():
		for i in range(len(caption_list)):
			caption = caption_list[i]
			# Tokenize i.e. split on white spaces
			caption = caption.split()
			# Convert to lowercase
			caption = [word.lower() for word in caption]
			# Remove punctuation from each token
			caption = [w.translate(table) for w in caption]
			# Remove hanging 's' and 'a'
			caption = [word for word in caption if len(word)>1]
			# Remove tokens with numbers in them
			caption = [word for word in caption if word.isalpha()]
			# Store as string
			caption_list[i] =  ' '.join(caption)

Source File: data_structures.py From edm with GNU General Public License v2.0

6 votes

def tokenize_sentence(sentence):
    """
    Splits a sentence into words, strips punctuation and turns it to lowercase.

    :param sentence           : the sentence to tokenize.
    :type sentence            : str

    :return                   : list of words
    """

    # Get rid of non-ascii characters to avoid errors with unrecognised characters
    sentence = "".join([c for c in sentence if 0 < ord(c) < 127])

    sentence = sentence.encode("ascii", errors="ignore").decode()

    # Only works in Python 3
    sentenceNoPunctuation = sentence.translate(str.maketrans("", "", string.punctuation))

    sentenceLower         = sentenceNoPunctuation.lower()
    sentenceWords         = sentenceLower.split()

    return sentenceWords

Source File: tokenize.py From timefhuman with Apache License 2.0

6 votes

def get_character_type(character):
    """
    >>> get_character_type('a')
    'alpha'
    >>> get_character_type('1')
    'numeric'
    >>> get_character_type('.')
    'punctuation'
    >>> get_character_type(' ')
    """
    if character.isalpha():
        return 'alpha'
    elif character.isnumeric():
        return 'numeric'
    elif character in string.punctuation:
        return 'punctuation'
    return None

Source File: base.py From wanggeService with MIT License

6 votes

def getRandomStr(types='letter', length=8):
        """ 随机产生length长度的字符串

        :param types: 随机字符串的类型
        types in ['letter', 'ascii'] 返回包含字母的字符串
        types in ['digit', 'num']: 返回包含数字的字符串
        其他：返回混合字母和数字的字符串

        :param length: 返回字符串的长度
        :return: 长度为length，类型为types的字符串

        todo string.punctuation

        """
        import random
        import string
        if types in ['letter', 'ascii']:
            return ''.join(random.sample(string.ascii_letters, length))
        if types in ['digit', 'num']:
            return ''.join(random.sample(string.digits, length))
        else:
            return ''.join(random.sample(string.ascii_letters + string.digits, length))

Source File: squad_data.py From mipsqa with Apache License 2.0

6 votes

def _normalize_answer(s):
  """Lower text and remove punctuation, articles and extra whitespace.

  Directly copied from official SQuAD eval script, SHOULD NOT BE MODIFIED.

  Args:
    s: Input text.
  Returns:
    Normalized text.
  """

  def remove_articles(text):
    return re.sub(r'\b(a|an|the)\b', ' ', text)

  def white_space_fix(text):
    return ' '.join(text.split())

  def remove_punc(text):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in text if ch not in exclude)

  def lower(text):
    return text.lower()

  return white_space_fix(remove_articles(remove_punc(lower(s))))

Source File: evaluation.py From cdQA with Apache License 2.0

6 votes

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        return re.sub(r"\b(a|an|the)\b", " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

Source File: requestchecker.py From galaxy-sdk-python with Apache License 2.0

6 votes

def isJavaIdentifierPart(c):
    if c in string.ascii_letters:
      return True
    if c in string.digits:
      return True
    if c in string.punctuation:
      return True
    if category(unicode(c)) == 'Sc':
      return True
    if category(unicode(c)) == 'Mn':
      return True
    if category(unicode(c)) == 'N1':
      return True
    if category(unicode(c)) == 'Mc':
      return False
    return False

Source File: test_validate.py From borgmatic with GNU General Public License v3.0

6 votes

def test_parse_configuration_passes_through_quoted_punctuation():
    escaped_punctuation = string.punctuation.replace('\\', r'\\').replace('"', r'\"')

    mock_config_and_schema(
        '''
        location:
            source_directories:
                - /home

            repositories:
                - "{}.borg"
        '''.format(
            escaped_punctuation
        )
    )

    result = module.parse_configuration('config.yaml', 'schema.yaml')

    assert result == {
        'location': {
            'source_directories': ['/home'],
            'repositories': ['{}.borg'.format(string.punctuation)],
        }
    }

Source File: evaluate-v1.1.py From pytorch_pretrained_BERT with Apache License 2.0

6 votes

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

Source File: utils.py From justcopy-backend with MIT License

6 votes

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

Source File: titleParseNew.py From ReadableWebProxy with BSD 3-Clause "New" or "Revised" License

6 votes

def get_preceeding_text(self, prefix_arr):
		intermediate = ""
		consumed = 0

		# print("Get preceeding text:", prefix_arr)
		for idx in range(len(prefix_arr)-1, 0-1, -1):
			if isinstance(prefix_arr[idx], TokenBase):
				# print("Get preceeding text returning:", (prefix_arr[:idx+1], None, intermediate))
				return prefix_arr[:idx+1], None, intermediate
			if all([char in string.punctuation+string.whitespace for char in prefix_arr[idx]]):
				intermediate = prefix_arr[idx] + intermediate
				consumed += 1
			else:
				# print("Get preceeding text returning:", (prefix_arr[:idx], prefix_arr[idx], intermediate))
				return prefix_arr[:idx], prefix_arr[idx], intermediate

		# print("get_preceeding_text", ([], None, intermediate))
		return [], None, intermediate

Source File: utils.py From OpenQA with MIT License

6 votes

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

Source File: indic_tokenize.py From indic_nlp_library with MIT License

6 votes

def trivial_tokenize_urdu(text): 
    """tokenize Urdu string 

    A trivial tokenizer which just tokenizes on the punctuation boundaries. 
    This also includes punctuations for the Urdu script.
    These punctuations characters were identified from the Unicode database 
    for Arabic script by looking for punctuation symbols.

    Args:
        text (str): text to tokenize

    Returns:
        list: list of tokens
    """
    tok_str=triv_tokenizer_urdu_pat.sub(r' \1 ',text.replace('\t',' '))
    return re.sub(r'[ ]+',' ',tok_str).strip(' ').split(' ')

Source File: evaluate.py From cs224n-win18-squad with Apache License 2.0

6 votes

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

Source File: eval.py From BERT-for-Chinese-Question-Answering with Apache License 2.0

6 votes

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation + zh.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

Source File: train_predict_trees_batch2.py From wsdm19cup with MIT License

5 votes

def preprocess(x):
        x = str(x).lower()
        re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
        x = re_tok.sub(r' \1 ', x)
        return x

Source File: googlecalendar.py From lrrbot with Apache License 2.0

5 votes

def process_description(description):
	lines = [line.strip() for line in description.splitlines() if len(line) > 0]

	if len(lines) == 2:
		# Show info from LRR (issue #270): line 1 is game, line 2 is show description
		game, show_description = lines
		if game == '-':
			return show_description
		if show_description[-1] not in string.punctuation:
			show_description += '.'
		return "%s Game: %s" % (show_description, game)
	else:
		return "; ".join(lines)

Source File: mapper.py From Data_Analytics_with_Hadoop with MIT License

5 votes

def exclude(self, token):
        """
        Do not allow punctuation or stopwords in trigrams.
        """
        return (
            token in self.stopwords or
            token in string.punctuation
        )

Source File: train_predict_trees_batch3.py From wsdm19cup with MIT License

5 votes

def __preprocess__(self,x):
        x = str(x).lower()
        re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
        x = re_tok.sub(r' \1 ', x)
        return x

Source File: train_predict_trees_batch3.py From wsdm19cup with MIT License

5 votes

def preprocess(x):
        x = str(x).lower()
        re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
        x = re_tok.sub(r' \1 ', x)
        return x

Source File: train_predict_trees_batch1.py From wsdm19cup with MIT License

5 votes

def preprocess(x):
        x = str(x).lower()
        re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
        x = re_tok.sub(r' \1 ', x)
        return x

Source File: train_predict_trees_batch2.py From wsdm19cup with MIT License

5 votes

def __preprocess__(self,x):
        x = str(x).lower()
        re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
        x = re_tok.sub(r' \1 ', x)
        return x

Source File: CUB_preprocess_token.py From show-adapt-and-tell with MIT License

5 votes

def clean_words(data):
    dict = {}
    freq = {}
    # start with 1
    idx = 1
    sentence_count = 0
    eliminate = 0
    max_w = 30
    for k in tqdm(range(len(data['caption']))):
        sen = data['caption'][k]
        filename = data['file_name'][k]
        # skip the no image description
        words = re.split(' ', sen)
        # pop the last u'.'
        n = len(words)
        if n <= max_w:
            sentence_count += 1
            for word in words:
                for p in string.punctuation:
                    if p in word:
                        word = word.replace(p,'')
                word = word.lower()
                if word not in dict.keys():
                    dict[word] = idx
                    idx += 1
                    freq[word] = 1
                else:
                    freq[word] += 1
        else:
            eliminate += 1
    print 'Threshold(max_words) =', max_w
    print 'Eliminate =', eliminate 
    print 'Total sentence_count =', sentence_count
    print 'Number of different words =', len(dict.keys())
    print 'Saving....'
    np.savez('cleaned_words', dict=dict, freq=freq)
    return dict, freq

Python string.punctuation() Examples