Python collections.Counter() Examples
The following are 30
code examples of collections.Counter().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
collections
, or try the search function
.
Example #1
Source File: dtree.py From decisiontrees with MIT License | 7 votes |
def value_counts(self, subset, attr, value, base=False): """ Get the number of currences per value of the dependent variable when the given attribute is equal to the given value. FIXME: Can attr/value be eliminated?? Args: subset: the subset with which to act upon. attr: the attribute of the value. value: the value with which to track counts. base: whether or not to calculate values based on the dependent value (default False). Returns: A Counter instance detailing the number of occurrences per dependent variable. """ counts = Counter() for row in subset: if row[attr] == value or base: counts[row[self.dependent]] += 1 return counts
Example #2
Source File: errorcounter.py From DOTA_models with Apache License 2.0 | 6 votes |
def CountErrors(ocr_text, truth_text): """Counts the drops and adds between 2 bags of iterables. Simple bag of objects count returns the number of dropped and added elements, regardless of order, from anything that is iterable, eg a pair of strings gives character errors, and a pair of word lists give word errors. Args: ocr_text: OCR text iterable (eg string for chars, word list for words). truth_text: Truth text iterable. Returns: ErrorCounts named tuple. """ counts = collections.Counter(truth_text) counts.subtract(ocr_text) drops = sum(c for c in counts.values() if c > 0) adds = sum(-c for c in counts.values() if c < 0) return ErrorCounts(drops, adds, len(truth_text), len(ocr_text))
Example #3
Source File: ggtnn_graph_parse.py From gated-graph-transformer-network with MIT License | 6 votes |
def get_buckets(stories, max_ignore_unbatched=100, max_pad_amount=25): sentencecounts = [len(sents_graphs) for (sents_graphs, query, answer) in stories] countpairs = sorted(collections.Counter(sentencecounts).items()) buckets = [] smallest_left_val = 0 num_unbatched = max_ignore_unbatched for val,ct in countpairs: num_unbatched += ct if val - smallest_left_val > max_pad_amount or num_unbatched > max_ignore_unbatched: buckets.append(val) smallest_left_val = val num_unbatched = 0 if buckets[-1] != countpairs[-1][0]: buckets.append(countpairs[-1][0]) return buckets
Example #4
Source File: data.py From Neural-LP with MIT License | 6 votes |
def _count_batch(self, samples, batch_size): relations = zip(*samples)[0] relations_counts = Counter(relations) num_batches = [ceil(1. * x / batch_size) for x in relations_counts.values()] return int(sum(num_batches))
Example #5
Source File: builddataset.py From slot-filling with MIT License | 6 votes |
def build_vocab(data, min_count=1): count = [("<UNK>", -1), ("<PAD>", -1)] words = [] for sentence, _ in data: words.extend(sentence) counter = Counter(words) counter_list = counter.most_common() for word, c in counter_list: if c >= min_count: count.append((word, c)) word2idx = dict() for word, _ in count: word2idx[word] = len(word2idx) idx2word = dict(zip(word2idx.values(), word2idx.keys())) return word2idx, idx2word
Example #6
Source File: data_process.py From nlp-tensorflow with MIT License | 6 votes |
def build_character(sentences): word_counter = Counter() vocab = dict() reverse_vocab = dict() for sentence in sentences: tokens = list(sentence) word_counter.update(tokens) vocab['<PAD>'] = 0 vocab['<GO>'] = 1 vocab['<UNK>'] = 2 vocab_idx = 3 for key, value in word_counter.most_common(len(word_counter)): vocab[key] = vocab_idx vocab_idx += 1 for key, value in vocab.items(): reverse_vocab[value] = key vocab_size = len(vocab.keys()) return vocab, reverse_vocab, vocab_size
Example #7
Source File: forests.py From trees with Apache License 2.0 | 6 votes |
def build_tree(train, features, levels=5, numfeatures=100): 'Train a decision tree based on labeled data and features' if levels == 0: C1 = Counter([b for _, b in train]) Leaf = (None, C1) return Leaf else: try: X = (split(train, F) for F in random.sample(features, numfeatures)) H, L1, L2, F = max(X) M1 = build_tree(L1, features, levels - 1, numfeatures) M2 = build_tree(L2, features, levels - 1, numfeatures) Branch = (F, M1, M2) return Branch except: return build_tree(train, features, levels=0)
Example #8
Source File: data_process.py From nlp-tensorflow with MIT License | 6 votes |
def build_vocab(sentences): word_counter = Counter() vocab = dict() reverse_vocab = dict() for sentence in sentences: tokens = tokenizer(sentence) word_counter.update(tokens) vocab['<PAD>'] = 0 vocab['<GO>'] = 1 vocab['<UNK>'] = 2 vocab_idx = 3 for key, value in word_counter.most_common(len(word_counter)): vocab[key] = vocab_idx vocab_idx += 1 for key, value in vocab.items(): reverse_vocab[value] = key vocab_size = len(vocab.keys()) return vocab, reverse_vocab, vocab_size
Example #9
Source File: strainsimulationwrapper.py From CAMISIM with Apache License 2.0 | 6 votes |
def _get_genome_amounts(self, probability, max_genome_amount): """ Get amounts of genomes by original genome @param probability: Proportion of simulated original genomes @type probability: int | long | float @param max_genome_amount: Total number of genomes @type max_genome_amount: int | long @return: List of integers representing amount of strains @rtype: list[int] """ assert isinstance(probability, (int, long, float)) assert 0 <= probability <= 1 assert isinstance(max_genome_amount, (int, long)) genome_amounts = self._get_genome_amounts_geometric(probability, max_genome_amount) diverence = Counter(genome_amounts)[1] / float(len(genome_amounts)) if max_genome_amount >= 10: while abs(diverence - probability) > 0.05: # print "need: {}, gotten: {}".format(probability, diverence) genome_amounts = self._get_genome_amounts_geometric(probability, max_genome_amount) diverence = Counter(genome_amounts)[1] / float(len(genome_amounts)) return genome_amounts
Example #10
Source File: subject_verb_agreement.py From fine-lm with MIT License | 6 votes |
def _build_vocab(examples, example_field, vocab_dir, vocab_name): """Build a vocabulary from examples. Args: examples: a dict containing all the examples. example_field: field of example from which the vocabulary is built. vocab_dir: directory where to save the vocabulary. vocab_name: vocab file name. Returns: text encoder. """ vocab_path = os.path.join(vocab_dir, vocab_name) if not tf.gfile.Exists(vocab_path): data = [] for e in examples: data.extend(e[example_field].split()) counter = collections.Counter(data) count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) words, _ = list(zip(*count_pairs)) encoder = text_encoder.TokenTextEncoder(None, vocab_list=words) encoder.store_to_file(vocab_path) else: encoder = text_encoder.TokenTextEncoder(vocab_path) return encoder
Example #11
Source File: tokenizer.py From fine-lm with MIT License | 6 votes |
def corpus_token_counts( text_filepattern, corpus_max_lines, split_on_newlines=True): """Read the corpus and compute a dictionary of token counts. Args: text_filepattern: A pattern matching one or more files. corpus_max_lines: An integer; maximum total lines to read. split_on_newlines: A boolean. If true, then split files by lines and strip leading and trailing whitespace from each line. Otherwise, treat each file as a single string. Returns: a dictionary mapping token to count. """ counts = collections.Counter() for doc in _read_filepattern( text_filepattern, max_lines=corpus_max_lines, split_on_newlines=split_on_newlines): counts.update(encode(_native_to_unicode(doc))) return counts
Example #12
Source File: w2v_utils.py From deep-learning-note with MIT License | 6 votes |
def build_vocab(words, vocab_size, visual_fld): """ Build vocabulary of VOCAB_SIZE most frequent words and write it to visualization/vocab.tsv """ utils.safe_mkdir(visual_fld) file = open(os.path.join(visual_fld, 'vocab.tsv'), 'w') dictionary = dict() count = [('UNK', -1)] index = 0 count.extend(Counter(words).most_common(vocab_size - 1)) for word, _ in count: dictionary[word] = index index += 1 file.write(word + '\n') index_dictionary = dict(zip(dictionary.values(), dictionary.keys())) file.close() return dictionary, index_dictionary
Example #13
Source File: ptb.py From fine-lm with MIT License | 6 votes |
def _build_vocab(filename, vocab_path, vocab_size): """Reads a file to build a vocabulary of `vocab_size` most common words. The vocabulary is sorted by occurrence count and has one word per line. Originally from: https://github.com/tensorflow/models/blob/master/tutorials/rnn/ptb/reader.py Args: filename: file to read list of words from. vocab_path: path where to save the vocabulary. vocab_size: size of the vocabulary to generate. """ data = _read_words(filename) counter = collections.Counter(data) count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) words, _ = list(zip(*count_pairs)) words = words[:vocab_size] with open(vocab_path, "w") as f: f.write("\n".join(words))
Example #14
Source File: text_encoder_test.py From fine-lm with MIT License | 6 votes |
def test_save_and_reload_no_single_quotes(self): corpus = "the quick brown fox jumps over the lazy dog" token_counts = collections.Counter(corpus.split(" ")) # Deliberately exclude some required encoding chars from the alphabet # and token list, making some strings unencodable. encoder = text_encoder.SubwordTextEncoder.build_to_target_size( 100, token_counts, 2, 10) filename = os.path.join(self.test_temp_dir, "out.voc") encoder.store_to_file(filename, add_single_quotes=False) new_encoder = text_encoder.SubwordTextEncoder(filename) self.assertEqual(encoder._alphabet, new_encoder._alphabet) self.assertEqual(encoder.all_subtoken_strings, new_encoder.all_subtoken_strings) self.assertEqual(encoder._subtoken_string_to_id, new_encoder._subtoken_string_to_id) self.assertEqual(encoder._max_subtoken_len, new_encoder._max_subtoken_len)
Example #15
Source File: dtree.py From decisiontrees with MIT License | 6 votes |
def attr_counts(self, subset, attr): """ Get the number of occurrences per value of the given attribute Args: subset: the subset with which to act upon. attr: the selected attribute. Returns: A Counter instance detailing the number of occurrences per attribute value. """ counts = Counter() for row in subset: counts[row[attr]] += 1 return counts
Example #16
Source File: bleu_hook.py From fine-lm with MIT License | 6 votes |
def _get_ngrams(segment, max_order): """Extracts all n-grams up to a given maximum order from an input segment. Args: segment: text segment from which n-grams will be extracted. max_order: maximum length in tokens of the n-grams returned by this methods. Returns: The Counter containing all n-grams up to max_order in segment with a count of how many times each n-gram occurred. """ ngram_counts = collections.Counter() for order in range(1, max_order + 1): for i in range(0, len(segment) - order + 1): ngram = tuple(segment[i:i + order]) ngram_counts[ngram] += 1 return ngram_counts
Example #17
Source File: babi_qa.py From fine-lm with MIT License | 6 votes |
def _build_vocab(generator, vocab_dir, vocab_name): """Build a vocabulary from examples. Args: generator: text generator for creating vocab. vocab_dir: directory where to save the vocabulary. vocab_name: vocab file name. Returns: text encoder. """ vocab_path = os.path.join(vocab_dir, vocab_name) if not tf.gfile.Exists(vocab_path): data = [] for line in generator: data.extend(line.split()) counter = collections.Counter(data) count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) words, _ = list(zip(*count_pairs)) encoder = text_encoder.TokenTextEncoder(None, vocab_list=words) encoder.store_to_file(vocab_path) else: encoder = text_encoder.TokenTextEncoder(vocab_path) return encoder
Example #18
Source File: wikitext103.py From fine-lm with MIT License | 6 votes |
def _build_vocab(filename, vocab_dir, vocab_name): """Reads a file to build a vocabulary. Args: filename: file to read list of words from. vocab_dir: directory where to save the vocabulary. vocab_name: vocab file name. Returns: text encoder. """ vocab_path = os.path.join(vocab_dir, vocab_name) if not tf.gfile.Exists(vocab_path): with tf.gfile.GFile(filename, "r") as f: data = f.read().split() counter = collections.Counter(data) count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) words, _ = list(zip(*count_pairs)) encoder = text_encoder.TokenTextEncoder(None, vocab_list=words) encoder.store_to_file(vocab_path) else: encoder = text_encoder.TokenTextEncoder(vocab_path) return encoder
Example #19
Source File: test_contrib_text.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 6 votes |
def test_indices_to_tokens(): counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) vocab = text.vocab.Vocabulary(counter, most_freq_count=None, min_freq=1, unknown_token='<unknown>', reserved_tokens=None) i1 = vocab.to_tokens(1) assert i1 == 'c' i2 = vocab.to_tokens([1]) assert i2 == ['c'] i3 = vocab.to_tokens([0, 0]) assert i3 == ['<unknown>', '<unknown>'] i4 = vocab.to_tokens([3, 0, 3, 2]) assert i4 == ['a', '<unknown>', 'a', 'b'] assertRaises(ValueError, vocab.to_tokens, 100)
Example #20
Source File: test_contrib_text.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 6 votes |
def test_tokens_to_indices(): counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$']) vocab = text.vocab.Vocabulary(counter, most_freq_count=None, min_freq=1, unknown_token='<unk>', reserved_tokens=None) i1 = vocab.to_indices('c') assert i1 == 1 i2 = vocab.to_indices(['c']) assert i2 == [1] i3 = vocab.to_indices(['<unk>', 'non-exist']) assert i3 == [0, 0] i4 = vocab.to_indices(['a', 'non-exist', 'a', 'b']) assert i4 == [3, 0, 3, 2]
Example #21
Source File: text_encoder_test.py From fine-lm with MIT License | 6 votes |
def test_custom_reserved_tokens(self): """Test that we can pass custom reserved tokens to SubwordTextEncoder.""" corpus = "The quick brown fox jumps over the lazy dog" token_counts = collections.Counter(corpus.split(" ")) start_symbol = "<S>" end_symbol = "<E>" reserved_tokens = text_encoder.RESERVED_TOKENS + [start_symbol, end_symbol] encoder = text_encoder.SubwordTextEncoder.build_to_target_size( 10, token_counts, 2, 10, reserved_tokens=reserved_tokens) # Make sure that reserved tokens appear in the right places. self.assertEqual(encoder.decode([2]), start_symbol) self.assertEqual(encoder.decode([3]), end_symbol) # Make sure that we haven't messed up the ability to reconstruct. reconstructed_corpus = encoder.decode(encoder.encode(corpus)) self.assertEqual(corpus, reconstructed_corpus)
Example #22
Source File: text_encoder_test.py From fine-lm with MIT License | 6 votes |
def test_encodable_when_not_in_alphabet(self): corpus = "the quick brown fox jumps over the lazy dog" token_counts = collections.Counter(corpus.split(" ")) encoder = text_encoder.SubwordTextEncoder.build_to_target_size( 100, token_counts, 2, 10) original = "This has UPPER CASE letters that are out of alphabet" # Early versions could have an infinite loop when breaking into subtokens # if there was any out-of-alphabet characters in the encoded string. encoded = encoder.encode(original) decoded = encoder.decode(encoded) self.assertEqual(original, decoded) encoded_str = "".join(encoder.all_subtoken_strings[i] for i in encoded) self.assertIn("\\84;", encoded_str)
Example #23
Source File: text_encoder_test.py From fine-lm with MIT License | 6 votes |
def test_reserved_token_chars_not_in_alphabet(self): corpus = "dog" token_counts = collections.Counter(corpus.split(" ")) encoder1 = text_encoder.SubwordTextEncoder.build_to_target_size( 100, token_counts, 2, 100) filename = os.path.join(self.test_temp_dir, "out.voc") encoder1.store_to_file(filename) encoder2 = text_encoder.SubwordTextEncoder(filename=filename) self.assertEqual(encoder1._alphabet, encoder2._alphabet) for t in text_encoder.RESERVED_TOKENS: for c in t: # Verify that encoders can encode all reserved token chars. encoder1.encode(c) encoder2.encode(c)
Example #24
Source File: stt_bi_graphemes_util.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 6 votes |
def generate_bi_graphemes_dictionary(label_list): freqs = Counter() for label in label_list: label = label.split(' ') for i in label: for pair in split_every(2, i): if len(pair) == 2: freqs[pair] += 1 with open('resources/unicodemap_en_baidu_bi_graphemes.csv', 'w') as bigram_label: bigramwriter = csv.writer(bigram_label, delimiter = ',') baidu_labels = list('\' abcdefghijklmnopqrstuvwxyz') for index, key in enumerate(baidu_labels): bigramwriter.writerow((key, index+1)) for index, key in enumerate(freqs.keys()): bigramwriter.writerow((key, index+len(baidu_labels)+1))
Example #25
Source File: text_encoder_test.py From fine-lm with MIT License | 5 votes |
def test_encode_decode(self): corpus = ( "This is a corpus of text that provides a bunch of tokens from which " "to build a vocabulary. It will be used when strings are encoded " "with a TextEncoder subclass. The encoder was coded by a coder.") token_counts = collections.Counter(corpus.split(" ")) alphabet = set(corpus) - {" "} original = "This is a coded sentence encoded by the SubwordTextEncoder." token_counts.update(original.split(" ")) encoder = text_encoder.SubwordTextEncoder.build_to_target_size( 100, token_counts, 2, 10) # Encoding should be reversible. encoded = encoder.encode(original) decoded = encoder.decode(encoded) self.assertEqual(original, decoded) # The substrings coded and coder are frequent enough in the corpus that # they should appear in the vocabulary even though they are substrings # of other included strings. subtoken_strings = {encoder.all_subtoken_strings[i] for i in encoded} self.assertIn("encoded_", subtoken_strings) self.assertIn("coded_", subtoken_strings) self.assertIn("TextEncoder", encoder.all_subtoken_strings) self.assertIn("coder", encoder.all_subtoken_strings) # Every character in the corpus should be in the encoders alphabet and # its subtoken vocabulary. self.assertTrue(alphabet.issubset(encoder._alphabet)) for a in alphabet: self.assertIn(a, encoder.all_subtoken_strings)
Example #26
Source File: to_string.py From QCElemental with BSD 3-Clause "New" or "Revised" License | 5 votes |
def formula_generator(elem): """Return simple chemical formula from element list `elem`. >>> formula_generator(['C', 'Ca', 'O', 'O', 'Ag'] AgCCaO2 """ counted = collections.Counter(elem) return "".join((el if cnt == 1 else (el + str(cnt))) for el, cnt in sorted(counted.items()))
Example #27
Source File: data_process.py From nlp-tensorflow with MIT License | 5 votes |
def build_vocab(lines, max_vocab=None): word_counter = Counter() vocab = dict() reverse_vocab = dict() for line in lines: tokens = tokenizer(line) word_counter.update(tokens) vocab['<PAD>'] = 0 vocab['<UNK>'] = 1 vocab_idx = 2 if max_vocab == None or max_vocab > len(word_counter): max_vocab = len(word_counter) for key, value in word_counter.most_common(max_vocab): vocab[key] = vocab_idx vocab_idx += 1 for key, value in vocab.items(): reverse_vocab[value] = key vocab_size = len(vocab.keys()) return vocab, reverse_vocab, vocab_size
Example #28
Source File: data_process.py From nlp-tensorflow with MIT License | 5 votes |
def build_vocab_pos(lines, max_vocab=None): word_counter = Counter() vocab = dict() reverse_vocab = dict() for line in lines: tokens = pos_extractor(line) word_counter.update(tokens) vocab['<PAD>'] = 0 vocab['<UNK>'] = 1 vocab_idx = 2 if max_vocab == None or max_vocab > len(word_counter): max_vocab = len(word_counter) for key, value in word_counter.most_common(max_vocab): vocab[key] = vocab_idx vocab_idx += 1 for key, value in vocab.items(): reverse_vocab[value] = key vocab_size = len(vocab.keys()) return vocab, reverse_vocab, vocab_size
Example #29
Source File: data_process.py From nlp-tensorflow with MIT License | 5 votes |
def build_vocab_morphs(lines, max_vocab=None): word_counter = Counter() vocab = dict() reverse_vocab = dict() for line in lines: tokens = morphs_extractor(line) word_counter.update(tokens) vocab['<PAD>'] = 0 vocab['<UNK>'] = 1 vocab_idx = 2 if max_vocab == None or max_vocab > len(word_counter): max_vocab = len(word_counter) for key, value in word_counter.most_common(max_vocab): vocab[key] = vocab_idx vocab_idx += 1 for key, value in vocab.items(): reverse_vocab[value] = key vocab_size = len(vocab.keys()) return vocab, reverse_vocab, vocab_size
Example #30
Source File: malware.py From trees with Apache License 2.0 | 5 votes |
def feature_importance_test(self, data, D, L, N, name): max_ACC, TP, FP = [], [], [] L = defaultdict(list) for _ in range(30): feature_labels, feature_set, training_data, test_data = selectsubsets(data, features=self.features, training=self.training, testing=self.testing, fraction_negative=self.bias) (training_set, training_records, training_labels) = training_data (test_set, test_records, test_labels) = test_data F = Forest(trees = self.trees, numfeatures = self.proposed_features) R = Record(training_labels, training_records) F.train(R) features = [] for t in F.root: features += [X for X, _ in list(get_features(t))] c = Counter(features) items = sorted(c.items(), key=lambda x: x[1], reverse=True) for l, v in items: L[feature_labels[l]].append(v) #V = [v for _, v in items] #L = [l for l, _ in items] return L # return max_ACC, TP, FP