edu.stanford.nlp.stats.ClassicCounter Java Examples
The following examples show how to use
edu.stanford.nlp.stats.ClassicCounter.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DocumentFrequencyCounter.java From wiseowl with MIT License | 6 votes |
/** * Get an IDF map for the given document string. * * @param document * @return */ private static Counter<String> getIDFMapForDocument(String document) { // Clean up -- remove some Gigaword patterns that slow things down // / don't help anything document = headingSeparator.matcher(document).replaceAll(""); DocumentPreprocessor preprocessor = new DocumentPreprocessor(new StringReader(document)); preprocessor.setTokenizerFactory(tokenizerFactory); Counter<String> idfMap = new ClassicCounter<String>(); for (List<HasWord> sentence : preprocessor) { if (sentence.size() > MAX_SENTENCE_LENGTH) continue; List<TaggedWord> tagged = tagger.tagSentence(sentence); for (TaggedWord w : tagged) { if (w.tag().startsWith("n")) idfMap.incrementCount(w.word()); } } return idfMap; }
Example #2
Source File: ConvertWeights.java From phrasal with GNU General Public License v3.0 | 6 votes |
@SuppressWarnings("unchecked") public static void main(String[] args) { if (args.length != 1) { System.err.printf("Usage: java %s old_wts%n", ConvertWeights.class.getName()); System.exit(-1); } String filename = args[0]; Counter<String> oldWeights = IOTools.deserialize(filename, ClassicCounter.class, SerializationMode.DEFAULT); Path oldFilename = Paths.get(filename + ".old"); try { Files.move(Paths.get(filename), oldFilename); } catch (IOException e) { e.printStackTrace(); System.exit(-1); } IOTools.writeWeights(filename, oldWeights); System.out.printf("Converted %s to new format (old file moved to %s)%n", filename, oldFilename.toString()); }
Example #3
Source File: DependencyBnBPreorderer.java From phrasal with GNU General Public License v3.0 | 6 votes |
private static Set<String> getMostFrequentTokens(LineNumberReader reader, int k) throws IOException { Counter<String> tokenCounts = new ClassicCounter<String>(); String line; while ((line = reader.readLine()) != null) { String tokens[] = line.split("\\s+"); for (String t : tokens) { tokenCounts.incrementCount(t); } } Set<String> mostFrequentTokens = new HashSet<>(k); Counters.retainTop(tokenCounts, k); mostFrequentTokens.addAll(tokenCounts.keySet()); tokenCounts = null; return mostFrequentTokens; }
Example #4
Source File: MetricUtils.java From phrasal with GNU General Public License v3.0 | 6 votes |
/** * Calculates the "informativeness" of each ngram, which is used by the NIST * metric. In Matlab notation, the informativeness of the ngram w_1:n is * defined as -log2(count(w_1:n)/count(w_1:n-1)). * * @param ngramCounts * ngram counts according to references * @param totWords * total number of words, which is used to compute the * informativeness of unigrams. */ static public <TK> Counter<Sequence<TK>> getNGramInfo( Counter<Sequence<TK>> ngramCounts, int totWords) { Counter<Sequence<TK>> ngramInfo = new ClassicCounter<Sequence<TK>>(); for (Sequence<TK> ngram : ngramCounts.keySet()) { double num = ngramCounts.getCount(ngram); double denom = totWords; if (ngram.size() > 1) { Sequence<TK> ngramPrefix = ngram.subsequence(0, ngram.size() - 1); denom = ngramCounts.getCount(ngramPrefix); } double inf = -Math.log(num / denom) / LOG2; ngramInfo.setCount(ngram, inf); // System.err.printf("ngram info: %s %.3f\n", ngram.toString(), inf); } return ngramInfo; }
Example #5
Source File: MetricUtils.java From phrasal with GNU General Public License v3.0 | 6 votes |
/** * Compute maximum n-gram counts from one or more sequences. * * @param sequences - The list of sequences. * @param maxOrder - The n-gram order. */ static public <TK> Counter<Sequence<TK>> getMaxNGramCounts( List<Sequence<TK>> sequences, double[] seqWeights, int maxOrder) { Counter<Sequence<TK>> maxCounts = new ClassicCounter<Sequence<TK>>(); maxCounts.setDefaultReturnValue(0.0); if(seqWeights != null && seqWeights.length != sequences.size()) { throw new RuntimeException("Improper weight vector for sequences."); } int seqId = 0; for (Sequence<TK> sequence : sequences) { Counter<Sequence<TK>> counts = getNGramCounts(sequence, maxOrder); for (Sequence<TK> ngram : counts.keySet()) { double weight = seqWeights == null ? 1.0 : seqWeights[seqId]; double countValue = weight * counts.getCount(ngram); double currentMax = maxCounts.getCount(ngram); maxCounts.setCount(ngram, Math.max(countValue, currentMax)); } ++seqId; } return maxCounts; }
Example #6
Source File: OptimizerUtils.java From phrasal with GNU General Public License v3.0 | 6 votes |
public static Set<String> featureWhiteList(FlatNBestList nbest, int minSegmentCount) { List<List<ScoredFeaturizedTranslation<IString, String>>> nbestlists = nbest.nbestLists(); Counter<String> featureSegmentCounts = new ClassicCounter<String>(); for (List<ScoredFeaturizedTranslation<IString, String>> nbestlist : nbestlists) { Set<String> segmentFeatureSet = new HashSet<String>(); for (ScoredFeaturizedTranslation<IString, String> trans : nbestlist) { for (FeatureValue<String> feature : trans.features) { segmentFeatureSet.add(feature.name); } } for (String featureName : segmentFeatureSet) { featureSegmentCounts.incrementCount(featureName); } } return Counters.keysAbove(featureSegmentCounts, minSegmentCount -1); }
Example #7
Source File: OnlineTuner.java From phrasal with GNU General Public License v3.0 | 6 votes |
public ProcessorInput(List<Sequence<IString>> input, List<List<Sequence<IString>>> references, Counter<String> weights, int[] translationIds, int inputId, TranslationModel<IString,String> localTM, boolean createForcedAlignment, boolean additionalPrefixDecoding) { this.source = input; this.translationIds = translationIds; this.references = references; this.inputId = inputId; // Copy here for thread safety. DO NOT change this unless you know // what you're doing.... this.weights = new ClassicCounter<String>(weights); this.localTM = localTM; this.createForcedAlignment = createForcedAlignment; this.additionalPrefixDecoding = additionalPrefixDecoding; }
Example #8
Source File: KBPStatisticalExtractor.java From InformationExtraction with GNU General Public License v3.0 | 6 votes |
public static Counter<String> features(KBPInput input) { // Ensure RegexNER Tags! input.sentence.regexner(DefaultPaths.DEFAULT_KBP_REGEXNER_CASED, false); input.sentence.regexner(DefaultPaths.DEFAULT_KBP_REGEXNER_CASELESS, true); // Get useful variables ClassicCounter<String> feats = new ClassicCounter<>(); if (Span.overlaps(input.subjectSpan, input.objectSpan) || input.subjectSpan.size() == 0 || input.objectSpan.size() == 0) { return new ClassicCounter<>(); } // Actually featurize denseFeatures(input, input.sentence, feats); surfaceFeatures(input, input.sentence, feats); dependencyFeatures(input, input.sentence, feats); relationSpecificFeatures(input, input.sentence, feats); return feats; }
Example #9
Source File: KBPStatisticalExtractor.java From InformationExtraction with GNU General Public License v3.0 | 6 votes |
public static Counter<String> features(KBPInput input) { // Ensure RegexNER Tags! input.sentence.regexner(IntelConfig.Regex_NER_caseless, false); input.sentence.regexner(IntelConfig.Regex_NER_cased, true); // Get useful variables ClassicCounter<String> feats = new ClassicCounter<>(); if (Span.overlaps(input.subjectSpan, input.objectSpan) || input.subjectSpan.size() == 0 || input.objectSpan.size() == 0) { return new ClassicCounter<>(); } // Actually featurize denseFeatures(input, input.sentence, feats); surfaceFeatures(input, input.sentence, feats); dependencyFeatures(input, input.sentence, feats); relationSpecificFeatures(input, input.sentence, feats); return feats; }
Example #10
Source File: MERT.java From phrasal with GNU General Public License v3.0 | 5 votes |
static Counter<String> randomWts(Set<String> keySet) { Counter<String> randpt = new ClassicCounter<String>(); for (String f : keySet) { randpt.setCount(f, globalRandom.nextDouble()); } System.err.printf("random Wts: %s%n", randpt); return randpt; }
Example #11
Source File: ComputeBitextIDF.java From phrasal with GNU General Public License v3.0 | 5 votes |
/** * @param args */ public static void main(String[] args) { if (args.length > 0) { System.err.printf("Usage: java %s < files > idf-file%n", ComputeBitextIDF.class.getName()); System.exit(-1); } Counter<String> documentsPerTerm = new ClassicCounter<String>(1000000); LineNumberReader reader = new LineNumberReader(new InputStreamReader(System.in)); double nDocuments = 0.0; try { for (String line; (line = reader.readLine()) != null;) { String[] tokens = line.trim().split("\\s+"); Set<String> seen = new HashSet<String>(tokens.length); for (String token : tokens) { if ( ! seen.contains(token)) { seen.add(token); documentsPerTerm.incrementCount(token); } } } nDocuments = reader.getLineNumber(); reader.close(); } catch (IOException e) { e.printStackTrace(); } // Output the idfs System.err.printf("Bitext contains %d sentences and %d word types%n", (int) nDocuments, documentsPerTerm.keySet().size()); for (String wordType : documentsPerTerm.keySet()) { double count = documentsPerTerm.getCount(wordType); System.out.printf("%s\t%f%n", wordType, Math.log(nDocuments / count)); } System.out.printf("%s\t%f%n", UNK_TOKEN, Math.log(nDocuments / 1.0)); }
Example #12
Source File: TargetFunctionWordInsertion.java From phrasal with GNU General Public License v3.0 | 5 votes |
private Set<IString> loadCountsFile(String filename) { Counter<IString> counter = new ClassicCounter<IString>(); LineNumberReader reader = IOTools.getReaderFromFile(filename); try { for (String line; (line = reader.readLine()) != null;) { String[] fields = line.trim().split("\\s+"); if (fields.length == 2) { String wordType = fields[0]; if ( ! (TokenUtils.isNumericOrPunctuationOrSymbols(wordType) || wordType.equals(TokenUtils.START_TOKEN.toString()) || wordType.equals(TokenUtils.END_TOKEN.toString()))) { counter.setCount(new IString(wordType), Double.valueOf(fields[1])); } } else { System.err.printf("%s: Discarding line %s%n", this.getClass().getName(), line); } } reader.close(); Set<IString> set = new HashSet<>(Counters.topKeys(counter, rankCutoff)); for (IString word : set) { System.err.printf(" %s%n", word); } return set; } catch (IOException e) { throw new RuntimeException(e); } }
Example #13
Source File: FeatureValues.java From phrasal with GNU General Public License v3.0 | 5 votes |
/** * Convert a collection of feature values to a counter. * * @param featureValues * @return */ public static <T> Counter<T> toCounter(Collection<FeatureValue<T>> featureValues) { Counter<T> counter = new ClassicCounter<T>(); for (FeatureValue<T> fv : featureValues) { counter.incrementCount(fv.name, fv.value); } return counter; }
Example #14
Source File: IOTools.java From phrasal with GNU General Public License v3.0 | 5 votes |
/** * Read weights from a plain text file. * * @param filename * @return * @throws IOException */ public static Counter<String> readWeightsPlain(String filename) throws IOException { LineNumberReader reader = new LineNumberReader(new FileReader(filename)); Counter<String> wts = new ClassicCounter<String>(); for (String line; (line = reader.readLine()) != null;) { String[] input = line.split(" "); if(input.length != 2) { reader.close(); throw new IOException("Illegal input in weight file " + filename + ": " + line); } wts.setCount(input[0],Double.parseDouble(input[1])); } reader.close(); return wts; }
Example #15
Source File: IOTools.java From phrasal with GNU General Public License v3.0 | 5 votes |
/** * Read weights from a file. Supports both binary and text formats. * * TODO(spenceg) Replace ClassicCounter with our own SparseVector implementation. * * @param filename * @param featureIndex * @return a counter of weights * @throws IOException */ @SuppressWarnings("unchecked") public static Counter<String> readWeights(String filename, Index<String> featureIndex) { Counter<String> wts = (Counter<String>) deserialize(filename, ClassicCounter.class, SerializationMode.BIN_GZ); if (wts == null) wts = new ClassicCounter<>(); if (featureIndex != null) { for (String key : wts.keySet()) { featureIndex.addToIndex(key); } } return wts; }
Example #16
Source File: Summarizer.java From wiseowl with MIT License | 5 votes |
private static Counter<String> getTermFrequencies(List<CoreMap> sentences) { Counter<String> ret = new ClassicCounter<String>(); for (CoreMap sentence : sentences) for (CoreLabel cl : sentence.get(CoreAnnotations.TokensAnnotation.class)) ret.incrementCount(cl.get(CoreAnnotations.TextAnnotation.class)); return ret; }
Example #17
Source File: DocumentFrequencyCounter.java From wiseowl with MIT License | 5 votes |
public static void main(String[] args) throws InterruptedException, ExecutionException, IOException { ExecutorService pool = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors()); List<Future<Counter<String>>> futures = new ArrayList<Future<Counter<String>>>(); for (String filePath : args) futures.add(pool.submit(new FileIDFBuilder(new File(filePath)))); int finished = 0; Counter<String> overall = new ClassicCounter<String>(); for (Future<Counter<String>> future : futures) { System.err.printf("%s: Polling future #%d / %d%n", dateFormat.format(new Date()), finished + 1, args.length); Counter<String> result = future.get(); finished++; System.err.printf("%s: Finished future #%d / %d%n", dateFormat.format(new Date()), finished, args.length); System.err.printf("\tMerging counter.. "); overall.addAll(result); System.err.printf("done.%n"); } pool.shutdown(); System.err.printf("\n%s: Saving to '%s'.. ", dateFormat.format(new Date()), OUT_FILE); ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(OUT_FILE)); oos.writeObject(overall); System.err.printf("done.%n"); }
Example #18
Source File: MosesCompoundSplitter.java From phrasal with GNU General Public License v3.0 | 5 votes |
private void loadModel(String modelFileName) throws IOException { System.err.println("Loading MosesCompoundSplitter from " + modelFileName); LineNumberReader reader = new LineNumberReader(new FileReader(modelFileName)); lcModel = new ClassicCounter<String>(); trueCase = new HashMap<>(); double totalCount = 0.0; if(useUnigramProbs) probs = new ClassicCounter<String>(); int minCnt = Math.min(MAX_COUNT, MIN_COUNT); for (String line; (line = reader.readLine()) != null;) { String[] input = line.split("\t"); if(input.length != 3) { reader.close(); throw new IOException("Illegal input in model file, line " + reader.getLineNumber() + ": " + line); } long cnt = Long.parseLong(input[2]); totalCount += cnt; String tc = input[1]; if(cnt < minCnt || tc.length() < MIN_SIZE + 1) continue; // these will never be used for splitting anyway String lc = tc.toLowerCase(); // use the most frequent casing if(lcModel.getCount(lc) < cnt) { lcModel.setCount(lc, cnt); trueCase.put(lc, tc); //System.err.println("adding: " + input[1] + " ::: " + input[2]); } } totalCount = Math.log(totalCount); if(useUnigramProbs) { for(Entry<String, Double> e : lcModel.entrySet()) { probs.setCount(e.getKey(), Math.log(e.getValue()) - totalCount); } } reader.close(); }
Example #19
Source File: RepetitionRate.java From phrasal with GNU General Public License v3.0 | 5 votes |
RepetitionRateIncrementalMetric() { ngrams = new ClassicCounter<Sequence<TK>>(); corpus = new ArrayList<Sequence<TK>>(); for(int i = 0; i < maxNgramOrder; ++i) { totalNonSingletonNgrams.add(0); totalNgrams.add(0); windowNonSingletonNgrams.add(0); windowNgrams.add(0); } }
Example #20
Source File: KBPStatisticalExtractor.java From InformationExtraction with GNU General Public License v3.0 | 5 votes |
@SuppressWarnings("UnusedParameters") private static void denseFeatures(KBPInput input, Sentence sentence, ClassicCounter<String> feats) { boolean subjBeforeObj = input.subjectSpan.isBefore(input.objectSpan); // Type signature indicator(feats, "type_signature", input.subjectType + "," + input.objectType); // Relative position indicator(feats, "subj_before_obj", subjBeforeObj ? "y" : "n"); }
Example #21
Source File: MetricUtils.java From phrasal with GNU General Public License v3.0 | 5 votes |
/** * * @param <TK> */ static public <TK> Counter<Sequence<TK>> getNGramCounts(Sequence<TK> sequence, int maxOrder) { Counter<Sequence<TK>> counts = new ClassicCounter<>(); int sz = sequence.size(); for (int i = 0; i < sz; i++) { int jMax = Math.min(sz, i + maxOrder); for (int j = i + 1; j <= jMax; j++) { Sequence<TK> ngram = sequence.subsequence(i, j); counts.incrementCount(ngram); } } return counts; }
Example #22
Source File: NISTMetric.java From phrasal with GNU General Public License v3.0 | 5 votes |
@Override public IncrementalEvaluationMetric<TK, FV> replace(int index, ScoredFeaturizedTranslation<TK, FV> trans) { if (index > sequences.size()) { throw new IndexOutOfBoundsException(String.format("Index: %d >= %d", index, sequences.size())); } Counter<Sequence<TK>> canidateCounts = (trans == null ? new ClassicCounter<Sequence<TK>>() : MetricUtils.getNGramCounts(trans.translation, order)); MetricUtils.clipCounts(canidateCounts, maxReferenceCounts.get(index)); if (sequences.get(index) != null) { Counter<Sequence<TK>> oldCanidateCounts = MetricUtils.getNGramCounts( sequences.get(index), order); MetricUtils.clipCounts(oldCanidateCounts, maxReferenceCounts.get(index)); decCounts(oldCanidateCounts, sequences.get(index)); c -= sequences.get(index).size(); r -= averageReferenceLength(index); } sequences.set(index, (trans == null ? null : trans.translation)); if (trans != null) { incCounts(canidateCounts, trans.translation); c += sequences.get(index).size(); r += averageReferenceLength(index); } return this; }
Example #23
Source File: NISTMetric.java From phrasal with GNU General Public License v3.0 | 5 votes |
private void initNgramWeights(List<List<Sequence<TK>>> referencesList) { int len = 0; Counter<Sequence<TK>> allNgrams = new ClassicCounter<Sequence<TK>>(); for (List<Sequence<TK>> references : referencesList) { for (Sequence<TK> reference : references) { len += reference.size(); Counter<Sequence<TK>> altCounts = MetricUtils.getNGramCounts( reference, order); addToCounts(allNgrams, altCounts); } } ngramInfo = MetricUtils.getNGramInfo(allNgrams, len); }
Example #24
Source File: MERT.java From phrasal with GNU General Public License v3.0 | 5 votes |
public static Counter<String> summarizedAllFeaturesVector( List<ScoredFeaturizedTranslation<IString, String>> trans) { Counter<String> sumValues = new ClassicCounter<String>(); for (ScoredFeaturizedTranslation<IString, String> tran : trans) { for (FeatureValue<String> fValue : tran.features) { sumValues.incrementCount(fValue.name, fValue.value); } } return sumValues; }
Example #25
Source File: OneSidedObjectiveFunction.java From phrasal with GNU General Public License v3.0 | 5 votes |
/** * Constructor. * * @param input */ public OneSidedObjectiveFunction(ClustererState input) { // Setup delta data structures this.inputState = input; localWordToClass = new HashMap<>(input.vocabularySubset.size()); deltaClassCount = new ClassicCounter<Integer>(input.numClasses); deltaClassHistoryCount = new TwoDimensionalCounter<Integer,NgramHistory>(); for (IString word : input.vocabularySubset) { int classId = input.wordToClass.get(word); localWordToClass.put(word, classId); } this.objValue = input.currentObjectiveValue; }
Example #26
Source File: OptimizerUtils.java From phrasal with GNU General Public License v3.0 | 5 votes |
public static <T> Counter<T> featureValueCollectionToCounter(Collection<FeatureValue<T>> c) { Counter<T> counter = new ClassicCounter<T>(); for (FeatureValue<T> fv : c) { counter.incrementCount(fv.name, fv.value); } return counter; }
Example #27
Source File: OptimizerUtils.java From phrasal with GNU General Public License v3.0 | 5 votes |
public static Counter<String> getWeightCounterFromArray(String[] weightNames, double[] wtsArr) { Counter<String> wts = new ClassicCounter<String>(); for (int i = 0; i < weightNames.length; i++) { wts.setCount(weightNames[i], wtsArr[i]); } return wts; }
Example #28
Source File: AdaGradFOBOSUpdater.java From phrasal with GNU General Public License v3.0 | 5 votes |
public AdaGradFOBOSUpdater(double initialRate, int expectedNumFeatures, double lambda, Norm norm, Counter<String> customL1, Set<String> fixedFeatures) { this.rate = initialRate; this.lambda = lambda; this.norm = norm; this.customL1 = customL1; this.fixedFeatures = fixedFeatures; sumGradSquare = new ClassicCounter<String>(expectedNumFeatures); }
Example #29
Source File: AbstractOnlineOptimizer.java From phrasal with GNU General Public License v3.0 | 5 votes |
@Override public Counter<String> getBatchGradient(Counter<String> weights, List<Sequence<IString>> sources, int[] sourceIds, List<List<RichTranslation<IString, String>>> translations, List<List<Sequence<IString>>> references, double[] referenceWeights, SentenceLevelMetric<IString, String> scoreMetric) { Counter<String> batchGradient = new ClassicCounter<String>(); for (int i = 0; i < sourceIds.length; i++) { if (translations.get(i).size() > 0) { // Skip decoder failures. Counter<String> unregularizedGradient = getUnregularizedGradient(weights, sources.get(i), sourceIds[i], translations.get(i), references.get(i), referenceWeights, scoreMetric); batchGradient.addAll(unregularizedGradient); } } // Add L2 regularization directly into the derivative if (this.l2Regularization) { final Set<String> features = new HashSet<String>(weights.keySet()); features.addAll(weights.keySet()); final double dataFraction = sourceIds.length /(double) tuneSetSize; final double scaledInvSigmaSquared = dataFraction/(2*sigmaSq); for (String key : features) { double x = weights.getCount(key); batchGradient.incrementCount(key, x * scaledInvSigmaSquared); } } return batchGradient; }
Example #30
Source File: KBPStatisticalExtractor.java From InformationExtraction with GNU General Public License v3.0 | 5 votes |
@SuppressWarnings("UnusedParameters") private static void denseFeatures(KBPInput input, Sentence sentence, ClassicCounter<String> feats) { boolean subjBeforeObj = input.subjectSpan.isBefore(input.objectSpan); // Type signature indicator(feats, "type_signature", input.subjectType + "," + input.objectType); // Relative position indicator(feats, "subj_before_obj", subjBeforeObj ? "y" : "n"); }