edu.stanford.nlp.stats.ClassicCounter Java Exaples

Source File: DocumentFrequencyCounter.java From wiseowl with MIT License

6 votes

/**
 * Get an IDF map for the given document string.
 *
 * @param document
 * @return
 */
private static Counter<String> getIDFMapForDocument(String document) {
  // Clean up -- remove some Gigaword patterns that slow things down
  // / don't help anything
  document = headingSeparator.matcher(document).replaceAll("");

  DocumentPreprocessor preprocessor = new DocumentPreprocessor(new StringReader(document));
  preprocessor.setTokenizerFactory(tokenizerFactory);

  Counter<String> idfMap = new ClassicCounter<String>();
  for (List<HasWord> sentence : preprocessor) {
    if (sentence.size() > MAX_SENTENCE_LENGTH)
      continue;

    List<TaggedWord> tagged = tagger.tagSentence(sentence);

    for (TaggedWord w : tagged) {
      if (w.tag().startsWith("n"))
        idfMap.incrementCount(w.word());
    }
  }

  return idfMap;
}

Source File: ConvertWeights.java From phrasal with GNU General Public License v3.0

6 votes

@SuppressWarnings("unchecked")
public static void main(String[] args) {
  if (args.length != 1) {
    System.err.printf("Usage: java %s old_wts%n", ConvertWeights.class.getName());
    System.exit(-1);
  }
  String filename = args[0];
  Counter<String> oldWeights = IOTools.deserialize(filename, ClassicCounter.class, 
      SerializationMode.DEFAULT);
  Path oldFilename = Paths.get(filename + ".old");
  try {
    Files.move(Paths.get(filename), oldFilename);
  } catch (IOException e) {
    e.printStackTrace();
    System.exit(-1);
  }
  IOTools.writeWeights(filename, oldWeights);
  System.out.printf("Converted %s to new format (old file moved to %s)%n",
      filename, oldFilename.toString());
}

Source File: DependencyBnBPreorderer.java From phrasal with GNU General Public License v3.0

6 votes

private static Set<String> getMostFrequentTokens(LineNumberReader reader, int k) throws IOException {
  
  Counter<String> tokenCounts = new ClassicCounter<String>();
  
  String line;
  while ((line = reader.readLine()) != null) {
    String tokens[] = line.split("\\s+");
    for (String t : tokens) {
      tokenCounts.incrementCount(t);
    }
  }

  Set<String> mostFrequentTokens = new HashSet<>(k);
  Counters.retainTop(tokenCounts, k);
  mostFrequentTokens.addAll(tokenCounts.keySet());
  tokenCounts = null;
  return mostFrequentTokens;
}

Source File: MetricUtils.java From phrasal with GNU General Public License v3.0

6 votes

/**
 * Calculates the "informativeness" of each ngram, which is used by the NIST
 * metric. In Matlab notation, the informativeness of the ngram w_1:n is
 * defined as -log2(count(w_1:n)/count(w_1:n-1)).
 * 
 * @param ngramCounts
 *          ngram counts according to references
 * @param totWords
 *          total number of words, which is used to compute the
 *          informativeness of unigrams.
 */
static public <TK> Counter<Sequence<TK>> getNGramInfo(
    Counter<Sequence<TK>> ngramCounts, int totWords) {
  Counter<Sequence<TK>> ngramInfo = new ClassicCounter<Sequence<TK>>();

  for (Sequence<TK> ngram : ngramCounts.keySet()) {
    double num = ngramCounts.getCount(ngram);
    double denom = totWords;
    if (ngram.size() > 1) {
      Sequence<TK> ngramPrefix = ngram.subsequence(0,
          ngram.size() - 1);
      denom = ngramCounts.getCount(ngramPrefix);
    }
    double inf = -Math.log(num / denom) / LOG2;
    ngramInfo.setCount(ngram, inf);
    // System.err.printf("ngram info: %s %.3f\n", ngram.toString(), inf);
  }
  return ngramInfo;
}

Source File: MetricUtils.java From phrasal with GNU General Public License v3.0

6 votes

/**
 * Compute maximum n-gram counts from one or more sequences.
 * 
 * @param sequences - The list of sequences.
 * @param maxOrder - The n-gram order.
 */
static public <TK> Counter<Sequence<TK>> getMaxNGramCounts(
    List<Sequence<TK>> sequences, double[] seqWeights, int maxOrder) {
  Counter<Sequence<TK>> maxCounts = new ClassicCounter<Sequence<TK>>();
  maxCounts.setDefaultReturnValue(0.0);
  if(seqWeights != null && seqWeights.length != sequences.size()) {
    throw new RuntimeException("Improper weight vector for sequences.");
  }
  
  int seqId = 0;
  for (Sequence<TK> sequence : sequences) {
    Counter<Sequence<TK>> counts = getNGramCounts(sequence, maxOrder);
    for (Sequence<TK> ngram : counts.keySet()) {
      double weight = seqWeights == null ? 1.0 : seqWeights[seqId];
      double countValue = weight * counts.getCount(ngram);
      double currentMax = maxCounts.getCount(ngram);
      maxCounts.setCount(ngram, Math.max(countValue, currentMax));
    }
    ++seqId;
  }
  return maxCounts;
}

Source File: OptimizerUtils.java From phrasal with GNU General Public License v3.0

6 votes

public static Set<String> featureWhiteList(FlatNBestList nbest, int minSegmentCount) {
  List<List<ScoredFeaturizedTranslation<IString, String>>> nbestlists = nbest.nbestLists();
  Counter<String> featureSegmentCounts = new ClassicCounter<String>();
  for (List<ScoredFeaturizedTranslation<IString, String>> nbestlist : nbestlists) {
      Set<String> segmentFeatureSet = new HashSet<String>();
      for (ScoredFeaturizedTranslation<IString, String> trans : nbestlist) {
         for (FeatureValue<String> feature : trans.features) {
           segmentFeatureSet.add(feature.name);
         }
      }
      for (String featureName : segmentFeatureSet) {
        featureSegmentCounts.incrementCount(featureName);
      }
  }
  return Counters.keysAbove(featureSegmentCounts, minSegmentCount -1);
}

Source File: OnlineTuner.java From phrasal with GNU General Public License v3.0

6 votes

public ProcessorInput(List<Sequence<IString>> input, 
    List<List<Sequence<IString>>> references, 
    Counter<String> weights, int[] translationIds, int inputId, 
    TranslationModel<IString,String> localTM, boolean createForcedAlignment,
    boolean additionalPrefixDecoding) {
  this.source = input;
  this.translationIds = translationIds;
  this.references = references;
  this.inputId = inputId;
  // Copy here for thread safety. DO NOT change this unless you know
  // what you're doing....
  this.weights = new ClassicCounter<String>(weights);
  this.localTM = localTM;
  this.createForcedAlignment = createForcedAlignment;
  this.additionalPrefixDecoding = additionalPrefixDecoding;
}

Source File: KBPStatisticalExtractor.java From InformationExtraction with GNU General Public License v3.0

6 votes

public static Counter<String> features(KBPInput input) {
  // Ensure RegexNER Tags!
  input.sentence.regexner(DefaultPaths.DEFAULT_KBP_REGEXNER_CASED, false);
  input.sentence.regexner(DefaultPaths.DEFAULT_KBP_REGEXNER_CASELESS, true);

  // Get useful variables
  ClassicCounter<String> feats = new ClassicCounter<>();
  if (Span.overlaps(input.subjectSpan, input.objectSpan) || input.subjectSpan.size() == 0 || input.objectSpan.size() == 0) {
    return new ClassicCounter<>();
  }

  // Actually featurize
  denseFeatures(input, input.sentence, feats);
  surfaceFeatures(input, input.sentence, feats);
  dependencyFeatures(input, input.sentence, feats);
  relationSpecificFeatures(input, input.sentence, feats);

  return feats;
}

Source File: KBPStatisticalExtractor.java From InformationExtraction with GNU General Public License v3.0

6 votes

public static Counter<String> features(KBPInput input) {
    // Ensure RegexNER Tags!
    input.sentence.regexner(IntelConfig.Regex_NER_caseless, false);
    input.sentence.regexner(IntelConfig.Regex_NER_cased, true);

    // Get useful variables
    ClassicCounter<String> feats = new ClassicCounter<>();
    if (Span.overlaps(input.subjectSpan, input.objectSpan) || input.subjectSpan.size() == 0 || input.objectSpan.size() == 0) {
        return new ClassicCounter<>();
    }

    // Actually featurize
    denseFeatures(input, input.sentence, feats);
    surfaceFeatures(input, input.sentence, feats);
    dependencyFeatures(input, input.sentence, feats);
    relationSpecificFeatures(input, input.sentence, feats);

    return feats;
}

Source File: MERT.java From phrasal with GNU General Public License v3.0

5 votes

static Counter<String> randomWts(Set<String> keySet) {
  Counter<String> randpt = new ClassicCounter<String>();
  for (String f : keySet) {
    randpt.setCount(f, globalRandom.nextDouble());
  }
  System.err.printf("random Wts: %s%n", randpt);
  return randpt;
}

Source File: ComputeBitextIDF.java From phrasal with GNU General Public License v3.0

5 votes

/**
 * @param args
 */
public static void main(String[] args) {
  if (args.length > 0) {
    System.err.printf("Usage: java %s < files > idf-file%n", ComputeBitextIDF.class.getName());
    System.exit(-1);
  }

  Counter<String> documentsPerTerm = new ClassicCounter<String>(1000000);
  LineNumberReader reader = new LineNumberReader(new InputStreamReader(System.in));
  double nDocuments = 0.0;
  try {
    for (String line; (line = reader.readLine()) != null;) {
      String[] tokens = line.trim().split("\\s+");
      Set<String> seen = new HashSet<String>(tokens.length);
      for (String token : tokens) {
        if ( ! seen.contains(token)) {
          seen.add(token);
          documentsPerTerm.incrementCount(token);
        }
      }
    }
    nDocuments = reader.getLineNumber();
    reader.close();
  } catch (IOException e) {
    e.printStackTrace();
  }

  // Output the idfs
  System.err.printf("Bitext contains %d sentences and %d word types%n", (int) nDocuments, documentsPerTerm.keySet().size());
  for (String wordType : documentsPerTerm.keySet()) {
    double count = documentsPerTerm.getCount(wordType);
    System.out.printf("%s\t%f%n", wordType, Math.log(nDocuments / count));
  }
  System.out.printf("%s\t%f%n", UNK_TOKEN, Math.log(nDocuments / 1.0));
}

Source File: TargetFunctionWordInsertion.java From phrasal with GNU General Public License v3.0

5 votes

private Set<IString> loadCountsFile(String filename) {
  Counter<IString> counter = new ClassicCounter<IString>();
  LineNumberReader reader = IOTools.getReaderFromFile(filename);
  try {
    for (String line; (line = reader.readLine()) != null;) {
      String[] fields = line.trim().split("\\s+");
      if (fields.length == 2) {
        String wordType = fields[0];
        if ( ! (TokenUtils.isNumericOrPunctuationOrSymbols(wordType) ||
                wordType.equals(TokenUtils.START_TOKEN.toString()) ||
                wordType.equals(TokenUtils.END_TOKEN.toString()))) {
          counter.setCount(new IString(wordType), Double.valueOf(fields[1]));
        }
      } else {
        System.err.printf("%s: Discarding line %s%n", this.getClass().getName(), line);
      }
    }
    reader.close();
    Set<IString> set = new HashSet<>(Counters.topKeys(counter, rankCutoff));
    for (IString word : set) {
      System.err.printf(" %s%n", word);
    }
    return set;
    
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
}

Source File: FeatureValues.java From phrasal with GNU General Public License v3.0

5 votes

/**
 * Convert a collection of feature values to a counter.
 * 
 * @param featureValues
 * @return
 */
public static <T> Counter<T> toCounter(Collection<FeatureValue<T>> featureValues) {
  Counter<T> counter = new ClassicCounter<T>();
  for (FeatureValue<T> fv : featureValues) {
    counter.incrementCount(fv.name, fv.value);
  }
  return counter;
}

Source File: IOTools.java From phrasal with GNU General Public License v3.0

5 votes

/**
 * Read weights from a plain text file.
 * 
 * @param filename
 * @return
 * @throws IOException
 */
public static Counter<String> readWeightsPlain(String filename) throws IOException {
  LineNumberReader reader = new LineNumberReader(new FileReader(filename));   
  Counter<String> wts = new ClassicCounter<String>();
  for (String line; (line = reader.readLine()) != null;) {
    String[] input = line.split(" ");
    if(input.length != 2) {
      reader.close();
      throw new IOException("Illegal input in weight file " + filename + ": " + line);
    }
    wts.setCount(input[0],Double.parseDouble(input[1]));
  }
  reader.close();
  return wts;
}

Source File: IOTools.java From phrasal with GNU General Public License v3.0

5 votes

/**
 * Read weights from a file. Supports both binary and text formats.
 * 
 * TODO(spenceg) Replace ClassicCounter with our own SparseVector implementation.
 * 
 * @param filename
 * @param featureIndex
 * @return a counter of weights
 * @throws IOException 
 */
@SuppressWarnings("unchecked")
public static Counter<String> readWeights(String filename,
    Index<String> featureIndex) {
  Counter<String> wts = (Counter<String>) deserialize(filename, ClassicCounter.class, SerializationMode.BIN_GZ);
  if (wts == null) wts = new ClassicCounter<>();
  if (featureIndex != null) {
    for (String key : wts.keySet()) {
      featureIndex.addToIndex(key);
    }
  }
  return wts;
}

Source File: Summarizer.java From wiseowl with MIT License

5 votes

private static Counter<String> getTermFrequencies(List<CoreMap> sentences) {
  Counter<String> ret = new ClassicCounter<String>();

  for (CoreMap sentence : sentences)
    for (CoreLabel cl : sentence.get(CoreAnnotations.TokensAnnotation.class))
      ret.incrementCount(cl.get(CoreAnnotations.TextAnnotation.class));

  return ret;
}

Source File: DocumentFrequencyCounter.java From wiseowl with MIT License

5 votes

public static void main(String[] args) throws InterruptedException, ExecutionException,
  IOException {
  ExecutorService pool = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());
  List<Future<Counter<String>>> futures = new ArrayList<Future<Counter<String>>>();

  for (String filePath : args)
    futures.add(pool.submit(new FileIDFBuilder(new File(filePath))));

  int finished = 0;
  Counter<String> overall = new ClassicCounter<String>();

  for (Future<Counter<String>> future : futures) {
    System.err.printf("%s: Polling future #%d / %d%n",
        dateFormat.format(new Date()), finished + 1, args.length);
    Counter<String> result = future.get();
    finished++;
    System.err.printf("%s: Finished future #%d / %d%n",
        dateFormat.format(new Date()), finished, args.length);

    System.err.printf("\tMerging counter.. ");
    overall.addAll(result);
    System.err.printf("done.%n");
  }
  pool.shutdown();

  System.err.printf("\n%s: Saving to '%s'.. ", dateFormat.format(new Date()),
      OUT_FILE);
  ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(OUT_FILE));
  oos.writeObject(overall);
  System.err.printf("done.%n");
}

Source File: MosesCompoundSplitter.java From phrasal with GNU General Public License v3.0

5 votes

private void loadModel(String modelFileName) throws IOException {
  System.err.println("Loading MosesCompoundSplitter from " + modelFileName);
  LineNumberReader reader = new LineNumberReader(new FileReader(modelFileName));
  
  lcModel = new ClassicCounter<String>();
  trueCase = new HashMap<>();
  double totalCount = 0.0;
  if(useUnigramProbs) probs = new ClassicCounter<String>();

  int minCnt = Math.min(MAX_COUNT, MIN_COUNT);
  
  for (String line; (line = reader.readLine()) != null;) {
    String[] input = line.split("\t");
    if(input.length != 3) {
      reader.close();
      throw new IOException("Illegal input in model file, line " + reader.getLineNumber() + ": " + line);
    }
    long cnt = Long.parseLong(input[2]);
    totalCount += cnt;
    String tc = input[1];
    if(cnt < minCnt || tc.length() < MIN_SIZE + 1) continue; // these will never be used for splitting anyway
    
    String lc = tc.toLowerCase();
    // use the most frequent casing
    if(lcModel.getCount(lc) < cnt) {
      lcModel.setCount(lc, cnt);
      trueCase.put(lc, tc);
      //System.err.println("adding: " + input[1] + " ::: " + input[2]);
    }
  }
  
  totalCount = Math.log(totalCount);
  if(useUnigramProbs) {
    for(Entry<String, Double> e : lcModel.entrySet()) {
      probs.setCount(e.getKey(), Math.log(e.getValue()) - totalCount);
    }
  }
  reader.close();
}

Source File: RepetitionRate.java From phrasal with GNU General Public License v3.0

5 votes

RepetitionRateIncrementalMetric() {
  ngrams = new ClassicCounter<Sequence<TK>>();
  corpus = new ArrayList<Sequence<TK>>();
  
  for(int i = 0; i < maxNgramOrder; ++i) {
    totalNonSingletonNgrams.add(0);
    totalNgrams.add(0);
    windowNonSingletonNgrams.add(0);
    windowNgrams.add(0);
  }
  
}

Source File: KBPStatisticalExtractor.java From InformationExtraction with GNU General Public License v3.0

5 votes

@SuppressWarnings("UnusedParameters")
private static void denseFeatures(KBPInput input, Sentence sentence, ClassicCounter<String> feats) {
    boolean subjBeforeObj = input.subjectSpan.isBefore(input.objectSpan);

    // Type signature
    indicator(feats, "type_signature", input.subjectType + "," + input.objectType);

    // Relative position
    indicator(feats, "subj_before_obj", subjBeforeObj ? "y" : "n");
}

Source File: MetricUtils.java From phrasal with GNU General Public License v3.0

5 votes

/**
 * 
 * @param <TK>
 */
static public <TK> Counter<Sequence<TK>> getNGramCounts(Sequence<TK> sequence, int maxOrder) {
  Counter<Sequence<TK>> counts = new ClassicCounter<>();
  int sz = sequence.size();
  for (int i = 0; i < sz; i++) {
    int jMax = Math.min(sz, i + maxOrder);
    for (int j = i + 1; j <= jMax; j++) {
      Sequence<TK> ngram = sequence.subsequence(i, j);
      counts.incrementCount(ngram);
    }
  }
  return counts;
}

Source File: NISTMetric.java From phrasal with GNU General Public License v3.0

5 votes

@Override
public IncrementalEvaluationMetric<TK, FV> replace(int index,
    ScoredFeaturizedTranslation<TK, FV> trans) {
  if (index > sequences.size()) {
    throw new IndexOutOfBoundsException(String.format("Index: %d >= %d",
        index, sequences.size()));
  }
  Counter<Sequence<TK>> canidateCounts = (trans == null ? new ClassicCounter<Sequence<TK>>()
      : MetricUtils.getNGramCounts(trans.translation, order));
  MetricUtils.clipCounts(canidateCounts, maxReferenceCounts.get(index));
  if (sequences.get(index) != null) {
    Counter<Sequence<TK>> oldCanidateCounts = MetricUtils.getNGramCounts(
        sequences.get(index), order);
    MetricUtils.clipCounts(oldCanidateCounts, maxReferenceCounts.get(index));
    decCounts(oldCanidateCounts, sequences.get(index));
    c -= sequences.get(index).size();
    r -= averageReferenceLength(index);
  }
  sequences.set(index, (trans == null ? null : trans.translation));
  if (trans != null) {
    incCounts(canidateCounts, trans.translation);
    c += sequences.get(index).size();
    r += averageReferenceLength(index);
  }

  return this;
}

Source File: NISTMetric.java From phrasal with GNU General Public License v3.0

5 votes

private void initNgramWeights(List<List<Sequence<TK>>> referencesList) {
  int len = 0;
  Counter<Sequence<TK>> allNgrams = new ClassicCounter<Sequence<TK>>();
  for (List<Sequence<TK>> references : referencesList) {
    for (Sequence<TK> reference : references) {
      len += reference.size();
      Counter<Sequence<TK>> altCounts = MetricUtils.getNGramCounts(
          reference, order);
      addToCounts(allNgrams, altCounts);
    }
  }
  ngramInfo = MetricUtils.getNGramInfo(allNgrams, len);
}

Source File: MERT.java From phrasal with GNU General Public License v3.0

5 votes

public static Counter<String> summarizedAllFeaturesVector(
    List<ScoredFeaturizedTranslation<IString, String>> trans) {
  Counter<String> sumValues = new ClassicCounter<String>();

  for (ScoredFeaturizedTranslation<IString, String> tran : trans) {
    for (FeatureValue<String> fValue : tran.features) {
      sumValues.incrementCount(fValue.name, fValue.value);
    }
  }

  return sumValues;
}

Source File: OneSidedObjectiveFunction.java From phrasal with GNU General Public License v3.0

5 votes

/**
 * Constructor.
 * 
 * @param input
 */
public OneSidedObjectiveFunction(ClustererState input) {
  // Setup delta data structures
  this.inputState = input;
  localWordToClass = new HashMap<>(input.vocabularySubset.size());
  deltaClassCount = new ClassicCounter<Integer>(input.numClasses);
  deltaClassHistoryCount = new TwoDimensionalCounter<Integer,NgramHistory>();
  for (IString word : input.vocabularySubset) {
    int classId = input.wordToClass.get(word);
    localWordToClass.put(word, classId);
  }
  this.objValue = input.currentObjectiveValue;
}

Source File: OptimizerUtils.java From phrasal with GNU General Public License v3.0

5 votes

public static <T> Counter<T> featureValueCollectionToCounter(Collection<FeatureValue<T>> c) {
  Counter<T> counter = new ClassicCounter<T>();
  
  for (FeatureValue<T> fv : c) {
    counter.incrementCount(fv.name, fv.value);
  }
  
  return counter;
}

Source File: OptimizerUtils.java From phrasal with GNU General Public License v3.0

5 votes

public static Counter<String> getWeightCounterFromArray(String[] weightNames,
    double[] wtsArr) {
  Counter<String> wts = new ClassicCounter<String>();
  for (int i = 0; i < weightNames.length; i++) {
    wts.setCount(weightNames[i], wtsArr[i]);
  }
  return wts;
}

Source File: AdaGradFOBOSUpdater.java From phrasal with GNU General Public License v3.0

5 votes

public AdaGradFOBOSUpdater(double initialRate, int expectedNumFeatures, double lambda, Norm norm, Counter<String> customL1, Set<String> fixedFeatures) {
  this.rate = initialRate;
  this.lambda = lambda;
  this.norm = norm;
  this.customL1 = customL1;
  this.fixedFeatures = fixedFeatures;
  
  sumGradSquare = new ClassicCounter<String>(expectedNumFeatures);
}

Source File: AbstractOnlineOptimizer.java From phrasal with GNU General Public License v3.0

5 votes

@Override
public Counter<String> getBatchGradient(Counter<String> weights,
    List<Sequence<IString>> sources, int[] sourceIds,
    List<List<RichTranslation<IString, String>>> translations,
    List<List<Sequence<IString>>> references,
    double[] referenceWeights,
    SentenceLevelMetric<IString, String> scoreMetric) {
  Counter<String> batchGradient = new ClassicCounter<String>();

  for (int i = 0; i < sourceIds.length; i++) {
    if (translations.get(i).size() > 0) {
      // Skip decoder failures.
      Counter<String> unregularizedGradient = getUnregularizedGradient(weights, sources.get(i), sourceIds[i], translations.get(i), references.get(i), referenceWeights, scoreMetric);
      batchGradient.addAll(unregularizedGradient);
    }
  }

  // Add L2 regularization directly into the derivative
  if (this.l2Regularization) {
    final Set<String> features = new HashSet<String>(weights.keySet());
    features.addAll(weights.keySet());
    final double dataFraction = sourceIds.length /(double) tuneSetSize;
    final double scaledInvSigmaSquared = dataFraction/(2*sigmaSq);
    for (String key : features) {
      double x = weights.getCount(key);
      batchGradient.incrementCount(key, x * scaledInvSigmaSquared);
    }
  }

  return batchGradient;
}

Source File: KBPStatisticalExtractor.java From InformationExtraction with GNU General Public License v3.0

5 votes

@SuppressWarnings("UnusedParameters")
private static void denseFeatures(KBPInput input, Sentence sentence, ClassicCounter<String> feats) {
  boolean subjBeforeObj = input.subjectSpan.isBefore(input.objectSpan);

  // Type signature
  indicator(feats, "type_signature", input.subjectType + "," + input.objectType);

  // Relative position
  indicator(feats, "subj_before_obj", subjBeforeObj ? "y" : "n");
}

edu.stanford.nlp.stats.ClassicCounter Java Examples