edu.stanford.nlp.stats.Counter Java Exaples

Source File: ConvertWeights.java From phrasal with GNU General Public License v3.0

6 votes

@SuppressWarnings("unchecked")
public static void main(String[] args) {
  if (args.length != 1) {
    System.err.printf("Usage: java %s old_wts%n", ConvertWeights.class.getName());
    System.exit(-1);
  }
  String filename = args[0];
  Counter<String> oldWeights = IOTools.deserialize(filename, ClassicCounter.class, 
      SerializationMode.DEFAULT);
  Path oldFilename = Paths.get(filename + ".old");
  try {
    Files.move(Paths.get(filename), oldFilename);
  } catch (IOException e) {
    e.printStackTrace();
    System.exit(-1);
  }
  IOTools.writeWeights(filename, oldWeights);
  System.out.printf("Converted %s to new format (old file moved to %s)%n",
      filename, oldFilename.toString());
}

Source File: MakeWordClasses.java From phrasal with GNU General Public License v3.0

6 votes

private int updateCountsWith(PartialStateUpdate result) {
  // Update counts
  Counters.addInPlace(classCount, result.deltaClassCount);
  Set<Integer> classes = result.deltaClassHistoryCount.firstKeySet();
  for (Integer classId : classes) {
    Counter<NgramHistory> counter = this.classHistoryCount.getCounter(classId);
    Counter<NgramHistory> delta = result.deltaClassHistoryCount.getCounter(classId);
    Counters.addInPlace(counter, delta);
  }

  // Update assignments
  int numUpdates = 0;
  for (Map.Entry<IString, Integer> assignment : result.wordToClass.entrySet()) {
    int oldAssignment = wordToClass.get(assignment.getKey());
    int newAssignment = assignment.getValue();
    if (oldAssignment != newAssignment) {
      ++numUpdates;
      wordToClass.put(assignment.getKey(), assignment.getValue());
    }
  }
  return numUpdates;
}

Source File: OnlineTuner.java From phrasal with GNU General Public License v3.0

6 votes

/**
 * Load additional feature values from plain text file.
 * Features are only updated if not already present in weight vector.
 * 
 * @param additionalFeatureWeights
 */
private void addAdditionalFeatureWeights(String additionalFeatureWeightsFile) {
  try {
    Counter<String> weights = IOTools.readWeightsPlain(additionalFeatureWeightsFile);
    System.err.println("read weights: ");
    for(Entry<String,Double> entry : weights.entrySet()) {
      if(!wtsAccumulator.containsKey(entry.getKey())) {
        wtsAccumulator.setCount(entry.getKey(), entry.getValue());
        System.err.println("setting feature: " + entry.getKey() + " = " + entry.getValue());
      }
      else System.err.println("skipping feature: " + entry.getKey());
    }
  }
  catch (IOException e) {
    e.printStackTrace();
    logger.fatal("Could not load additional weights from : {}", additionalFeatureWeightsFile);
  }
  
}

Source File: KBPStatisticalExtractor.java From InformationExtraction with GNU General Public License v3.0

6 votes

/**
 * Score the given input, returning both the classification decision and the
 * probability of that decision.
 * Note that this method will not return a relation which does not type check.
 *
 *
 * @param input The input to classify.
 * @return A pair with the relation we classified into, along with its confidence.
 */
public Pair<String,Double> classify(KBPInput input) {
  RVFDatum<String, String> datum = new RVFDatum<>(features(input));
  Counter<String> scores =  classifier.scoresOf(datum);
  Counters.expInPlace(scores);
  Counters.normalize(scores);
  String best = Counters.argmax(scores);
  // While it doesn't type check, continue going down the list.
  // NO_RELATION is always an option somewhere in there, so safe to keep going...
  while (!NO_RELATION.equals(best) &&
      (!edu.stanford.nlp.ie.KBPRelationExtractor.RelationType.fromString(best).get().validNamedEntityLabels.contains(input.objectType) ||
       RelationType.fromString(best).get().entityType != input.subjectType) ) {
    scores.remove(best);
    Counters.normalize(scores);
    best = Counters.argmax(scores);
  }
  return Pair.makePair(best, scores.getCount(best));
}

Source File: KBPStatisticalExtractor.java From InformationExtraction with GNU General Public License v3.0

6 votes

public static Counter<String> features(KBPInput input) {
  // Ensure RegexNER Tags!
  input.sentence.regexner(DefaultPaths.DEFAULT_KBP_REGEXNER_CASED, false);
  input.sentence.regexner(DefaultPaths.DEFAULT_KBP_REGEXNER_CASELESS, true);

  // Get useful variables
  ClassicCounter<String> feats = new ClassicCounter<>();
  if (Span.overlaps(input.subjectSpan, input.objectSpan) || input.subjectSpan.size() == 0 || input.objectSpan.size() == 0) {
    return new ClassicCounter<>();
  }

  // Actually featurize
  denseFeatures(input, input.sentence, feats);
  surfaceFeatures(input, input.sentence, feats);
  dependencyFeatures(input, input.sentence, feats);
  relationSpecificFeatures(input, input.sentence, feats);

  return feats;
}

Source File: OptimizerUtils.java From phrasal with GNU General Public License v3.0

6 votes

public static Set<String> featureWhiteList(FlatNBestList nbest, int minSegmentCount) {
  List<List<ScoredFeaturizedTranslation<IString, String>>> nbestlists = nbest.nbestLists();
  Counter<String> featureSegmentCounts = new ClassicCounter<String>();
  for (List<ScoredFeaturizedTranslation<IString, String>> nbestlist : nbestlists) {
      Set<String> segmentFeatureSet = new HashSet<String>();
      for (ScoredFeaturizedTranslation<IString, String> trans : nbestlist) {
         for (FeatureValue<String> feature : trans.features) {
           segmentFeatureSet.add(feature.name);
         }
      }
      for (String featureName : segmentFeatureSet) {
        featureSegmentCounts.incrementCount(featureName);
      }
  }
  return Counters.keysAbove(featureSegmentCounts, minSegmentCount -1);
}

Source File: DependencyBnBPreorderer.java From phrasal with GNU General Public License v3.0

6 votes

private static Set<String> getMostFrequentTokens(LineNumberReader reader, int k) throws IOException {
  
  Counter<String> tokenCounts = new ClassicCounter<String>();
  
  String line;
  while ((line = reader.readLine()) != null) {
    String tokens[] = line.split("\\s+");
    for (String t : tokens) {
      tokenCounts.incrementCount(t);
    }
  }

  Set<String> mostFrequentTokens = new HashSet<>(k);
  Counters.retainTop(tokenCounts, k);
  mostFrequentTokens.addAll(tokenCounts.keySet());
  tokenCounts = null;
  return mostFrequentTokens;
}

Source File: MetricUtils.java From phrasal with GNU General Public License v3.0

6 votes

/**
 * Calculates the "informativeness" of each ngram, which is used by the NIST
 * metric. In Matlab notation, the informativeness of the ngram w_1:n is
 * defined as -log2(count(w_1:n)/count(w_1:n-1)).
 * 
 * @param ngramCounts
 *          ngram counts according to references
 * @param totWords
 *          total number of words, which is used to compute the
 *          informativeness of unigrams.
 */
static public <TK> Counter<Sequence<TK>> getNGramInfo(
    Counter<Sequence<TK>> ngramCounts, int totWords) {
  Counter<Sequence<TK>> ngramInfo = new ClassicCounter<Sequence<TK>>();

  for (Sequence<TK> ngram : ngramCounts.keySet()) {
    double num = ngramCounts.getCount(ngram);
    double denom = totWords;
    if (ngram.size() > 1) {
      Sequence<TK> ngramPrefix = ngram.subsequence(0,
          ngram.size() - 1);
      denom = ngramCounts.getCount(ngramPrefix);
    }
    double inf = -Math.log(num / denom) / LOG2;
    ngramInfo.setCount(ngram, inf);
    // System.err.printf("ngram info: %s %.3f\n", ngram.toString(), inf);
  }
  return ngramInfo;
}

Source File: ScorerFactory.java From phrasal with GNU General Public License v3.0

6 votes

/**
 * Creates a scorer.
 *
 * @throws IOException
 */
public static Scorer<String> factory(String scorerName, Counter<String> config, Index<String> featureIndex)
    throws IOException {

  switch (scorerName) {
    case UNIFORM_SCORER:
      return new UniformScorer<String>();
    case DENSE_SCORER:
      return new DenseScorer(config, featureIndex);
    case SPARSE_SCORER:
      return new SparseScorer(config, featureIndex);
  }

  throw new RuntimeException(String.format("Unknown scorer \"%s\"",
      scorerName));
}

Source File: PairwiseRankingOptimizerSGD.java From phrasal with GNU General Public License v3.0

6 votes

/**
 * True online learning, one example at a time.
 */
@Override
public Counter<String> getGradient(Counter<String> weights, Sequence<IString> source, int sourceId,
    List<RichTranslation<IString, String>> translations, List<Sequence<IString>> references,
    double[] referenceWeights, SentenceLevelMetric<IString, String> scoreMetric) {
  Objects.requireNonNull(weights);
  Objects.requireNonNull(scoreMetric);
  assert sourceId >= 0;
  assert translations.size() > 0 : "No translations for source id: " + String.valueOf(sourceId);
  assert references.size() > 0;

  // Sample from the n-best list
  List<Datum> dataset = sampleNbestList(sourceId, source, scoreMetric, translations, references);
  Counter<String> gradient = computeGradient(dataset, weights, 1);
  if (dataset.isEmpty()) {
    logger.warn("Null gradient for sourceId: {}", sourceId);
  }
  
  if (VERBOSE) {
     System.err.printf("True online gradient");
     displayGradient(gradient);
  }
 
  return gradient;
}

Source File: DocumentFrequencyCounter.java From wiseowl with MIT License

6 votes

/**
 * Get an IDF map for all the documents in the given file.
 * @param file
 * @return
 */
private static Counter<String> getIDFMapForFile(Reader file)
  throws SAXException, IOException, TransformerException {

  DocumentBuilder parser = XMLUtils.getXmlParser();
  Document xml = parser.parse(new ReaderInputStream(file));
  NodeList docNodes = xml.getDocumentElement().getElementsByTagName(TAG_DOCUMENT);

  Element doc;
  Counter<String> idfMap = new ClassicCounter<String>();
  for (int i = 0; i < docNodes.getLength(); i++) {
    doc = (Element) docNodes.item(i);
    NodeList texts = doc.getElementsByTagName(TAG_TEXT);
    assert texts.getLength() == 1;

    Element text = (Element) texts.item(0);
    String textContent = getFullTextContent(text);

    idfMap.addAll(getIDFMapForDocument(textContent));

    // Increment magic counter
    idfMap.incrementCount("__all__");
  }

  return idfMap;
}

Source File: DocumentFrequencyCounter.java From wiseowl with MIT License

6 votes

/**
 * Get an IDF map for the given document string.
 *
 * @param document
 * @return
 */
private static Counter<String> getIDFMapForDocument(String document) {
  // Clean up -- remove some Gigaword patterns that slow things down
  // / don't help anything
  document = headingSeparator.matcher(document).replaceAll("");

  DocumentPreprocessor preprocessor = new DocumentPreprocessor(new StringReader(document));
  preprocessor.setTokenizerFactory(tokenizerFactory);

  Counter<String> idfMap = new ClassicCounter<String>();
  for (List<HasWord> sentence : preprocessor) {
    if (sentence.size() > MAX_SENTENCE_LENGTH)
      continue;

    List<TaggedWord> tagged = tagger.tagSentence(sentence);

    for (TaggedWord w : tagged) {
      if (w.tag().startsWith("n"))
        idfMap.incrementCount(w.word());
    }
  }

  return idfMap;
}

Source File: MERT.java From phrasal with GNU General Public License v3.0

5 votes

static Counter<String> randomWts(Set<String> keySet) {
  Counter<String> randpt = new ClassicCounter<String>();
  for (String f : keySet) {
    randpt.setCount(f, globalRandom.nextDouble());
  }
  System.err.printf("random Wts: %s%n", randpt);
  return randpt;
}

Source File: OverrideBinwts.java From phrasal with GNU General Public License v3.0

5 votes

public static void main(String[] args) {
  if(args.length != 3) {
    usage();
    System.exit(-1);
  }
    
  String input = args[0];
  String overrides = args[1];
  String output = args[2];
  
  System.err.println("reading weights from " + input);
  
  Counter<String> weights = IOTools.readWeights(input);
  
  try {
    Counter<String> overridesW = IOTools.readWeightsPlain(overrides);
    System.err.println("read weights from  " + overrides + ":");
    for(Entry<String,Double> entry : overridesW.entrySet()) {
      if(entry.getValue() == 0) weights.remove(entry.getKey());
      else weights.setCount(entry.getKey(), entry.getValue());
      System.err.println("setting feature: " + entry.getKey() + " = " + entry.getValue());
    }
  }
  catch (IOException e) {
    e.printStackTrace();
    System.exit(-1);
  }

  System.err.println("writing weights to " + output);
  
  IOTools.writeWeights(output, weights);
  
}

Source File: BLEUMetric.java From phrasal with GNU General Public License v3.0

5 votes

public BLEUMetric(List<List<Sequence<TK>>> referencesList, int order) {
  this.order = order;
  maxReferenceCounts = new ArrayList<Counter<Sequence<TK>>>(
      referencesList.size());
  refLengths = new int[referencesList.size()][];
  multiplier = 1;
  init(referencesList);
  smooth = referencesList.size() == 1;
}

Source File: MERT.java From phrasal with GNU General Public License v3.0

5 votes

public static Counter<String> summarizedAllFeaturesVector(
    List<ScoredFeaturizedTranslation<IString, String>> trans) {
  Counter<String> sumValues = new ClassicCounter<String>();

  for (ScoredFeaturizedTranslation<IString, String> tran : trans) {
    for (FeatureValue<String> fValue : tran.features) {
      sumValues.incrementCount(fValue.name, fValue.value);
    }
  }

  return sumValues;
}

Source File: AdaGradFOBOSUpdater.java From phrasal with GNU General Public License v3.0

5 votes

@Override
public void update(Counter<String> weights,
    Counter<String> gradient, int timeStep, boolean endOfEpoch) {
  if (norm == Norm.LASSO)
    updateL1(weights, gradient, timeStep);
  else if (norm == Norm.aeLASSO) {
    updateElitistLasso(weights, gradient, timeStep);
  } else 
    throw new UnsupportedOperationException("norm type " + norm + " cannot be recognized in AdaGradFOBOSUpdater");
}

Source File: OptimizerUtils.java From phrasal with GNU General Public License v3.0

5 votes

public static Counter<String> getWeightCounterFromArray(String[] weightNames,
    double[] wtsArr) {
  Counter<String> wts = new ClassicCounter<String>();
  for (int i = 0; i < weightNames.length; i++) {
    wts.setCount(weightNames[i], wtsArr[i]);
  }
  return wts;
}

Source File: PairwiseRankingOptimizer.java From phrasal with GNU General Public License v3.0

5 votes

@Override
public Counter<String> optimize(Counter<String> initialWts) {
  Counter<String> wts = new ClassicCounter<String>(initialWts);
  Counters.normalize(wts);
  double seedSeed = Math.abs(Counters.max(wts));
  long seed = (long)Math.exp(Math.log(seedSeed) + Math.log(Long.MAX_VALUE));
  System.err.printf("PRO thread using random seed: %d\n", seed);
  RVFDataset<String, String> proSamples = getSamples(new Random(seed));
  LogPrior lprior = new LogPrior();
  lprior.setSigma(l2sigma);
  LogisticClassifierFactory<String,String> lcf = new LogisticClassifierFactory<String,String>();
  LogisticClassifier<String, String> lc = lcf.trainClassifier(proSamples, lprior, false);
  Counter<String> decoderWeights = new ClassicCounter<String>(); 
  Counter<String> lcWeights = lc.weightsAsCounter();
  for (String key : lcWeights.keySet()) {
    double mul;
    if (key.startsWith("1 / ")) {
      mul = 1.0;
    } else if (key.startsWith("0 / ")) {
      mul = -1.0;
    } else {
      throw new RuntimeException("Unparsable weight name produced by logistic classifier: "+key);
    }
    String decoderKey = key.replaceFirst("^[10] / ", "");
    decoderWeights.incrementCount(decoderKey, mul*lcWeights.getCount(key));
  }

  synchronized (MERT.bestWts) {
    if (!updatedBestOnce) {
      System.err.println("Force updating weights (once)");
      double metricEval = MERT.evalAtPoint(nbest, decoderWeights, emetric);
      MERT.updateBest(decoderWeights, metricEval, true);
      updatedBestOnce = true;
    }
  }
  return decoderWeights;
}

Source File: AdaGradFOBOSUpdater.java From phrasal with GNU General Public License v3.0

5 votes

public AdaGradFOBOSUpdater(double initialRate, int expectedNumFeatures, double lambda, Norm norm, Counter<String> customL1, Set<String> fixedFeatures) {
  this.rate = initialRate;
  this.lambda = lambda;
  this.norm = norm;
  this.customL1 = customL1;
  this.fixedFeatures = fixedFeatures;
  
  sumGradSquare = new ClassicCounter<String>(expectedNumFeatures);
}

Source File: IOTools.java From phrasal with GNU General Public License v3.0

5 votes

/**
 * Read weights from a file. Supports both binary and text formats.
 * 
 * TODO(spenceg) Replace ClassicCounter with our own SparseVector implementation.
 * 
 * @param filename
 * @param featureIndex
 * @return a counter of weights
 * @throws IOException 
 */
@SuppressWarnings("unchecked")
public static Counter<String> readWeights(String filename,
    Index<String> featureIndex) {
  Counter<String> wts = (Counter<String>) deserialize(filename, ClassicCounter.class, SerializationMode.BIN_GZ);
  if (wts == null) wts = new ClassicCounter<>();
  if (featureIndex != null) {
    for (String key : wts.keySet()) {
      featureIndex.addToIndex(key);
    }
  }
  return wts;
}

Source File: CoverageChecker.java From phrasal with GNU General Public License v3.0

5 votes

static public void countNgrams(String line, Counter<String> ngramCounts, Set<String> limitSet, int order) {
   String[] toks = line.split("\\s");
   for (int i = 0; i < toks.length; i++) {
      for (int j = 0; j < order && j+i < toks.length ; j++) {
         String[] ngramArr = Arrays.copyOfRange(toks, i, i+j+1);
         String ngram = Sentence.listToString(Arrays.asList(ngramArr));
         if (limitSet == null || limitSet.contains(ngram)) {
            ngramCounts.incrementCount(ngram);
         }
      }
   }	   
}

Source File: NISTMetric.java From phrasal with GNU General Public License v3.0

5 votes

private void incCounts(Counter<Sequence<TK>> clippedCounts,
    Sequence<TK> sequence, int mul) {
  int seqSz = sequence.size();
  for (int i = 0; i < order; i++) {
    possibleMatchCounts[i] += mul * possibleMatchCounts(i, seqSz);
  }

  double[] localCounts = localMatchCounts(clippedCounts);
  for (int i = 0; i < order; i++) {
    // System.err.printf("local Counts[%d]: %d\n", i, localCounts[i]);
    matchCounts[i] += mul * localCounts[i];
  }
}

Source File: RandomNBestPoint.java From phrasal with GNU General Public License v3.0

5 votes

@Override
public Counter<String> optimize(Counter<String> initialWts) {

  Counter<String> wts = initialWts;

  for (int noProgress = 0; noProgress < MERT.NO_PROGRESS_LIMIT;) {
    Counter<String> dir;
    List<ScoredFeaturizedTranslation<IString, String>> rTrans;
    dir = MERT.summarizedAllFeaturesVector(rTrans = (better ? mert
        .randomBetterTranslations(nbest, wts, emetric) : mert
        .randomTranslations(nbest)));

    System.err.printf("Random n-best point score: %.5f\n",
        emetric.score(rTrans));
    Counter<String> newWts = mert.lineSearch(nbest, wts, dir, emetric);
    double eval = MERT.evalAtPoint(nbest, newWts, emetric);
    double ssd = MERT.wtSsd(wts, newWts);
    if (ssd < MERT.NO_PROGRESS_SSD)
      noProgress++;
    else
      noProgress = 0;
    System.err.printf("Eval: %.5f SSD: %e (no progress: %d)\n", eval, ssd,
        noProgress);
    wts = newWts;
  }
  return wts;
}

Source File: AbstractOnlineOptimizer.java From phrasal with GNU General Public License v3.0

5 votes

@Override
public Counter<String> getGradient(Counter<String> weights,
    Sequence<IString> source, int sourceId,
    List<RichTranslation<IString, String>> translations,
    List<Sequence<IString>> references, double[] referenceWeights,
    SentenceLevelMetric<IString, String> scoreMetric) {
  return getBatchGradient(weights, Arrays.asList(source), new int[]{sourceId}, Arrays.asList(translations), Arrays.asList(references), referenceWeights, scoreMetric);
}

Source File: MIRA1BestHopeFearOptimizer.java From phrasal with GNU General Public License v3.0

5 votes

@Override
public Counter<String> getBatchGradient(Counter<String> weights,
    List<Sequence<IString>> sources, int[] sourceIds,
    List<List<RichTranslation<IString, String>>> translations,
    List<List<Sequence<IString>>> references,
    double[] referenceWeights, SentenceLevelMetric<IString, String> scoreMetric) {
  throw new UnsupportedOperationException("1-best MIRA does not support mini-batch learning");
}

Source File: DownhillSimplexOptimizer.java From phrasal with GNU General Public License v3.0

5 votes

private Counter<String> vectorToWeights(double[] x) {
  Counter<String> wts = new ClassicCounter<String>();
  for (int i = 0; i < weightNames.length; i++) {
    wts.setCount(weightNames[i], x[i]);
  }
  return wts;
}

Source File: SequenceOptimizer.java From phrasal with GNU General Public License v3.0

5 votes

@Override
public Counter<String> optimize(Counter<String> initialWts) {
  Counter<String> wts = initialWts;
  for (BatchOptimizer opt : opts) {

    boolean done = false;

    while (!done) {
      Counter<String> newWts = opt.optimize(wts);

      double wtSsd = MERT.wtSsd(newWts, wts);

      double oldE = MERT.evalAtPoint(nbest, wts, emetric);
      double newE = MERT.evalAtPoint(nbest, newWts, emetric);
      // MERT.updateBest(newWts, -newE);

      boolean worse = oldE > newE;
      done = Math.abs(oldE - newE) <= MIN_OBJECTIVE_CHANGE || !loop || worse;

      System.err.printf(
          "seq optimizer: %s -> %s (%s) ssd: %f done: %s opt: %s\n", oldE,
          newE, newE - oldE, wtSsd, done, opt.toString());

      if (worse)
        System.err.printf("WARNING: negative objective change!");
      else
        wts = newWts;
    }
  }
  return wts;
}

Source File: AdaGradFastFOBOSUpdater.java From phrasal with GNU General Public License v3.0

5 votes

/**
 * Constructor.
 * 
 * @param initialRate
 * @param expectedNumFeatures
 * @param L1lambda
 * @param customL1
 * @param fixedFeatures
 */
public AdaGradFastFOBOSUpdater(double initialRate, int expectedNumFeatures, double L1lambda, 
    Counter<String> customL1, Set<String> fixedFeatures) {
  this.rate = initialRate;
  this.L1lambda = L1lambda;
  sumGradSquare = new ClassicCounter<>(expectedNumFeatures);
  lastUpdated = new ClassicCounter<>(expectedNumFeatures);
  this.customL1 = customL1;
  this.fixedFeatures = fixedFeatures;
}

Source File: AdaGradFastFOBOSUpdater.java From phrasal with GNU General Public License v3.0

5 votes

public AdaGradFastFOBOSState(Counter<String> h, Counter<String> r, Set<String> f, Counter<String> u, int t) {
  this.gradHistory = h;
  this.customReg = r;
  this.fixedFeatures = f;
  this.lastUp = u;
  this.timeStep = t;
}

edu.stanford.nlp.stats.Counter Java Examples