cc.mallet.types.InstanceList Java Examples

The following examples show how to use cc.mallet.types.InstanceList. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: MaxEntClassifierTrainerTest.java From baleen with Apache License 2.0

6 votes

@Test
public void testTaskProducesValidModelFile() throws Exception {

  File modelFile = modelPath.toFile();
  assertTrue(modelFile.exists());

  Classifier classifier = new FileObject<Classifier>(modelFile.getPath()).object();
  assertTrue(classifier.getLabelAlphabet().contains("pos"));
  assertTrue(classifier.getLabelAlphabet().contains("neg"));

  Pipe pipe = classifier.getInstancePipe();
  InstanceList instanceList = new InstanceList(pipe);
  instanceList.addThruPipe(
      new Instance("I love this amazing awesome classifier.", null, null, null));
  instanceList.addThruPipe(new Instance("I can't stand this horrible test.", null, null, null));

  assertEquals(
      "pos", classifier.classify(instanceList.get(0)).getLabeling().getBestLabel().toString());
  assertEquals(
      "neg", classifier.classify(instanceList.get(1)).getLabeling().getBestLabel().toString());
}

Example #2

Source File: EngineMBMalletSeq.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

6 votes

private Transducer trainModel(InstanceList trainingData, String options) {

    // TODO: check field shadowing!
    TransducerTrainer trainer = createTrainer(trainingData, info, options);
    Parms parms = new Parms(options,"i:iterations:i","V:verbose:b");
    boolean verbose = (boolean)parms.getValueOrElse("verbose", false);
    int iters = (int) parms.getValueOrElse("iterations", 0);
    if(iters==0) {
      iters = Integer.MAX_VALUE;
    }
    try {
      trainer.train(trainingData, iters);
    } catch(OptimizationException ex) {
      System.err.println("Encountered an OptimizationException during training (CONTINUING!): "+ex.getMessage());
      ex.printStackTrace(System.err);
      System.err.println("We ignore this exception and try to use the model so far ...");
    }
    if(verbose) {
      trainer.getTransducer().print();
    }
    Transducer td = trainer.getTransducer();
    return td;
  }

Example #3

Source File: MalletClassifierTrainerTest.java From baleen with Apache License 2.0

6 votes

private void validateModel() {
  File modelFile = modelPath.toFile();
  assertTrue(modelFile.exists());

  Classifier classifier = new FileObject<Classifier>(modelFile.getPath()).object();
  assertTrue(classifier.getLabelAlphabet().contains("pos"));
  assertTrue(classifier.getLabelAlphabet().contains("neg"));

  Pipe pipe = classifier.getInstancePipe();
  InstanceList instanceList = new InstanceList(pipe);

  instanceList.addThruPipe(
      new Instance("I love this amazing awesome classifier.", "", null, null));
  instanceList.addThruPipe(new Instance("I can't stand this horrible test.", "", null, null));

  ImmutableSet<String> labels = ImmutableSet.of("pos", "neg");
  assertTrue(
      labels.contains(
          classifier.classify(instanceList.get(0)).getLabeling().getBestLabel().toString()));
  assertTrue(
      labels.contains(
          classifier.classify(instanceList.get(1)).getLabeling().getBestLabel().toString()));
}

Example #4

Source File: MalletClassifierTrainer.java From baleen with Apache License 2.0

6 votes

@Override
protected void execute(JobSettings settings) throws AnalysisEngineProcessException {

  Pipe pipe = new ClassifierPipe(stopwords);
  InstanceList instances = new InstanceList(pipe);
  instances.addThruPipe(getDocumentsFromMongo());

  InstanceList training = null;
  InstanceList testing = null;
  if (forTesting > 0.0) {
    InstanceList[] ilists = instances.split(new double[] {1 - forTesting, forTesting});
    training = ilists[0];
    testing = ilists[1];
  } else {
    training = instances;
  }

  processTrainerDefinitions(training, testing);
}

Example #5

Source File: TopicModelTrainer.java From baleen with Apache License 2.0

6 votes

private void writeTopicAssignmentsToMongo(
    InstanceList instances, TopicWords topicWords, ParallelTopicModel model) {
  IntStream.range(0, instances.size())
      .forEach(
          document -> {
            double[] topicDistribution = model.getTopicProbabilities(document);
            int maxAt = new MaximumIndex(topicDistribution).find();
            Instance instance = instances.get(document);

            List<String> iterator = topicWords.forTopic(maxAt);

            documentsCollection.findOneAndUpdate(
                Filters.eq(new ObjectId((String) instance.getName())),
                Updates.set(
                    TOPIC_FIELD,
                    new Document()
                        .append(KEYWORDS_FIELD, iterator.toString())
                        .append(TOPIC_NUMBER_FIELD, maxAt)));
          });
}

Example #6

Source File: BrainRegionAnnotator.java From bluima with Apache License 2.0

6 votes

private static void configure(CRF _crf, InstanceList trainingSet) {

        // crf.addStatesForLabelsConnectedAsIn(trainingSet);
        // CRFTrainerByLabelLikelihood trainer = new
        // CRFTrainerByLabelLikelihood(
        // crf);
        // trainer.setGaussianPriorVariance(1d);

        int[] orders = new int[] { 1 };
        Pattern forbiddenPat = Pattern.compile("\\s");
        Pattern allowedPat = Pattern.compile(".*");

        String outside = Jcas2TokenSequence.TARGET_O;
        String startName = _crf.addOrderNStates(trainingSet, orders, null,
                outside, forbiddenPat, allowedPat, true);
        // String startName = crf.addOrderNStates(trainingSet, orders, null,
        // null, null, null, true);

        for (int i = 0; i < _crf.numStates(); i++)
            _crf.getState(i).setInitialWeight(Transducer.IMPOSSIBLE_WEIGHT);
        _crf.getState(startName).setInitialWeight(0.0);
    }

Example #7

Source File: TopicModel.java From baleen with Apache License 2.0

6 votes

@Override
protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {

  InstanceList testing = new InstanceList(pipe);
  testing.addThruPipe(new Instance(jCas.getDocumentText(), null, "from jcas", null));

  TopicInferencer inferencer = model.getInferencer();

  double[] topicDistribution =
      inferencer.getSampledDistribution(testing.get(0), iterations, thining, burnIn);

  int topicIndex = new MaximumIndex(topicDistribution).find();

  List<String> inferedTopic = topicWords.forTopic(topicIndex);

  Metadata md = new Metadata(jCas);
  md.setKey(metadataKey);
  md.setValue(inferedTopic.toString());
  addToJCasIndex(md);
}

Example #8

Source File: BrainRegionAnnotator.java From bluima with Apache License 2.0

5 votes

@Override
public void initialize(UimaContext context)
        throws ResourceInitializationException {
    super.initialize(context);
    try {
        mode = Mode.valueOf(modeStr);
        LOG.debug("Running in {} mode", mode);

        if (mode.equals(infer)) {
            // load model for inference
            checkArgument(new File(modelFile).exists(),
                    "required for inference: no modelFile at " + modelFile);
            ObjectInputStream s = new ObjectInputStream(
                    new FileInputStream(modelFile));
            inferenceCrf = (CRF) s.readObject();
            s.close();
            checkArgument(inferenceCrf != null);
        } else {
            // create empty instanceList, init pipes
            trainingInstanceList = new InstanceList(//
                    new SerialPipes(BrainRegionPipes.getPipes()));
            if (mode.equals(train))
                checkNotNull(modelFile, "missing model output file");
        }

    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }
}

Example #9

Source File: MalletCalculator.java From TagRec with GNU Affero General Public License v3.0

5 votes

private void initializeDataStructures() {
	this.instances = new InstanceList(new StringList2FeatureSequence());
	for (Map<Integer, Integer> map : this.maps) {
		List<String> tags = new ArrayList<String>();
		for (Map.Entry<Integer, Integer> entry : map.entrySet()) {
			for (int i = 0; i < entry.getValue(); i++) {
				tags.add(entry.getKey().toString());
			}				
		}
		Instance inst = new Instance(tags, null, null, null);
		inst.setData(tags);
		this.instances.addThruPipe(inst);
	}
}

Example #10

Source File: MalletCalculatorTweet.java From TagRec with GNU Affero General Public License v3.0

5 votes

private void initializeDataStructures() {
    this.instances = new InstanceList(new StringList2FeatureSequence());
    for (Map<Integer, Integer> map : this.maps) {
        List<String> tags = new ArrayList<String>();
        for (Map.Entry<Integer, Integer> entry : map.entrySet()) {
            for (int i = 0; i < entry.getValue(); i++) {
                tags.add(entry.getKey().toString());
            }               
        }
        Instance inst = new Instance(tags, null, null, null);
        inst.setData(tags);
        this.instances.addThruPipe(inst);
    }
}

Example #11

Source File: ReferencesClassifierTrainer.java From bluima with Apache License 2.0

5 votes

public static Trial testTrainSplit(InstanceList instances) {

        InstanceList[] instanceLists = instances.split(new Randoms(),
                new double[] { 0.9, 0.1, 0.0 });

        // LOG.debug("{} training instance, {} testing instances",
        // instanceLists[0].size(), instanceLists[1].size());

        @SuppressWarnings("rawtypes")
        ClassifierTrainer trainer = new MaxEntTrainer();
        Classifier classifier = trainer.train(instanceLists[TRAINING]);
        return new Trial(classifier, instanceLists[TESTING]);
    }

Example #12

Source File: LDA.java From topic-detection with Apache License 2.0

5 votes

/**
 * Creates the LDA model on the specified document corpus
 * @param texts a list of documents
 * @param numTopics the number of desired documents
 * @param numIterations the number of LDA iterationss
 * @return An LDA topic model
 * @throws IOException
 */
private ParallelTopicModel createLDAModel(List<String> texts, int numTopics, int numIterations) throws IOException
{
	InstanceList instanceList = createInstanceList(texts);
	ParallelTopicModel model = new ParallelTopicModel(numTopics);
	model.addInstances(instanceList);
	model.setNumIterations(numIterations);
	model.estimate();
	return model;
}

Example #13

Source File: BrainRegionPipesTest.java From bluima with Apache License 2.0

5 votes

private void pipe(String txt, List<String>... features) throws Exception {
    // it might not have all the aes, though...
    JCas jCas = getOpenNlpTokenizedTestCas(txt);

    InstanceList il = new InstanceList(//
            new SerialPipes(BrainRegionPipes.getPipes()));

    Instance instance = new Instance(jCas, null, 1, jCas);
    il.addThruPipe(instance);

    Instance pipedInstance = il.iterator().next();
    FeatureVectorSequence data = (FeatureVectorSequence) pipedInstance
            .getData();

    java.util.Iterator<List<String>> featuresIt = asList(features)
            .iterator();
    Iterator it = data.iterator();
    while (it.hasNext()) {
        FeatureVector featureVector = it.next();

        if (featuresIt.hasNext()) {
            for (String expectedFeature : featuresIt.next()) {
                assertTrue("could not find expected feature '"
                        + expectedFeature + "', FeatureVector = \n"
                        + featureVector,
                        featureVector.contains(expectedFeature));

            }
        }
    }
}

Example #14

Source File: MyMultiSegmentationEvaluator.java From bluima with Apache License 2.0

5 votes

public MyMultiSegmentationEvaluator(InstanceList[] il, String[] ild,
        Object[] sst, Object[] sct, boolean printMissclassified) {
    super(il, ild, sst, sct);
    if (il.length > 1) {
        throw new IllegalArgumentException(
                "Only works when evaluating one instancelist");
    }
    this.printMissclassified = printMissclassified;
}

Example #15

Source File: MultiSegmentationEvaluator.java From bluima with Apache License 2.0

5 votes

public MultiSegmentationEvaluator(InstanceList[] instanceLists,
        String[] instanceListDescriptions, Object[] segmentStartTags,
        Object[] segmentContinueTags) {
    super(instanceLists, instanceListDescriptions);
    this.segmentStartTags = segmentStartTags;
    this.segmentContinueTags = segmentContinueTags;
    assert (segmentStartTags.length == segmentContinueTags.length);
}

Example #16

Source File: LenientMultiSegmentationEvaluator.java From bluima with Apache License 2.0

5 votes

/**
 * @param printMissclassified
 *            whether to output a detailed report of TP, FT and the sentence
 *            to sys.out
 */
public LenientMultiSegmentationEvaluator(InstanceList[] instanceLists,
        String[] instanceListDescriptions, Object[] segmentStartTags,
        Object[] segmentContinueTags, boolean printMissclassified) {
    super(instanceLists, instanceListDescriptions, segmentStartTags,
            segmentContinueTags);
    this.printMissclassified = printMissclassified;
}

Example #17

Source File: LDA.java From topic-detection with Apache License 2.0

5 votes

/**
 * Creates a list of Malelt instances from a list of documents
 * @param texts a list of documents
 * @return a list of Mallet instances
 * @throws IOException
 */
private InstanceList createInstanceList(List<String> texts) throws IOException
{
	ArrayList<Pipe> pipes = new ArrayList<Pipe>();
	pipes.add(new CharSequence2TokenSequence());
	pipes.add(new TokenSequenceLowercase());
	pipes.add(new TokenSequenceRemoveStopwords());
	pipes.add(new TokenSequence2FeatureSequence());
	InstanceList instanceList = new InstanceList(new SerialPipes(pipes));
	instanceList.addThruPipe(new ArrayIterator(texts));
	return instanceList;
}

Example #18

Source File: EngineMBPythonNetworksBase.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

5 votes

protected AbstractMap.SimpleEntry<String,Integer> findOutMode(CorpusRepresentationMalletTarget crm)  {
  InstanceList instances = crm.getRepresentationMallet();
  // we pass on a "mode" for the learning problem, which is one of the following:
  // - classind: predict the index of a class
  // - classcosts: targets are vectors of class costs
  // - regr: regression
  // we also pass on another parameter which provides details of the learning problem:
  // - the number of class indices in case of classind and classcosts
  // - 0 as a dummy value in case of "regr"
  
  int nrClasses = 0;
  String mode = "regr";
  Alphabet ta = crm.getPipe().getTargetAlphabet();
  
  if(ta != null) {
    // if this is invoked for training, we should have a first instance, but for 
    // application, we do not have any instances yet. If we do not have any instances, we 
    // just use dummy values for now since at the moment we do not need this information
    // at application time. Should we ever need it we need to store this in the pipe!
    if(instances==null || instances.isEmpty()) {
      mode="classind";
      nrClasses=-1;
    } else {
      Instance firstInstance = instances.get(0);
      Object targetObj = firstInstance.getTarget();
      if(targetObj instanceof NominalTargetWithCosts) {
        NominalTargetWithCosts target = (NominalTargetWithCosts)targetObj;
        nrClasses = target.getCosts().length;
        mode = "classcosts";
      } else {
        mode = "classind";
        nrClasses = ta.size();
      }
    }
  } 
  AbstractMap.SimpleEntry<String,Integer> ret = new AbstractMap.SimpleEntry<>(mode,nrClasses);
  return ret;
}

Example #19

Source File: MalletClassifierTrainer.java From baleen with Apache License 2.0

5 votes

private List<String> createRow(
    InstanceList training, InstanceList testing, String e, Classifier classifier, Trial trial) {
  List<String> row = new ArrayList<>();
  row.add(e);
  row.add(Integer.toString(training.size()));
  row.add(Integer.toString(testing.size()));
  row.add(Double.toString(trial.getAccuracy()));
  for (String label : (String[]) classifier.getLabelAlphabet().toArray(new String[0])) {
    row.add(Double.toString(trial.getF1(label)));
    row.add(Double.toString(trial.getPrecision(label)));
    row.add(Double.toString(trial.getRecall(label)));
  }
  return row;
}

Example #20

Source File: MalletClassifier.java From baleen with Apache License 2.0

5 votes

@Override
protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {

  InstanceList instances = new InstanceList(classifierModel.getInstancePipe());
  instances.addThruPipe(new Instance(jCas.getDocumentText(), "", "from jcas", null));

  Classification classify = classifierModel.classify(instances.get(0));

  Metadata md = new Metadata(jCas);
  md.setKey(metadataKey);
  md.setValue(classify.getLabeling().getBestLabel().toString());
  addToJCasIndex(md);
}

Example #21

Source File: CorpusExporterMRTarget.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

5 votes

@Override
public void export() {    
  exportMeta();
  CorpusRepresentationMallet crm = (CorpusRepresentationMallet)corpusRepresentation;
  InstanceList malletInstances = crm.getRepresentationMallet();
  //Pipe pipe = malletInstances.getPipe();
  //Attributes attrs = new Attributes(pipe,instanceType);
  malletInstances.save(new File(dataDirFile, "data.mallettarget.ser"));
}

Example #22

Source File: CorpusExporterMRSeq.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

5 votes

@Override
public void export() {    
  exportMeta();
  CorpusRepresentationMallet crm = (CorpusRepresentationMallet)corpusRepresentation;
  InstanceList malletInstances = crm.getRepresentationMallet();
  //Pipe pipe = malletInstances.getPipe();
  //Attributes attrs = new Attributes(pipe,instanceType);
  malletInstances.save(new File(dataDirFile, "data.malletseq.ser"));    
}

Example #23

Source File: FVStatsMeanVarAll.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

5 votes

/**
 * Constructor from instance list.
 * @param instances instances
 */
public FVStatsMeanVarAll(InstanceList instances) {
  for(Instance instance : instances) {
    FeatureVector fv = (FeatureVector)instance.getData();
    addFeatureVector(fv);
  }
  finish();
}

Example #24

Source File: CorpusRepresentationLibSVM.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

5 votes

/**
 * Create libsvm representation from Mallet.
 *
 * @param crm mallet representation
 * @return libsvm representation
 */
public static svm_problem getFromMallet(CorpusRepresentationMallet crm) {
  InstanceList instances = crm.getRepresentationMallet();
  svm_problem prob = new svm_problem();
  int numTrainingInstances = instances.size();
  prob.l = numTrainingInstances;
  prob.y = new double[prob.l];
  prob.x = new svm_node[prob.l][];

  for (int i = 0; i < numTrainingInstances; i++) {
    Instance instance = instances.get(i);

    //Labels
    // convert the target: if we get a label, convert to index,
    // if we get a double, use it directly
    Object tobj = instance.getTarget();
    if (tobj instanceof Label) {
      prob.y[i] = ((Label) instance.getTarget()).getIndex();
    } else if (tobj instanceof Double) {
      prob.y[i] = (double) tobj;
    } else {
      throw new GateRuntimeException("Odd target in mallet instance, cannot convert to LIBSVM: " + tobj);
    }

    //Features
    SparseVector data = (SparseVector) instance.getData();
    int[] indices = data.getIndices();
    double[] values = data.getValues();
    prob.x[i] = new svm_node[indices.length];
    for (int j = 0; j < indices.length; j++) {
      svm_node node = new svm_node();
      node.index = indices[j]+1; // NOTE: LibSVM location indices have to start with 1
      node.value = values[j];
      prob.x[i][j] = node;
    }
  }
  return prob;
}

Example #25

Source File: EngineMBMalletSeq.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

5 votes

@Override
public void trainModel(File dataDirectory, String instanceType, String options) {
  InstanceList trainingData = corpusRepresentation.getRepresentationMallet();
  Transducer td = trainModel(trainingData,options);
  model = td;
  updateInfo();
  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
  info.modelWhenTrained = sdf.format(new Date());    
  info.algorithmParameters = options;
  info.save(dataDirectory);    
  featureInfo.save(dataDirectory);
}

Example #26

Source File: MaxEntClassifierTrainer.java From baleen with Apache License 2.0

4 votes

@Override
protected void execute(JobSettings settings) throws AnalysisEngineProcessException {

  Pipe pipe = new MaxEntClassifierPipe(labelsAndFeatures.keySet(), stopwords);

  InstanceList instances = new InstanceList(pipe);
  instances.addThruPipe(getDocumentsFromMongoWithRandonLabelAssignement());

  Alphabet targetAlphabet = instances.getTargetAlphabet();
  HashMap<Integer, ArrayList<Integer>> featuresAndLabels =
      mapFeaturesToLabels(instances.getDataAlphabet(), targetAlphabet);

  int numLabels = targetAlphabet.size();
  HashMap<Integer, double[]> constraintsMap =
      FeatureConstraintUtil.setTargetsUsingHeuristic(featuresAndLabels, numLabels, 0.9);

  MaxEntKLFLGEConstraints geConstraints =
      new MaxEntKLFLGEConstraints(instances.getDataAlphabet().size(), numLabels, false);
  constraintsMap
      .entrySet()
      .forEach(e -> geConstraints.addConstraint(e.getKey(), e.getValue(), 1));
  ArrayList<MaxEntGEConstraint> constraints = new ArrayList<>();
  constraints.add(geConstraints);

  // Create a classifier trainer, and use it to create a classifier
  MaxEntGETrainer trainer = new MaxEntGETrainer(constraints);
  trainer.setMaxIterations(numIterations);
  trainer.setGaussianPriorVariance(variance);

  instances.forEach(
      i -> {
        i.unLock();
        i.setTarget(null);
        i.lock();
      });

  Classifier classifier = trainer.train(instances);

  List<Classification> classify = classifier.classify(instances);

  writeClassificationToMongo(classify);
  new ObjectFile(classifier, modelFile).write();
}

Example #27

Source File: ReferencesClassifierTrainer.java From bluima with Apache License 2.0

4 votes

public static void main(String[] args) {

        // pipe instances
        InstanceList instanceList = new InstanceList(
                new SerialPipes(getPipes()));
        FileIterator iterator = new FileIterator(new File[] { CORPUS },
                new TxtFilter(), LAST_DIRECTORY);
        instanceList.addThruPipe(iterator);

        // ////////////////////////////////////////////////////////////////
        // cross-validate
        System.out.println("trial\tprec\trecall\tF-score");
        double f1s = 0;
        for (int i = 0; i < trials; i++) {
            Trial trial = testTrainSplit(instanceList);
            System.out.println(join(new Object[] {//
                    i, trial.getPrecision(TESTING), trial.getRecall(TESTING),
                            trial.getF1(TESTING) }, "\t"));
            f1s += trial.getF1(TESTING);
        }
        System.out.println("mean F1 = " + (f1s / (trials + 0d)));

        // ////////////////////////////////////////////////////////////////
        // train
        ClassifierTrainer trainer = new MaxEntTrainer();
        Classifier c = trainer.train(instanceList);

        String txt = "in the entorhinal cortex of the rat\n"
                + "II: phase relations between unit discharges and theta field potentials.\n"
                + "J. Comp. Neurol. 67, 502–509.\n"
                + "Alonso, A., and Klink, R. (1993).\n"
                + "Differential electroresponsiveness of\n"
                + "stellate and pyramidal-like cells of\n"
                + "medial entorhinal cortex layer II.\n"
                + "J. Neurophysiol. 70, 128–143.\n"
                + "Alonso, A., and Köhler, C. (1984).\n"
                + "A study of the reciprocal connections between the septum and the\n"
                + "entorhinal area using anterograde\n"
                + "and retrograde axonal transport\n"
                + "methods in the rat brain. J. Comp.\n"
                + "Neurol. 225, 327–343.\n"
                + "Alonso, A., and Llinás, R. (1989).\n"
                + "Subthreshold sodium-dependent\n"
                + "theta-like rhythmicity in stellate\n"
                + "cells of entorhinal cortex layer II.\n"
                + "Nature 342, 175–177.\n"
                + "Amaral, D. G., and Kurz, J. (1985).\n"
                + "An analysis of the origins of\n" + "";
        Classification classification = c.classify(c.getInstancePipe()
                .instanceFrom(new Instance(txt, null, null, null)));
        System.out.println("LABELL " + classification.getLabeling());
        c.print();

        try {
            ObjectOutputStream oos = new ObjectOutputStream(
                    new FileOutputStream("target/classifier_"
                            + currentTimeMillis() + ".model"));
            oos.writeObject(c);
            oos.close();
        } catch (Exception e) {
            e.fillInStackTrace();
        }

        // //////////////////////////////////////////////////////////////////
        // train test
        for (String goldLabel : new String[] { "I", "O" }) {
            ClassifierTrainer trainer2 = new MaxEntTrainer();
            Classifier c2 = trainer2.train(instanceList);

            FileIterator iteratorI = new FileIterator(new File[] { new File(
                    CORPUS, "../annots1/" + goldLabel + "/") },
                    new TxtFilter(), LAST_DIRECTORY);
            Iterator<Instance> instancesI = c2.getInstancePipe()
                    .newIteratorFrom(iteratorI);

            Histogram<String> h = new Histogram<String>();
            while (instancesI.hasNext()) {
                Instance inst = instancesI.next();
                Labeling labeling = c2.classify(inst).getLabeling();
                Label bestLabel = labeling.getBestLabel();
                h.add(bestLabel.toString());

                // if (!bestLabel.toString().equals(goldLabel)) {
                // LOG.debug(
                // "\n\n\nMISSCLASSIFIED as {} but gold:{} :: "
                // + inst.getSource(), bestLabel, goldLabel);
                // }
            }
            System.out.println("\nlabel " + goldLabel + "\n" + h);
        }
    }

Example #28

Source File: BrainRegionAnnotator.java From bluima with Apache License 2.0

4 votes

/** MultiSegmentationEvaluator */
    private static Fold evaluate(int iterationId, CRF crf,
            InstanceList trainingSet, InstanceList testingSet, int threads) {

        // TODO 1 see if it works (better) with simpler setup

        CRFTrainerByThreadedLabelLikelihood trainer = new CRFTrainerByThreadedLabelLikelihood(
                crf, threads);
        // CRFTrainerByLabelLikelihood trainer = new
        // CRFTrainerByLabelLikelihood(crf);
        trainer.setGaussianPriorVariance(1);

        String[] tags = new String[] { Jcas2TokenSequence.TARGET_I };
        String[] continueTags = tags;

        trainer.train(trainingSet);

        MyMultiSegmentationEvaluator eval = new MyMultiSegmentationEvaluator(
                new InstanceList[] { testingSet },//
                new String[] { "TTesting" }, tags, continueTags,
                PRINT_MISSCLASSIFIED);
        eval.evaluate(trainer); // eval at end of training

//        MultiSegmentationEvaluator evalOrig = new MultiSegmentationEvaluator(
//                new InstanceList[] { testingSet },//
//                new String[] { "TTesting" }, tags, continueTags);
//        evalOrig.evaluate(trainer); // eval at end of training

        LenientMultiSegmentationEvaluator evalLenient = new LenientMultiSegmentationEvaluator(
                new InstanceList[] { testingSet },//
                new String[] { "TTesting" }, tags, continueTags,
                PRINT_MISSCLASSIFIED);
        evalLenient.evaluate(trainer);

        LOG.info("FOLD {} --> " + eval + " lenient: {}", iterationId,
                evalLenient);
        return new Fold(eval);

        // TODO trainer.trainWithFeatureInduction

        // TODO
        // if ( runner.isInduceFeatures() ) {
        // // Number of maximizer iterations between each call to the Feature
        // Inducer. (10 in simpletagger and TUI)
        // int numIterationsBetweenFeatureInductions = 10;
        //
        // // Maximum number of rounds of feature induction. (20 in
        // simpleTagger, 99 in TUI)
        // int numFeatureInductions = 20;
        //
        // // Maximum number of features to induce at each round of induction.
        // (500 in simpletagger, 200 in TUI)
        // int numFeaturesPerFeatureInduction = 300;
        // // splits = new double[] {.1, .2, .5, .7}
        //
        // crft.trainWithFeatureInduction( training, null, testing, eval,
        // iterations,
        // numIterationsBetweenFeatureInductions, numFeatureInductions,
        // numFeaturesPerFeatureInduction, 0.5,
        // false, null );
        // } else {
        // // before
        // converged = crft.train( training ); // , iterations );
        // }
    }

Example #29

Source File: CorpusRepresentationLibSVM.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

4 votes

@Override
public InstanceList getRepresentationMallet() {
 return crm.instances;
}

Example #30

Source File: MultiSegmentationEvaluator.java From bluima with Apache License 2.0

4 votes

public void evaluateInstanceList(TransducerTrainer tt, InstanceList data,
        String description) {
    Transducer model = tt.getTransducer();
    int numCorrectTokens, totalTokens;
    int[] numTrueSegments, numPredictedSegments, numCorrectSegments;
    int allIndex = segmentStartTags.length;
    numTrueSegments = new int[allIndex + 1];
    numPredictedSegments = new int[allIndex + 1];
    numCorrectSegments = new int[allIndex + 1];

    totalTokens = numCorrectTokens = 0;
    for (int n = 0; n < numTrueSegments.length; n++)
        numTrueSegments[n] = numPredictedSegments[n] = numCorrectSegments[n] = 0;
    for (int i = 0; i < data.size(); i++) {
        Instance instance = data.get(i);
        Sequence input = (Sequence) instance.getData();
        // String tokens = null;
        // if (instance.getSource() != null)
        // tokens = (String) instance.getSource().toString();
        Sequence trueOutput = (Sequence) instance.getTarget();
        assert (input.size() == trueOutput.size());
        Sequence predOutput = model.transduce(input);
        assert (predOutput.size() == trueOutput.size());
        int trueStart, predStart; // -1 for non-start, otherwise index into
                                  // segmentStartTag
        for (int j = 0; j < trueOutput.size(); j++) {
            totalTokens++;
            if (trueOutput.get(j).equals(predOutput.get(j)))
                numCorrectTokens++;
            trueStart = predStart = -1;
            // Count true segment starts
            for (int n = 0; n < segmentStartTags.length; n++) {
                if (segmentStartTags[n].equals(trueOutput.get(j))) {
                    numTrueSegments[n]++;
                    numTrueSegments[allIndex]++;
                    trueStart = n;
                    break;
                }
            }
            // Count predicted segment starts
            for (int n = 0; n < segmentStartTags.length; n++) {
                if (segmentStartTags[n].equals(predOutput.get(j))) {
                    numPredictedSegments[n]++;
                    numPredictedSegments[allIndex]++;
                    predStart = n;
                }
            }
            if (trueStart != -1 && trueStart == predStart) {
                // Truth and Prediction both agree that the same segment
                // tag-type is starting now
                int m;
                boolean trueContinue = false;
                boolean predContinue = false;
                for (m = j + 1; m < trueOutput.size(); m++) {
                    trueContinue = segmentContinueTags[predStart]
                            .equals(trueOutput.get(m));
                    predContinue = segmentContinueTags[predStart]
                            .equals(predOutput.get(m));
                    if (!trueContinue || !predContinue) {
                        if (trueContinue == predContinue) {
                            // They agree about a segment is ending somehow
                            numCorrectSegments[predStart]++;
                            numCorrectSegments[allIndex]++;
                        }
                        break;
                    }
                }
                // for the case of the end of the sequence
                if (m == trueOutput.size()) {
                    if (trueContinue == predContinue) {
                        numCorrectSegments[predStart]++;
                        numCorrectSegments[allIndex]++;
                    }
                }
            }
        }
    }
    DecimalFormat f = new DecimalFormat("0.####");
    System.err.println(description + " tokenaccuracy="
            + f.format(((double) numCorrectTokens) / totalTokens));
    for (int n = 0; n < numCorrectSegments.length; n++) {
        System.err.println((n < allIndex ? segmentStartTags[n].toString()
                : "OVERALL") + ' ');
        double precision = numPredictedSegments[n] == 0 ? 1
                : ((double) numCorrectSegments[n])
                        / numPredictedSegments[n];
        double recall = numTrueSegments[n] == 0 ? 1
                : ((double) numCorrectSegments[n]) / numTrueSegments[n];
        double f1 = recall + precision == 0.0 ? 0.0
                : (2.0 * recall * precision) / (recall + precision);
        System.err.println(" " + description + " segments true="
                + numTrueSegments[n] + " pred=" + numPredictedSegments[n]
                + " correct=" + numCorrectSegments[n] + " misses="
                + (numTrueSegments[n] - numCorrectSegments[n]) + " alarms="
                + (numPredictedSegments[n] - numCorrectSegments[n]));
        System.err.println(" " + description + " precision="
                + f.format(precision) + " recall=" + f.format(recall)
                + " f1=" + f.format(f1));
    }

}