cc.mallet.types.Instance Java Examples

The following examples show how to use cc.mallet.types.Instance. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CorpusRepresentationLibSVM.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 6 votes vote down vote up
public static svm_node[] libSVMInstanceIndepFromMalletInstance(
        cc.mallet.types.Instance malletInstance) {

  // TODO: maybe check that data is really a sparse vector? Should be in all cases
  // except if we have an instance from MalletSeq
  SparseVector data = (SparseVector) malletInstance.getData();
  int[] indices = data.getIndices();
  double[] values = data.getValues();
  svm_node[] nodearray = new svm_node[indices.length];
  int index = 0;
  for (int j = 0; j < indices.length; j++) {
    svm_node node = new svm_node();
    node.index = indices[j]+1;   // NOTE: LibSVM locations have to start with 1
    node.value = values[j];
    nodearray[index] = node;
    index++;
  }
  return nodearray;
}
 
Example #2
Source File: RemoveStopwordsTest.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Test
public void testStopwordsAreRemoved() {
  String stop = "stop";
  String word = "word";
  String white = "white";
  String list = "list";

  TokenSequence data =
      new TokenSequence(
          ImmutableList.of(new Token(stop), new Token(word), new Token(white), new Token(list)));
  Instance instance = new Instance(data, null, null, null);

  RemoveStopwords stopwords = new RemoveStopwords(ImmutableList.of(stop, word));
  Instance output = stopwords.pipe(instance);

  TokenSequence ts = (TokenSequence) output.getData();
  assertEquals(2, ts.size());
  assertEquals(
      ImmutableSet.of(white, list), ts.stream().map(Token::getText).collect(Collectors.toSet()));
}
 
Example #3
Source File: MalletClassifierTrainerTest.java    From baleen with Apache License 2.0 6 votes vote down vote up
private void validateModel() {
  File modelFile = modelPath.toFile();
  assertTrue(modelFile.exists());

  Classifier classifier = new FileObject<Classifier>(modelFile.getPath()).object();
  assertTrue(classifier.getLabelAlphabet().contains("pos"));
  assertTrue(classifier.getLabelAlphabet().contains("neg"));

  Pipe pipe = classifier.getInstancePipe();
  InstanceList instanceList = new InstanceList(pipe);

  instanceList.addThruPipe(
      new Instance("I love this amazing awesome classifier.", "", null, null));
  instanceList.addThruPipe(new Instance("I can't stand this horrible test.", "", null, null));

  ImmutableSet<String> labels = ImmutableSet.of("pos", "neg");
  assertTrue(
      labels.contains(
          classifier.classify(instanceList.get(0)).getLabeling().getBestLabel().toString()));
  assertTrue(
      labels.contains(
          classifier.classify(instanceList.get(1)).getLabeling().getBestLabel().toString()));
}
 
Example #4
Source File: MaxEntClassifierTrainerTest.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Test
public void testTaskProducesValidModelFile() throws Exception {

  File modelFile = modelPath.toFile();
  assertTrue(modelFile.exists());

  Classifier classifier = new FileObject<Classifier>(modelFile.getPath()).object();
  assertTrue(classifier.getLabelAlphabet().contains("pos"));
  assertTrue(classifier.getLabelAlphabet().contains("neg"));

  Pipe pipe = classifier.getInstancePipe();
  InstanceList instanceList = new InstanceList(pipe);
  instanceList.addThruPipe(
      new Instance("I love this amazing awesome classifier.", null, null, null));
  instanceList.addThruPipe(new Instance("I can't stand this horrible test.", null, null, null));

  assertEquals(
      "pos", classifier.classify(instanceList.get(0)).getLabeling().getBestLabel().toString());
  assertEquals(
      "neg", classifier.classify(instanceList.get(1)).getLabeling().getBestLabel().toString());
}
 
Example #5
Source File: FeatureExtractionMalletSparse.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 6 votes vote down vote up
/**
 * Extract numeric target value.
 * @param inst instance
 * @param targetFeature target feature
 * @param instanceAnnotation instance annotation
 * @param inputAS input annotation set
 */
public static void extractNumericTarget(Instance inst, String targetFeature, Annotation instanceAnnotation, AnnotationSet inputAS) {
  Document doc = inputAS.getDocument();
  Object obj = instanceAnnotation.getFeatures().get(targetFeature);
  // Brilliant, we have a missing target, WTF? Throw an exception
  if (obj == null) {
    throw new GateRuntimeException("No target value for feature " + targetFeature
            + " for instance at offset " + gate.Utils.start(instanceAnnotation) + " in document " + doc.getName());
  }
  double value = Double.NaN;
  if (obj instanceof Number) {
    value = ((Number) obj).doubleValue();
  } else {
    String asString = obj.toString();
    try {
      value = Double.parseDouble(asString);
    } catch (NumberFormatException ex) {
      throw new GateRuntimeException("Could not convert target value to a double for feature " + targetFeature
              + " for instance at offset " + gate.Utils.start(instanceAnnotation) + " in document " + doc.getName());
    }
  }
  inst.setTarget(value);
}
 
Example #6
Source File: FeatureExtractionMalletSparse.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 6 votes vote down vote up
/**
 * Extract featyre,
 * @param inst isntance
 * @param att attribute
 * @param inputAS input annotation set
 * @param instanceAnnotation instance annotation
 */
public static void extractFeature(
        Instance inst,
        FeatureSpecAttribute att,
        AnnotationSet inputAS,
        Annotation instanceAnnotation) {
  if (att instanceof FeatureSpecAttributeList) {
    extractFeatureHelper(inst, (FeatureSpecAttributeList) att, inputAS, instanceAnnotation);
  } else if (att instanceof FeatureSpecSimpleAttribute) {
    extractFeatureHelper(inst, (FeatureSpecSimpleAttribute) att, inputAS, instanceAnnotation);
  } else if (att instanceof FeatureSpecNgram) {
    extractFeatureHelper(inst, (FeatureSpecNgram) att, inputAS, instanceAnnotation);
  } else {
    throw new GateRuntimeException("Attempt to call extractFeature with type " + att.getClass());
  }
}
 
Example #7
Source File: TestFeatureExtraction.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 6 votes vote down vote up
public void extractSimple2() {
  String spec = "<ROOT>"+
          "<ATTRIBUTE><TYPE>theType</TYPE><FEATURE>feature1</FEATURE><DATATYPE>nominal</DATATYPE></ATTRIBUTE>"+
          "</ROOT>";
  List<FeatureSpecAttribute> as = new FeatureSpecification(spec).getFeatureInfo().getAttributes();
  Instance inst = newInstance();
  
  // prepare the document
  Annotation instAnn = addAnn(doc, "", 0, 10, "instanceType", gate.Utils.featureMap());
  Annotation tok1 = addAnn(doc, "", 0, 5, "theType", gate.Utils.featureMap("feature1","f1v1"));
  tok1.getFeatures().put("feature2", "valOfFeature2");
  Annotation tok2 = addAnn(doc, "", 0, 5, "theType", gate.Utils.featureMap("feature1","f1v2"));
  tok2.getFeatures().put("feature2", "valOfFeature2B");
  
  // We do not allow more than one overlapping annotation of the given type for ATTRIBUTE
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn);
}
 
Example #8
Source File: TestFeatureExtraction.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 6 votes vote down vote up
@Test
public void extractSimple3() {
  String spec = "<ROOT>"+
          "<ATTRIBUTE><TYPE>theType</TYPE><FEATURE>feature1</FEATURE><DATATYPE>nominal</DATATYPE></ATTRIBUTE>"+
          "</ROOT>";
  List<FeatureSpecAttribute> as = new FeatureSpecification(spec).getFeatureInfo().getAttributes();
  Instance inst = newInstance();
  
  // prepare the document
  Annotation instAnn = addAnn(doc, "", 0, 10, "instanceType", gate.Utils.featureMap());
  Annotation tok1 = addAnn(doc, "", 0, 5, "theType", gate.Utils.featureMap("feature1","f1v1"));
  tok1.getFeatures().put("feature2", "valOfFeature2");
  
  // We do not allow more than one overlapping annotation of the given type for ATTRIBUTE
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn);
  System.err.println("After "+as.get(0)+" (overlapping) FV="+inst.getData());
}
 
Example #9
Source File: MalletClassifierTrainer.java    From baleen with Apache License 2.0 6 votes vote down vote up
private Iterator<Instance> getDocumentsFromMongo() {
  FindIterable<Document> find = documentsCollection.find();
  return FluentIterable.from(new MongoIterable(find))
      .transform(
          d -> {
            String name = d.getObjectId("_id").toHexString();
            String data = d.getString(contentField);
            Optional<String> label = getLabel(d);
            if (!label.isPresent()) {
              Document metadata = (Document) d.get(Mongo.FIELD_METADATA);
              label = getLabel(metadata);
            }
            return new Instance(data, label.orElse("UNKNOWN"), name, null);
          })
      .iterator();
}
 
Example #10
Source File: CorpusRepresentationMalletTarget.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 6 votes vote down vote up
/**
 * Extract the independent features for a single instance annotation.
 * 
 * Extract the independent features for a single annotation according to the information
 * in the featureInfo object. The information in the featureInfo instance gets updated 
 * by this. 
 * 
 * NOTE: this method is static so that it can be used in the CorpusRepresentationMalletSeq class too.
 * 
 * @param instanceAnnotation instance annotation
 * @param inputAS input annotation set
 * @param targetFeatureName feature name of target
 * @param featureInfo feature info instance
 * @param pipe mallet pipe
 * @param nameFeature name feature
 * @return  Instance
 */
static Instance extractIndependentFeaturesHelper(
        Annotation instanceAnnotation,
        AnnotationSet inputAS,
        FeatureInfo featureInfo,
        Pipe pipe) {
  
  AugmentableFeatureVector afv = new AugmentableFeatureVector(pipe.getDataAlphabet());
  // Constructor parms: data, target, name, source
  Instance inst = new Instance(afv, null, null, null);
  for(FeatureSpecAttribute attr : featureInfo.getAttributes()) {
    FeatureExtractionMalletSparse.extractFeature(inst, attr, inputAS, instanceAnnotation);
  }
  // TODO: we destructively replace the AugmentableFeatureVector by a FeatureVector here,
  // but it is not clear if this is beneficial - our assumption is that yes.
  inst.setData(((AugmentableFeatureVector)inst.getData()).toFeatureVector());
  return inst;
}
 
Example #11
Source File: TopicModelTrainer.java    From baleen with Apache License 2.0 6 votes vote down vote up
private void writeTopicAssignmentsToMongo(
    InstanceList instances, TopicWords topicWords, ParallelTopicModel model) {
  IntStream.range(0, instances.size())
      .forEach(
          document -> {
            double[] topicDistribution = model.getTopicProbabilities(document);
            int maxAt = new MaximumIndex(topicDistribution).find();
            Instance instance = instances.get(document);

            List<String> iterator = topicWords.forTopic(maxAt);

            documentsCollection.findOneAndUpdate(
                Filters.eq(new ObjectId((String) instance.getName())),
                Updates.set(
                    TOPIC_FIELD,
                    new Document()
                        .append(KEYWORDS_FIELD, iterator.toString())
                        .append(TOPIC_NUMBER_FIELD, maxAt)));
          });
}
 
Example #12
Source File: PipeScaleMeanVarAll.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 6 votes vote down vote up
@Override
public Instance pipe(Instance carrier) {
  if (!(carrier.getData() instanceof FeatureVector)) {
    System.out.println(carrier.getData().getClass());
    throw new IllegalArgumentException("Data must be of type FeatureVector not " + carrier.getData().getClass() + " we got " + carrier.getData());
  }

  if (this.means.length != this.getDataAlphabet().size()
          || this.variances.length != this.getDataAlphabet().size()) {
    throw new GateRuntimeException("Size mismatch, alphabet="+getDataAlphabet().size()+", stats="+means.length);    }

  FeatureVector fv = (FeatureVector) carrier.getData();
  int[] indices = fv.getIndices();
  double[] values = fv.getValues();
  for (int i = 0; i < indices.length; i++) {
    int index = indices[i];
    if(normalize[index]) {
      double value = values[i];
      double mean = means[index];
      double variance = variances[index];
      double newvalue = (value - mean) / Math.sqrt(variance);
      fv.setValue(index, newvalue);
    }
  }
  return carrier;
}
 
Example #13
Source File: TopicModel.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Override
protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {

  InstanceList testing = new InstanceList(pipe);
  testing.addThruPipe(new Instance(jCas.getDocumentText(), null, "from jcas", null));

  TopicInferencer inferencer = model.getInferencer();

  double[] topicDistribution =
      inferencer.getSampledDistribution(testing.get(0), iterations, thining, burnIn);

  int topicIndex = new MaximumIndex(topicDistribution).find();

  List<String> inferedTopic = topicWords.forTopic(topicIndex);

  Metadata md = new Metadata(jCas);
  md.setKey(metadataKey);
  md.setValue(inferedTopic.toString());
  addToJCasIndex(md);
}
 
Example #14
Source File: TopicModelTrainer.java    From baleen with Apache License 2.0 5 votes vote down vote up
private Iterator<Instance> getDocumentsFromMongo() {
  FindIterable<Document> find = documentsCollection.find();
  return FluentIterable.from(new MongoIterable(find))
      .transform(
          d -> {
            String data = d.getString(contentField);
            String name = d.getObjectId("_id").toHexString();
            return new Instance(data, null, name, null);
          })
      .iterator();
}
 
Example #15
Source File: TestFeatureExtraction.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 5 votes vote down vote up
@Test
public void extractList1() {
  String spec = "<ROOT>"+
          "<ATTRIBUTELIST><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><DATATYPE>nominal</DATATYPE><FROM>-1</FROM><TO>1</TO></ATTRIBUTELIST>"+
          "</ROOT>";
  List<FeatureSpecAttribute> as = new FeatureSpecification(spec).getFeatureInfo().getAttributes();
  Instance inst = newInstance();
  
  // prepare the document
  Annotation instAnn = addAnn(doc, "", 10, 11, "instanceType", gate.Utils.featureMap());
  addAnn(doc,"",0,2,"theType",gate.Utils.featureMap("theFeature","tok1"));
  addAnn(doc,"",2,4,"theType",gate.Utils.featureMap("theFeature","tok2"));
  addAnn(doc,"",4,6,"theType",gate.Utils.featureMap("theFeature","tok3"));
  addAnn(doc,"",6,8,"theType",gate.Utils.featureMap("theFeature","tok4"));
  addAnn(doc,"",8,10,"theType",gate.Utils.featureMap("theFeature","tok5"));
  addAnn(doc,"",10,12,"theType",gate.Utils.featureMap("theFeature","tok6"));
  addAnn(doc,"",12,14,"theType",gate.Utils.featureMap("theFeature","tok7"));
  addAnn(doc,"",14,16,"theType",gate.Utils.featureMap("theFeature","tok8"));
  addAnn(doc,"",16,18,"theType",gate.Utils.featureMap("theFeature","tok9"));
  addAnn(doc,"",18,20,"theType",gate.Utils.featureMap("theFeature","tok10"));
  
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn);
  System.err.println("After "+as.get(0)+" (list -1to1) FV="+inst.getData());
  System.err.println("Alphabet L1="+inst.getAlphabet());
  assertEquals(3,inst.getAlphabet().size());
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬L-1═tok5"));
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬L0═tok6"));
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬L1═tok7"));
  assertEquals(3,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬L-1═tok5"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬L0═tok6"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬L1═tok7"),EPS);
}
 
Example #16
Source File: FeatureExtractionMalletSparse.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 5 votes vote down vote up
/**
 * Return flag that indicates if the instance does have a missing value.
 * @param inst instance
 * @return flag
 */
public static boolean instanceHasMV(Instance inst) {
  Object val = inst.getProperty(PROP_HAVE_MV);
  if (val == null) {
    return false;
  }
  return ((Boolean) inst.getProperty(PROP_HAVE_MV));
}
 
Example #17
Source File: EngineMBMalletClass.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 5 votes vote down vote up
@Override
public List<ModelApplication> applyModel(
        AnnotationSet instanceAS, AnnotationSet inputAS, AnnotationSet sequenceAS, String parms) {
  // NOTE: the crm should be of type CorpusRepresentationMalletClass for this to work!
  if(!(corpusRepresentation instanceof CorpusRepresentationMalletTarget)) {
    throw new GateRuntimeException("Cannot perform classification with data from "+corpusRepresentation.getClass());
  }
  CorpusRepresentationMalletTarget data = (CorpusRepresentationMalletTarget)corpusRepresentation;
  data.stopGrowth();
  List<ModelApplication> gcs = new ArrayList<>();
  LFPipe pipe = (LFPipe)data.getRepresentationMallet().getPipe();
  Classifier classifier = (Classifier)model;
  // iterate over the instance annotations and create mallet instances 
  for(Annotation instAnn : instanceAS.inDocumentOrder()) {
    Instance inst = data.extractIndependentFeatures(instAnn, inputAS);
    inst = pipe.instanceFrom(inst);
    Classification classification = classifier.classify(inst);
    Labeling labeling = classification.getLabeling();
    LabelVector labelvec = labeling.toLabelVector();
    List<String> classes = new ArrayList<>(labelvec.numLocations());
    List<Double> confidences = new ArrayList<>(labelvec.numLocations());
    for(int i=0; i<labelvec.numLocations(); i++) {
      classes.add(labelvec.getLabelAtRank(i).toString());
      confidences.add(labelvec.getValueAtRank(i));
    }      
    ModelApplication gc = new ModelApplication(instAnn, labeling.getBestLabel().toString(), 
            labeling.getBestValue(), classes, confidences);
    //System.err.println("ADDING GC "+gc);
    // now save the class in our special class feature on the instance as well
    instAnn.getFeatures().put("gate.LF.target",labeling.getBestLabel().toString());
    gcs.add(gc);
  }
  data.startGrowth();
  return gcs;
}
 
Example #18
Source File: MalletClassifier.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {

  InstanceList instances = new InstanceList(classifierModel.getInstancePipe());
  instances.addThruPipe(new Instance(jCas.getDocumentText(), "", "from jcas", null));

  Classification classify = classifierModel.classify(instances.get(0));

  Metadata md = new Metadata(jCas);
  md.setKey(metadataKey);
  md.setValue(classify.getLabeling().getBestLabel().toString());
  addToJCasIndex(md);
}
 
Example #19
Source File: TestFeatureExtraction.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 5 votes vote down vote up
@Test
public void extractSimpleList2() {
  String spec = "<ROOT>"+
          "<ATTRIBUTE><TYPE>theType</TYPE><FEATURE>feature1</FEATURE><DATATYPE>nominal</DATATYPE><LISTSEP>:</LISTSEP></ATTRIBUTE>"+
          "</ROOT>";
  List<FeatureSpecAttribute> as = new FeatureSpecification(spec).getFeatureInfo().getAttributes();
  Instance inst = newInstance();
  
  // prepare the document
  Annotation instAnn = addAnn(doc, "", 0, 10, "instanceType", gate.Utils.featureMap());
  Annotation tok1 = addAnn(doc, "", 0, 5, "theType", gate.Utils.featureMap("feature1","lval1:lval2:lval3"));

  
  Annotation instAnn2 = addAnn(doc, "", 11, 20, "instanceType", gate.Utils.featureMap());
  Annotation tok2 = addAnn(doc, "", 12, 15, "theType", gate.Utils.featureMap("feature1","lval1:lval4:lval5"));
  
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn);
  FeatureVector fv = (FeatureVector)inst.getData();
  System.err.println("FeatureExtraction SimpleList2a: "+fv.toString(true));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═lval1"));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═lval2"));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═lval3"));
  assertEquals(3,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═lval1"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═lval2"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═lval3"),EPS);
  
  inst = newInstance(inst.getAlphabet());
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn2);
  fv = (FeatureVector)inst.getData();
  System.err.println("FeatureExtraction SimpleList2b: "+fv.toString(true));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═lval1"));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═lval4"));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═lval5"));
  assertEquals(3,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═lval1"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═lval4"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═lval5"),EPS);
  
}
 
Example #20
Source File: MaxEntClassifierTrainer.java    From baleen with Apache License 2.0 5 votes vote down vote up
private void writeClassificationToMongo(List<Classification> classify) {
  classify.forEach(
      classification -> {
        Instance instance = classification.getInstance();
        documentsCollection.findOneAndUpdate(
            Filters.eq(new ObjectId((String) instance.getName())),
            Updates.set(
                CLASSIFICATION_FIELD, classification.getLabeling().getBestLabel().toString()));
      });
}
 
Example #21
Source File: CorpusExporterMRJsonTarget.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 5 votes vote down vote up
/**
 * Convert instance to string. 
 * 
 * @param inst instance
 * @param targetAlphabet target alphabet
 * @param attrs attributes
 * @param nrFeatures number of features
 * @param asString represent as quoted string
 * @param filterMV filter missing values
 * @return string representation
 */
public String instance2String(
        Instance inst,
        LabelAlphabet targetAlphabet,
        Attributes attrs,
        int nrFeatures,
        boolean asString,
        boolean filterMV) {
  StringBuilder sb = new StringBuilder();
  sb.append("[");  // outermost list 
  FeatureVector fv = (FeatureVector)inst.getData();
  Object targetObject = inst.getTarget();
  if (filterMV) {
    Object ignore = inst.getProperty(FeatureExtractionMalletSparse.PROP_IGNORE_HAS_MV);
    if (ignore != null && ignore.equals(true)) {
        return null;
    }
  }
  sb.append(featureVector2String(fv, nrFeatures, attrs, asString));
  // for now, we always try to output the target, even if it is null, this may change 
  // in the future
  if (targetObject!=null) {
    sb.append(", ");
    sb.append(target2String(targetObject, targetAlphabet, asString));
  }
  sb.append("]");  // close outer list
  return sb.toString();
}
 
Example #22
Source File: LDAModelEstimator.java    From RankSys with Mozilla Public License 2.0 5 votes vote down vote up
@Override
public Iterator<Instance> iterator() {
    return preferences.getAllUidx()
            .mapToObj(preferences::getUidxPreferences)
            .map(userPreferences -> {
                FeatureSequence sequence = new FeatureSequence(alphabet);
                userPreferences
                        .forEach(pref -> range(0, (int) pref.v2)
                                .forEach(i -> sequence.add(pref.v1)));

                return new Instance(sequence, null, null, null);
            })
            .iterator();
}
 
Example #23
Source File: MaxEntClassifierTrainer.java    From baleen with Apache License 2.0 5 votes vote down vote up
private Iterator<Instance> getDocumentsFromMongoWithRandonLabelAssignement() {
  FindIterable<Document> find = documentsCollection.find();
  return FluentIterable.from(new MongoIterable(find))
      .transform(
          d -> {
            String data = d.getString(contentField);
            String name = d.getObjectId("_id").toHexString();
            return new Instance(data, null, name, null);
          })
      .iterator();
}
 
Example #24
Source File: RemoveStopwords.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
public Instance pipe(Instance carrier) {
  TokenSequence input = (TokenSequence) carrier.getData();
  TokenSequence output = new TokenSequence();
  for (int i = 0; i < input.size(); i++) {
    Token t = input.get(i);
    if (!stopwords.contains(t.getText())) {
      output.add(t);
    }
  }
  carrier.setData(output);
  return carrier;
}
 
Example #25
Source File: FVStatsMeanVarAll.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 5 votes vote down vote up
/**
 * Constructor from instance list.
 * @param instances instances
 */
public FVStatsMeanVarAll(InstanceList instances) {
  for(Instance instance : instances) {
    FeatureVector fv = (FeatureVector)instance.getData();
    addFeatureVector(fv);
  }
  finish();
}
 
Example #26
Source File: FeatureExtractionMalletSparse.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 5 votes vote down vote up
/**
 * Get flag indiciating if we ignore instances with a missing value.
 * @param inst instance
 * @return flag
 */
public static boolean ignoreInstanceWithMV(Instance inst) {
  Object val = inst.getProperty(PROP_IGNORE_HAS_MV);
  if (val == null) {
    return false;
  }
  return ((Boolean) inst.getProperty(PROP_IGNORE_HAS_MV));
}
 
Example #27
Source File: MalletCalculator.java    From TagRec with GNU Affero General Public License v3.0 5 votes vote down vote up
private void initializeDataStructures() {
	this.instances = new InstanceList(new StringList2FeatureSequence());
	for (Map<Integer, Integer> map : this.maps) {
		List<String> tags = new ArrayList<String>();
		for (Map.Entry<Integer, Integer> entry : map.entrySet()) {
			for (int i = 0; i < entry.getValue(); i++) {
				tags.add(entry.getKey().toString());
			}				
		}
		Instance inst = new Instance(tags, null, null, null);
		inst.setData(tags);
		this.instances.addThruPipe(inst);
	}
}
 
Example #28
Source File: MalletCalculatorTweet.java    From TagRec with GNU Affero General Public License v3.0 5 votes vote down vote up
private void initializeDataStructures() {
    this.instances = new InstanceList(new StringList2FeatureSequence());
    for (Map<Integer, Integer> map : this.maps) {
        List<String> tags = new ArrayList<String>();
        for (Map.Entry<Integer, Integer> entry : map.entrySet()) {
            for (int i = 0; i < entry.getValue(); i++) {
                tags.add(entry.getKey().toString());
            }               
        }
        Instance inst = new Instance(tags, null, null, null);
        inst.setData(tags);
        this.instances.addThruPipe(inst);
    }
}
 
Example #29
Source File: CorpusRepresentationLibSVM.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 5 votes vote down vote up
/**
 * Create libsvm representation from Mallet.
 *
 * @param crm mallet representation
 * @return libsvm representation
 */
public static svm_problem getFromMallet(CorpusRepresentationMallet crm) {
  InstanceList instances = crm.getRepresentationMallet();
  svm_problem prob = new svm_problem();
  int numTrainingInstances = instances.size();
  prob.l = numTrainingInstances;
  prob.y = new double[prob.l];
  prob.x = new svm_node[prob.l][];

  for (int i = 0; i < numTrainingInstances; i++) {
    Instance instance = instances.get(i);

    //Labels
    // convert the target: if we get a label, convert to index,
    // if we get a double, use it directly
    Object tobj = instance.getTarget();
    if (tobj instanceof Label) {
      prob.y[i] = ((Label) instance.getTarget()).getIndex();
    } else if (tobj instanceof Double) {
      prob.y[i] = (double) tobj;
    } else {
      throw new GateRuntimeException("Odd target in mallet instance, cannot convert to LIBSVM: " + tobj);
    }

    //Features
    SparseVector data = (SparseVector) instance.getData();
    int[] indices = data.getIndices();
    double[] values = data.getValues();
    prob.x[i] = new svm_node[indices.length];
    for (int j = 0; j < indices.length; j++) {
      svm_node node = new svm_node();
      node.index = indices[j]+1; // NOTE: LibSVM location indices have to start with 1
      node.value = values[j];
      prob.x[i][j] = node;
    }
  }
  return prob;
}
 
Example #30
Source File: CorpusRepresentationMalletTarget.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 5 votes vote down vote up
/**
 * Add instances. 
 * 
 * The exact way of how the target is created to the instances depends on which
 * parameters are given and which are null. The parameter sequenceAS must always be null for this
 * corpus representation since this corpus representation is not usable for sequence tagging
 * algorithms If the parameter classAS is non-null then instances for a sequence tagging task are
 * created, in that case targetFeatureName must be null. If targetFeatureName is non-null then
 * instances for a regression or classification problem are created (depending on targetType) and
 * classAS must be null. if the parameter nameFeatureName is non-null, then a Mallet instance name
 * is added from the source document and annotation.
 *
 * @param instancesAS instance annotation set
 * @param sequenceAS sequence annotation set
 * @param inputAS input annotation set
 * @param classAS class annotation set
 * @param targetFeatureName target feature name
 * @param targetType type of target
 * @param instanceWeightFeature feature for the instance weight or null
 * @param nameFeatureName feature for the instance name or null
 * @param seqEncoder sequence encoder instance
 */
@Override
public void add(AnnotationSet instancesAS, AnnotationSet sequenceAS, AnnotationSet inputAS, AnnotationSet classAS, String targetFeatureName, TargetType targetType, String instanceWeightFeature, String nameFeatureName, SeqEncoder seqEncoder) {
  if(sequenceAS != null) {
    throw new GateRuntimeException("LF invalid call to CorpusRepresentationMallet.add: sequenceAS must be null "+
            " for document "+inputAS.getDocument().getName());
  }
  List<Annotation> instanceAnnotations = instancesAS.inDocumentOrder();
  for (Annotation instanceAnnotation : instanceAnnotations) {
    Instance inst = extractIndependentFeaturesHelper(instanceAnnotation, inputAS, featureInfo, pipe);
    if (classAS != null) {
      // extract the target as required for sequence tagging
      FeatureExtractionMalletSparse.extractClassForSeqTagging(inst, pipe.getTargetAlphabet(), classAS, instanceAnnotation, seqEncoder);
    } else {
      if(targetType == TargetType.NOMINAL) {
        FeatureExtractionMalletSparse.extractClassTarget(inst, pipe.getTargetAlphabet(), targetFeatureName, instanceAnnotation, inputAS);
      } else if(targetType == TargetType.NUMERIC) {
        FeatureExtractionMalletSparse.extractNumericTarget(inst, targetFeatureName, instanceAnnotation, inputAS);
      }
    }
    // if a nameFeature is specified, add the name informatin to the instance
    if(nameFeatureName != null) {
      FeatureExtractionMalletSparse.extractName(inst, instanceAnnotation, inputAS.getDocument());
    }
    if(instanceWeightFeature != null && !instanceWeightFeature.isEmpty()) {
      // If the instanceWeightFeature is not specified we do not set any weight, but if it is 
      // specified then we either try to convert the value to double or use 1.0.
      double score = LFUtils.anyToDoubleOrElse(instanceAnnotation.getFeatures().get(instanceWeightFeature), 1.0);
      inst.setProperty("instanceWeight", score);
    }
    if(!FeatureExtractionMalletSparse.ignoreInstanceWithMV(inst)) {
      synchronized(this) { // we can synchronize on this because this is a singleton
        instances.add(inst);
      }
    }
  }
}