Java Code Examples for cc.mallet.types.Instance#getData()

The following examples show how to use cc.mallet.types.Instance#getData() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: PipeScaleMeanVarAll.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 6 votes vote down vote up
@Override
public Instance pipe(Instance carrier) {
  if (!(carrier.getData() instanceof FeatureVector)) {
    System.out.println(carrier.getData().getClass());
    throw new IllegalArgumentException("Data must be of type FeatureVector not " + carrier.getData().getClass() + " we got " + carrier.getData());
  }

  if (this.means.length != this.getDataAlphabet().size()
          || this.variances.length != this.getDataAlphabet().size()) {
    throw new GateRuntimeException("Size mismatch, alphabet="+getDataAlphabet().size()+", stats="+means.length);    }

  FeatureVector fv = (FeatureVector) carrier.getData();
  int[] indices = fv.getIndices();
  double[] values = fv.getValues();
  for (int i = 0; i < indices.length; i++) {
    int index = indices[i];
    if(normalize[index]) {
      double value = values[i];
      double mean = means[index];
      double variance = variances[index];
      double newvalue = (value - mean) / Math.sqrt(variance);
      fv.setValue(index, newvalue);
    }
  }
  return carrier;
}
 
Example 2
Source File: RemoveStopwordsTest.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Test
public void testStopwordsAreRemoved() {
  String stop = "stop";
  String word = "word";
  String white = "white";
  String list = "list";

  TokenSequence data =
      new TokenSequence(
          ImmutableList.of(new Token(stop), new Token(word), new Token(white), new Token(list)));
  Instance instance = new Instance(data, null, null, null);

  RemoveStopwords stopwords = new RemoveStopwords(ImmutableList.of(stop, word));
  Instance output = stopwords.pipe(instance);

  TokenSequence ts = (TokenSequence) output.getData();
  assertEquals(2, ts.size());
  assertEquals(
      ImmutableSet.of(white, list), ts.stream().map(Token::getText).collect(Collectors.toSet()));
}
 
Example 3
Source File: PipeScaleMinMaxAll.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 5 votes vote down vote up
@Override
public Instance pipe(Instance carrier) {
  if (!(carrier.getData() instanceof FeatureVector)) {
    System.out.println(carrier.getData().getClass());
    throw new IllegalArgumentException("Data must be of type FeatureVector not " + carrier.getData().getClass() + " we got " + carrier.getData());
  }

  if (min.length != getDataAlphabet().size()
          || max.length != getDataAlphabet().size()) {
    throw new GateRuntimeException("Size mismatch, alphabet="+getDataAlphabet().size()+", stats="+min.length);
  }

  FeatureVector fv = (FeatureVector) carrier.getData();
  int[] indices = fv.getIndices();
  double[] values = fv.getValues();
  for (int i = 0; i < indices.length; i++) {
    int index = indices[i];
    double mi = min[index];
    double ma = max[index];
    double span = ma - mi;
    if(normalize[index] && span > 0.0) {
      double value = values[i];
      // NOTE: this could in theory cause an overflow error but we ignore this here!
      double newvalue = (value - mi) / span;
      fv.setValue(index, newvalue);
    }
  }
  return carrier;
}
 
Example 4
Source File: CorpusRepresentationLibSVM.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 5 votes vote down vote up
/**
 * Create libsvm representation from Mallet.
 *
 * @param crm mallet representation
 * @return libsvm representation
 */
public static svm_problem getFromMallet(CorpusRepresentationMallet crm) {
  InstanceList instances = crm.getRepresentationMallet();
  svm_problem prob = new svm_problem();
  int numTrainingInstances = instances.size();
  prob.l = numTrainingInstances;
  prob.y = new double[prob.l];
  prob.x = new svm_node[prob.l][];

  for (int i = 0; i < numTrainingInstances; i++) {
    Instance instance = instances.get(i);

    //Labels
    // convert the target: if we get a label, convert to index,
    // if we get a double, use it directly
    Object tobj = instance.getTarget();
    if (tobj instanceof Label) {
      prob.y[i] = ((Label) instance.getTarget()).getIndex();
    } else if (tobj instanceof Double) {
      prob.y[i] = (double) tobj;
    } else {
      throw new GateRuntimeException("Odd target in mallet instance, cannot convert to LIBSVM: " + tobj);
    }

    //Features
    SparseVector data = (SparseVector) instance.getData();
    int[] indices = data.getIndices();
    double[] values = data.getValues();
    prob.x[i] = new svm_node[indices.length];
    for (int j = 0; j < indices.length; j++) {
      svm_node node = new svm_node();
      node.index = indices[j]+1; // NOTE: LibSVM location indices have to start with 1
      node.value = values[j];
      prob.x[i][j] = node;
    }
  }
  return prob;
}
 
Example 5
Source File: FVStatsMeanVarAll.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 5 votes vote down vote up
/**
 * Constructor from instance list.
 * @param instances instances
 */
public FVStatsMeanVarAll(InstanceList instances) {
  for(Instance instance : instances) {
    FeatureVector fv = (FeatureVector)instance.getData();
    addFeatureVector(fv);
  }
  finish();
}
 
Example 6
Source File: CorpusExporterMRJsonTarget.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 5 votes vote down vote up
/**
 * Convert instance to string. 
 * 
 * @param inst instance
 * @param targetAlphabet target alphabet
 * @param attrs attributes
 * @param nrFeatures number of features
 * @param asString represent as quoted string
 * @param filterMV filter missing values
 * @return string representation
 */
public String instance2String(
        Instance inst,
        LabelAlphabet targetAlphabet,
        Attributes attrs,
        int nrFeatures,
        boolean asString,
        boolean filterMV) {
  StringBuilder sb = new StringBuilder();
  sb.append("[");  // outermost list 
  FeatureVector fv = (FeatureVector)inst.getData();
  Object targetObject = inst.getTarget();
  if (filterMV) {
    Object ignore = inst.getProperty(FeatureExtractionMalletSparse.PROP_IGNORE_HAS_MV);
    if (ignore != null && ignore.equals(true)) {
        return null;
    }
  }
  sb.append(featureVector2String(fv, nrFeatures, attrs, asString));
  // for now, we always try to output the target, even if it is null, this may change 
  // in the future
  if (targetObject!=null) {
    sb.append(", ");
    sb.append(target2String(targetObject, targetAlphabet, asString));
  }
  sb.append("]");  // close outer list
  return sb.toString();
}
 
Example 7
Source File: TestFeatureExtraction.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 5 votes vote down vote up
@Test
public void extractSimpleList2() {
  String spec = "<ROOT>"+
          "<ATTRIBUTE><TYPE>theType</TYPE><FEATURE>feature1</FEATURE><DATATYPE>nominal</DATATYPE><LISTSEP>:</LISTSEP></ATTRIBUTE>"+
          "</ROOT>";
  List<FeatureSpecAttribute> as = new FeatureSpecification(spec).getFeatureInfo().getAttributes();
  Instance inst = newInstance();
  
  // prepare the document
  Annotation instAnn = addAnn(doc, "", 0, 10, "instanceType", gate.Utils.featureMap());
  Annotation tok1 = addAnn(doc, "", 0, 5, "theType", gate.Utils.featureMap("feature1","lval1:lval2:lval3"));

  
  Annotation instAnn2 = addAnn(doc, "", 11, 20, "instanceType", gate.Utils.featureMap());
  Annotation tok2 = addAnn(doc, "", 12, 15, "theType", gate.Utils.featureMap("feature1","lval1:lval4:lval5"));
  
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn);
  FeatureVector fv = (FeatureVector)inst.getData();
  System.err.println("FeatureExtraction SimpleList2a: "+fv.toString(true));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═lval1"));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═lval2"));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═lval3"));
  assertEquals(3,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═lval1"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═lval2"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═lval3"),EPS);
  
  inst = newInstance(inst.getAlphabet());
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn2);
  fv = (FeatureVector)inst.getData();
  System.err.println("FeatureExtraction SimpleList2b: "+fv.toString(true));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═lval1"));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═lval4"));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═lval5"));
  assertEquals(3,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═lval1"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═lval4"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═lval5"),EPS);
  
}
 
Example 8
Source File: RemoveStopwords.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
public Instance pipe(Instance carrier) {
  TokenSequence input = (TokenSequence) carrier.getData();
  TokenSequence output = new TokenSequence();
  for (int i = 0; i < input.size(); i++) {
    Token t = input.get(i);
    if (!stopwords.contains(t.getText())) {
      output.add(t);
    }
  }
  carrier.setData(output);
  return carrier;
}
 
Example 9
Source File: BrainRegionPipesTest.java    From bluima with Apache License 2.0 5 votes vote down vote up
private void pipe(String txt, List<String>... features) throws Exception {
    // it might not have all the aes, though...
    JCas jCas = getOpenNlpTokenizedTestCas(txt);

    InstanceList il = new InstanceList(//
            new SerialPipes(BrainRegionPipes.getPipes()));

    Instance instance = new Instance(jCas, null, 1, jCas);
    il.addThruPipe(instance);

    Instance pipedInstance = il.iterator().next();
    FeatureVectorSequence data = (FeatureVectorSequence) pipedInstance
            .getData();

    java.util.Iterator<List<String>> featuresIt = asList(features)
            .iterator();
    Iterator it = data.iterator();
    while (it.hasNext()) {
        FeatureVector featureVector = it.next();

        if (featuresIt.hasNext()) {
            for (String expectedFeature : featuresIt.next()) {
                assertTrue("could not find expected feature '"
                        + expectedFeature + "', FeatureVector = \n"
                        + featureVector,
                        featureVector.contains(expectedFeature));

            }
        }
    }
}
 
Example 10
Source File: TokenTransform.java    From bluima with Apache License 2.0 5 votes vote down vote up
public Instance pipe (Instance carrier) {
	TokenSequence ts = (TokenSequence) carrier.getData();
	for (int i = 0; i < ts.size(); i++) {
		Token token = ts.get(i);
		String s = token.getText();			
		String transform = tokenTransformer.transform(s);
		if (null != transform)
			token.setFeatureValue((featureName + transform), 1.0);
	}
	return carrier;
}
 
Example 11
Source File: EngineMBWekaWrapper.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 4 votes vote down vote up
@Override
public List<ModelApplication> applyModel(AnnotationSet instanceAS, AnnotationSet inputAS, 
        AnnotationSet sequenceAS, String parms) {
  CorpusRepresentationMalletTarget data = (CorpusRepresentationMalletTarget)corpusRepresentation;
  data.stopGrowth();
  //System.err.println("Running EngineWeka.applyModel on document "+instanceAS.getDocument().getName());
  List<ModelApplication> gcs = new ArrayList<>();
  LFPipe pipe = (LFPipe)data.getRepresentationMallet().getPipe();
  for(Annotation instAnn : instanceAS.inDocumentOrder()) {
    Instance inst = data.extractIndependentFeatures(instAnn, inputAS);
    
    //FeatureVector fv = (FeatureVector)inst.getData();      
    //System.out.println("Mallet instance, fv: "+fv.toString(true)+", len="+fv.numLocations());
    inst = pipe.instanceFrom(inst);
    
    FeatureVector fv = (FeatureVector)inst.getData();
    //System.out.println("Mallet instance, fv: "+fv.toString(true)+", len="+fv.numLocations());
    
    double weight = Double.NaN;
    Object weightObj = inst.getProperty("instanceWeight");
    if(weightObj != null) {
      weight = (double)weightObj;
    }
    // Convert to the sparse vector we use to send to the weka process
    int locs = fv.numLocations();
    SparseDoubleVector sdv = new SparseDoubleVector(locs);
    sdv.setInstanceWeight(weight);
    int[] locations = sdv.getLocations();
    double[] values = sdv.getValues();
    for(int i=0;i<locs;i++) {
      locations[i] = fv.indexAtLocation(i);
      values[i] = fv.value(locations[i]);
    }
    // send the vector over to the weka process
    process.writeObject(sdv);
    // get the result back
    Object obj = process.readObject();
    // check that it is an array of double
    double[] ret = null;
    if(obj instanceof double[]) {
      // if the array has one element, the model treated it as regression, otherwise classification
      ret = (double[])obj;
    } else {
      // this is an error, lets panic for now
      throw new RuntimeException("Got a response from the Weka process which is not double[] but "+obj.getClass());
    }
    //System.err.println("Sent vector: locs/values="+Arrays.toString(locations)+"/"+Arrays.toString(values)+", ret="+Arrays.toString(ret));
    ModelApplication gc = null;
    // now check if the mallet representation and the weka process agree 
    // on if we have regression or classification
    if(pipe.getTargetAlphabet() == null) {
      // we expect a regression result, i.e ret should have 1 element
      if(ret.length != 1) {
        throw new RuntimeException("We think we have regression but the Weka process sent a ret of length "+ret.length);
      }
      gc = new ModelApplication(instAnn, ret[0]);
    } else {
      // classification, we expect ret to have length >= 2
      if(ret.length < 2) {
        throw new RuntimeException("We think we have classification but Weka process sent a ret of length "+ret.length);
      }
      double bestprob = 0.0;
      int bestlabel = 0;
      /*
      System.err.print("DEBUG: got classes from pipe: ");
  		Object[] cls = pipe.getTargetAlphabet().toArray();
      boolean first = true;
      for(Object cl : cls) {
        if(first) { first = false; } else { System.err.print(", "); }
        System.err.print(">"+cl+"<");
      }
      System.err.println();
       */
      List<String> classList = new ArrayList<>();
      List<Double> confidenceList = new ArrayList<>();
      for (int i = 0; i < ret.length; i++) {
        int thislabel = i;
        double thisprob = ret[i];
        String labelstr = pipe.getTargetAlphabet().lookupObject(thislabel).toString();
        classList.add(labelstr);
        confidenceList.add(thisprob);
        if (thisprob > bestprob) {
          bestlabel = thislabel;
          bestprob = thisprob;
        }
      } // end for i < predictionDistribution.length

      String cl
              = pipe.getTargetAlphabet().lookupObject(bestlabel).toString();

      gc = new ModelApplication(
              instAnn, cl, bestprob, classList, confidenceList);
    }
    gcs.add(gc);
  }
  data.startGrowth();
  return gcs;
}
 
Example 12
Source File: EngineMBMalletSeq.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 4 votes vote down vote up
@Override
public List<ModelApplication> applyModel(
        AnnotationSet instanceAS, AnnotationSet inputAS, AnnotationSet sequenceAS, 
        String parms) {
  // stop growth
  CorpusRepresentationMalletSeq data = (CorpusRepresentationMalletSeq)corpusRepresentation;
  data.stopGrowth();
  
  List<ModelApplication> gcs = new ArrayList<>();

  Transducer crf = (Transducer)model;
  
  for(Annotation sequenceAnn : sequenceAS) {
    int sequenceSpanId = sequenceAnn.getId();
    Instance inst = data.getInstanceForSequence( 
            instanceAS, sequenceAnn, inputAS, null, null, TargetType.NONE, null, null);

    //Always put the instance through the same pipe used for training.
    inst = crf.getInputPipe().instanceFrom(inst);

    SumLatticeDefault sl = new SumLatticeDefault(crf,
            (FeatureVectorSequence) inst.getData());

    List<Annotation> instanceAnnotations = gate.Utils.getContainedAnnotations(
            instanceAS, sequenceAnn).inDocumentOrder();

    //Sanity check that we're mapping the probs back onto the right anns.
    //This being wrong might follow from errors reading in the data to mallet inst.
    if (instanceAnnotations.size() != ((FeatureVectorSequence) inst.getData()).size()) {
      LOGGER.warn("LearningFramework: CRF output length: "
              + ((FeatureVectorSequence) inst.getData()).size()
              + ", GATE instances: " + instanceAnnotations.size()
              + ". Can't assign.");
    } else {
      int i = 0;
      for (Annotation instanceAnn : instanceAnnotations) {
        i++;

        String bestLabel = null;
        double bestProb = 0.0;

        //For each label option ..
        
        // NOTE: for CRF we had this code:
        //for (int j = 0; j < crf.getOutputAlphabet().size(); j++) {
        //  String label = crf.getOutputAlphabet().lookupObject(j).toString();
        // but for Transducer we do not have the getOutputAlphabet method so we use
        // model.getInputPipe().getTargetAlphabet() instead (this seems to be what 
        // is used inside CRF anyway.)
        for (int j = 0; j < crf.getInputPipe().getTargetAlphabet().size(); j++) {
          String label = crf.getInputPipe().getTargetAlphabet().lookupObject(j).toString();

          //Get the probability of being in state j at position i+1
          //Note that the plus one is because the labels are on the
          //transitions. Positions are between transitions.
          double marg = sl.getGammaProbability(i, crf.getState(j));
          if (marg > bestProb) {
            bestLabel = label;
            bestProb = marg;
          }
        }
        ModelApplication gc = new ModelApplication(
                instanceAnn, bestLabel, bestProb, sequenceSpanId);

        gcs.add(gc);
      }
    }
  }
  data.startGrowth();
  return gcs;
}
 
Example 13
Source File: CorpusExporterMRARFF.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 4 votes vote down vote up
private String instance2WekaArffLine(Instance inst, Attributes attrs, boolean filterMVs) {
  StringBuilder sb = new StringBuilder();
  
  if(filterMVs) {
    Object ignore = inst.getProperty(FeatureExtractionMalletSparse.PROP_IGNORE_HAS_MV);    
    // If the flag says the instance should get ignored, return null
    // to indicate to the caller that this is an ignored instance.
    if(ignore != null && ignore.equals(true)) {
      return null;
    }
  }
  Double instanceWeight = (Double)inst.getProperty("instanceWeight");
  Object data = inst.getData();
  if(data instanceof FeatureVector) {
    FeatureVector vector = (FeatureVector)data;
    sb.append("{");
    boolean first = true;
    // TODO: maybe it is easier to do 
    // for(int idx : vector.getIndices) 
    for(int i=0; i<vector.numLocations(); i++) {   
      int idx = vector.indexAtLocation(i);
      if(first) { 
        first = false;
      } else {
        sb.append(", ");
      } 
      sb.append(idx);
      sb.append(" ");
      double value = vector.valueAtLocation(i);
      if(Double.isNaN(value)) {
        sb.append("?");
      } else {
        // TODO: proper handling of missing values!!!
        // Also: codeas may be null sometimes, make sure if we have a datatype
        // where codeas is relevant, we ALWAYS have codeas set to the correct value!
        Attribute attr = attrs.getAttribute(idx);
        if(attr.datatype==Datatype.numeric || (attr.datatype==Datatype.nominal && attr.codeAs!=CodeAs.number)) {
          sb.append(value);
        } else if(attr.datatype==Datatype.bool) {
          // TODO: check for missing value, also use the special alphabet we created?
          if(value<0.5) { sb.append("false"); } else { sb.append("true"); }
        } else if(attr.datatype==Datatype.nominal) {
          // TODO: check for how to exactly handling missing values, for now we simply output
          // the Weka missing value placeholder
          if(((int)value)==-1) {
            sb.append("?");
          } else {
            sb.append(escape4Arff((String)attr.alphabet.lookupObject((int) value)));
          }
        } else {
          // guard for forgetting about here when we add datatypes later
          sb.append("GOTCHA!!!! DATATYPE NOT SUPPORTED IN THE EXPORT CODE");
        }                  
      }
    } // for 
    // Now also add location and value for the target, if we have one
    Object target = inst.getTarget();
    if(target!=null) {
      Attribute targetAttr = attrs.getTargetAttribute();
      sb.append(", ");        
      sb.append(targetAttr.index);
      sb.append(" ");
      // we expect this to be either a Label instance or something that can be cast to double
      if(target instanceof Label) {
        if(targetAttr.datatype != Datatype.nominal) {
          throw new RuntimeException("Target is a label but datatype for attribute is not nominal");
        }
        Label malletLabel = (Label)target;
        String targetString = malletLabel.toString();
        sb.append(escape4Arff(targetString));
        // TODO: could check here if the label index is the same as expected from
        // the attribute defintion!
      } else {
        if(targetAttr.datatype != Datatype.numeric) {
          throw new RuntimeException("Target is a number but datatype for attribute is not  numeric");
        }
        double targetValue = (double)target;
        sb.append(targetValue);
      }
    } else {
      // target is null: do nothing, simply create the row without a target 
      // TODO: not sure what I was thinking here, but admittedly, exporting without
      // a target or a missing target could have its uses, so we leave this as it is
    }
    sb.append("}");
    if(instanceWeight!=null) {
      sb.append(", {");
      sb.append(instanceWeight);
      sb.append("}");
    }
  } else {
    throw new RuntimeException("Cannot export, instance is not a feature vector but "+data.getClass());
  }
  return sb.toString();
}
 
Example 14
Source File: TestFeatureExtraction.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 4 votes vote down vote up
@Test
public void extractList2() {
  // same as extractList2, but with explicitly specified name
  String spec = "<ROOT>"+
          "<ATTRIBUTELIST><NAME>myAttList</NAME><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><DATATYPE>nominal</DATATYPE><FROM>-2</FROM><TO>2</TO></ATTRIBUTELIST>"+
          "</ROOT>";
  List<FeatureSpecAttribute> as = new FeatureSpecification(spec).getFeatureInfo().getAttributes();
  Instance inst = newInstance();
  
  // prepare the document
  Annotation instAnn = addAnn(doc, "", 10, 12, "instanceType", gate.Utils.featureMap());
  addAnn(doc,"",0,2,"theType",gate.Utils.featureMap("theFeature","tok1"));
  addAnn(doc,"",2,4,"theType",gate.Utils.featureMap("theFeature","tok2"));
  addAnn(doc,"",4,6,"theType",gate.Utils.featureMap("theFeature","tok3"));
  addAnn(doc,"",6,8,"theType",gate.Utils.featureMap("theFeature","tok4"));
  addAnn(doc,"",8,10,"theType",gate.Utils.featureMap("theFeature","tok5"));
  addAnn(doc,"",10,12,"theType",gate.Utils.featureMap("theFeature","tok6"));
  addAnn(doc,"",12,14,"theType",gate.Utils.featureMap("theFeature","tok7"));
  addAnn(doc,"",14,16,"theType",gate.Utils.featureMap("theFeature","tok8"));
  addAnn(doc,"",16,18,"theType",gate.Utils.featureMap("theFeature","tok9"));
  addAnn(doc,"",18,20,"theType",gate.Utils.featureMap("theFeature","tok10"));
  Annotation withinAnn = addAnn(doc,"",8,14,"within",gate.Utils.featureMap());
  
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn);
  System.err.println("After "+as.get(0)+" (list -1to1) FV="+inst.getData());
  System.err.println("Alphabet L2="+inst.getAlphabet());
  assertEquals(5,inst.getAlphabet().size());
  System.err.println("Alphabet is "+inst.getAlphabet());
  FeatureVector fv = (FeatureVector)inst.getData();
  System.err.println("extractList2-all: "+fv.toString(true));
  assertTrue(inst.getAlphabet().contains("myAttList╬L-2═tok4"));
  assertTrue(inst.getAlphabet().contains("myAttList╬L-1═tok5"));
  assertTrue(inst.getAlphabet().contains("myAttList╬L0═tok6"));
  assertTrue(inst.getAlphabet().contains("myAttList╬L1═tok7"));
  assertTrue(inst.getAlphabet().contains("myAttList╬L2═tok8"));
  assertEquals(5,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L-2═tok4"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L-1═tok5"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L0═tok6"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L1═tok7"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L2═tok8"),EPS);
  
  // Do the test again, but this time with a declaration that limits it to within the within annotation
  spec = "<ROOT>"+
          "<ATTRIBUTELIST><NAME>myAttList</NAME><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><DATATYPE>nominal</DATATYPE><FROM>-1</FROM><TO>1</TO><WITHIN>within</WITHIN></ATTRIBUTELIST>"+
          "</ROOT>";
  as = new FeatureSpecification(spec).getFeatureInfo().getAttributes();
  inst = newInstance();
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn);
  fv = (FeatureVector)inst.getData();
  System.err.println("extractList2-within: "+fv.toString(true));
  assertEquals(5,inst.getAlphabet().size());
  assertTrue(inst.getAlphabet().contains("myAttList╬L-1═tok5"));
  assertTrue(inst.getAlphabet().contains("myAttList╬L0═tok6"));
  assertTrue(inst.getAlphabet().contains("myAttList╬L1═tok7"));
  assertTrue(inst.getAlphabet().contains("myAttList╬L-1═╔START╗"));
  assertTrue(inst.getAlphabet().contains("myAttList╬L1═╔STOP╗"));
  assertEquals(5,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L-1═tok5"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L0═tok6"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L1═tok7"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L-1═╔START╗"),EPS);
}
 
Example 15
Source File: TestFeatureExtraction.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 4 votes vote down vote up
@Test
public void extractSimpleList1() {
  String spec = "<ROOT>"+
          "<ATTRIBUTE><TYPE>theType</TYPE><FEATURE>feature1</FEATURE><DATATYPE>nominal</DATATYPE></ATTRIBUTE>"+
          "</ROOT>";
  List<FeatureSpecAttribute> as = new FeatureSpecification(spec).getFeatureInfo().getAttributes();
  Instance inst = newInstance();
  
  // prepare the document
  Annotation instAnn = addAnn(doc, "", 0, 10, "instanceType", gate.Utils.featureMap());
  HashSet<String> v1 = new HashSet<>();
  v1.add("setval1");
  v1.add("setval2");
  v1.add("setval3");
  Annotation tok1 = addAnn(doc, "", 0, 5, "theType", gate.Utils.featureMap("feature1",v1));

  
  Annotation instAnn2 = addAnn(doc, "", 11, 20, "instanceType", gate.Utils.featureMap());
  HashSet<String> v2 = new HashSet<>();
  v2.add("setval1");
  v2.add("setval4");
  v2.add("setval5");
  Annotation tok2 = addAnn(doc, "", 12, 15, "theType", gate.Utils.featureMap("feature1",v2));
  
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn);
  FeatureVector fv = (FeatureVector)inst.getData();
  System.err.println("FeatureExtraction SimpleList1a: "+fv.toString(true));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═setval1"));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═setval2"));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═setval3"));
  assertEquals(3,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═setval1"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═setval2"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═setval3"),EPS);
  
  inst = newInstance(inst.getAlphabet());
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn2);
  fv = (FeatureVector)inst.getData();
  System.err.println("FeatureExtraction SimpleList1b: "+fv.toString(true));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═setval1"));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═setval4"));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═setval5"));
  assertEquals(3,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═setval1"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═setval4"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═setval5"),EPS);
  
}
 
Example 16
Source File: MultiSegmentationEvaluator.java    From bluima with Apache License 2.0 4 votes vote down vote up
public void evaluateInstanceList(TransducerTrainer tt, InstanceList data,
        String description) {
    Transducer model = tt.getTransducer();
    int numCorrectTokens, totalTokens;
    int[] numTrueSegments, numPredictedSegments, numCorrectSegments;
    int allIndex = segmentStartTags.length;
    numTrueSegments = new int[allIndex + 1];
    numPredictedSegments = new int[allIndex + 1];
    numCorrectSegments = new int[allIndex + 1];

    totalTokens = numCorrectTokens = 0;
    for (int n = 0; n < numTrueSegments.length; n++)
        numTrueSegments[n] = numPredictedSegments[n] = numCorrectSegments[n] = 0;
    for (int i = 0; i < data.size(); i++) {
        Instance instance = data.get(i);
        Sequence input = (Sequence) instance.getData();
        // String tokens = null;
        // if (instance.getSource() != null)
        // tokens = (String) instance.getSource().toString();
        Sequence trueOutput = (Sequence) instance.getTarget();
        assert (input.size() == trueOutput.size());
        Sequence predOutput = model.transduce(input);
        assert (predOutput.size() == trueOutput.size());
        int trueStart, predStart; // -1 for non-start, otherwise index into
                                  // segmentStartTag
        for (int j = 0; j < trueOutput.size(); j++) {
            totalTokens++;
            if (trueOutput.get(j).equals(predOutput.get(j)))
                numCorrectTokens++;
            trueStart = predStart = -1;
            // Count true segment starts
            for (int n = 0; n < segmentStartTags.length; n++) {
                if (segmentStartTags[n].equals(trueOutput.get(j))) {
                    numTrueSegments[n]++;
                    numTrueSegments[allIndex]++;
                    trueStart = n;
                    break;
                }
            }
            // Count predicted segment starts
            for (int n = 0; n < segmentStartTags.length; n++) {
                if (segmentStartTags[n].equals(predOutput.get(j))) {
                    numPredictedSegments[n]++;
                    numPredictedSegments[allIndex]++;
                    predStart = n;
                }
            }
            if (trueStart != -1 && trueStart == predStart) {
                // Truth and Prediction both agree that the same segment
                // tag-type is starting now
                int m;
                boolean trueContinue = false;
                boolean predContinue = false;
                for (m = j + 1; m < trueOutput.size(); m++) {
                    trueContinue = segmentContinueTags[predStart]
                            .equals(trueOutput.get(m));
                    predContinue = segmentContinueTags[predStart]
                            .equals(predOutput.get(m));
                    if (!trueContinue || !predContinue) {
                        if (trueContinue == predContinue) {
                            // They agree about a segment is ending somehow
                            numCorrectSegments[predStart]++;
                            numCorrectSegments[allIndex]++;
                        }
                        break;
                    }
                }
                // for the case of the end of the sequence
                if (m == trueOutput.size()) {
                    if (trueContinue == predContinue) {
                        numCorrectSegments[predStart]++;
                        numCorrectSegments[allIndex]++;
                    }
                }
            }
        }
    }
    DecimalFormat f = new DecimalFormat("0.####");
    System.err.println(description + " tokenaccuracy="
            + f.format(((double) numCorrectTokens) / totalTokens));
    for (int n = 0; n < numCorrectSegments.length; n++) {
        System.err.println((n < allIndex ? segmentStartTags[n].toString()
                : "OVERALL") + ' ');
        double precision = numPredictedSegments[n] == 0 ? 1
                : ((double) numCorrectSegments[n])
                        / numPredictedSegments[n];
        double recall = numTrueSegments[n] == 0 ? 1
                : ((double) numCorrectSegments[n]) / numTrueSegments[n];
        double f1 = recall + precision == 0.0 ? 0.0
                : (2.0 * recall * precision) / (recall + precision);
        System.err.println(" " + description + " segments true="
                + numTrueSegments[n] + " pred=" + numPredictedSegments[n]
                + " correct=" + numCorrectSegments[n] + " misses="
                + (numTrueSegments[n] - numCorrectSegments[n]) + " alarms="
                + (numPredictedSegments[n] - numCorrectSegments[n]));
        System.err.println(" " + description + " precision="
                + f.format(precision) + " recall=" + f.format(recall)
                + " f1=" + f.format(f1));
    }

}