cc.mallet.types.Instance#getData

Source File: PipeScaleMeanVarAll.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

6 votes

@Override
public Instance pipe(Instance carrier) {
  if (!(carrier.getData() instanceof FeatureVector)) {
    System.out.println(carrier.getData().getClass());
    throw new IllegalArgumentException("Data must be of type FeatureVector not " + carrier.getData().getClass() + " we got " + carrier.getData());
  }

  if (this.means.length != this.getDataAlphabet().size()
          || this.variances.length != this.getDataAlphabet().size()) {
    throw new GateRuntimeException("Size mismatch, alphabet="+getDataAlphabet().size()+", stats="+means.length);    }

  FeatureVector fv = (FeatureVector) carrier.getData();
  int[] indices = fv.getIndices();
  double[] values = fv.getValues();
  for (int i = 0; i < indices.length; i++) {
    int index = indices[i];
    if(normalize[index]) {
      double value = values[i];
      double mean = means[index];
      double variance = variances[index];
      double newvalue = (value - mean) / Math.sqrt(variance);
      fv.setValue(index, newvalue);
    }
  }
  return carrier;
}

Source File: RemoveStopwordsTest.java From baleen with Apache License 2.0

6 votes

@Test
public void testStopwordsAreRemoved() {
  String stop = "stop";
  String word = "word";
  String white = "white";
  String list = "list";

  TokenSequence data =
      new TokenSequence(
          ImmutableList.of(new Token(stop), new Token(word), new Token(white), new Token(list)));
  Instance instance = new Instance(data, null, null, null);

  RemoveStopwords stopwords = new RemoveStopwords(ImmutableList.of(stop, word));
  Instance output = stopwords.pipe(instance);

  TokenSequence ts = (TokenSequence) output.getData();
  assertEquals(2, ts.size());
  assertEquals(
      ImmutableSet.of(white, list), ts.stream().map(Token::getText).collect(Collectors.toSet()));
}

Source File: PipeScaleMinMaxAll.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

5 votes

@Override
public Instance pipe(Instance carrier) {
  if (!(carrier.getData() instanceof FeatureVector)) {
    System.out.println(carrier.getData().getClass());
    throw new IllegalArgumentException("Data must be of type FeatureVector not " + carrier.getData().getClass() + " we got " + carrier.getData());
  }

  if (min.length != getDataAlphabet().size()
          || max.length != getDataAlphabet().size()) {
    throw new GateRuntimeException("Size mismatch, alphabet="+getDataAlphabet().size()+", stats="+min.length);
  }

  FeatureVector fv = (FeatureVector) carrier.getData();
  int[] indices = fv.getIndices();
  double[] values = fv.getValues();
  for (int i = 0; i < indices.length; i++) {
    int index = indices[i];
    double mi = min[index];
    double ma = max[index];
    double span = ma - mi;
    if(normalize[index] && span > 0.0) {
      double value = values[i];
      // NOTE: this could in theory cause an overflow error but we ignore this here!
      double newvalue = (value - mi) / span;
      fv.setValue(index, newvalue);
    }
  }
  return carrier;
}

Source File: CorpusRepresentationLibSVM.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

5 votes

/**
 * Create libsvm representation from Mallet.
 *
 * @param crm mallet representation
 * @return libsvm representation
 */
public static svm_problem getFromMallet(CorpusRepresentationMallet crm) {
  InstanceList instances = crm.getRepresentationMallet();
  svm_problem prob = new svm_problem();
  int numTrainingInstances = instances.size();
  prob.l = numTrainingInstances;
  prob.y = new double[prob.l];
  prob.x = new svm_node[prob.l][];

  for (int i = 0; i < numTrainingInstances; i++) {
    Instance instance = instances.get(i);

    //Labels
    // convert the target: if we get a label, convert to index,
    // if we get a double, use it directly
    Object tobj = instance.getTarget();
    if (tobj instanceof Label) {
      prob.y[i] = ((Label) instance.getTarget()).getIndex();
    } else if (tobj instanceof Double) {
      prob.y[i] = (double) tobj;
    } else {
      throw new GateRuntimeException("Odd target in mallet instance, cannot convert to LIBSVM: " + tobj);
    }

    //Features
    SparseVector data = (SparseVector) instance.getData();
    int[] indices = data.getIndices();
    double[] values = data.getValues();
    prob.x[i] = new svm_node[indices.length];
    for (int j = 0; j < indices.length; j++) {
      svm_node node = new svm_node();
      node.index = indices[j]+1; // NOTE: LibSVM location indices have to start with 1
      node.value = values[j];
      prob.x[i][j] = node;
    }
  }
  return prob;
}

Source File: FVStatsMeanVarAll.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

5 votes

/**
 * Constructor from instance list.
 * @param instances instances
 */
public FVStatsMeanVarAll(InstanceList instances) {
  for(Instance instance : instances) {
    FeatureVector fv = (FeatureVector)instance.getData();
    addFeatureVector(fv);
  }
  finish();
}

Source File: CorpusExporterMRJsonTarget.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

5 votes

/**
 * Convert instance to string. 
 * 
 * @param inst instance
 * @param targetAlphabet target alphabet
 * @param attrs attributes
 * @param nrFeatures number of features
 * @param asString represent as quoted string
 * @param filterMV filter missing values
 * @return string representation
 */
public String instance2String(
        Instance inst,
        LabelAlphabet targetAlphabet,
        Attributes attrs,
        int nrFeatures,
        boolean asString,
        boolean filterMV) {
  StringBuilder sb = new StringBuilder();
  sb.append("[");  // outermost list 
  FeatureVector fv = (FeatureVector)inst.getData();
  Object targetObject = inst.getTarget();
  if (filterMV) {
    Object ignore = inst.getProperty(FeatureExtractionMalletSparse.PROP_IGNORE_HAS_MV);
    if (ignore != null && ignore.equals(true)) {
        return null;
    }
  }
  sb.append(featureVector2String(fv, nrFeatures, attrs, asString));
  // for now, we always try to output the target, even if it is null, this may change 
  // in the future
  if (targetObject!=null) {
    sb.append(", ");
    sb.append(target2String(targetObject, targetAlphabet, asString));
  }
  sb.append("]");  // close outer list
  return sb.toString();
}

Source File: TestFeatureExtraction.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

5 votes

@Test
public void extractSimpleList2() {
  String spec = "<ROOT>"+
          "<ATTRIBUTE><TYPE>theType</TYPE><FEATURE>feature1</FEATURE><DATATYPE>nominal</DATATYPE><LISTSEP>:</LISTSEP></ATTRIBUTE>"+
          "</ROOT>";
  List<FeatureSpecAttribute> as = new FeatureSpecification(spec).getFeatureInfo().getAttributes();
  Instance inst = newInstance();
  
  // prepare the document
  Annotation instAnn = addAnn(doc, "", 0, 10, "instanceType", gate.Utils.featureMap());
  Annotation tok1 = addAnn(doc, "", 0, 5, "theType", gate.Utils.featureMap("feature1","lval1:lval2:lval3"));

  
  Annotation instAnn2 = addAnn(doc, "", 11, 20, "instanceType", gate.Utils.featureMap());
  Annotation tok2 = addAnn(doc, "", 12, 15, "theType", gate.Utils.featureMap("feature1","lval1:lval4:lval5"));
  
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn);
  FeatureVector fv = (FeatureVector)inst.getData();
  System.err.println("FeatureExtraction SimpleList2a: "+fv.toString(true));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═lval1"));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═lval2"));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═lval3"));
  assertEquals(3,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═lval1"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═lval2"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═lval3"),EPS);
  
  inst = newInstance(inst.getAlphabet());
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn2);
  fv = (FeatureVector)inst.getData();
  System.err.println("FeatureExtraction SimpleList2b: "+fv.toString(true));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═lval1"));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═lval4"));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═lval5"));
  assertEquals(3,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═lval1"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═lval4"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═lval5"),EPS);
  
}

Source File: RemoveStopwords.java From baleen with Apache License 2.0

5 votes

@Override
public Instance pipe(Instance carrier) {
  TokenSequence input = (TokenSequence) carrier.getData();
  TokenSequence output = new TokenSequence();
  for (int i = 0; i < input.size(); i++) {
    Token t = input.get(i);
    if (!stopwords.contains(t.getText())) {
      output.add(t);
    }
  }
  carrier.setData(output);
  return carrier;
}

Source File: BrainRegionPipesTest.java From bluima with Apache License 2.0

5 votes

private void pipe(String txt, List<String>... features) throws Exception {
    // it might not have all the aes, though...
    JCas jCas = getOpenNlpTokenizedTestCas(txt);

    InstanceList il = new InstanceList(//
            new SerialPipes(BrainRegionPipes.getPipes()));

    Instance instance = new Instance(jCas, null, 1, jCas);
    il.addThruPipe(instance);

    Instance pipedInstance = il.iterator().next();
    FeatureVectorSequence data = (FeatureVectorSequence) pipedInstance
            .getData();

    java.util.Iterator<List<String>> featuresIt = asList(features)
            .iterator();
    Iterator it = data.iterator();
    while (it.hasNext()) {
        FeatureVector featureVector = it.next();

        if (featuresIt.hasNext()) {
            for (String expectedFeature : featuresIt.next()) {
                assertTrue("could not find expected feature '"
                        + expectedFeature + "', FeatureVector = \n"
                        + featureVector,
                        featureVector.contains(expectedFeature));

            }
        }
    }
}

Source File: TokenTransform.java From bluima with Apache License 2.0

5 votes

public Instance pipe (Instance carrier) {
	TokenSequence ts = (TokenSequence) carrier.getData();
	for (int i = 0; i < ts.size(); i++) {
		Token token = ts.get(i);
		String s = token.getText();			
		String transform = tokenTransformer.transform(s);
		if (null != transform)
			token.setFeatureValue((featureName + transform), 1.0);
	}
	return carrier;
}

Source File: EngineMBWekaWrapper.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

4 votes

@Override
public List<ModelApplication> applyModel(AnnotationSet instanceAS, AnnotationSet inputAS, 
        AnnotationSet sequenceAS, String parms) {
  CorpusRepresentationMalletTarget data = (CorpusRepresentationMalletTarget)corpusRepresentation;
  data.stopGrowth();
  //System.err.println("Running EngineWeka.applyModel on document "+instanceAS.getDocument().getName());
  List<ModelApplication> gcs = new ArrayList<>();
  LFPipe pipe = (LFPipe)data.getRepresentationMallet().getPipe();
  for(Annotation instAnn : instanceAS.inDocumentOrder()) {
    Instance inst = data.extractIndependentFeatures(instAnn, inputAS);
    
    //FeatureVector fv = (FeatureVector)inst.getData();      
    //System.out.println("Mallet instance, fv: "+fv.toString(true)+", len="+fv.numLocations());
    inst = pipe.instanceFrom(inst);
    
    FeatureVector fv = (FeatureVector)inst.getData();
    //System.out.println("Mallet instance, fv: "+fv.toString(true)+", len="+fv.numLocations());
    
    double weight = Double.NaN;
    Object weightObj = inst.getProperty("instanceWeight");
    if(weightObj != null) {
      weight = (double)weightObj;
    }
    // Convert to the sparse vector we use to send to the weka process
    int locs = fv.numLocations();
    SparseDoubleVector sdv = new SparseDoubleVector(locs);
    sdv.setInstanceWeight(weight);
    int[] locations = sdv.getLocations();
    double[] values = sdv.getValues();
    for(int i=0;i<locs;i++) {
      locations[i] = fv.indexAtLocation(i);
      values[i] = fv.value(locations[i]);
    }
    // send the vector over to the weka process
    process.writeObject(sdv);
    // get the result back
    Object obj = process.readObject();
    // check that it is an array of double
    double[] ret = null;
    if(obj instanceof double[]) {
      // if the array has one element, the model treated it as regression, otherwise classification
      ret = (double[])obj;
    } else {
      // this is an error, lets panic for now
      throw new RuntimeException("Got a response from the Weka process which is not double[] but "+obj.getClass());
    }
    //System.err.println("Sent vector: locs/values="+Arrays.toString(locations)+"/"+Arrays.toString(values)+", ret="+Arrays.toString(ret));
    ModelApplication gc = null;
    // now check if the mallet representation and the weka process agree 
    // on if we have regression or classification
    if(pipe.getTargetAlphabet() == null) {
      // we expect a regression result, i.e ret should have 1 element
      if(ret.length != 1) {
        throw new RuntimeException("We think we have regression but the Weka process sent a ret of length "+ret.length);
      }
      gc = new ModelApplication(instAnn, ret[0]);
    } else {
      // classification, we expect ret to have length >= 2
      if(ret.length < 2) {
        throw new RuntimeException("We think we have classification but Weka process sent a ret of length "+ret.length);
      }
      double bestprob = 0.0;
      int bestlabel = 0;
      /*
      System.err.print("DEBUG: got classes from pipe: ");
  		Object[] cls = pipe.getTargetAlphabet().toArray();
      boolean first = true;
      for(Object cl : cls) {
        if(first) { first = false; } else { System.err.print(", "); }
        System.err.print(">"+cl+"<");
      }
      System.err.println();
       */
      List<String> classList = new ArrayList<>();
      List<Double> confidenceList = new ArrayList<>();
      for (int i = 0; i < ret.length; i++) {
        int thislabel = i;
        double thisprob = ret[i];
        String labelstr = pipe.getTargetAlphabet().lookupObject(thislabel).toString();
        classList.add(labelstr);
        confidenceList.add(thisprob);
        if (thisprob > bestprob) {
          bestlabel = thislabel;
          bestprob = thisprob;
        }
      } // end for i < predictionDistribution.length

      String cl
              = pipe.getTargetAlphabet().lookupObject(bestlabel).toString();

      gc = new ModelApplication(
              instAnn, cl, bestprob, classList, confidenceList);
    }
    gcs.add(gc);
  }
  data.startGrowth();
  return gcs;
}

Source File: EngineMBMalletSeq.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

4 votes

@Override
public List<ModelApplication> applyModel(
        AnnotationSet instanceAS, AnnotationSet inputAS, AnnotationSet sequenceAS, 
        String parms) {
  // stop growth
  CorpusRepresentationMalletSeq data = (CorpusRepresentationMalletSeq)corpusRepresentation;
  data.stopGrowth();
  
  List<ModelApplication> gcs = new ArrayList<>();

  Transducer crf = (Transducer)model;
  
  for(Annotation sequenceAnn : sequenceAS) {
    int sequenceSpanId = sequenceAnn.getId();
    Instance inst = data.getInstanceForSequence( 
            instanceAS, sequenceAnn, inputAS, null, null, TargetType.NONE, null, null);

    //Always put the instance through the same pipe used for training.
    inst = crf.getInputPipe().instanceFrom(inst);

    SumLatticeDefault sl = new SumLatticeDefault(crf,
            (FeatureVectorSequence) inst.getData());

    List<Annotation> instanceAnnotations = gate.Utils.getContainedAnnotations(
            instanceAS, sequenceAnn).inDocumentOrder();

    //Sanity check that we're mapping the probs back onto the right anns.
    //This being wrong might follow from errors reading in the data to mallet inst.
    if (instanceAnnotations.size() != ((FeatureVectorSequence) inst.getData()).size()) {
      LOGGER.warn("LearningFramework: CRF output length: "
              + ((FeatureVectorSequence) inst.getData()).size()
              + ", GATE instances: " + instanceAnnotations.size()
              + ". Can't assign.");
    } else {
      int i = 0;
      for (Annotation instanceAnn : instanceAnnotations) {
        i++;

        String bestLabel = null;
        double bestProb = 0.0;

        //For each label option ..
        
        // NOTE: for CRF we had this code:
        //for (int j = 0; j < crf.getOutputAlphabet().size(); j++) {
        //  String label = crf.getOutputAlphabet().lookupObject(j).toString();
        // but for Transducer we do not have the getOutputAlphabet method so we use
        // model.getInputPipe().getTargetAlphabet() instead (this seems to be what 
        // is used inside CRF anyway.)
        for (int j = 0; j < crf.getInputPipe().getTargetAlphabet().size(); j++) {
          String label = crf.getInputPipe().getTargetAlphabet().lookupObject(j).toString();

          //Get the probability of being in state j at position i+1
          //Note that the plus one is because the labels are on the
          //transitions. Positions are between transitions.
          double marg = sl.getGammaProbability(i, crf.getState(j));
          if (marg > bestProb) {
            bestLabel = label;
            bestProb = marg;
          }
        }
        ModelApplication gc = new ModelApplication(
                instanceAnn, bestLabel, bestProb, sequenceSpanId);

        gcs.add(gc);
      }
    }
  }
  data.startGrowth();
  return gcs;
}

Source File: CorpusExporterMRARFF.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

4 votes

private String instance2WekaArffLine(Instance inst, Attributes attrs, boolean filterMVs) {
  StringBuilder sb = new StringBuilder();
  
  if(filterMVs) {
    Object ignore = inst.getProperty(FeatureExtractionMalletSparse.PROP_IGNORE_HAS_MV);    
    // If the flag says the instance should get ignored, return null
    // to indicate to the caller that this is an ignored instance.
    if(ignore != null && ignore.equals(true)) {
      return null;
    }
  }
  Double instanceWeight = (Double)inst.getProperty("instanceWeight");
  Object data = inst.getData();
  if(data instanceof FeatureVector) {
    FeatureVector vector = (FeatureVector)data;
    sb.append("{");
    boolean first = true;
    // TODO: maybe it is easier to do 
    // for(int idx : vector.getIndices) 
    for(int i=0; i<vector.numLocations(); i++) {   
      int idx = vector.indexAtLocation(i);
      if(first) { 
        first = false;
      } else {
        sb.append(", ");
      } 
      sb.append(idx);
      sb.append(" ");
      double value = vector.valueAtLocation(i);
      if(Double.isNaN(value)) {
        sb.append("?");
      } else {
        // TODO: proper handling of missing values!!!
        // Also: codeas may be null sometimes, make sure if we have a datatype
        // where codeas is relevant, we ALWAYS have codeas set to the correct value!
        Attribute attr = attrs.getAttribute(idx);
        if(attr.datatype==Datatype.numeric || (attr.datatype==Datatype.nominal && attr.codeAs!=CodeAs.number)) {
          sb.append(value);
        } else if(attr.datatype==Datatype.bool) {
          // TODO: check for missing value, also use the special alphabet we created?
          if(value<0.5) { sb.append("false"); } else { sb.append("true"); }
        } else if(attr.datatype==Datatype.nominal) {
          // TODO: check for how to exactly handling missing values, for now we simply output
          // the Weka missing value placeholder
          if(((int)value)==-1) {
            sb.append("?");
          } else {
            sb.append(escape4Arff((String)attr.alphabet.lookupObject((int) value)));
          }
        } else {
          // guard for forgetting about here when we add datatypes later
          sb.append("GOTCHA!!!! DATATYPE NOT SUPPORTED IN THE EXPORT CODE");
        }                  
      }
    } // for 
    // Now also add location and value for the target, if we have one
    Object target = inst.getTarget();
    if(target!=null) {
      Attribute targetAttr = attrs.getTargetAttribute();
      sb.append(", ");        
      sb.append(targetAttr.index);
      sb.append(" ");
      // we expect this to be either a Label instance or something that can be cast to double
      if(target instanceof Label) {
        if(targetAttr.datatype != Datatype.nominal) {
          throw new RuntimeException("Target is a label but datatype for attribute is not nominal");
        }
        Label malletLabel = (Label)target;
        String targetString = malletLabel.toString();
        sb.append(escape4Arff(targetString));
        // TODO: could check here if the label index is the same as expected from
        // the attribute defintion!
      } else {
        if(targetAttr.datatype != Datatype.numeric) {
          throw new RuntimeException("Target is a number but datatype for attribute is not  numeric");
        }
        double targetValue = (double)target;
        sb.append(targetValue);
      }
    } else {
      // target is null: do nothing, simply create the row without a target 
      // TODO: not sure what I was thinking here, but admittedly, exporting without
      // a target or a missing target could have its uses, so we leave this as it is
    }
    sb.append("}");
    if(instanceWeight!=null) {
      sb.append(", {");
      sb.append(instanceWeight);
      sb.append("}");
    }
  } else {
    throw new RuntimeException("Cannot export, instance is not a feature vector but "+data.getClass());
  }
  return sb.toString();
}

Source File: TestFeatureExtraction.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

4 votes

@Test
public void extractList2() {
  // same as extractList2, but with explicitly specified name
  String spec = "<ROOT>"+
          "<ATTRIBUTELIST><NAME>myAttList</NAME><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><DATATYPE>nominal</DATATYPE><FROM>-2</FROM><TO>2</TO></ATTRIBUTELIST>"+
          "</ROOT>";
  List<FeatureSpecAttribute> as = new FeatureSpecification(spec).getFeatureInfo().getAttributes();
  Instance inst = newInstance();
  
  // prepare the document
  Annotation instAnn = addAnn(doc, "", 10, 12, "instanceType", gate.Utils.featureMap());
  addAnn(doc,"",0,2,"theType",gate.Utils.featureMap("theFeature","tok1"));
  addAnn(doc,"",2,4,"theType",gate.Utils.featureMap("theFeature","tok2"));
  addAnn(doc,"",4,6,"theType",gate.Utils.featureMap("theFeature","tok3"));
  addAnn(doc,"",6,8,"theType",gate.Utils.featureMap("theFeature","tok4"));
  addAnn(doc,"",8,10,"theType",gate.Utils.featureMap("theFeature","tok5"));
  addAnn(doc,"",10,12,"theType",gate.Utils.featureMap("theFeature","tok6"));
  addAnn(doc,"",12,14,"theType",gate.Utils.featureMap("theFeature","tok7"));
  addAnn(doc,"",14,16,"theType",gate.Utils.featureMap("theFeature","tok8"));
  addAnn(doc,"",16,18,"theType",gate.Utils.featureMap("theFeature","tok9"));
  addAnn(doc,"",18,20,"theType",gate.Utils.featureMap("theFeature","tok10"));
  Annotation withinAnn = addAnn(doc,"",8,14,"within",gate.Utils.featureMap());
  
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn);
  System.err.println("After "+as.get(0)+" (list -1to1) FV="+inst.getData());
  System.err.println("Alphabet L2="+inst.getAlphabet());
  assertEquals(5,inst.getAlphabet().size());
  System.err.println("Alphabet is "+inst.getAlphabet());
  FeatureVector fv = (FeatureVector)inst.getData();
  System.err.println("extractList2-all: "+fv.toString(true));
  assertTrue(inst.getAlphabet().contains("myAttList╬L-2═tok4"));
  assertTrue(inst.getAlphabet().contains("myAttList╬L-1═tok5"));
  assertTrue(inst.getAlphabet().contains("myAttList╬L0═tok6"));
  assertTrue(inst.getAlphabet().contains("myAttList╬L1═tok7"));
  assertTrue(inst.getAlphabet().contains("myAttList╬L2═tok8"));
  assertEquals(5,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L-2═tok4"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L-1═tok5"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L0═tok6"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L1═tok7"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L2═tok8"),EPS);
  
  // Do the test again, but this time with a declaration that limits it to within the within annotation
  spec = "<ROOT>"+
          "<ATTRIBUTELIST><NAME>myAttList</NAME><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><DATATYPE>nominal</DATATYPE><FROM>-1</FROM><TO>1</TO><WITHIN>within</WITHIN></ATTRIBUTELIST>"+
          "</ROOT>";
  as = new FeatureSpecification(spec).getFeatureInfo().getAttributes();
  inst = newInstance();
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn);
  fv = (FeatureVector)inst.getData();
  System.err.println("extractList2-within: "+fv.toString(true));
  assertEquals(5,inst.getAlphabet().size());
  assertTrue(inst.getAlphabet().contains("myAttList╬L-1═tok5"));
  assertTrue(inst.getAlphabet().contains("myAttList╬L0═tok6"));
  assertTrue(inst.getAlphabet().contains("myAttList╬L1═tok7"));
  assertTrue(inst.getAlphabet().contains("myAttList╬L-1═╔START╗"));
  assertTrue(inst.getAlphabet().contains("myAttList╬L1═╔STOP╗"));
  assertEquals(5,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L-1═tok5"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L0═tok6"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L1═tok7"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L-1═╔START╗"),EPS);
}

Source File: TestFeatureExtraction.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

4 votes

@Test
public void extractSimpleList1() {
  String spec = "<ROOT>"+
          "<ATTRIBUTE><TYPE>theType</TYPE><FEATURE>feature1</FEATURE><DATATYPE>nominal</DATATYPE></ATTRIBUTE>"+
          "</ROOT>";
  List<FeatureSpecAttribute> as = new FeatureSpecification(spec).getFeatureInfo().getAttributes();
  Instance inst = newInstance();
  
  // prepare the document
  Annotation instAnn = addAnn(doc, "", 0, 10, "instanceType", gate.Utils.featureMap());
  HashSet<String> v1 = new HashSet<>();
  v1.add("setval1");
  v1.add("setval2");
  v1.add("setval3");
  Annotation tok1 = addAnn(doc, "", 0, 5, "theType", gate.Utils.featureMap("feature1",v1));

  
  Annotation instAnn2 = addAnn(doc, "", 11, 20, "instanceType", gate.Utils.featureMap());
  HashSet<String> v2 = new HashSet<>();
  v2.add("setval1");
  v2.add("setval4");
  v2.add("setval5");
  Annotation tok2 = addAnn(doc, "", 12, 15, "theType", gate.Utils.featureMap("feature1",v2));
  
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn);
  FeatureVector fv = (FeatureVector)inst.getData();
  System.err.println("FeatureExtraction SimpleList1a: "+fv.toString(true));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═setval1"));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═setval2"));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═setval3"));
  assertEquals(3,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═setval1"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═setval2"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═setval3"),EPS);
  
  inst = newInstance(inst.getAlphabet());
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn2);
  fv = (FeatureVector)inst.getData();
  System.err.println("FeatureExtraction SimpleList1b: "+fv.toString(true));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═setval1"));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═setval4"));
  assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═setval5"));
  assertEquals(3,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═setval1"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═setval4"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═setval5"),EPS);
  
}

Source File: MultiSegmentationEvaluator.java From bluima with Apache License 2.0

4 votes

public void evaluateInstanceList(TransducerTrainer tt, InstanceList data,
        String description) {
    Transducer model = tt.getTransducer();
    int numCorrectTokens, totalTokens;
    int[] numTrueSegments, numPredictedSegments, numCorrectSegments;
    int allIndex = segmentStartTags.length;
    numTrueSegments = new int[allIndex + 1];
    numPredictedSegments = new int[allIndex + 1];
    numCorrectSegments = new int[allIndex + 1];

    totalTokens = numCorrectTokens = 0;
    for (int n = 0; n < numTrueSegments.length; n++)
        numTrueSegments[n] = numPredictedSegments[n] = numCorrectSegments[n] = 0;
    for (int i = 0; i < data.size(); i++) {
        Instance instance = data.get(i);
        Sequence input = (Sequence) instance.getData();
        // String tokens = null;
        // if (instance.getSource() != null)
        // tokens = (String) instance.getSource().toString();
        Sequence trueOutput = (Sequence) instance.getTarget();
        assert (input.size() == trueOutput.size());
        Sequence predOutput = model.transduce(input);
        assert (predOutput.size() == trueOutput.size());
        int trueStart, predStart; // -1 for non-start, otherwise index into
                                  // segmentStartTag
        for (int j = 0; j < trueOutput.size(); j++) {
            totalTokens++;
            if (trueOutput.get(j).equals(predOutput.get(j)))
                numCorrectTokens++;
            trueStart = predStart = -1;
            // Count true segment starts
            for (int n = 0; n < segmentStartTags.length; n++) {
                if (segmentStartTags[n].equals(trueOutput.get(j))) {
                    numTrueSegments[n]++;
                    numTrueSegments[allIndex]++;
                    trueStart = n;
                    break;
                }
            }
            // Count predicted segment starts
            for (int n = 0; n < segmentStartTags.length; n++) {
                if (segmentStartTags[n].equals(predOutput.get(j))) {
                    numPredictedSegments[n]++;
                    numPredictedSegments[allIndex]++;
                    predStart = n;
                }
            }
            if (trueStart != -1 && trueStart == predStart) {
                // Truth and Prediction both agree that the same segment
                // tag-type is starting now
                int m;
                boolean trueContinue = false;
                boolean predContinue = false;
                for (m = j + 1; m < trueOutput.size(); m++) {
                    trueContinue = segmentContinueTags[predStart]
                            .equals(trueOutput.get(m));
                    predContinue = segmentContinueTags[predStart]
                            .equals(predOutput.get(m));
                    if (!trueContinue || !predContinue) {
                        if (trueContinue == predContinue) {
                            // They agree about a segment is ending somehow
                            numCorrectSegments[predStart]++;
                            numCorrectSegments[allIndex]++;
                        }
                        break;
                    }
                }
                // for the case of the end of the sequence
                if (m == trueOutput.size()) {
                    if (trueContinue == predContinue) {
                        numCorrectSegments[predStart]++;
                        numCorrectSegments[allIndex]++;
                    }
                }
            }
        }
    }
    DecimalFormat f = new DecimalFormat("0.####");
    System.err.println(description + " tokenaccuracy="
            + f.format(((double) numCorrectTokens) / totalTokens));
    for (int n = 0; n < numCorrectSegments.length; n++) {
        System.err.println((n < allIndex ? segmentStartTags[n].toString()
                : "OVERALL") + ' ');
        double precision = numPredictedSegments[n] == 0 ? 1
                : ((double) numCorrectSegments[n])
                        / numPredictedSegments[n];
        double recall = numTrueSegments[n] == 0 ? 1
                : ((double) numCorrectSegments[n]) / numTrueSegments[n];
        double f1 = recall + precision == 0.0 ? 0.0
                : (2.0 * recall * precision) / (recall + precision);
        System.err.println(" " + description + " segments true="
                + numTrueSegments[n] + " pred=" + numPredictedSegments[n]
                + " correct=" + numCorrectSegments[n] + " misses="
                + (numTrueSegments[n] - numCorrectSegments[n]) + " alarms="
                + (numPredictedSegments[n] - numCorrectSegments[n]));
        System.err.println(" " + description + " precision="
                + f.format(precision) + " recall=" + f.format(recall)
                + " f1=" + f.format(f1));
    }

}

Java Code Examples for cc.mallet.types.Instance#getData()