Java Code Examples for cc.mallet.types.Instance#getData()
The following examples show how to use
cc.mallet.types.Instance#getData() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: PipeScaleMeanVarAll.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 6 votes |
@Override public Instance pipe(Instance carrier) { if (!(carrier.getData() instanceof FeatureVector)) { System.out.println(carrier.getData().getClass()); throw new IllegalArgumentException("Data must be of type FeatureVector not " + carrier.getData().getClass() + " we got " + carrier.getData()); } if (this.means.length != this.getDataAlphabet().size() || this.variances.length != this.getDataAlphabet().size()) { throw new GateRuntimeException("Size mismatch, alphabet="+getDataAlphabet().size()+", stats="+means.length); } FeatureVector fv = (FeatureVector) carrier.getData(); int[] indices = fv.getIndices(); double[] values = fv.getValues(); for (int i = 0; i < indices.length; i++) { int index = indices[i]; if(normalize[index]) { double value = values[i]; double mean = means[index]; double variance = variances[index]; double newvalue = (value - mean) / Math.sqrt(variance); fv.setValue(index, newvalue); } } return carrier; }
Example 2
Source File: RemoveStopwordsTest.java From baleen with Apache License 2.0 | 6 votes |
@Test public void testStopwordsAreRemoved() { String stop = "stop"; String word = "word"; String white = "white"; String list = "list"; TokenSequence data = new TokenSequence( ImmutableList.of(new Token(stop), new Token(word), new Token(white), new Token(list))); Instance instance = new Instance(data, null, null, null); RemoveStopwords stopwords = new RemoveStopwords(ImmutableList.of(stop, word)); Instance output = stopwords.pipe(instance); TokenSequence ts = (TokenSequence) output.getData(); assertEquals(2, ts.size()); assertEquals( ImmutableSet.of(white, list), ts.stream().map(Token::getText).collect(Collectors.toSet())); }
Example 3
Source File: PipeScaleMinMaxAll.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 5 votes |
@Override public Instance pipe(Instance carrier) { if (!(carrier.getData() instanceof FeatureVector)) { System.out.println(carrier.getData().getClass()); throw new IllegalArgumentException("Data must be of type FeatureVector not " + carrier.getData().getClass() + " we got " + carrier.getData()); } if (min.length != getDataAlphabet().size() || max.length != getDataAlphabet().size()) { throw new GateRuntimeException("Size mismatch, alphabet="+getDataAlphabet().size()+", stats="+min.length); } FeatureVector fv = (FeatureVector) carrier.getData(); int[] indices = fv.getIndices(); double[] values = fv.getValues(); for (int i = 0; i < indices.length; i++) { int index = indices[i]; double mi = min[index]; double ma = max[index]; double span = ma - mi; if(normalize[index] && span > 0.0) { double value = values[i]; // NOTE: this could in theory cause an overflow error but we ignore this here! double newvalue = (value - mi) / span; fv.setValue(index, newvalue); } } return carrier; }
Example 4
Source File: CorpusRepresentationLibSVM.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 5 votes |
/** * Create libsvm representation from Mallet. * * @param crm mallet representation * @return libsvm representation */ public static svm_problem getFromMallet(CorpusRepresentationMallet crm) { InstanceList instances = crm.getRepresentationMallet(); svm_problem prob = new svm_problem(); int numTrainingInstances = instances.size(); prob.l = numTrainingInstances; prob.y = new double[prob.l]; prob.x = new svm_node[prob.l][]; for (int i = 0; i < numTrainingInstances; i++) { Instance instance = instances.get(i); //Labels // convert the target: if we get a label, convert to index, // if we get a double, use it directly Object tobj = instance.getTarget(); if (tobj instanceof Label) { prob.y[i] = ((Label) instance.getTarget()).getIndex(); } else if (tobj instanceof Double) { prob.y[i] = (double) tobj; } else { throw new GateRuntimeException("Odd target in mallet instance, cannot convert to LIBSVM: " + tobj); } //Features SparseVector data = (SparseVector) instance.getData(); int[] indices = data.getIndices(); double[] values = data.getValues(); prob.x[i] = new svm_node[indices.length]; for (int j = 0; j < indices.length; j++) { svm_node node = new svm_node(); node.index = indices[j]+1; // NOTE: LibSVM location indices have to start with 1 node.value = values[j]; prob.x[i][j] = node; } } return prob; }
Example 5
Source File: FVStatsMeanVarAll.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 5 votes |
/** * Constructor from instance list. * @param instances instances */ public FVStatsMeanVarAll(InstanceList instances) { for(Instance instance : instances) { FeatureVector fv = (FeatureVector)instance.getData(); addFeatureVector(fv); } finish(); }
Example 6
Source File: CorpusExporterMRJsonTarget.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 5 votes |
/** * Convert instance to string. * * @param inst instance * @param targetAlphabet target alphabet * @param attrs attributes * @param nrFeatures number of features * @param asString represent as quoted string * @param filterMV filter missing values * @return string representation */ public String instance2String( Instance inst, LabelAlphabet targetAlphabet, Attributes attrs, int nrFeatures, boolean asString, boolean filterMV) { StringBuilder sb = new StringBuilder(); sb.append("["); // outermost list FeatureVector fv = (FeatureVector)inst.getData(); Object targetObject = inst.getTarget(); if (filterMV) { Object ignore = inst.getProperty(FeatureExtractionMalletSparse.PROP_IGNORE_HAS_MV); if (ignore != null && ignore.equals(true)) { return null; } } sb.append(featureVector2String(fv, nrFeatures, attrs, asString)); // for now, we always try to output the target, even if it is null, this may change // in the future if (targetObject!=null) { sb.append(", "); sb.append(target2String(targetObject, targetAlphabet, asString)); } sb.append("]"); // close outer list return sb.toString(); }
Example 7
Source File: TestFeatureExtraction.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 5 votes |
@Test public void extractSimpleList2() { String spec = "<ROOT>"+ "<ATTRIBUTE><TYPE>theType</TYPE><FEATURE>feature1</FEATURE><DATATYPE>nominal</DATATYPE><LISTSEP>:</LISTSEP></ATTRIBUTE>"+ "</ROOT>"; List<FeatureSpecAttribute> as = new FeatureSpecification(spec).getFeatureInfo().getAttributes(); Instance inst = newInstance(); // prepare the document Annotation instAnn = addAnn(doc, "", 0, 10, "instanceType", gate.Utils.featureMap()); Annotation tok1 = addAnn(doc, "", 0, 5, "theType", gate.Utils.featureMap("feature1","lval1:lval2:lval3")); Annotation instAnn2 = addAnn(doc, "", 11, 20, "instanceType", gate.Utils.featureMap()); Annotation tok2 = addAnn(doc, "", 12, 15, "theType", gate.Utils.featureMap("feature1","lval1:lval4:lval5")); FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn); FeatureVector fv = (FeatureVector)inst.getData(); System.err.println("FeatureExtraction SimpleList2a: "+fv.toString(true)); assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═lval1")); assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═lval2")); assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═lval3")); assertEquals(3,((FeatureVector)inst.getData()).numLocations()); assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═lval1"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═lval2"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═lval3"),EPS); inst = newInstance(inst.getAlphabet()); FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn2); fv = (FeatureVector)inst.getData(); System.err.println("FeatureExtraction SimpleList2b: "+fv.toString(true)); assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═lval1")); assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═lval4")); assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═lval5")); assertEquals(3,((FeatureVector)inst.getData()).numLocations()); assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═lval1"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═lval4"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═lval5"),EPS); }
Example 8
Source File: RemoveStopwords.java From baleen with Apache License 2.0 | 5 votes |
@Override public Instance pipe(Instance carrier) { TokenSequence input = (TokenSequence) carrier.getData(); TokenSequence output = new TokenSequence(); for (int i = 0; i < input.size(); i++) { Token t = input.get(i); if (!stopwords.contains(t.getText())) { output.add(t); } } carrier.setData(output); return carrier; }
Example 9
Source File: BrainRegionPipesTest.java From bluima with Apache License 2.0 | 5 votes |
private void pipe(String txt, List<String>... features) throws Exception { // it might not have all the aes, though... JCas jCas = getOpenNlpTokenizedTestCas(txt); InstanceList il = new InstanceList(// new SerialPipes(BrainRegionPipes.getPipes())); Instance instance = new Instance(jCas, null, 1, jCas); il.addThruPipe(instance); Instance pipedInstance = il.iterator().next(); FeatureVectorSequence data = (FeatureVectorSequence) pipedInstance .getData(); java.util.Iterator<List<String>> featuresIt = asList(features) .iterator(); Iterator it = data.iterator(); while (it.hasNext()) { FeatureVector featureVector = it.next(); if (featuresIt.hasNext()) { for (String expectedFeature : featuresIt.next()) { assertTrue("could not find expected feature '" + expectedFeature + "', FeatureVector = \n" + featureVector, featureVector.contains(expectedFeature)); } } } }
Example 10
Source File: TokenTransform.java From bluima with Apache License 2.0 | 5 votes |
public Instance pipe (Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); for (int i = 0; i < ts.size(); i++) { Token token = ts.get(i); String s = token.getText(); String transform = tokenTransformer.transform(s); if (null != transform) token.setFeatureValue((featureName + transform), 1.0); } return carrier; }
Example 11
Source File: EngineMBWekaWrapper.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 4 votes |
@Override public List<ModelApplication> applyModel(AnnotationSet instanceAS, AnnotationSet inputAS, AnnotationSet sequenceAS, String parms) { CorpusRepresentationMalletTarget data = (CorpusRepresentationMalletTarget)corpusRepresentation; data.stopGrowth(); //System.err.println("Running EngineWeka.applyModel on document "+instanceAS.getDocument().getName()); List<ModelApplication> gcs = new ArrayList<>(); LFPipe pipe = (LFPipe)data.getRepresentationMallet().getPipe(); for(Annotation instAnn : instanceAS.inDocumentOrder()) { Instance inst = data.extractIndependentFeatures(instAnn, inputAS); //FeatureVector fv = (FeatureVector)inst.getData(); //System.out.println("Mallet instance, fv: "+fv.toString(true)+", len="+fv.numLocations()); inst = pipe.instanceFrom(inst); FeatureVector fv = (FeatureVector)inst.getData(); //System.out.println("Mallet instance, fv: "+fv.toString(true)+", len="+fv.numLocations()); double weight = Double.NaN; Object weightObj = inst.getProperty("instanceWeight"); if(weightObj != null) { weight = (double)weightObj; } // Convert to the sparse vector we use to send to the weka process int locs = fv.numLocations(); SparseDoubleVector sdv = new SparseDoubleVector(locs); sdv.setInstanceWeight(weight); int[] locations = sdv.getLocations(); double[] values = sdv.getValues(); for(int i=0;i<locs;i++) { locations[i] = fv.indexAtLocation(i); values[i] = fv.value(locations[i]); } // send the vector over to the weka process process.writeObject(sdv); // get the result back Object obj = process.readObject(); // check that it is an array of double double[] ret = null; if(obj instanceof double[]) { // if the array has one element, the model treated it as regression, otherwise classification ret = (double[])obj; } else { // this is an error, lets panic for now throw new RuntimeException("Got a response from the Weka process which is not double[] but "+obj.getClass()); } //System.err.println("Sent vector: locs/values="+Arrays.toString(locations)+"/"+Arrays.toString(values)+", ret="+Arrays.toString(ret)); ModelApplication gc = null; // now check if the mallet representation and the weka process agree // on if we have regression or classification if(pipe.getTargetAlphabet() == null) { // we expect a regression result, i.e ret should have 1 element if(ret.length != 1) { throw new RuntimeException("We think we have regression but the Weka process sent a ret of length "+ret.length); } gc = new ModelApplication(instAnn, ret[0]); } else { // classification, we expect ret to have length >= 2 if(ret.length < 2) { throw new RuntimeException("We think we have classification but Weka process sent a ret of length "+ret.length); } double bestprob = 0.0; int bestlabel = 0; /* System.err.print("DEBUG: got classes from pipe: "); Object[] cls = pipe.getTargetAlphabet().toArray(); boolean first = true; for(Object cl : cls) { if(first) { first = false; } else { System.err.print(", "); } System.err.print(">"+cl+"<"); } System.err.println(); */ List<String> classList = new ArrayList<>(); List<Double> confidenceList = new ArrayList<>(); for (int i = 0; i < ret.length; i++) { int thislabel = i; double thisprob = ret[i]; String labelstr = pipe.getTargetAlphabet().lookupObject(thislabel).toString(); classList.add(labelstr); confidenceList.add(thisprob); if (thisprob > bestprob) { bestlabel = thislabel; bestprob = thisprob; } } // end for i < predictionDistribution.length String cl = pipe.getTargetAlphabet().lookupObject(bestlabel).toString(); gc = new ModelApplication( instAnn, cl, bestprob, classList, confidenceList); } gcs.add(gc); } data.startGrowth(); return gcs; }
Example 12
Source File: EngineMBMalletSeq.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 4 votes |
@Override public List<ModelApplication> applyModel( AnnotationSet instanceAS, AnnotationSet inputAS, AnnotationSet sequenceAS, String parms) { // stop growth CorpusRepresentationMalletSeq data = (CorpusRepresentationMalletSeq)corpusRepresentation; data.stopGrowth(); List<ModelApplication> gcs = new ArrayList<>(); Transducer crf = (Transducer)model; for(Annotation sequenceAnn : sequenceAS) { int sequenceSpanId = sequenceAnn.getId(); Instance inst = data.getInstanceForSequence( instanceAS, sequenceAnn, inputAS, null, null, TargetType.NONE, null, null); //Always put the instance through the same pipe used for training. inst = crf.getInputPipe().instanceFrom(inst); SumLatticeDefault sl = new SumLatticeDefault(crf, (FeatureVectorSequence) inst.getData()); List<Annotation> instanceAnnotations = gate.Utils.getContainedAnnotations( instanceAS, sequenceAnn).inDocumentOrder(); //Sanity check that we're mapping the probs back onto the right anns. //This being wrong might follow from errors reading in the data to mallet inst. if (instanceAnnotations.size() != ((FeatureVectorSequence) inst.getData()).size()) { LOGGER.warn("LearningFramework: CRF output length: " + ((FeatureVectorSequence) inst.getData()).size() + ", GATE instances: " + instanceAnnotations.size() + ". Can't assign."); } else { int i = 0; for (Annotation instanceAnn : instanceAnnotations) { i++; String bestLabel = null; double bestProb = 0.0; //For each label option .. // NOTE: for CRF we had this code: //for (int j = 0; j < crf.getOutputAlphabet().size(); j++) { // String label = crf.getOutputAlphabet().lookupObject(j).toString(); // but for Transducer we do not have the getOutputAlphabet method so we use // model.getInputPipe().getTargetAlphabet() instead (this seems to be what // is used inside CRF anyway.) for (int j = 0; j < crf.getInputPipe().getTargetAlphabet().size(); j++) { String label = crf.getInputPipe().getTargetAlphabet().lookupObject(j).toString(); //Get the probability of being in state j at position i+1 //Note that the plus one is because the labels are on the //transitions. Positions are between transitions. double marg = sl.getGammaProbability(i, crf.getState(j)); if (marg > bestProb) { bestLabel = label; bestProb = marg; } } ModelApplication gc = new ModelApplication( instanceAnn, bestLabel, bestProb, sequenceSpanId); gcs.add(gc); } } } data.startGrowth(); return gcs; }
Example 13
Source File: CorpusExporterMRARFF.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 4 votes |
private String instance2WekaArffLine(Instance inst, Attributes attrs, boolean filterMVs) { StringBuilder sb = new StringBuilder(); if(filterMVs) { Object ignore = inst.getProperty(FeatureExtractionMalletSparse.PROP_IGNORE_HAS_MV); // If the flag says the instance should get ignored, return null // to indicate to the caller that this is an ignored instance. if(ignore != null && ignore.equals(true)) { return null; } } Double instanceWeight = (Double)inst.getProperty("instanceWeight"); Object data = inst.getData(); if(data instanceof FeatureVector) { FeatureVector vector = (FeatureVector)data; sb.append("{"); boolean first = true; // TODO: maybe it is easier to do // for(int idx : vector.getIndices) for(int i=0; i<vector.numLocations(); i++) { int idx = vector.indexAtLocation(i); if(first) { first = false; } else { sb.append(", "); } sb.append(idx); sb.append(" "); double value = vector.valueAtLocation(i); if(Double.isNaN(value)) { sb.append("?"); } else { // TODO: proper handling of missing values!!! // Also: codeas may be null sometimes, make sure if we have a datatype // where codeas is relevant, we ALWAYS have codeas set to the correct value! Attribute attr = attrs.getAttribute(idx); if(attr.datatype==Datatype.numeric || (attr.datatype==Datatype.nominal && attr.codeAs!=CodeAs.number)) { sb.append(value); } else if(attr.datatype==Datatype.bool) { // TODO: check for missing value, also use the special alphabet we created? if(value<0.5) { sb.append("false"); } else { sb.append("true"); } } else if(attr.datatype==Datatype.nominal) { // TODO: check for how to exactly handling missing values, for now we simply output // the Weka missing value placeholder if(((int)value)==-1) { sb.append("?"); } else { sb.append(escape4Arff((String)attr.alphabet.lookupObject((int) value))); } } else { // guard for forgetting about here when we add datatypes later sb.append("GOTCHA!!!! DATATYPE NOT SUPPORTED IN THE EXPORT CODE"); } } } // for // Now also add location and value for the target, if we have one Object target = inst.getTarget(); if(target!=null) { Attribute targetAttr = attrs.getTargetAttribute(); sb.append(", "); sb.append(targetAttr.index); sb.append(" "); // we expect this to be either a Label instance or something that can be cast to double if(target instanceof Label) { if(targetAttr.datatype != Datatype.nominal) { throw new RuntimeException("Target is a label but datatype for attribute is not nominal"); } Label malletLabel = (Label)target; String targetString = malletLabel.toString(); sb.append(escape4Arff(targetString)); // TODO: could check here if the label index is the same as expected from // the attribute defintion! } else { if(targetAttr.datatype != Datatype.numeric) { throw new RuntimeException("Target is a number but datatype for attribute is not numeric"); } double targetValue = (double)target; sb.append(targetValue); } } else { // target is null: do nothing, simply create the row without a target // TODO: not sure what I was thinking here, but admittedly, exporting without // a target or a missing target could have its uses, so we leave this as it is } sb.append("}"); if(instanceWeight!=null) { sb.append(", {"); sb.append(instanceWeight); sb.append("}"); } } else { throw new RuntimeException("Cannot export, instance is not a feature vector but "+data.getClass()); } return sb.toString(); }
Example 14
Source File: TestFeatureExtraction.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 4 votes |
@Test public void extractList2() { // same as extractList2, but with explicitly specified name String spec = "<ROOT>"+ "<ATTRIBUTELIST><NAME>myAttList</NAME><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><DATATYPE>nominal</DATATYPE><FROM>-2</FROM><TO>2</TO></ATTRIBUTELIST>"+ "</ROOT>"; List<FeatureSpecAttribute> as = new FeatureSpecification(spec).getFeatureInfo().getAttributes(); Instance inst = newInstance(); // prepare the document Annotation instAnn = addAnn(doc, "", 10, 12, "instanceType", gate.Utils.featureMap()); addAnn(doc,"",0,2,"theType",gate.Utils.featureMap("theFeature","tok1")); addAnn(doc,"",2,4,"theType",gate.Utils.featureMap("theFeature","tok2")); addAnn(doc,"",4,6,"theType",gate.Utils.featureMap("theFeature","tok3")); addAnn(doc,"",6,8,"theType",gate.Utils.featureMap("theFeature","tok4")); addAnn(doc,"",8,10,"theType",gate.Utils.featureMap("theFeature","tok5")); addAnn(doc,"",10,12,"theType",gate.Utils.featureMap("theFeature","tok6")); addAnn(doc,"",12,14,"theType",gate.Utils.featureMap("theFeature","tok7")); addAnn(doc,"",14,16,"theType",gate.Utils.featureMap("theFeature","tok8")); addAnn(doc,"",16,18,"theType",gate.Utils.featureMap("theFeature","tok9")); addAnn(doc,"",18,20,"theType",gate.Utils.featureMap("theFeature","tok10")); Annotation withinAnn = addAnn(doc,"",8,14,"within",gate.Utils.featureMap()); FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn); System.err.println("After "+as.get(0)+" (list -1to1) FV="+inst.getData()); System.err.println("Alphabet L2="+inst.getAlphabet()); assertEquals(5,inst.getAlphabet().size()); System.err.println("Alphabet is "+inst.getAlphabet()); FeatureVector fv = (FeatureVector)inst.getData(); System.err.println("extractList2-all: "+fv.toString(true)); assertTrue(inst.getAlphabet().contains("myAttList╬L-2═tok4")); assertTrue(inst.getAlphabet().contains("myAttList╬L-1═tok5")); assertTrue(inst.getAlphabet().contains("myAttList╬L0═tok6")); assertTrue(inst.getAlphabet().contains("myAttList╬L1═tok7")); assertTrue(inst.getAlphabet().contains("myAttList╬L2═tok8")); assertEquals(5,((FeatureVector)inst.getData()).numLocations()); assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L-2═tok4"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L-1═tok5"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L0═tok6"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L1═tok7"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L2═tok8"),EPS); // Do the test again, but this time with a declaration that limits it to within the within annotation spec = "<ROOT>"+ "<ATTRIBUTELIST><NAME>myAttList</NAME><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><DATATYPE>nominal</DATATYPE><FROM>-1</FROM><TO>1</TO><WITHIN>within</WITHIN></ATTRIBUTELIST>"+ "</ROOT>"; as = new FeatureSpecification(spec).getFeatureInfo().getAttributes(); inst = newInstance(); FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn); fv = (FeatureVector)inst.getData(); System.err.println("extractList2-within: "+fv.toString(true)); assertEquals(5,inst.getAlphabet().size()); assertTrue(inst.getAlphabet().contains("myAttList╬L-1═tok5")); assertTrue(inst.getAlphabet().contains("myAttList╬L0═tok6")); assertTrue(inst.getAlphabet().contains("myAttList╬L1═tok7")); assertTrue(inst.getAlphabet().contains("myAttList╬L-1═╔START╗")); assertTrue(inst.getAlphabet().contains("myAttList╬L1═╔STOP╗")); assertEquals(5,((FeatureVector)inst.getData()).numLocations()); assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L-1═tok5"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L0═tok6"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L1═tok7"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("myAttList╬L-1═╔START╗"),EPS); }
Example 15
Source File: TestFeatureExtraction.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 4 votes |
@Test public void extractSimpleList1() { String spec = "<ROOT>"+ "<ATTRIBUTE><TYPE>theType</TYPE><FEATURE>feature1</FEATURE><DATATYPE>nominal</DATATYPE></ATTRIBUTE>"+ "</ROOT>"; List<FeatureSpecAttribute> as = new FeatureSpecification(spec).getFeatureInfo().getAttributes(); Instance inst = newInstance(); // prepare the document Annotation instAnn = addAnn(doc, "", 0, 10, "instanceType", gate.Utils.featureMap()); HashSet<String> v1 = new HashSet<>(); v1.add("setval1"); v1.add("setval2"); v1.add("setval3"); Annotation tok1 = addAnn(doc, "", 0, 5, "theType", gate.Utils.featureMap("feature1",v1)); Annotation instAnn2 = addAnn(doc, "", 11, 20, "instanceType", gate.Utils.featureMap()); HashSet<String> v2 = new HashSet<>(); v2.add("setval1"); v2.add("setval4"); v2.add("setval5"); Annotation tok2 = addAnn(doc, "", 12, 15, "theType", gate.Utils.featureMap("feature1",v2)); FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn); FeatureVector fv = (FeatureVector)inst.getData(); System.err.println("FeatureExtraction SimpleList1a: "+fv.toString(true)); assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═setval1")); assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═setval2")); assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═setval3")); assertEquals(3,((FeatureVector)inst.getData()).numLocations()); assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═setval1"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═setval2"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═setval3"),EPS); inst = newInstance(inst.getAlphabet()); FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn2); fv = (FeatureVector)inst.getData(); System.err.println("FeatureExtraction SimpleList1b: "+fv.toString(true)); assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═setval1")); assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═setval4")); assertTrue(inst.getAlphabet().contains("theType┆feature1╬A═setval5")); assertEquals(3,((FeatureVector)inst.getData()).numLocations()); assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═setval1"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═setval4"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆feature1╬A═setval5"),EPS); }
Example 16
Source File: MultiSegmentationEvaluator.java From bluima with Apache License 2.0 | 4 votes |
public void evaluateInstanceList(TransducerTrainer tt, InstanceList data, String description) { Transducer model = tt.getTransducer(); int numCorrectTokens, totalTokens; int[] numTrueSegments, numPredictedSegments, numCorrectSegments; int allIndex = segmentStartTags.length; numTrueSegments = new int[allIndex + 1]; numPredictedSegments = new int[allIndex + 1]; numCorrectSegments = new int[allIndex + 1]; totalTokens = numCorrectTokens = 0; for (int n = 0; n < numTrueSegments.length; n++) numTrueSegments[n] = numPredictedSegments[n] = numCorrectSegments[n] = 0; for (int i = 0; i < data.size(); i++) { Instance instance = data.get(i); Sequence input = (Sequence) instance.getData(); // String tokens = null; // if (instance.getSource() != null) // tokens = (String) instance.getSource().toString(); Sequence trueOutput = (Sequence) instance.getTarget(); assert (input.size() == trueOutput.size()); Sequence predOutput = model.transduce(input); assert (predOutput.size() == trueOutput.size()); int trueStart, predStart; // -1 for non-start, otherwise index into // segmentStartTag for (int j = 0; j < trueOutput.size(); j++) { totalTokens++; if (trueOutput.get(j).equals(predOutput.get(j))) numCorrectTokens++; trueStart = predStart = -1; // Count true segment starts for (int n = 0; n < segmentStartTags.length; n++) { if (segmentStartTags[n].equals(trueOutput.get(j))) { numTrueSegments[n]++; numTrueSegments[allIndex]++; trueStart = n; break; } } // Count predicted segment starts for (int n = 0; n < segmentStartTags.length; n++) { if (segmentStartTags[n].equals(predOutput.get(j))) { numPredictedSegments[n]++; numPredictedSegments[allIndex]++; predStart = n; } } if (trueStart != -1 && trueStart == predStart) { // Truth and Prediction both agree that the same segment // tag-type is starting now int m; boolean trueContinue = false; boolean predContinue = false; for (m = j + 1; m < trueOutput.size(); m++) { trueContinue = segmentContinueTags[predStart] .equals(trueOutput.get(m)); predContinue = segmentContinueTags[predStart] .equals(predOutput.get(m)); if (!trueContinue || !predContinue) { if (trueContinue == predContinue) { // They agree about a segment is ending somehow numCorrectSegments[predStart]++; numCorrectSegments[allIndex]++; } break; } } // for the case of the end of the sequence if (m == trueOutput.size()) { if (trueContinue == predContinue) { numCorrectSegments[predStart]++; numCorrectSegments[allIndex]++; } } } } } DecimalFormat f = new DecimalFormat("0.####"); System.err.println(description + " tokenaccuracy=" + f.format(((double) numCorrectTokens) / totalTokens)); for (int n = 0; n < numCorrectSegments.length; n++) { System.err.println((n < allIndex ? segmentStartTags[n].toString() : "OVERALL") + ' '); double precision = numPredictedSegments[n] == 0 ? 1 : ((double) numCorrectSegments[n]) / numPredictedSegments[n]; double recall = numTrueSegments[n] == 0 ? 1 : ((double) numCorrectSegments[n]) / numTrueSegments[n]; double f1 = recall + precision == 0.0 ? 0.0 : (2.0 * recall * precision) / (recall + precision); System.err.println(" " + description + " segments true=" + numTrueSegments[n] + " pred=" + numPredictedSegments[n] + " correct=" + numCorrectSegments[n] + " misses=" + (numTrueSegments[n] - numCorrectSegments[n]) + " alarms=" + (numPredictedSegments[n] - numCorrectSegments[n])); System.err.println(" " + description + " precision=" + f.format(precision) + " recall=" + f.format(recall) + " f1=" + f.format(f1)); } }