cc.mallet.types.Alphabet Java Examples
The following examples show how to use
cc.mallet.types.Alphabet.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: EngineMB.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 6 votes |
protected void updateInfo() { //System.err.println("In updateInfo, model is "+model); if(model!=null) { info.modelClass = model.getClass().getName(); } info.nrTrainingInstances = corpusRepresentation.getRepresentationMallet().size(); info.nrTrainingDimensions = corpusRepresentation.getRepresentationMallet().getDataAlphabet().size(); LFPipe pipe = corpusRepresentation.getPipe(); Alphabet targetAlph = pipe.getTargetAlphabet(); if(targetAlph == null) { info.nrTargetValues = 0; } else { info.nrTargetValues = targetAlph.size(); //info.classLabels = Object[] objs = targetAlph.toArray(); ArrayList<String> labels = new ArrayList<>(); for(Object obj : objs) { labels.add(obj.toString()); } info.classLabels = labels; } }
Example #2
Source File: PipeScaleMinMaxAll.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 6 votes |
/** * Constructor from alphabet and feature stats. * @param alphabet alphabet * @param stats feature stats */ public PipeScaleMinMaxAll(Alphabet alphabet, FVStatsMeanVarAll stats) { super(alphabet, null); List<PerFeatureStats> pfss = stats.getStats(); int n = pfss.size(); min = new double[n]; max = new double[n]; normalize = new boolean[n]; for(int i=0; i<n; i++) { PerFeatureStats pfs = pfss.get(i); // we do not normalize binary features and we do not normalize features with no // values at all if(pfs.binary != null && pfs.binary != true) { min[i] = pfs.min; max[i] = pfs.max; } else { normalize[i] = false; } } System.err.println("DEBUG: Creating PipeScaleMinMaxAll instance with mins="+Arrays.toString(min)+",maxs="+Arrays.toString(max)); }
Example #3
Source File: PipeScaleMeanVarAll.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 6 votes |
/** * Constructor from alphabet and stats. * @param alphabet alphabet * @param stats feature stats */ public PipeScaleMeanVarAll(Alphabet alphabet, FVStatsMeanVarAll stats) { super(alphabet, null); List<PerFeatureStats> pfss = stats.getStats(); int n = pfss.size(); means = new double[n]; variances = new double[n]; normalize = new boolean[n]; for(int i=0; i<n; i++) { PerFeatureStats pfs = pfss.get(i); // we do not normalize binary features and we do not normalize features with no // values at all if(pfs.binary != null && pfs.binary != true) { means[i] = pfs.mean; variances[i] = pfs.var; normalize[i] = true; } else { means[i] = Double.NaN; variances[i] = Double.NaN; normalize[i] = false; } } //System.err.println("DEBUG: Creating PipeScaleMeanVarAll instance with means="+Arrays.toString(means)+ // ",variances="+Arrays.toString(variances)+",flags="+Arrays.toString(normalize)); }
Example #4
Source File: MaxEntClassifierTrainer.java From baleen with Apache License 2.0 | 6 votes |
private HashMap<Integer, ArrayList<Integer>> mapFeaturesToLabels( Alphabet dataAlphabet, Alphabet targetAlphabet) { HashMap<Integer, ArrayList<Integer>> featuresAndLabels = new HashMap<>(); labelsAndFeatures.forEach( (k, v) -> { Integer label = targetAlphabet.lookupIndex(k); v.forEach( f -> { Integer feature = dataAlphabet.lookupIndex(f); ArrayList<Integer> labels = featuresAndLabels.get(feature); if (labels == null) { labels = new ArrayList<>(); featuresAndLabels.put(feature, labels); } labels.add(label); }); }); return featuresAndLabels; }
Example #5
Source File: EngineMBPythonNetworksBase.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 5 votes |
protected AbstractMap.SimpleEntry<String,Integer> findOutMode(CorpusRepresentationMalletTarget crm) { InstanceList instances = crm.getRepresentationMallet(); // we pass on a "mode" for the learning problem, which is one of the following: // - classind: predict the index of a class // - classcosts: targets are vectors of class costs // - regr: regression // we also pass on another parameter which provides details of the learning problem: // - the number of class indices in case of classind and classcosts // - 0 as a dummy value in case of "regr" int nrClasses = 0; String mode = "regr"; Alphabet ta = crm.getPipe().getTargetAlphabet(); if(ta != null) { // if this is invoked for training, we should have a first instance, but for // application, we do not have any instances yet. If we do not have any instances, we // just use dummy values for now since at the moment we do not need this information // at application time. Should we ever need it we need to store this in the pipe! if(instances==null || instances.isEmpty()) { mode="classind"; nrClasses=-1; } else { Instance firstInstance = instances.get(0); Object targetObj = firstInstance.getTarget(); if(targetObj instanceof NominalTargetWithCosts) { NominalTargetWithCosts target = (NominalTargetWithCosts)targetObj; nrClasses = target.getCosts().length; mode = "classcosts"; } else { mode = "classind"; nrClasses = ta.size(); } } } AbstractMap.SimpleEntry<String,Integer> ret = new AbstractMap.SimpleEntry<>(mode,nrClasses); return ret; }
Example #6
Source File: TopicModelPipe.java From baleen with Apache License 2.0 | 5 votes |
/** * Construct topic model pipe with given stopwords and alphabets * * @param stopwords to be removed * @param alphabet to use */ public TopicModelPipe(Collection<String> stopwords, Alphabet alphabet) { // @formatter:off super( ImmutableList.of( new CharSequenceLowercase(), new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")), new RemoveStopwords(stopwords), new TokenSequence2FeatureSequence(alphabet))); // @formatter:on }
Example #7
Source File: CorpusExporterMRARFF.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 5 votes |
/** * Convert alphabet to ARFF declaration string. * @param alph Mallet alphabet * @param mvt missing value treatment setting * @return ARFF declaration */ public String alphabet2Arff(Alphabet alph, MissingValueTreatment mvt) { // NOTE: mvt can be null, if this is used for a target!! StringBuilder sb = new StringBuilder(); sb.append("{"); for(int i=0; i<alph.size(); i++) { if(i>0) sb.append(","); String val = alph.lookupObject(i).toString(); sb.append(escape4Arff(val)); } // TODO: we may need to add the definition for the missing value here, // but by default, we do not do that. sb.append("}"); return sb.toString(); }
Example #8
Source File: CorpusRepresentationMallet.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 5 votes |
@Override public List<String> getLabelList() { Alphabet ta = pipe.getTargetAlphabet(); if (ta != null) { Object[] ls = ta.toArray(); List<String> list = new ArrayList<>(); for(Object o : ls) { list.add(o.toString()); } return list; } else { return new ArrayList<>(); } }
Example #9
Source File: FeatureExtractionMalletSparse.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 4 votes |
/** * Extract the class for an instance for sequence tagging. * * In the case of sequence tagging, we construct the class based on the instance's position * relative to the class annotation annType. If it occurs at the beginning of the class * annotation, it's a "beginning". In the middle or at the end, it's an "inside". Instances that * don't occur in the span of a class annotation are an "outside". * * @param inst instance * @param alph the label alphabet to use, must be an instance of LabelAlphabet * @param classAS class annotation set * @param instanceAnnotation the instance annotation, e.g. "Token". * @param seqEncoder sequence encoder instance */ public static void extractClassForSeqTagging(Instance inst, Alphabet alph, AnnotationSet classAS, Annotation instanceAnnotation, SeqEncoder seqEncoder) { String target; Document doc = classAS.getDocument(); if (!(alph instanceof LabelAlphabet)) { throw new GateRuntimeException("LF extractClassForSeqTagging: the alphabet must be of type LabelAlphabet" + " for instance annotation at offset " + gate.Utils.start(instanceAnnotation) + " in document " + doc.getName()); } LabelAlphabet labelalph = (LabelAlphabet) alph; AnnotationSet overlappingClassAnns = Utils.getOverlappingAnnotations(classAS, instanceAnnotation); // NOTE: previously we only allowed at most one class annotation, but now we are as flexible // as possible here: any number of class annotations of any number of types can overlap. // The class label for each instance is generated from the complete list of what overlaps, // e.g. beginning of T1, beginning of another T1, continuation of T2 and end of T3 // The class labels for such combinations only get generated if an overlap actually occurs, // so if we only ever see nicely separated annotations, then we will never see the combined labels. // Labels are dynamically generated as a string of pipe-separated type names, with the flag // (beginning=B, inside=I) appended, or class "O" if outside of all types. // The ordering of types in the class label name must be consistent: TODO!! // NOTE: this should be one of several possible ways to do it, implemented in several // methods/classes and choosable through e.g. the "algorithmParameter" settings. // Then we could use approaches like BIO, BMEWO, BMEWO+ (see // https://lingpipe-blog.com/2009/10/14/coding-chunkers-as-taggers-io-bio-bmewo-and-bmewo/) // or the ones listed in http://cs229.stanford.edu/proj2005/KrishnanGanapathy-NamedEntityRecognition.pdf // Whenever we choose a strategy here, the strategy needs to get stored in the // model info file and re-used at application time! // NOTE: need to see if the label alphabet growing setting is handled correctly! // if there is at least one overlapping class annotation if (overlappingClassAnns.size() > 0) { // convert the set of annotation types to a list of type|code names // this should eventually be parametrizable so we can choose one of several methods // ideally we implement this as a method of one of an instance of several Seq2Class // subclasses. If it is an instance we could maybe also implement methods where we // need to remember something about the last instance for which we did it! target = seqEncoder.seqAnns2ClassLabel(overlappingClassAnns, instanceAnnotation, doc); if(target.isEmpty()) { target = SeqEncoder.CODE_OUTSIDE; } } else { //No overlapping mentions so it's an outside target = SeqEncoder.CODE_OUTSIDE; } // if debugging is enabled, we put the // the target class on the instance annotation if (DEBUG_SEQUENCE_CLASS) { instanceAnnotation.getFeatures().put("LF_sequenceClass", target); } // we now have the target label as a string, now set the target of the instance to // to the actual label // NOTE: the target alphabet for such an instance MUST be a LabelAlphabet! synchronized(labelalph) { inst.setTarget(labelalph.lookupLabel(target)); } }
Example #10
Source File: CorpusRepresentationMalletLDA.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 4 votes |
/** * Get a Mallet FeatureSequence Instance for the tokens in the span. * The span is what is covered by the original instance annotation. * @param from start offset * @param to end offset * @param tokenAS annotation set containing the token-like annotations * @param tokenFeatureName feature in the token-like annotations to use or empty for document text * @return mallet instance containing a feature sequence */ public Instance getInstanceFor( long from, long to, AnnotationSet tokenAS, String tokenFeatureName) { if(tokenFeatureName == null) { tokenFeatureName = ""; } Document doc = tokenAS.getDocument(); List<Annotation> tokenAnnotations = tokenAS.get(from, to).inDocumentOrder(); // System.err.println("DEBUG: getInstanceFor from="+from+", to="+to+", tokenanns="+tokenAnnotations.size()); List<String> tokenList = new ArrayList<>(); String str; for(Annotation tokenAnnotation : tokenAnnotations) { if(tokenFeatureName.isEmpty()) { str = gate.Utils.cleanStringFor(doc, tokenAnnotation); } else { str = (String)tokenAnnotation.getFeatures().get(tokenFeatureName); } if(str != null && !str.isEmpty()) { tokenList.add(str); } } TokenSequence tokenSeq = new TokenSequence(tokenList.toArray()); //System.err.println("DEBUG: tokensequence="+tokenSeq); //System.err.println("DEBUG: alphabet growStopped()="+instances.getAlphabet().growthStopped()); // NOTE: the following will create a feature sequence that contains -1 entries // for tokens which are not in the alphabet, if alphabet growth has been stopped // FeatureSequence featSeq = tokenSeq.toFeatureSequence(instances.getAlphabet()); // Instead we create the FeatureSequence ourselves FeatureSequence featSeq = new FeatureSequence(instances.getAlphabet(), tokenSeq.size()); Alphabet alph = instances.getAlphabet(); for(int i=0; i<tokenSeq.size(); i++) { int idx = alph.lookupIndex(tokenSeq.get(i).getText()); if(idx > -1) { featSeq.add(idx); } } /* System.err.println("DEBUG: fseq size="+featSeq.size()); System.err.println("DEBUG: fseq length="+featSeq.getLength()); System.err.println("DEBUG: fseq feats="+Arrays.toString(featSeq.getFeatures())); System.err.println("DEBUG: fseq feats="+Arrays.toString(featSeq.getFeatures())); System.err.println("DEBUG: fseq featIndexSequence="+Arrays.toString(featSeq.toFeatureIndexSequence())); */ // append the start offset to the document name, using a pipe character return new Instance(featSeq, null, doc.getName()+"|"+from, null); }
Example #11
Source File: TestPipeSerialization.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 4 votes |
@Test public void testPipeSerialization1() throws ResourceInstantiationException, IOException, ClassNotFoundException { String spec = "<ROOT>"+ "<ATTRIBUTE><TYPE>theType</TYPE><FEATURE>feature1</FEATURE><DATATYPE>nominal</DATATYPE><CODEAS>number</CODEAS></ATTRIBUTE>"+ "</ROOT>"; FeatureInfo fi = new FeatureSpecification(spec).getFeatureInfo(); // Create a pipe with a data and target alphabet Pipe tmppipe = new Noop(new LFAlphabet(),new LabelAlphabet()); List<Pipe> pipes = new ArrayList<>(); pipes.add(tmppipe); LFPipe pipe = new LFPipe(pipes); pipe.setFeatureInfo(fi); // add an entry to the data alphabet pipe.getDataAlphabet().lookupIndex("feature1"); // extract an instance - this should create/update the alphabet for the number representation of the feature Document doc = newDocument(); Annotation instAnn = addAnn(doc,"",0,0,"theType",gate.Utils.featureMap("feature1","val1")); Instance inst = newInstance(); FeatureSpecAttribute attr = fi.getAttributes().get(0); // make sure the attribute is a SimpleAttribute as expected assertEquals(FeatureSpecSimpleAttribute.class, attr.getClass()); FeatureSpecSimpleAttribute sa = (FeatureSpecSimpleAttribute)attr; FeatureExtractionMalletSparse.extractFeature(inst, sa, doc.getAnnotations(), instAnn); // verify that we do have an alphabet in the attribute info assertNotNull(sa.alphabet); System.err.println("DEBUG: the alphabet we have is "+sa.alphabet); assertTrue(sa.alphabet.contains("val1")); // remember that alphabet for later Alphabet valuealphabet = sa.alphabet; // No serialize the lfpipe File tmpFile = File.createTempFile("LF_test",".pipe"); tmpFile.deleteOnExit(); try (ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(tmpFile))) { oos.writeObject(pipe); } LFPipe pipe2; try ( // Now read it back and check if everything is there ObjectInputStream ois = new ObjectInputStream (new FileInputStream(tmpFile))) { pipe2 = (LFPipe) ois.readObject(); } // check if the data and target alphabets match assertTrue(pipe2.alphabetsMatch(pipe)); // Do we have a feature info? assertNotNull(pipe2.getFeatureInfo()); // do we have attributes? assertNotNull(pipe2.getFeatureInfo().getAttributes()); // is there exactly one attribute assertEquals(1, pipe2.getFeatureInfo().getAttributes().size()); // does that attribute have an alphabet assertNotNull(((FeatureSpecSimpleAttribute)pipe2.getFeatureInfo().getAttributes().get(0)).alphabet); // is the alphabet identical to what we originally had assertEquals(valuealphabet,((FeatureSpecSimpleAttribute)pipe2.getFeatureInfo().getAttributes().get(0)).alphabet); }
Example #12
Source File: TestFeatureExtraction.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 4 votes |
@Test public void extractNgram1() { String spec = "<ROOT>"+ "<NGRAM><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>1</NUMBER></NGRAM>"+ "<NGRAM><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>2</NUMBER></NGRAM>"+ "<NGRAM><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>3</NUMBER></NGRAM>"+ "</ROOT>"; FeatureInfo fi = new FeatureSpecification(spec).getFeatureInfo(); List<FeatureSpecAttribute> as = fi.getAttributes(); Alphabet a = new LFAlphabet(); AugmentableFeatureVector afv = new AugmentableFeatureVector(a); Instance inst = new Instance(afv,null,null,null); // prepare the document Annotation instAnn = addAnn(doc, "", 0, 20, "instanceType", gate.Utils.featureMap()); addAnn(doc,"",0,2,"theType",gate.Utils.featureMap("theFeature","tok1")); addAnn(doc,"",2,4,"theType",gate.Utils.featureMap("theFeature","tok2")); addAnn(doc,"",4,6,"theType",gate.Utils.featureMap("theFeature","tok3")); addAnn(doc,"",6,8,"theType",gate.Utils.featureMap("theFeature","tok4")); addAnn(doc,"",8,10,"theType",gate.Utils.featureMap("theFeature","tok5")); FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn); System.err.println("After "+as.get(0)+" (one-grams) FV="+inst.getData()); assertEquals(5,inst.getAlphabet().size()); System.err.println("Alphabet N1="+inst.getAlphabet()); assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N1═tok1")); assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N1═tok2")); assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N1═tok3")); assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N1═tok4")); assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N1═tok5")); assertEquals(5,((FeatureVector)inst.getData()).numLocations()); assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N1═tok1"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N1═tok2"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N1═tok3"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N1═tok4"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N1═tok5"),EPS); // now the bigrams inst = newInstance(); FeatureExtractionMalletSparse.extractFeature(inst, as.get(1), doc.getAnnotations(), instAnn); System.err.println("After "+as.get(1)+" (bi-grams) FV="+inst.getData()); System.err.println("Alphabet N2="+inst.getAlphabet()); assertEquals(4,inst.getAlphabet().size()); assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N2═tok1┋tok2")); assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N2═tok2┋tok3")); assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N2═tok3┋tok4")); assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N2═tok4┋tok5")); assertEquals(4,((FeatureVector)inst.getData()).numLocations()); assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N2═tok1┋tok2"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N2═tok2┋tok3"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N2═tok3┋tok4"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N2═tok4┋tok5"),EPS); // and the 3-grams inst = newInstance(); FeatureExtractionMalletSparse.extractFeature(inst, as.get(2), doc.getAnnotations(), instAnn); System.err.println("After "+as.get(2)+" (tri-grams) FV="+inst.getData()); System.err.println("Alphabet N3="+inst.getAlphabet()); assertEquals(3,inst.getAlphabet().size()); assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N3═tok1┋tok2┋tok3")); assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N3═tok2┋tok3┋tok4")); assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N3═tok3┋tok4┋tok5")); assertEquals(3,((FeatureVector)inst.getData()).numLocations()); assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N3═tok1┋tok2┋tok3"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N3═tok2┋tok3┋tok4"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N3═tok3┋tok4┋tok5"),EPS); }
Example #13
Source File: TestFeatureExtraction.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 4 votes |
@Test public void extractNgram2() { // essentially the same as extractNgram1 but explicitly specifies the name to use as internal // feature name String spec = "<ROOT>"+ "<NGRAM><NAME>ng1</NAME><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>1</NUMBER></NGRAM>"+ "<NGRAM><NAME>ngram2</NAME><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>2</NUMBER></NGRAM>"+ "<NGRAM><NAME>someName</NAME><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>3</NUMBER></NGRAM>"+ "</ROOT>"; FeatureInfo fi = new FeatureSpecification(spec).getFeatureInfo(); List<FeatureSpecAttribute> as = fi.getAttributes(); System.err.println("NGRAMS with explicitly specified name!!"); Alphabet a = new LFAlphabet(); AugmentableFeatureVector afv = new AugmentableFeatureVector(a); Instance inst = new Instance(afv,null,null,null); // prepare the document Annotation instAnn = addAnn(doc, "", 0, 20, "instanceType", gate.Utils.featureMap()); addAnn(doc,"",0,2,"theType",gate.Utils.featureMap("theFeature","tok1")); addAnn(doc,"",2,4,"theType",gate.Utils.featureMap("theFeature","tok2")); addAnn(doc,"",4,6,"theType",gate.Utils.featureMap("theFeature","tok3")); addAnn(doc,"",6,8,"theType",gate.Utils.featureMap("theFeature","tok4")); addAnn(doc,"",8,10,"theType",gate.Utils.featureMap("theFeature","tok5")); FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn); System.err.println("After "+as.get(0)+" (one-grams) FV="+inst.getData()); assertEquals(5,inst.getAlphabet().size()); System.err.println("Alphabet N1="+inst.getAlphabet()); assertTrue(inst.getAlphabet().contains("ng1╬N1═tok1")); assertTrue(inst.getAlphabet().contains("ng1╬N1═tok2")); assertTrue(inst.getAlphabet().contains("ng1╬N1═tok3")); assertTrue(inst.getAlphabet().contains("ng1╬N1═tok4")); assertTrue(inst.getAlphabet().contains("ng1╬N1═tok5")); assertEquals(5,((FeatureVector)inst.getData()).numLocations()); assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok1"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok2"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok3"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok4"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok5"),EPS); // now the bigrams inst = newInstance(); FeatureExtractionMalletSparse.extractFeature(inst, as.get(1), doc.getAnnotations(), instAnn); System.err.println("After "+as.get(1)+" (bi-grams) FV="+inst.getData()); System.err.println("Alphabet N2="+inst.getAlphabet()); assertEquals(4,inst.getAlphabet().size()); assertTrue(inst.getAlphabet().contains("ngram2╬N2═tok1┋tok2")); assertTrue(inst.getAlphabet().contains("ngram2╬N2═tok2┋tok3")); assertTrue(inst.getAlphabet().contains("ngram2╬N2═tok3┋tok4")); assertTrue(inst.getAlphabet().contains("ngram2╬N2═tok4┋tok5")); assertEquals(4,((FeatureVector)inst.getData()).numLocations()); assertEquals(1.0,((FeatureVector)inst.getData()).value("ngram2╬N2═tok1┋tok2"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("ngram2╬N2═tok2┋tok3"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("ngram2╬N2═tok3┋tok4"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("ngram2╬N2═tok4┋tok5"),EPS); // and the 3-grams inst = newInstance(); FeatureExtractionMalletSparse.extractFeature(inst, as.get(2), doc.getAnnotations(), instAnn); System.err.println("After "+as.get(2)+" (bi-grams) FV="+inst.getData()); System.err.println("Alphabet N3="+inst.getAlphabet()); assertEquals(3,inst.getAlphabet().size()); assertTrue(inst.getAlphabet().contains("someName╬N3═tok1┋tok2┋tok3")); assertTrue(inst.getAlphabet().contains("someName╬N3═tok2┋tok3┋tok4")); assertTrue(inst.getAlphabet().contains("someName╬N3═tok3┋tok4┋tok5")); assertEquals(3,((FeatureVector)inst.getData()).numLocations()); assertEquals(1.0,((FeatureVector)inst.getData()).value("someName╬N3═tok1┋tok2┋tok3"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("someName╬N3═tok2┋tok3┋tok4"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("someName╬N3═tok3┋tok4┋tok5"),EPS); }
Example #14
Source File: TestFeatureExtraction.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 4 votes |
@Test public void extractNgram3() { // same as Ngram2 but also use featureName4Value and test the filtering if we have a null // value for the second token. String spec = "<ROOT>"+ "<NGRAM><NAME>ng1</NAME><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>1</NUMBER><FEATURENAME4VALUE>val</FEATURENAME4VALUE></NGRAM>"+ "<NGRAM><NAME>ngram2</NAME><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>2</NUMBER><FEATURENAME4VALUE>val</FEATURENAME4VALUE></NGRAM>"+ "<NGRAM><NAME>someName</NAME><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>3</NUMBER><FEATURENAME4VALUE>val</FEATURENAME4VALUE></NGRAM>"+ "</ROOT>"; FeatureInfo fi = new FeatureSpecification(spec).getFeatureInfo(); List<FeatureSpecAttribute> as = fi.getAttributes(); System.err.println("NGRAMS with explicitly specified name, filtered by featurename4value!!"); Alphabet a = new LFAlphabet(); AugmentableFeatureVector afv = new AugmentableFeatureVector(a); Instance inst = new Instance(afv,null,null,null); // prepare the document Annotation instAnn = addAnn(doc, "", 0, 20, "instanceType", gate.Utils.featureMap()); addAnn(doc,"",0,2,"theType",gate.Utils.featureMap("theFeature","tok1","val",1.0)); addAnn(doc,"",2,4,"theType",gate.Utils.featureMap("theFeature","tok2")); addAnn(doc,"",4,6,"theType",gate.Utils.featureMap("theFeature","tok3","val",1.0)); addAnn(doc,"",6,8,"theType",gate.Utils.featureMap("theFeature","tok4","val",1.0)); addAnn(doc,"",8,10,"theType",gate.Utils.featureMap("theFeature","tok5","val",1.0)); FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn); System.err.println("Ngram3: After N1 extract "+as.get(0)+" (one-grams) FV="+inst.getData()); assertEquals(4,inst.getAlphabet().size()); System.err.println("Ngram3: Alphabet N1="+inst.getAlphabet()); assertTrue(inst.getAlphabet().contains("ng1╬N1═tok1")); assertTrue(inst.getAlphabet().contains("ng1╬N1═tok3")); assertTrue(inst.getAlphabet().contains("ng1╬N1═tok4")); assertTrue(inst.getAlphabet().contains("ng1╬N1═tok5")); assertEquals(4,((FeatureVector)inst.getData()).numLocations()); assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok1"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok3"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok4"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok5"),EPS); // now the bigrams inst = newInstance(); FeatureExtractionMalletSparse.extractFeature(inst, as.get(1), doc.getAnnotations(), instAnn); System.err.println("Ngram3: After N2 extract "+as.get(1)+" (bi-grams) FV="+inst.getData()); System.err.println("Alphabet N2="+inst.getAlphabet()); assertEquals(2,inst.getAlphabet().size()); assertTrue(inst.getAlphabet().contains("ngram2╬N2═tok3┋tok4")); assertTrue(inst.getAlphabet().contains("ngram2╬N2═tok4┋tok5")); assertEquals(2,((FeatureVector)inst.getData()).numLocations()); assertEquals(1.0,((FeatureVector)inst.getData()).value("ngram2╬N2═tok3┋tok4"),EPS); assertEquals(1.0,((FeatureVector)inst.getData()).value("ngram2╬N2═tok4┋tok5"),EPS); // and the 3-grams inst = newInstance(); FeatureExtractionMalletSparse.extractFeature(inst, as.get(2), doc.getAnnotations(), instAnn); System.err.println("Ngram3: After N3 extract "+as.get(2)+" (bi-grams) FV="+inst.getData()); System.err.println("Alphabet N3="+inst.getAlphabet()); assertEquals(1,inst.getAlphabet().size()); assertEquals(1,((FeatureVector)inst.getData()).numLocations()); assertEquals(1.0,((FeatureVector)inst.getData()).value("someName╬N3═tok3┋tok4┋tok5"),EPS); }
Example #15
Source File: Utils.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 4 votes |
public static Instance newInstance(Alphabet alph) { return new Instance(new AugmentableFeatureVector(alph),null,null,null); }
Example #16
Source File: LDAModelEstimator.java From RankSys with Mozilla Public License 2.0 | 4 votes |
@Override public Alphabet getDataAlphabet() { return alphabet; }
Example #17
Source File: MaxEntClassifierTrainer.java From baleen with Apache License 2.0 | 4 votes |
@Override protected void execute(JobSettings settings) throws AnalysisEngineProcessException { Pipe pipe = new MaxEntClassifierPipe(labelsAndFeatures.keySet(), stopwords); InstanceList instances = new InstanceList(pipe); instances.addThruPipe(getDocumentsFromMongoWithRandonLabelAssignement()); Alphabet targetAlphabet = instances.getTargetAlphabet(); HashMap<Integer, ArrayList<Integer>> featuresAndLabels = mapFeaturesToLabels(instances.getDataAlphabet(), targetAlphabet); int numLabels = targetAlphabet.size(); HashMap<Integer, double[]> constraintsMap = FeatureConstraintUtil.setTargetsUsingHeuristic(featuresAndLabels, numLabels, 0.9); MaxEntKLFLGEConstraints geConstraints = new MaxEntKLFLGEConstraints(instances.getDataAlphabet().size(), numLabels, false); constraintsMap .entrySet() .forEach(e -> geConstraints.addConstraint(e.getKey(), e.getValue(), 1)); ArrayList<MaxEntGEConstraint> constraints = new ArrayList<>(); constraints.add(geConstraints); // Create a classifier trainer, and use it to create a classifier MaxEntGETrainer trainer = new MaxEntGETrainer(constraints); trainer.setMaxIterations(numIterations); trainer.setGaussianPriorVariance(variance); instances.forEach( i -> { i.unLock(); i.setTarget(null); i.lock(); }); Classifier classifier = trainer.train(instances); List<Classification> classify = classifier.classify(instances); writeClassificationToMongo(classify); new ObjectFile(classifier, modelFile).write(); }
Example #18
Source File: Attributes.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 4 votes |
/** * Generate the attributes object from the information in the pipe. * The pipe should be a LFPipe, but we also try to come up with something * if it is an ordinary pipe. * * @param pipe mallet pipe * @param instanceType instance type */ public Attributes(Pipe pipe, String instanceType) { // first create the attributes (independent vars) Alphabet dataAlphabet = pipe.getDataAlphabet(); // if we can, also represent the pipe as LFPipe LFPipe lfPipe; FeatureInfo featureInfo = null; if(pipe instanceof LFPipe) { lfPipe = (LFPipe)pipe; featureInfo = lfPipe.getFeatureInfo(); } // the alphabet we use if we have a boolean variable LFAlphabet booleanAlph = new LFAlphabet(); booleanAlph.lookupIndex("false"); booleanAlph.lookupIndex("true"); for(int i =0; i<dataAlphabet.size(); i++) { String malletFeatureName = (String) dataAlphabet.lookupObject(i); // create an attribute with default settings for datatype, code and // alphabet, if we got more information about it we will override later Attribute attr = new Attribute( malletFeatureName, i, Datatype.numeric, null, null, null); // add it attributes.add(attr); name2index.put(malletFeatureName, i); // If we have a LFPipe, also get some additional info about the type, values etc. // NOTE that the default type for features that indicate the presence of // strings, ngrams etc. (which we assume when nothing else is declared) // is numeric, so that instead of 0/1 we can have counts or tf/idf or // other scores. So only if there is an explicity declaration of a different // type, we will change the default values. if(featureInfo != null) { FeatureSpecAttribute fsAttr = FeatureExtractionMalletSparse.lookupAttributeForFeatureName( featureInfo.getAttributes(), malletFeatureName, instanceType); if(fsAttr instanceof FeatureSpecAttributeList) { FeatureSpecAttributeList fsAttrList = (FeatureSpecAttributeList)fsAttr; attr.codeAs = fsAttrList.codeas; attr.mvTreatment = fsAttrList.missingValueTreatment; attr.datatype = fsAttrList.datatype; if(fsAttrList.datatype == Datatype.bool) { attr.alphabet = booleanAlph; } else if(fsAttrList.datatype == Datatype.nominal) { if(fsAttrList.codeas == CodeAs.number) { attr.alphabet = fsAttrList.alphabet; } } } else if(fsAttr instanceof FeatureSpecSimpleAttribute) { FeatureSpecSimpleAttribute fsAttrSimple = (FeatureSpecSimpleAttribute)fsAttr; attr.codeAs = fsAttrSimple.codeas; attr.mvTreatment = fsAttrSimple.missingValueTreatment; attr.datatype = fsAttrSimple.datatype; if(fsAttrSimple.datatype == Datatype.bool) { attr.alphabet = booleanAlph; } else if(fsAttrSimple.datatype == Datatype.nominal) { if(fsAttrSimple.codeas == CodeAs.number) { attr.alphabet = fsAttrSimple.alphabet; } } } else if(fsAttr instanceof FeatureSpecNgram) { // nothing to do here } else if(fsAttr==null) { // This can also happen if we try to look up a START/STOP feature which // is created by us and for which not specification exists. In this case, // we simply do nothing and use the default attr we have created above if(malletFeatureName.endsWith(FeatureExtractionMalletSparse.START_SYMBOL) || malletFeatureName.endsWith(FeatureExtractionMalletSparse.STOP_SYMBOL)) { // do nothing } else { throw new RuntimeException("FeatureSpecification is null for feature "+ i+", name="+malletFeatureName+ "\nFeatureSpecification is "+featureInfo); } } else { throw new RuntimeException( "Impossible: found odd FeatureSpecAttribute type "+fsAttr.getClass()); } } } @SuppressWarnings("unchecked") LabelAlphabet targetAlphabet = (LabelAlphabet)pipe.getTargetAlphabet(); // if the target alphabet exists, we assume a nominal target // The target index is the next index after the last independent attribute // index. This is convenient for Weka. targetAttribute = new Attribute("target", attributes.size(), Datatype.numeric, null, null, null); if(targetAlphabet != null) { targetAttribute.alphabet = targetAlphabet; targetAttribute.datatype = Datatype.nominal; } }
Example #19
Source File: TopicModelPipe.java From baleen with Apache License 2.0 | 2 votes |
/** * Construct topic model pipe with given stopwords * * @param stopwords to be removed */ public TopicModelPipe(Collection<String> stopwords) { this(stopwords, new Alphabet()); }