cc.mallet.types.Alphabet Java Exaples

Source File: EngineMB.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

6 votes

protected void updateInfo() {
  //System.err.println("In updateInfo, model is "+model);
  if(model!=null) {
    info.modelClass = model.getClass().getName();
  }
  info.nrTrainingInstances = corpusRepresentation.getRepresentationMallet().size();
  info.nrTrainingDimensions = corpusRepresentation.getRepresentationMallet().getDataAlphabet().size();    
  LFPipe pipe = corpusRepresentation.getPipe();
  Alphabet targetAlph = pipe.getTargetAlphabet();
  if(targetAlph == null) {
    info.nrTargetValues = 0;
  } else {
    info.nrTargetValues = targetAlph.size();
    //info.classLabels = 
    Object[] objs = targetAlph.toArray();
    ArrayList<String> labels = new ArrayList<>();
    for(Object obj : objs) { labels.add(obj.toString()); }
    info.classLabels = labels;
  }
  
}

Source File: PipeScaleMinMaxAll.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

6 votes

/**
 * Constructor from alphabet and feature stats.
 * @param alphabet alphabet
 * @param stats feature stats
 */
public PipeScaleMinMaxAll(Alphabet alphabet, FVStatsMeanVarAll stats) {
  super(alphabet, null);
  List<PerFeatureStats> pfss = stats.getStats();
  int n = pfss.size();
  min = new double[n];
  max = new double[n];
  normalize = new boolean[n];
  for(int i=0; i<n; i++) {
    PerFeatureStats pfs = pfss.get(i);
    // we do not normalize binary features and we do not normalize features with no
    // values at all
    if(pfs.binary != null && pfs.binary != true) {
      min[i] = pfs.min;
      max[i] = pfs.max;
    } else {
      normalize[i] = false;
    }
  }
  System.err.println("DEBUG: Creating PipeScaleMinMaxAll instance with mins="+Arrays.toString(min)+",maxs="+Arrays.toString(max));
}

Source File: PipeScaleMeanVarAll.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

6 votes

/**
 * Constructor from alphabet and stats.
 * @param alphabet alphabet
 * @param stats feature stats
 */
public PipeScaleMeanVarAll(Alphabet alphabet, FVStatsMeanVarAll stats) {
  super(alphabet, null);
  List<PerFeatureStats> pfss = stats.getStats();
  int n = pfss.size();
  means = new double[n];
  variances = new double[n];
  normalize = new boolean[n];
  for(int i=0; i<n; i++) {
    PerFeatureStats pfs = pfss.get(i);
    // we do not normalize binary features and we do not normalize features with no
    // values at all
    if(pfs.binary != null && pfs.binary != true) {
      means[i] = pfs.mean;
      variances[i] = pfs.var;
      normalize[i] = true;
    } else {
      means[i] = Double.NaN;
      variances[i] = Double.NaN;
      normalize[i] = false;
    }
  }
  //System.err.println("DEBUG: Creating PipeScaleMeanVarAll instance with means="+Arrays.toString(means)+
  //        ",variances="+Arrays.toString(variances)+",flags="+Arrays.toString(normalize));
}

Source File: MaxEntClassifierTrainer.java From baleen with Apache License 2.0

6 votes

private HashMap<Integer, ArrayList<Integer>> mapFeaturesToLabels(
    Alphabet dataAlphabet, Alphabet targetAlphabet) {

  HashMap<Integer, ArrayList<Integer>> featuresAndLabels = new HashMap<>();

  labelsAndFeatures.forEach(
      (k, v) -> {
        Integer label = targetAlphabet.lookupIndex(k);
        v.forEach(
            f -> {
              Integer feature = dataAlphabet.lookupIndex(f);
              ArrayList<Integer> labels = featuresAndLabels.get(feature);
              if (labels == null) {
                labels = new ArrayList<>();
                featuresAndLabels.put(feature, labels);
              }
              labels.add(label);
            });
      });

  return featuresAndLabels;
}

Source File: EngineMBPythonNetworksBase.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

5 votes

protected AbstractMap.SimpleEntry<String,Integer> findOutMode(CorpusRepresentationMalletTarget crm)  {
  InstanceList instances = crm.getRepresentationMallet();
  // we pass on a "mode" for the learning problem, which is one of the following:
  // - classind: predict the index of a class
  // - classcosts: targets are vectors of class costs
  // - regr: regression
  // we also pass on another parameter which provides details of the learning problem:
  // - the number of class indices in case of classind and classcosts
  // - 0 as a dummy value in case of "regr"
  
  int nrClasses = 0;
  String mode = "regr";
  Alphabet ta = crm.getPipe().getTargetAlphabet();
  
  if(ta != null) {
    // if this is invoked for training, we should have a first instance, but for 
    // application, we do not have any instances yet. If we do not have any instances, we 
    // just use dummy values for now since at the moment we do not need this information
    // at application time. Should we ever need it we need to store this in the pipe!
    if(instances==null || instances.isEmpty()) {
      mode="classind";
      nrClasses=-1;
    } else {
      Instance firstInstance = instances.get(0);
      Object targetObj = firstInstance.getTarget();
      if(targetObj instanceof NominalTargetWithCosts) {
        NominalTargetWithCosts target = (NominalTargetWithCosts)targetObj;
        nrClasses = target.getCosts().length;
        mode = "classcosts";
      } else {
        mode = "classind";
        nrClasses = ta.size();
      }
    }
  } 
  AbstractMap.SimpleEntry<String,Integer> ret = new AbstractMap.SimpleEntry<>(mode,nrClasses);
  return ret;
}

Source File: TopicModelPipe.java From baleen with Apache License 2.0

5 votes

/**
 * Construct topic model pipe with given stopwords and alphabets
 *
 * @param stopwords to be removed
 * @param alphabet to use
 */
public TopicModelPipe(Collection<String> stopwords, Alphabet alphabet) {
  // @formatter:off
  super(
      ImmutableList.of(
          new CharSequenceLowercase(),
          new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")),
          new RemoveStopwords(stopwords),
          new TokenSequence2FeatureSequence(alphabet)));
  // @formatter:on
}

Source File: CorpusExporterMRARFF.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

5 votes

/**
 * Convert alphabet to ARFF declaration string.
 * @param alph Mallet alphabet
 * @param mvt missing value treatment setting
 * @return ARFF declaration
 */
public String alphabet2Arff(Alphabet alph, MissingValueTreatment mvt) {
  // NOTE: mvt can be null, if this is used for a target!!
  StringBuilder sb = new StringBuilder();
  sb.append("{");
  for(int i=0; i<alph.size(); i++) {
    if(i>0) sb.append(",");
    String val = alph.lookupObject(i).toString();
    sb.append(escape4Arff(val));
  }
  // TODO: we may need to add the definition for the missing value here,
  // but by default, we do not do that.
  sb.append("}");
  return sb.toString();
}

Source File: CorpusRepresentationMallet.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

5 votes

@Override
public List<String> getLabelList() {
  Alphabet ta = pipe.getTargetAlphabet();
  if (ta != null) {
     Object[] ls = ta.toArray();
     List<String> list = new ArrayList<>();
     for(Object o : ls) {
       list.add(o.toString());
     }
     return list;
  } else {
    return new ArrayList<>();
  }
}

Source File: FeatureExtractionMalletSparse.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

4 votes

/**
 * Extract the class for an instance for sequence tagging.
 *
 * In the case of sequence tagging, we construct the class based on the instance's position
 * relative to the class annotation annType. If it occurs at the beginning of the class
 * annotation, it's a "beginning". In the middle or at the end, it's an "inside". Instances that
 * don't occur in the span of a class annotation are an "outside".
 *
 * @param inst instance
 * @param alph the label alphabet to use, must be an instance of LabelAlphabet
 * @param classAS class annotation set
 * @param instanceAnnotation  the instance annotation, e.g. "Token".
 * @param seqEncoder sequence encoder instance
 */
public static void extractClassForSeqTagging(Instance inst, Alphabet alph, AnnotationSet classAS, Annotation instanceAnnotation, SeqEncoder seqEncoder) {
  String target;
  Document doc = classAS.getDocument();
  if (!(alph instanceof LabelAlphabet)) {
    throw new GateRuntimeException("LF extractClassForSeqTagging: the alphabet must be of type LabelAlphabet"
            + " for instance annotation at offset " + gate.Utils.start(instanceAnnotation)
            + " in document " + doc.getName());
  }
  LabelAlphabet labelalph = (LabelAlphabet) alph;
  AnnotationSet overlappingClassAnns = Utils.getOverlappingAnnotations(classAS, instanceAnnotation);
  // NOTE: previously we only allowed at most one class annotation, but now we are as flexible
  // as possible here: any number of class annotations of any number of types can overlap.
  // The class label for each instance is generated from the complete list of what overlaps,
  // e.g. beginning of T1, beginning of another T1, continuation of T2 and end of T3 
  // The class labels for such combinations only get generated if an overlap actually occurs,
  // so if we only ever see nicely separated annotations, then we will never see the combined labels.
  // Labels are dynamically generated as a string of pipe-separated type names, with the flag
  // (beginning=B, inside=I) appended, or class "O" if outside of all types. 
  // The ordering of types in the class label name must be consistent: TODO!!
  // NOTE: this should be one of several possible ways to do it, implemented in several
  // methods/classes and choosable through e.g. the "algorithmParameter" settings.
  // Then we could use approaches like BIO, BMEWO, BMEWO+ (see
  // https://lingpipe-blog.com/2009/10/14/coding-chunkers-as-taggers-io-bio-bmewo-and-bmewo/)
  // or the ones listed in http://cs229.stanford.edu/proj2005/KrishnanGanapathy-NamedEntityRecognition.pdf
  // Whenever we choose a strategy here, the strategy needs to get stored in the 
  // model info file and re-used at application time!
  // NOTE: need to see if the label alphabet growing setting is handled correctly!
  
  // if there is at least one overlapping class annotation
  if (overlappingClassAnns.size() > 0) {
    // convert the set of annotation types to a list of type|code names
    // this should eventually be parametrizable so we can choose one of several methods
    // ideally we implement this as a method of one of an instance of several Seq2Class 
    // subclasses. If it is an instance we could maybe also implement methods where we
    // need to remember something about the last instance for which we did it!
    target = seqEncoder.seqAnns2ClassLabel(overlappingClassAnns, instanceAnnotation, doc);
    if(target.isEmpty()) {
      target = SeqEncoder.CODE_OUTSIDE;
    }
  } else {
    //No overlapping mentions so it's an outside
    target = SeqEncoder.CODE_OUTSIDE;
  }
  // if debugging is enabled, we put the 
  // the target class on the instance annotation
  if (DEBUG_SEQUENCE_CLASS) {
    instanceAnnotation.getFeatures().put("LF_sequenceClass", target);
  }
  // we now have the target label as a string, now set the target of the instance to 
  // to the actual label
  // NOTE: the target alphabet for such an instance MUST be a LabelAlphabet!
  synchronized(labelalph) {
    inst.setTarget(labelalph.lookupLabel(target));
  }
}

Source File: CorpusRepresentationMalletLDA.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

4 votes

/**
 * Get a Mallet FeatureSequence Instance for the tokens in the span.
 * The span is what is covered by the original instance annotation.
 * @param from start offset
 * @param to end offset 
 * @param tokenAS  annotation set containing the token-like annotations
 * @param tokenFeatureName feature in the token-like annotations to use or empty for document text
 * @return  mallet instance containing a feature sequence 
 */
public Instance getInstanceFor(
        long from,
        long to,
        AnnotationSet tokenAS,
        String tokenFeatureName) {

  
  if(tokenFeatureName == null) {
    tokenFeatureName = "";
  }
  Document doc = tokenAS.getDocument();
  List<Annotation> tokenAnnotations = tokenAS.get(from, to).inDocumentOrder();
  // System.err.println("DEBUG: getInstanceFor from="+from+", to="+to+", tokenanns="+tokenAnnotations.size());
  List<String> tokenList = new ArrayList<>();
  String str;
  for(Annotation tokenAnnotation : tokenAnnotations) {
    if(tokenFeatureName.isEmpty()) {
      str = gate.Utils.cleanStringFor(doc, tokenAnnotation);
    } else {
      str = (String)tokenAnnotation.getFeatures().get(tokenFeatureName);
    }
    if(str != null && !str.isEmpty()) {
      tokenList.add(str);
    }
  }
  TokenSequence tokenSeq = new TokenSequence(tokenList.toArray());
  //System.err.println("DEBUG: tokensequence="+tokenSeq);
  //System.err.println("DEBUG: alphabet growStopped()="+instances.getAlphabet().growthStopped());
  
  
  // NOTE: the following will create a feature sequence that contains -1 entries
  // for tokens which are not in the alphabet, if alphabet growth has been stopped
  // FeatureSequence featSeq = tokenSeq.toFeatureSequence(instances.getAlphabet());
  
  // Instead we create the FeatureSequence ourselves
  FeatureSequence featSeq = new FeatureSequence(instances.getAlphabet(), tokenSeq.size());
  Alphabet alph = instances.getAlphabet();
  for(int i=0; i<tokenSeq.size(); i++) {
    int idx = alph.lookupIndex(tokenSeq.get(i).getText());
    if(idx > -1) {
      featSeq.add(idx);
    }
  }
  /*
  System.err.println("DEBUG: fseq size="+featSeq.size());
  System.err.println("DEBUG: fseq length="+featSeq.getLength());
  System.err.println("DEBUG: fseq feats="+Arrays.toString(featSeq.getFeatures()));
  System.err.println("DEBUG: fseq feats="+Arrays.toString(featSeq.getFeatures()));
  System.err.println("DEBUG: fseq featIndexSequence="+Arrays.toString(featSeq.toFeatureIndexSequence()));
  */
  // append the start offset to the document name, using a pipe character
  return new Instance(featSeq, null, doc.getName()+"|"+from, null);

}

Source File: TestPipeSerialization.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

4 votes

@Test
public void testPipeSerialization1() throws ResourceInstantiationException, IOException, ClassNotFoundException {
  String spec = "<ROOT>"+
          "<ATTRIBUTE><TYPE>theType</TYPE><FEATURE>feature1</FEATURE><DATATYPE>nominal</DATATYPE><CODEAS>number</CODEAS></ATTRIBUTE>"+
          "</ROOT>";    
  FeatureInfo fi = new FeatureSpecification(spec).getFeatureInfo();
  // Create a pipe with a data and target alphabet
  Pipe tmppipe = new Noop(new LFAlphabet(),new LabelAlphabet());
  List<Pipe> pipes = new ArrayList<>();
  pipes.add(tmppipe);
  LFPipe pipe = new LFPipe(pipes);
  pipe.setFeatureInfo(fi);
  
  // add an entry to the data alphabet
  pipe.getDataAlphabet().lookupIndex("feature1");
  // extract an instance - this should create/update the alphabet for the number representation of the feature
  Document doc = newDocument();
  Annotation instAnn = addAnn(doc,"",0,0,"theType",gate.Utils.featureMap("feature1","val1"));
  Instance inst = newInstance();
  FeatureSpecAttribute attr = fi.getAttributes().get(0);
  // make sure the attribute is a SimpleAttribute as expected
  assertEquals(FeatureSpecSimpleAttribute.class, attr.getClass());
  FeatureSpecSimpleAttribute sa = (FeatureSpecSimpleAttribute)attr;
  FeatureExtractionMalletSparse.extractFeature(inst, sa, doc.getAnnotations(), instAnn);
  // verify that we do have an alphabet in the attribute info
  assertNotNull(sa.alphabet);    
  System.err.println("DEBUG: the alphabet we have is "+sa.alphabet);
  assertTrue(sa.alphabet.contains("val1"));
  // remember that alphabet for later
  Alphabet valuealphabet = sa.alphabet;
  
  // No serialize the lfpipe
  File tmpFile = File.createTempFile("LF_test",".pipe");
  tmpFile.deleteOnExit();
  try (ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(tmpFile))) {
    oos.writeObject(pipe);
  }    
  LFPipe pipe2;
  try ( // Now read it back and check if everything is there
          ObjectInputStream ois = new ObjectInputStream (new FileInputStream(tmpFile))) {
    pipe2 = (LFPipe) ois.readObject();
  }
  // check if the data and target alphabets match
  assertTrue(pipe2.alphabetsMatch(pipe));
  // Do we have a feature info?
  assertNotNull(pipe2.getFeatureInfo());
  // do we have attributes?
  assertNotNull(pipe2.getFeatureInfo().getAttributes());
  // is there exactly one attribute
  assertEquals(1, pipe2.getFeatureInfo().getAttributes().size());
  // does that attribute have an alphabet
  assertNotNull(((FeatureSpecSimpleAttribute)pipe2.getFeatureInfo().getAttributes().get(0)).alphabet);
  // is the alphabet identical to what we originally had
  assertEquals(valuealphabet,((FeatureSpecSimpleAttribute)pipe2.getFeatureInfo().getAttributes().get(0)).alphabet);
}

Source File: TestFeatureExtraction.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

4 votes

@Test
public void extractNgram1() {
  String spec = "<ROOT>"+
          "<NGRAM><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>1</NUMBER></NGRAM>"+
          "<NGRAM><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>2</NUMBER></NGRAM>"+
          "<NGRAM><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>3</NUMBER></NGRAM>"+
          "</ROOT>";
  FeatureInfo fi = new FeatureSpecification(spec).getFeatureInfo();
  List<FeatureSpecAttribute> as = fi.getAttributes();

  Alphabet a = new LFAlphabet();
  AugmentableFeatureVector afv = new AugmentableFeatureVector(a);
  Instance inst = new Instance(afv,null,null,null);
  
  // prepare the document
  Annotation instAnn = addAnn(doc, "", 0, 20, "instanceType", gate.Utils.featureMap());
  addAnn(doc,"",0,2,"theType",gate.Utils.featureMap("theFeature","tok1"));
  addAnn(doc,"",2,4,"theType",gate.Utils.featureMap("theFeature","tok2"));
  addAnn(doc,"",4,6,"theType",gate.Utils.featureMap("theFeature","tok3"));
  addAnn(doc,"",6,8,"theType",gate.Utils.featureMap("theFeature","tok4"));
  addAnn(doc,"",8,10,"theType",gate.Utils.featureMap("theFeature","tok5"));
  
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn);
  System.err.println("After "+as.get(0)+" (one-grams) FV="+inst.getData());
  assertEquals(5,inst.getAlphabet().size());
  System.err.println("Alphabet N1="+inst.getAlphabet());
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N1═tok1"));
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N1═tok2"));
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N1═tok3"));
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N1═tok4"));
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N1═tok5"));
  assertEquals(5,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N1═tok1"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N1═tok2"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N1═tok3"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N1═tok4"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N1═tok5"),EPS);
  
  // now the bigrams
  inst = newInstance();
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(1), doc.getAnnotations(), instAnn);
  System.err.println("After "+as.get(1)+" (bi-grams) FV="+inst.getData());
  System.err.println("Alphabet N2="+inst.getAlphabet());
  assertEquals(4,inst.getAlphabet().size());
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N2═tok1┋tok2"));
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N2═tok2┋tok3"));
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N2═tok3┋tok4"));
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N2═tok4┋tok5"));
  assertEquals(4,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N2═tok1┋tok2"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N2═tok2┋tok3"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N2═tok3┋tok4"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N2═tok4┋tok5"),EPS);

  // and the 3-grams
  inst = newInstance();
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(2), doc.getAnnotations(), instAnn);
  System.err.println("After "+as.get(2)+" (tri-grams) FV="+inst.getData());
  System.err.println("Alphabet N3="+inst.getAlphabet());
  assertEquals(3,inst.getAlphabet().size());
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N3═tok1┋tok2┋tok3"));
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N3═tok2┋tok3┋tok4"));
  assertTrue(inst.getAlphabet().contains("theType┆theFeature╬N3═tok3┋tok4┋tok5"));
  assertEquals(3,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N3═tok1┋tok2┋tok3"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N3═tok2┋tok3┋tok4"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("theType┆theFeature╬N3═tok3┋tok4┋tok5"),EPS);
}

Source File: TestFeatureExtraction.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

4 votes

@Test
public void extractNgram2() {
  // essentially the same as extractNgram1 but explicitly specifies the name to use as internal
  // feature name
  String spec = "<ROOT>"+
          "<NGRAM><NAME>ng1</NAME><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>1</NUMBER></NGRAM>"+
          "<NGRAM><NAME>ngram2</NAME><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>2</NUMBER></NGRAM>"+
          "<NGRAM><NAME>someName</NAME><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>3</NUMBER></NGRAM>"+
          "</ROOT>";
  FeatureInfo fi = new FeatureSpecification(spec).getFeatureInfo();
  List<FeatureSpecAttribute> as = fi.getAttributes();
  System.err.println("NGRAMS with explicitly specified name!!");
  Alphabet a = new LFAlphabet();
  AugmentableFeatureVector afv = new AugmentableFeatureVector(a);
  Instance inst = new Instance(afv,null,null,null);
  
  // prepare the document
  Annotation instAnn = addAnn(doc, "", 0, 20, "instanceType", gate.Utils.featureMap());
  addAnn(doc,"",0,2,"theType",gate.Utils.featureMap("theFeature","tok1"));
  addAnn(doc,"",2,4,"theType",gate.Utils.featureMap("theFeature","tok2"));
  addAnn(doc,"",4,6,"theType",gate.Utils.featureMap("theFeature","tok3"));
  addAnn(doc,"",6,8,"theType",gate.Utils.featureMap("theFeature","tok4"));
  addAnn(doc,"",8,10,"theType",gate.Utils.featureMap("theFeature","tok5"));
  
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn);
  System.err.println("After "+as.get(0)+" (one-grams) FV="+inst.getData());
  assertEquals(5,inst.getAlphabet().size());
  System.err.println("Alphabet N1="+inst.getAlphabet());
  assertTrue(inst.getAlphabet().contains("ng1╬N1═tok1"));
  assertTrue(inst.getAlphabet().contains("ng1╬N1═tok2"));
  assertTrue(inst.getAlphabet().contains("ng1╬N1═tok3"));
  assertTrue(inst.getAlphabet().contains("ng1╬N1═tok4"));
  assertTrue(inst.getAlphabet().contains("ng1╬N1═tok5"));
  assertEquals(5,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok1"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok2"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok3"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok4"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok5"),EPS);
  
  // now the bigrams
  inst = newInstance();
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(1), doc.getAnnotations(), instAnn);
  System.err.println("After "+as.get(1)+" (bi-grams) FV="+inst.getData());
  System.err.println("Alphabet N2="+inst.getAlphabet());
  assertEquals(4,inst.getAlphabet().size());
  assertTrue(inst.getAlphabet().contains("ngram2╬N2═tok1┋tok2"));
  assertTrue(inst.getAlphabet().contains("ngram2╬N2═tok2┋tok3"));
  assertTrue(inst.getAlphabet().contains("ngram2╬N2═tok3┋tok4"));
  assertTrue(inst.getAlphabet().contains("ngram2╬N2═tok4┋tok5"));
  assertEquals(4,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ngram2╬N2═tok1┋tok2"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ngram2╬N2═tok2┋tok3"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ngram2╬N2═tok3┋tok4"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ngram2╬N2═tok4┋tok5"),EPS);

  // and the 3-grams
  inst = newInstance();
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(2), doc.getAnnotations(), instAnn);
  System.err.println("After "+as.get(2)+" (bi-grams) FV="+inst.getData());
  System.err.println("Alphabet N3="+inst.getAlphabet());
  assertEquals(3,inst.getAlphabet().size());
  assertTrue(inst.getAlphabet().contains("someName╬N3═tok1┋tok2┋tok3"));
  assertTrue(inst.getAlphabet().contains("someName╬N3═tok2┋tok3┋tok4"));
  assertTrue(inst.getAlphabet().contains("someName╬N3═tok3┋tok4┋tok5"));
  assertEquals(3,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("someName╬N3═tok1┋tok2┋tok3"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("someName╬N3═tok2┋tok3┋tok4"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("someName╬N3═tok3┋tok4┋tok5"),EPS);
}

Source File: TestFeatureExtraction.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

4 votes

@Test
public void extractNgram3() {
  // same as Ngram2 but also use featureName4Value and test the filtering if we have a null
  // value for the second token. 
  String spec = "<ROOT>"+
          "<NGRAM><NAME>ng1</NAME><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>1</NUMBER><FEATURENAME4VALUE>val</FEATURENAME4VALUE></NGRAM>"+
          "<NGRAM><NAME>ngram2</NAME><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>2</NUMBER><FEATURENAME4VALUE>val</FEATURENAME4VALUE></NGRAM>"+
          "<NGRAM><NAME>someName</NAME><TYPE>theType</TYPE><FEATURE>theFeature</FEATURE><NUMBER>3</NUMBER><FEATURENAME4VALUE>val</FEATURENAME4VALUE></NGRAM>"+
          "</ROOT>";
  FeatureInfo fi = new FeatureSpecification(spec).getFeatureInfo();
  List<FeatureSpecAttribute> as = fi.getAttributes();
  System.err.println("NGRAMS with explicitly specified name, filtered by featurename4value!!");
  Alphabet a = new LFAlphabet();
  AugmentableFeatureVector afv = new AugmentableFeatureVector(a);
  Instance inst = new Instance(afv,null,null,null);
  
  // prepare the document
  Annotation instAnn = addAnn(doc, "", 0, 20, "instanceType", gate.Utils.featureMap());
  addAnn(doc,"",0,2,"theType",gate.Utils.featureMap("theFeature","tok1","val",1.0));
  addAnn(doc,"",2,4,"theType",gate.Utils.featureMap("theFeature","tok2"));
  addAnn(doc,"",4,6,"theType",gate.Utils.featureMap("theFeature","tok3","val",1.0));
  addAnn(doc,"",6,8,"theType",gate.Utils.featureMap("theFeature","tok4","val",1.0));
  addAnn(doc,"",8,10,"theType",gate.Utils.featureMap("theFeature","tok5","val",1.0));
  
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(0), doc.getAnnotations(), instAnn);
  System.err.println("Ngram3: After N1 extract "+as.get(0)+" (one-grams) FV="+inst.getData());
  assertEquals(4,inst.getAlphabet().size());
  System.err.println("Ngram3: Alphabet N1="+inst.getAlphabet());
  assertTrue(inst.getAlphabet().contains("ng1╬N1═tok1"));
  assertTrue(inst.getAlphabet().contains("ng1╬N1═tok3"));
  assertTrue(inst.getAlphabet().contains("ng1╬N1═tok4"));
  assertTrue(inst.getAlphabet().contains("ng1╬N1═tok5"));
  assertEquals(4,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok1"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok3"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok4"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ng1╬N1═tok5"),EPS);
  
  // now the bigrams
  inst = newInstance();
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(1), doc.getAnnotations(), instAnn);
  System.err.println("Ngram3: After N2 extract "+as.get(1)+" (bi-grams) FV="+inst.getData());
  System.err.println("Alphabet N2="+inst.getAlphabet());
  assertEquals(2,inst.getAlphabet().size());
  assertTrue(inst.getAlphabet().contains("ngram2╬N2═tok3┋tok4"));
  assertTrue(inst.getAlphabet().contains("ngram2╬N2═tok4┋tok5"));
  assertEquals(2,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ngram2╬N2═tok3┋tok4"),EPS);
  assertEquals(1.0,((FeatureVector)inst.getData()).value("ngram2╬N2═tok4┋tok5"),EPS);

  // and the 3-grams
  inst = newInstance();
  FeatureExtractionMalletSparse.extractFeature(inst, as.get(2), doc.getAnnotations(), instAnn);
  System.err.println("Ngram3: After N3 extract "+as.get(2)+" (bi-grams) FV="+inst.getData());
  System.err.println("Alphabet N3="+inst.getAlphabet());
  assertEquals(1,inst.getAlphabet().size());
  assertEquals(1,((FeatureVector)inst.getData()).numLocations());
  assertEquals(1.0,((FeatureVector)inst.getData()).value("someName╬N3═tok3┋tok4┋tok5"),EPS);
}

Source File: Utils.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

4 votes

public static Instance newInstance(Alphabet alph) {
  return new Instance(new AugmentableFeatureVector(alph),null,null,null);    
}

Source File: LDAModelEstimator.java From RankSys with Mozilla Public License 2.0

4 votes

@Override
public Alphabet getDataAlphabet() {
    return alphabet;
}

Source File: MaxEntClassifierTrainer.java From baleen with Apache License 2.0

4 votes

@Override
protected void execute(JobSettings settings) throws AnalysisEngineProcessException {

  Pipe pipe = new MaxEntClassifierPipe(labelsAndFeatures.keySet(), stopwords);

  InstanceList instances = new InstanceList(pipe);
  instances.addThruPipe(getDocumentsFromMongoWithRandonLabelAssignement());

  Alphabet targetAlphabet = instances.getTargetAlphabet();
  HashMap<Integer, ArrayList<Integer>> featuresAndLabels =
      mapFeaturesToLabels(instances.getDataAlphabet(), targetAlphabet);

  int numLabels = targetAlphabet.size();
  HashMap<Integer, double[]> constraintsMap =
      FeatureConstraintUtil.setTargetsUsingHeuristic(featuresAndLabels, numLabels, 0.9);

  MaxEntKLFLGEConstraints geConstraints =
      new MaxEntKLFLGEConstraints(instances.getDataAlphabet().size(), numLabels, false);
  constraintsMap
      .entrySet()
      .forEach(e -> geConstraints.addConstraint(e.getKey(), e.getValue(), 1));
  ArrayList<MaxEntGEConstraint> constraints = new ArrayList<>();
  constraints.add(geConstraints);

  // Create a classifier trainer, and use it to create a classifier
  MaxEntGETrainer trainer = new MaxEntGETrainer(constraints);
  trainer.setMaxIterations(numIterations);
  trainer.setGaussianPriorVariance(variance);

  instances.forEach(
      i -> {
        i.unLock();
        i.setTarget(null);
        i.lock();
      });

  Classifier classifier = trainer.train(instances);

  List<Classification> classify = classifier.classify(instances);

  writeClassificationToMongo(classify);
  new ObjectFile(classifier, modelFile).write();
}

Source File: Attributes.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

4 votes

/**
 * Generate the attributes object from the information in the pipe.
 * The pipe should be a LFPipe, but we also try to come up with something
 * if it is an ordinary pipe. 
 * 
 * @param pipe  mallet pipe
 * @param instanceType instance type
 */
public Attributes(Pipe pipe, String instanceType) {
  // first create the attributes (independent vars)    
  Alphabet dataAlphabet = pipe.getDataAlphabet();
  // if we can, also represent the pipe as LFPipe
  LFPipe lfPipe;
  FeatureInfo featureInfo = null;
  if(pipe instanceof LFPipe) {
    lfPipe = (LFPipe)pipe;
    featureInfo = lfPipe.getFeatureInfo();
  }
  // the alphabet we use if we have a boolean variable
  LFAlphabet booleanAlph = new LFAlphabet();
  booleanAlph.lookupIndex("false");
  booleanAlph.lookupIndex("true");    
  for(int i =0; i<dataAlphabet.size(); i++) {
    String malletFeatureName = (String) dataAlphabet.lookupObject(i);
    // create an attribute with default settings for datatype, code and 
    // alphabet, if we got more information about it we will override later
    Attribute attr = new Attribute(
            malletFeatureName, i, Datatype.numeric, null, null, null);
    // add it
    attributes.add(attr);
    name2index.put(malletFeatureName, i);
    // If we have a LFPipe, also get some additional info about the type, values etc.
    // NOTE that the default type for features that indicate the presence of
    // strings, ngrams etc. (which we assume when nothing else is declared)
    // is numeric, so that instead of 0/1 we can have counts or tf/idf or 
    // other scores. So only if there is an explicity declaration of a different
    // type, we will change the default values.
    if(featureInfo != null) {
      FeatureSpecAttribute fsAttr = 
              FeatureExtractionMalletSparse.lookupAttributeForFeatureName(
                featureInfo.getAttributes(),
                malletFeatureName,
                instanceType);
      if(fsAttr instanceof FeatureSpecAttributeList) {
        FeatureSpecAttributeList fsAttrList = (FeatureSpecAttributeList)fsAttr;
        attr.codeAs = fsAttrList.codeas;
        attr.mvTreatment = fsAttrList.missingValueTreatment;
        attr.datatype = fsAttrList.datatype;
        if(fsAttrList.datatype == Datatype.bool) {
          attr.alphabet = booleanAlph;
        } else if(fsAttrList.datatype == Datatype.nominal) {
          if(fsAttrList.codeas == CodeAs.number) {
            attr.alphabet = fsAttrList.alphabet;
          }
        } 
      } else if(fsAttr instanceof FeatureSpecSimpleAttribute) {
        FeatureSpecSimpleAttribute fsAttrSimple = (FeatureSpecSimpleAttribute)fsAttr;
        attr.codeAs = fsAttrSimple.codeas;
        attr.mvTreatment = fsAttrSimple.missingValueTreatment;
        attr.datatype = fsAttrSimple.datatype;
        if(fsAttrSimple.datatype == Datatype.bool) {
          attr.alphabet = booleanAlph;
        } else if(fsAttrSimple.datatype == Datatype.nominal) {
          if(fsAttrSimple.codeas == CodeAs.number) {
            attr.alphabet = fsAttrSimple.alphabet;
          }
        }           
      } else if(fsAttr instanceof FeatureSpecNgram) {
        // nothing to do here
      } else if(fsAttr==null) {
        // This can also happen if we try to look up a START/STOP feature which 
        // is created by us and for which not specification exists. In this case,
        // we simply do nothing and use the default attr we have created above
        if(malletFeatureName.endsWith(FeatureExtractionMalletSparse.START_SYMBOL) || 
           malletFeatureName.endsWith(FeatureExtractionMalletSparse.STOP_SYMBOL)) {
          // do nothing
        } else {
          throw new RuntimeException("FeatureSpecification is null for feature "+
                i+", name="+malletFeatureName+ 
                "\nFeatureSpecification is "+featureInfo);
        }
      } else {
        throw new RuntimeException(
                "Impossible: found odd FeatureSpecAttribute type "+fsAttr.getClass());
      }
    }
  }
  @SuppressWarnings("unchecked")
  LabelAlphabet targetAlphabet = (LabelAlphabet)pipe.getTargetAlphabet();
  // if the target alphabet exists, we assume a nominal target
  // The target index is the next index after the last independent attribute
  // index. This is convenient for Weka.
  targetAttribute = new Attribute("target", attributes.size(), Datatype.numeric, null, null, null);
  if(targetAlphabet != null) {
    targetAttribute.alphabet = targetAlphabet;
    targetAttribute.datatype = Datatype.nominal;
  }
}

Source File: TopicModelPipe.java From baleen with Apache License 2.0

2 votes

/**
 * Construct topic model pipe with given stopwords
 *
 * @param stopwords to be removed
 */
public TopicModelPipe(Collection<String> stopwords) {
  this(stopwords, new Alphabet());
}

cc.mallet.types.Alphabet Java Examples