cc.mallet.pipe.Pipe Java Exaples

Source File: CorpusRepresentationMalletTarget.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

6 votes

/**
 * Extract the independent features for a single instance annotation.
 * 
 * Extract the independent features for a single annotation according to the information
 * in the featureInfo object. The information in the featureInfo instance gets updated 
 * by this. 
 * 
 * NOTE: this method is static so that it can be used in the CorpusRepresentationMalletSeq class too.
 * 
 * @param instanceAnnotation instance annotation
 * @param inputAS input annotation set
 * @param targetFeatureName feature name of target
 * @param featureInfo feature info instance
 * @param pipe mallet pipe
 * @param nameFeature name feature
 * @return  Instance
 */
static Instance extractIndependentFeaturesHelper(
        Annotation instanceAnnotation,
        AnnotationSet inputAS,
        FeatureInfo featureInfo,
        Pipe pipe) {
  
  AugmentableFeatureVector afv = new AugmentableFeatureVector(pipe.getDataAlphabet());
  // Constructor parms: data, target, name, source
  Instance inst = new Instance(afv, null, null, null);
  for(FeatureSpecAttribute attr : featureInfo.getAttributes()) {
    FeatureExtractionMalletSparse.extractFeature(inst, attr, inputAS, instanceAnnotation);
  }
  // TODO: we destructively replace the AugmentableFeatureVector by a FeatureVector here,
  // but it is not clear if this is beneficial - our assumption is that yes.
  inst.setData(((AugmentableFeatureVector)inst.getData()).toFeatureVector());
  return inst;
}

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

6 votes

public static void addBrainRegionLexicons(List<String> usedPipeNames,
        List<Pipe> pipes, boolean ignoreCase) throws FileNotFoundException,
        Exception {
    usedPipeNames.add("BrainRegions");
    // BRAINREGION Lexicons
    pipes.add(new TrieLexiconMembership("NNHu", new File(LEXICON_HOME
            + "NN2002Human.txt"), ignoreCase));
    pipes.add(new TrieLexiconMembership("NNMouseRat", new File(LEXICON_HOME
            + "NN2007RatMouse.txt"), ignoreCase));
    pipes.add(new TrieLexiconMembership("Allen", new File(LEXICON_HOME
            + "Allen.txt"), ignoreCase));
    pipes.add(new TrieLexiconMembership("BAMS", new File(LEXICON_HOME
            + "BAMS.txt"), ignoreCase));
    pipes.add(new TrieLexiconMembership("AllRegions", new File(LEXICON_HOME
            + "AllRegions.txt"), ignoreCase));

    pipes.addAll(NGramPipeFactory.getAllGramsPipes("AllRegions", new File(
            LEXICON_HOME + "AllRegions.txt"), ignoreCase));
}

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

6 votes

public static void addTextPressoPipes(List<String> usedPipeNames,
        List<Pipe> pipes, boolean ignoreCase) throws FileNotFoundException,
        Exception {
    usedPipeNames.add("TextPresso");
    // TEXTPRESSO files, files are split by how many tokens
    for (int i = 1; i < 8; i++) {
        pipes.add(new TrieLexiconMembership("textPresso" + i, new File(
                LEXICON_HOME + "TextPresso-wordLength-" + i + ".txt"),
                ignoreCase));
    }
    pipes.add(new TrieLexiconMembership("textPressoAll", new File(
            LEXICON_HOME + "TextPresso-all.txt"), ignoreCase));

    pipes.addAll(NGramPipeFactory.getAllGramsPipes("textPressoAll",
            new File(LEXICON_HOME + "TextPresso-all.txt"), ignoreCase));
}

Source File: MaxEntClassifierTrainerTest.java From baleen with Apache License 2.0

6 votes

@Test
public void testTaskProducesValidModelFile() throws Exception {

  File modelFile = modelPath.toFile();
  assertTrue(modelFile.exists());

  Classifier classifier = new FileObject<Classifier>(modelFile.getPath()).object();
  assertTrue(classifier.getLabelAlphabet().contains("pos"));
  assertTrue(classifier.getLabelAlphabet().contains("neg"));

  Pipe pipe = classifier.getInstancePipe();
  InstanceList instanceList = new InstanceList(pipe);
  instanceList.addThruPipe(
      new Instance("I love this amazing awesome classifier.", null, null, null));
  instanceList.addThruPipe(new Instance("I can't stand this horrible test.", null, null, null));

  assertEquals(
      "pos", classifier.classify(instanceList.get(0)).getLabeling().getBestLabel().toString());
  assertEquals(
      "neg", classifier.classify(instanceList.get(1)).getLabeling().getBestLabel().toString());
}

Source File: MalletClassifierTrainerTest.java From baleen with Apache License 2.0

6 votes

private void validateModel() {
  File modelFile = modelPath.toFile();
  assertTrue(modelFile.exists());

  Classifier classifier = new FileObject<Classifier>(modelFile.getPath()).object();
  assertTrue(classifier.getLabelAlphabet().contains("pos"));
  assertTrue(classifier.getLabelAlphabet().contains("neg"));

  Pipe pipe = classifier.getInstancePipe();
  InstanceList instanceList = new InstanceList(pipe);

  instanceList.addThruPipe(
      new Instance("I love this amazing awesome classifier.", "", null, null));
  instanceList.addThruPipe(new Instance("I can't stand this horrible test.", "", null, null));

  ImmutableSet<String> labels = ImmutableSet.of("pos", "neg");
  assertTrue(
      labels.contains(
          classifier.classify(instanceList.get(0)).getLabeling().getBestLabel().toString()));
  assertTrue(
      labels.contains(
          classifier.classify(instanceList.get(1)).getLabeling().getBestLabel().toString()));
}

Source File: MalletClassifierTrainer.java From baleen with Apache License 2.0

6 votes

@Override
protected void execute(JobSettings settings) throws AnalysisEngineProcessException {

  Pipe pipe = new ClassifierPipe(stopwords);
  InstanceList instances = new InstanceList(pipe);
  instances.addThruPipe(getDocumentsFromMongo());

  InstanceList training = null;
  InstanceList testing = null;
  if (forTesting > 0.0) {
    InstanceList[] ilists = instances.split(new double[] {1 - forTesting, forTesting});
    training = ilists[0];
    testing = ilists[1];
  } else {
    training = instances;
  }

  processTrainerDefinitions(training, testing);
}

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

5 votes

public static void addSubstringRegexPipes(List<String> usedPipeNames,
        List<Pipe> pipes) throws Exception {
    usedPipeNames.add("Substring regexes");

    // "thalamic" and nuclie are probably in the 1-grams
    for (String substring : new String[] { "cortic", "cerebel" }) {
        pipes.add(new RegexMatches(substring + "Regex", compile(".*"
                + substring + ".*", CASE_INSENSITIVE)));
    }
}

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

5 votes

public static List<Pipe> getPipes() throws Exception {

        List<Pipe> pipes = newArrayList();

        pipes.add(new Jcas2TokenSequence());
        pipes.add(new Target2LabelSequence());

        // more piiiiipes
        addAllGoodPipes(pipes);

        pipes.add(new FeatureWindow(window, window));
        // for debugging pipes.add(new PrintInputAndTarget());
        pipes.add(new TokenSequence2FeatureVectorSequence());
        return pipes;
    }

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

5 votes

public static void addAllGoodPipes(List<Pipe> pipes) throws Exception {

        List<String> usedPipeNames = new LinkedList<String>();

        // / if (StaticOption.getBoolean("TextPipe"))
        addTextPipe(usedPipeNames, pipes);

        // ren/ addOriginalMarkupPipes();
        addAreaRegexPipes(usedPipeNames, pipes);
        // this catches tracts, change?
        // /if (StaticOption.getBoolean("SubstringRegexPipes"))
        addSubstringRegexPipes(usedPipeNames, pipes);
        addSpineRegexPipes(usedPipeNames, pipes);

        // /if (StaticOption
        // .getBoolean("SmallLexicons_TextPressoPipes_BrainRegionLexicons_AbbreviationLexiconPipes_AreaLexicons"))
        addSmallLexicons(usedPipeNames, pipes, ignoreCase);
        addTextPressoPipes(usedPipeNames, pipes, ignoreCase);
        addBrainRegionLexicons(usedPipeNames, pipes, ignoreCase);
        // ren/ addPigeonLexicon(usedPipeNames, pipes, ignoreCase);
        addAbbreviationLexiconPipes(usedPipeNames, pipes);
        addAreaLexicons(usedPipeNames, pipes, ignoreCase);

        addLengthPipes(usedPipeNames, pipes);

        if (Jcas2TokenSequence.NEW_FEATURES)
            addFullTextPipes(usedPipeNames, pipes);

        // / if (StaticOption.getBoolean("HandMadeRegexPipes_MalletNEPipes")) {
        addHandMadeRegexPipes(usedPipeNames, pipes);
        addMalletNEPipes(usedPipeNames, pipes);
    }

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

5 votes

/** Pipes added based on experience with full text */
private static void addFullTextPipes(List<String> usedPipeNames,
        List<Pipe> pipes) {

    // blabla 24 24
    pipes.add(new LongRegexSpaced("digit_then_other_then_digit", Pattern
            .compile("\\d+[^\\d]+\\d+"), 2, 4));

    // 30 mM K SO , 5 mM MgCl 6H O, 10 mM 24 24 22 HEPES
    pipes.add(new LongRegexSpaced(
            "digit_then_other_then_digit_then_other_then_digit", Pattern
                    .compile(".*\\d+[^\\d\\n]+\\d+[^\\d\\n]+\\d+.*"), 4, 9));

    // n 19
    // n 5
    pipes.add(new LongRegexSpaced("n_space_digit", Pattern
            .compile("n \\d+"), 2, 2));
    pipes.add(new LongRegexSpaced("parenthesis_n_space_digit_parenthesis",
            Pattern.compile("\\( n \\d+ \\)"), 3, 4));
    pipes.add(new LongRegexSpaced("n_space_digit_parenthesis", Pattern
            .compile("n \\d+ \\)"), 3, 4));
    pipes.add(new LongRegexSpaced("parenthesis_n_space_digit", Pattern
            .compile("\\( n \\d+"), 3, 4));

    // Fig is never found in any lexicon
    pipes.add(new RegexMatches("Figure", Pattern.compile(".*Fig.*")));
}

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

5 votes

private static void addAbbreviationLexiconPipes(List<String> usedPipeNames,
        List<Pipe> pipes) throws IOException {

    usedPipeNames.add("AbbrevLex");
    File ratMouse = new File(LEXICON_HOME + "NN2007RatMouseAbbrev.txt");
    File human = new File(LEXICON_HOME + "NN2002HumanAbbrev.txt");
    boolean ignoreCase = true;
    // should be one word only but may not..
    pipes.add(new TrieLexiconMembership("NNHumanAbbrev", human, ignoreCase));
    pipes.add(new TrieLexiconMembership("NNRatMouseAbbrev", ratMouse,
            ignoreCase));

    addPrefixPipes(pipes, ratMouse, "NNHumanAbbrevPrefix");
    addPrefixPipes(pipes, human, "NNRatMouseAbbrevPrefix");
}

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

5 votes

public static void addPrefixPipes(List<Pipe> pipes, File file, String name)
        throws IOException {
    for (String line : linesFrom(file.getAbsolutePath())) {
        pipes.add(new RegexMatches(name, compile("(" + line.trim()
                + ".{1,3})", CASE_INSENSITIVE)));
    }
}

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

5 votes

public static void addAreaRegexPipes(List<String> usedPipeNames,
        List<Pipe> pipes) {
    usedPipeNames.add("Area regexes");

    pipes.add(new LongRegexSpaced("Brodmann", Pattern
            .compile("areas? \\d+((, ?\\d)*,? (or|and) \\d+)?"), 2, 9));

    // a looser version that allows just letters
    pipes.add(new LongRegexSpaced(
            "Areas",
            compile("areas? (\\p{Upper}|\\d)+((, ?(\\p{Upper}|\\d))*,? (or|and) (\\p{Upper}|\\d)+)?"),
            2, 9));

}

Source File: NGramPipeFactory.java From bluima with Apache License 2.0

5 votes

public static List<Pipe> getAllGramsPipes(String name, File inputFile,
        boolean ignoreCase, int startGramSize) throws Exception {
    // go up to seven?
    List<Pipe> pipes = new LinkedList<Pipe>();
    for (int i = startGramSize; i < 7; i++) {
        pipes.add(getNGramPipe(name, inputFile, ignoreCase, i));
    }
    return pipes;
}

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

5 votes

public static void addSpineRegexPipes(List<String> usedPipeNames,
        List<Pipe> pipes) throws Exception {
    usedPipeNames.add("SpineRegex");
    // T1-T12
    // L1-L5
    // S1-S5
    // C1-C8
    pipes.add(new LongRegexMatches("SpinalParts", Pattern
            .compile("([LS][1-5])|T((1[0-2]?)|[2-9])|(C[1-8])"), 1, 2));

}

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

5 votes

public static void addSmallLexicons(List<String> usedPipeNames,
        List<Pipe> pipes, boolean ignoreCase) throws FileNotFoundException {
    usedPipeNames.add("SmallLex");
    pipes.add(new LexiconMembership("chudlerListWord", new File(
            LEXICON_HOME + "chudler.txt"), ignoreCase));
    pipes.add(new LexiconMembership("directionWord", new File(LEXICON_HOME
            + "directions.txt"), ignoreCase));
    pipes.add(new LexiconMembership("extendedDirectionWord", new File(
            LEXICON_HOME + "extendedDirections.txt"), ignoreCase));
    pipes.add(new LexiconMembership("stopWord", new File(LEXICON_HOME
            + "stop.txt"), ignoreCase));
}

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

5 votes

public static void addHandMadeRegexPipes(List<String> usedPipeNames,
        List<Pipe> pipes) throws Exception {
    usedPipeNames.add("Handmade regexes");
    pipes.add(new LongRegexSpaced("of_The", compile("of the"), 2, 2));
    pipes.add(new LongRegexSpaced("part_Of", compile("part of"), 2, 2));
    pipes.add(new LongRegexSpaced("neurnEnd", compile("(.* neurons)"), 2, 3));
    pipes.add(new LongRegexSpaced("nucleiEnd", compile("(.* nuclei)"), 2, 3));
    pipes.add(new LongRegexSpaced("nclusEnd", compile("(.* nucleus)"), 2, 5));
    pipes.add(new LongRegexSpaced("fieldEnd", compile("(.* field)"), 2, 4));
    pipes.add(new LongRegexSpaced("cortexEnd", compile("(.* cortex)"), 2, 3));
    pipes.add(new LongRegexSpaced("areaEnd", compile("(.* area)"), 2, 4));
    pipes.add(new LongRegexSpaced("territoryEnd",
            compile("(.* territory)|(.* territories)"), 2, 4));
}

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

5 votes

public static void addLengthPipes(List<String> usedPipeNames,
        List<Pipe> pipes) throws Exception {
    usedPipeNames.add("Length");
    // length feature - binary bins
    pipes.add(new LengthBins("Length", new int[] { 1, 2, 3, 5, 8, 11, 14,
            18, 22 }));

    // from some calcs the average brain token is 6.92 while the outside is
    // 4.64 (~3.55 stdev)
    pipes.add(new LengthBins("LengthThreshold", new int[] { 6 }));
}

Source File: LDA.java From topic-detection with Apache License 2.0

5 votes

/**
 * Creates a list of Malelt instances from a list of documents
 * @param texts a list of documents
 * @return a list of Mallet instances
 * @throws IOException
 */
private InstanceList createInstanceList(List<String> texts) throws IOException
{
	ArrayList<Pipe> pipes = new ArrayList<Pipe>();
	pipes.add(new CharSequence2TokenSequence());
	pipes.add(new TokenSequenceLowercase());
	pipes.add(new TokenSequenceRemoveStopwords());
	pipes.add(new TokenSequence2FeatureSequence());
	InstanceList instanceList = new InstanceList(new SerialPipes(pipes));
	instanceList.addThruPipe(new ArrayIterator(texts));
	return instanceList;
}

Source File: NGramPipeFactory.java From bluima with Apache License 2.0

5 votes

public static Pipe getNGramPipe(String name, File inputFile,
        boolean ignoreCase, int gram) throws Exception {
    File tempFile = File.createTempFile("ngram", ".txt");
    // System.out.println( "Your temp file is " +
    // tempFile.getCanonicalPath() );
    // Arrange for it to be deleted at exit.
    tempFile.deleteOnExit();

    BufferedWriter bw = new BufferedWriter(new FileWriter(tempFile));
    BufferedReader br = new BufferedReader(new FileReader(inputFile));
    while (br.ready()) {
        String line = br.readLine().trim();
        if (line.equals(""))
            continue; // ignore blank lines
        // check null
        String[] gramStrings = getGrams(line, gram);
        if (gramStrings != null) {
            for (String gramString : gramStrings) {
                bw.write(gramString);
                // System.out.println(gramString);
                bw.newLine();
            }
        }
    }
    bw.close();
    br.close();
    return new TrieLexiconMembership(name + "-" + gram + "-gram", tempFile,
            ignoreCase);
}

Source File: ReferencesClassifierTrainer.java From bluima with Apache License 2.0

5 votes

static List<Pipe> getPipes() {

        List<Pipe> pipes = newArrayList();
        pipes.add(new Target2Label());
        pipes.add(new MyInput2RegexTokens());

        // pipes.add(new PrintInputAndTarget());

        pipes.add(new TokenSequence2FeatureSequence());
        pipes.add(new FeatureSequence2FeatureVector());
        return pipes;
    }

Source File: CorpusRepresentationMalletTarget.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

5 votes

/**
 * Constructor for creating a new CorpusRepresentation from a FeatureInfo. 
 * 
 * @param fi FeatureInfo instance
 * @param targetType type of target
 */
public CorpusRepresentationMalletTarget(FeatureInfo fi, TargetType targetType) {
  featureInfo = fi;
  scalingMethod = fi.getGlobalScalingMethod();    
  this.targetType = targetType;
  LabelAlphabet targetAlphabet = (targetType == TargetType.NOMINAL) ? new LabelAlphabet() : null;
  Pipe innerPipe = new Noop(new LFAlphabet(), targetAlphabet);
  List<Pipe> pipes = new ArrayList<>();
  pipes.add(innerPipe);
  pipe = new LFPipe(pipes);
  pipe.setFeatureInfo(fi);
  instances = new LFInstanceList(pipe);
}

Source File: CorpusRepresentationMalletLDA.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

5 votes

public CorpusRepresentationMalletLDA(FeatureInfo fi) {
  featureInfo = fi;  // always null
  // since we always pass a null feature info, the scaling method is hard-wired to be NONE
  scalingMethod = ScalingMethod.NONE;

  // TODO: we really do not need any of this, figure out if we can simplify,
  // but keeping this should not really do any harm!
  Pipe innerPipe = new Noop(new LFAlphabet(), null);
  List<Pipe> pipes = new ArrayList<>();
  pipes.add(innerPipe);
  pipe = new LFPipe(pipes);
  pipe.setFeatureInfo(fi);
  instances = new LFInstanceList(pipe);
  targetType = TargetType.NONE;
}

Source File: CorpusRepresentationMalletSeq.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

5 votes

public CorpusRepresentationMalletSeq(FeatureInfo fi) {
  featureInfo = fi;
  scalingMethod = fi.getGlobalScalingMethod();

  Pipe innerPipe = new Noop(new LFAlphabet(), new LabelAlphabet());
  List<Pipe> pipes = new ArrayList<>();
  pipes.add(innerPipe);
  pipe = new LFPipe(pipes);
  pipe.setFeatureInfo(fi);
  instances = new LFInstanceList(pipe);
  targetType = TargetType.NOMINAL;
}

Source File: NGramPipeFactory.java From bluima with Apache License 2.0

4 votes

public static List<Pipe> getAllGramsPipes(String name, File inputFile,
        boolean ignoreCase) throws Exception {
    return getAllGramsPipes(name, inputFile, ignoreCase, 1);
}

Source File: NEPipes.java From bluima with Apache License 2.0

4 votes

public NEPipes() {
    super(
            new Pipe[] {
                    //new TokenText( "text=" ),

                    new RegexMatches( "SingleLetter", Pattern.compile( "[A-Za-z]" ) ),
                    new RegexMatches( "AllCaps", Pattern.compile( ALLCAPS ) ),
                    new RegexMatches( "AllLower", Pattern.compile( ALLLOWER ) ),
                    new RegexMatches( "InitCaps", Pattern.compile( INITCAPS ) ),
                    new RegexMatches( "MixedCase", Pattern.compile( MIXEDCASE ) ),
                    new RegexMatches( "MixedNum", Pattern.compile( MIXEDNUM ) ),
                    new RegexMatches( "EndSentPunc", Pattern.compile( ENDSENTENCE ) ),
                    new RegexMatches( "Punc", Pattern.compile( PUNCTUATION ) ),
                    new RegexMatches( "Bracket", Pattern.compile( BRACKET ) ),
                    new RegexMatches( "Ordinal", Pattern.compile( ORDINAL, Pattern.CASE_INSENSITIVE ) ),

                    new LongRegexMatches( "Quoted", Pattern.compile( QUOTED ), 1, 4 ),
                    new LongRegexMatches( "Bracketed", Pattern.compile( BRACKETED ), 1, 4 ),
                    new LongRegexMatches( "Initial", Pattern.compile( INITIAL ), 2, 2 ),
                    new LongRegexMatches( "Ellipse", Pattern.compile( DOTS ), 1, 2 ),
                    new LongRegexMatches( "Dashes", Pattern.compile( DASHES ), 2, 2 ),
                    new LongRegexMatches( "Fraction", Pattern.compile( FRACTION ), 1, 3 ),
                    new LongRegexMatches( "DotDecimal", Pattern.compile( DOTDECIMAL ), 1, 3 ),

                    new LongRegexMatches( "Percent", Pattern.compile( "(" + RANGE + "|" + DECIMAL + ")%" ), 2, 4 ),
                    new RegexMatches( "10^3n", Pattern.compile( ILLION, Pattern.CASE_INSENSITIVE ) ),
                    new LongRegexMatches( "Numeric", Pattern.compile( DECIMAL ), 1, 3 ),
                    new LongRegexMatches( "BigNumber", Pattern.compile( COMMA_DECIMAL ), 1, 7 ),
                    new LongRegexMatches( "kmbNumber",
                            Pattern.compile( DECIMAL + ILLION, Pattern.CASE_INSENSITIVE ), 1, 4 ),
                    new RegexMatches( "kmbMixed", Pattern.compile( MIXED_ILLION, Pattern.CASE_INSENSITIVE ) ),
                    new LongRegexMatches( "Dollars", Pattern.compile( "[$](" + RANGE + "|" + DECIMAL + "|"
                            + COMMA_DECIMAL + "|" + DECIMAL + ILLION + "|" + MIXED_ILLION + ")",
                            Pattern.CASE_INSENSITIVE ), 2, 8 ),

                    new RegexMatches( "NumberWord", Pattern.compile( NUMBER_WORD, Pattern.CASE_INSENSITIVE ) ),
                   //FIXME useful beyond this?
                    new RegexMatches( "Currency", Pattern.compile( CURRENCY, Pattern.CASE_INSENSITIVE ) ),
                    new LongRegexMatches( "MoneyWords", Pattern.compile( MONEYWORDS, Pattern.CASE_INSENSITIVE ), 2,
                            4 ),

                    new LongRegexMatches( "AmPm", Pattern.compile( AMPM, Pattern.CASE_INSENSITIVE ), 1, 4 ),
                    new RegexMatches( "MixedAmPm", Pattern.compile( MIXED_AMPM, Pattern.CASE_INSENSITIVE ) ),
                    new LongRegexMatches( "TimeNum", Pattern.compile( TIMENUM ), 3, 5 ),
                    new RegexMatches( "TimeZone", Pattern.compile( TIMEZONES, Pattern.CASE_INSENSITIVE ) ),
                    new LongRegexMatches( "Time", Pattern.compile( TIME, Pattern.CASE_INSENSITIVE ), 1, 9 ),
                    new LongRegexMatches( "TimeRange", Pattern.compile( TIMERANGE, Pattern.CASE_INSENSITIVE ), 3,
                            19 ),

                    new LongRegexMatches( "P10", Pattern.compile( P10 ), 3, 7 ),
                    new LongRegexMatches( "P5", Pattern.compile( P10 ), 3, 3 ),
                    new LongRegexMatches( "Phone", Pattern.compile( P10 + "|" + P5 ), 3, 7 ),

                    new RegexMatches( "UncasedMonthName", Pattern.compile( MONTHNAME, Pattern.CASE_INSENSITIVE ) ),
                    new LongRegexMatches( "UncasedMonthAbbr",
                            Pattern.compile( MONTHABBR, Pattern.CASE_INSENSITIVE ), 1, 2 ),
                    new LongRegexMatches( "CasedMonth", Pattern.compile( MONTH ), 1, 2 ),
                    new LongRegexMatches( "UncasedMonth", Pattern.compile( MONTH, Pattern.CASE_INSENSITIVE ), 1, 2 ),

                    new RegexMatches( "UncasedWeekdayName", Pattern.compile( WEEKDAYNAME, Pattern.CASE_INSENSITIVE ) ),
                    new LongRegexMatches( "UncasedWeekdayAbbr", Pattern.compile( WEEKDAYABBR,
                            Pattern.CASE_INSENSITIVE ), 1, 2 ),
                    new LongRegexMatches( "CasedWeekday", Pattern.compile( WEEKDAY ), 1, 2 ),
                    new LongRegexMatches( "UncasedWeekday", Pattern.compile( WEEKDAY, Pattern.CASE_INSENSITIVE ),
                            1, 2 ),

                    new LongRegexMatches( "MonthDay", Pattern.compile( MONTHDAY, Pattern.CASE_INSENSITIVE ), 2, 3 ),
                    new LongRegexMatches( "DayMonthDay", Pattern.compile( DAYMONTHDAY, Pattern.CASE_INSENSITIVE ),
                            3, 6 ),
                    new LongRegexMatches( "MonthYear", Pattern.compile( MONTHYEAR, Pattern.CASE_INSENSITIVE ), 2, 4 ),
                    new LongRegexMatches( "MonthDayYear",
                            Pattern.compile( MONTHDAYYEAR, Pattern.CASE_INSENSITIVE ), 3, 5 ),
                    new LongRegexMatches( "DayMonthDayYear", Pattern.compile( DAYMONTHDAYYEAR,
                            Pattern.CASE_INSENSITIVE ), 4, 8 ),

                    new LongRegexMatches( "SeparatorDate", Pattern.compile( SEPDATE ), 3, 5 ),
                    new LongRegexMatches( "FullSeparatorDate", Pattern.compile( FULLSEPDATE ), 5, 5 ),
            } );
}

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

4 votes

public static void addMalletNEPipes(List<String> usedPipeNames,
        List<Pipe> pipes) throws Exception {
    usedPipeNames.add("Mallet NE");
    // random pipes from general NER
    pipes.addAll(new NEPipes().pipes());
}

Source File: Attributes.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

4 votes

/**
 * Generate the attributes object from the information in the pipe.
 * The pipe should be a LFPipe, but we also try to come up with something
 * if it is an ordinary pipe. 
 * 
 * @param pipe  mallet pipe
 * @param instanceType instance type
 */
public Attributes(Pipe pipe, String instanceType) {
  // first create the attributes (independent vars)    
  Alphabet dataAlphabet = pipe.getDataAlphabet();
  // if we can, also represent the pipe as LFPipe
  LFPipe lfPipe;
  FeatureInfo featureInfo = null;
  if(pipe instanceof LFPipe) {
    lfPipe = (LFPipe)pipe;
    featureInfo = lfPipe.getFeatureInfo();
  }
  // the alphabet we use if we have a boolean variable
  LFAlphabet booleanAlph = new LFAlphabet();
  booleanAlph.lookupIndex("false");
  booleanAlph.lookupIndex("true");    
  for(int i =0; i<dataAlphabet.size(); i++) {
    String malletFeatureName = (String) dataAlphabet.lookupObject(i);
    // create an attribute with default settings for datatype, code and 
    // alphabet, if we got more information about it we will override later
    Attribute attr = new Attribute(
            malletFeatureName, i, Datatype.numeric, null, null, null);
    // add it
    attributes.add(attr);
    name2index.put(malletFeatureName, i);
    // If we have a LFPipe, also get some additional info about the type, values etc.
    // NOTE that the default type for features that indicate the presence of
    // strings, ngrams etc. (which we assume when nothing else is declared)
    // is numeric, so that instead of 0/1 we can have counts or tf/idf or 
    // other scores. So only if there is an explicity declaration of a different
    // type, we will change the default values.
    if(featureInfo != null) {
      FeatureSpecAttribute fsAttr = 
              FeatureExtractionMalletSparse.lookupAttributeForFeatureName(
                featureInfo.getAttributes(),
                malletFeatureName,
                instanceType);
      if(fsAttr instanceof FeatureSpecAttributeList) {
        FeatureSpecAttributeList fsAttrList = (FeatureSpecAttributeList)fsAttr;
        attr.codeAs = fsAttrList.codeas;
        attr.mvTreatment = fsAttrList.missingValueTreatment;
        attr.datatype = fsAttrList.datatype;
        if(fsAttrList.datatype == Datatype.bool) {
          attr.alphabet = booleanAlph;
        } else if(fsAttrList.datatype == Datatype.nominal) {
          if(fsAttrList.codeas == CodeAs.number) {
            attr.alphabet = fsAttrList.alphabet;
          }
        } 
      } else if(fsAttr instanceof FeatureSpecSimpleAttribute) {
        FeatureSpecSimpleAttribute fsAttrSimple = (FeatureSpecSimpleAttribute)fsAttr;
        attr.codeAs = fsAttrSimple.codeas;
        attr.mvTreatment = fsAttrSimple.missingValueTreatment;
        attr.datatype = fsAttrSimple.datatype;
        if(fsAttrSimple.datatype == Datatype.bool) {
          attr.alphabet = booleanAlph;
        } else if(fsAttrSimple.datatype == Datatype.nominal) {
          if(fsAttrSimple.codeas == CodeAs.number) {
            attr.alphabet = fsAttrSimple.alphabet;
          }
        }           
      } else if(fsAttr instanceof FeatureSpecNgram) {
        // nothing to do here
      } else if(fsAttr==null) {
        // This can also happen if we try to look up a START/STOP feature which 
        // is created by us and for which not specification exists. In this case,
        // we simply do nothing and use the default attr we have created above
        if(malletFeatureName.endsWith(FeatureExtractionMalletSparse.START_SYMBOL) || 
           malletFeatureName.endsWith(FeatureExtractionMalletSparse.STOP_SYMBOL)) {
          // do nothing
        } else {
          throw new RuntimeException("FeatureSpecification is null for feature "+
                i+", name="+malletFeatureName+ 
                "\nFeatureSpecification is "+featureInfo);
        }
      } else {
        throw new RuntimeException(
                "Impossible: found odd FeatureSpecAttribute type "+fsAttr.getClass());
      }
    }
  }
  @SuppressWarnings("unchecked")
  LabelAlphabet targetAlphabet = (LabelAlphabet)pipe.getTargetAlphabet();
  // if the target alphabet exists, we assume a nominal target
  // The target index is the next index after the last independent attribute
  // index. This is convenient for Weka.
  targetAttribute = new Attribute("target", attributes.size(), Datatype.numeric, null, null, null);
  if(targetAlphabet != null) {
    targetAttribute.alphabet = targetAlphabet;
    targetAttribute.datatype = Datatype.nominal;
  }
}

Source File: BrainRegionPipes.java From bluima with Apache License 2.0

4 votes

public static void addAreaLexicons(List<String> usedPipeNames,
        List<Pipe> pipes, boolean ignoreCase) throws FileNotFoundException {
    usedPipeNames.add("Areawords");
    pipes.add(new LexiconMembership("areawords", new File(LEXICON_HOME
            + "areawords.txt"), ignoreCase));
}

Source File: ITEngineMalletClass.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1

4 votes

@Test
public void testEngineMalletClass1() throws MalformedURLException, ResourceInstantiationException {
  File configFile = new File("tests/cl-ionosphere/feats.xml");
  FeatureSpecification spec = new FeatureSpecification(configFile);
  FeatureInfo featureInfo = spec.getFeatureInfo();
  Engine engine = Engine.create(AlgorithmClassification.MalletC45_CL_MR, "", featureInfo, TargetType.NOMINAL, null);
  CorpusRepresentationMalletTarget crm = (CorpusRepresentationMalletTarget)engine.getCorpusRepresentation();
  System.err.println("TESTS: have engine "+engine);
  
  // load a document and train the model
  Document doc = loadDocument(new File("tests/cl-ionosphere/ionosphere_gate.xml"));
  System.err.println("TESTS: have document");
  
  AnnotationSet instanceAS = doc.getAnnotations().get("Mention");
  AnnotationSet sequenceAS = null;
  AnnotationSet inputAS = doc.getAnnotations();
  AnnotationSet classAS = null;
  String targetFeature = "class";
  String nameFeature = null;
  crm.add(instanceAS, sequenceAS, inputAS, classAS, targetFeature, TargetType.NOMINAL, "", nameFeature, null);
  System.err.println("TESTS: added instances, number of instances now: "+crm.getRepresentationMallet().size());
  engine.trainModel(null,"","");
  System.err.println("TESTS: model trained");
  System.err.println("TESTS: engine before saving: "+engine);
  engine.saveEngine(new File("."));
  
  // Now check if we can restore the engine and thus the corpus representation
  Engine engine2 = Engine.load(new File(".").toURI().toURL(), "");
  System.err.println("RESTORED engine is "+engine2);
  
  // check if the corpusRepresentation has been restored correctly
  CorpusRepresentation cr2 = engine2.getCorpusRepresentation();
  assertNotNull(cr2);
  assertTrue(cr2 instanceof CorpusRepresentationMalletTarget);
  CorpusRepresentationMalletTarget crmc2 = (CorpusRepresentationMalletTarget)cr2;
  Pipe pipe = crmc2.getPipe();
  assertNotNull(pipe);
  assertTrue(pipe instanceof LFPipe);
  LFPipe lfpipe = (LFPipe)pipe;
  FeatureInfo fi = lfpipe.getFeatureInfo();
  assertNotNull(fi);
  
  AnnotationSet lfAS = doc.getAnnotations("LF");
  String parms = "";
  List<ModelApplication> gcs = engine2.applyModel(instanceAS, inputAS, sequenceAS, parms);
  System.err.println("Number of classifications: "+gcs.size());
  ModelApplication.applyClassification(doc, gcs, "target", lfAS, null);
  
  System.err.println("Original instances: "+instanceAS.size()+", classification: "+lfAS.size());
  
  // quick and dirty evaluation: go through all the original annotations, get the 
  // co-extensive annotations from LF, and compare the values from the "class" feature
  int total = 0;
  int correct = 0;
  for(Annotation orig : instanceAS) {
    total++;
    Annotation lf = gate.Utils.getOnlyAnn(gate.Utils.getCoextensiveAnnotations(lfAS, orig));
    //System.err.println("ORIG="+orig+", lf="+lf);
    if(orig.getFeatures().get("class").equals(lf.getFeatures().get("target"))) {
      correct++;
    }
  }
  
  double acc = (double)correct / (double)total;
  System.err.println("Got total="+total+", correct="+correct+", acc="+acc);
  assertEquals(0.9630, acc, 0.01);
}

cc.mallet.pipe.Pipe Java Examples