cc.mallet.pipe.Pipe Java Examples
The following examples show how to use
cc.mallet.pipe.Pipe.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CorpusRepresentationMalletTarget.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 6 votes |
/** * Extract the independent features for a single instance annotation. * * Extract the independent features for a single annotation according to the information * in the featureInfo object. The information in the featureInfo instance gets updated * by this. * * NOTE: this method is static so that it can be used in the CorpusRepresentationMalletSeq class too. * * @param instanceAnnotation instance annotation * @param inputAS input annotation set * @param targetFeatureName feature name of target * @param featureInfo feature info instance * @param pipe mallet pipe * @param nameFeature name feature * @return Instance */ static Instance extractIndependentFeaturesHelper( Annotation instanceAnnotation, AnnotationSet inputAS, FeatureInfo featureInfo, Pipe pipe) { AugmentableFeatureVector afv = new AugmentableFeatureVector(pipe.getDataAlphabet()); // Constructor parms: data, target, name, source Instance inst = new Instance(afv, null, null, null); for(FeatureSpecAttribute attr : featureInfo.getAttributes()) { FeatureExtractionMalletSparse.extractFeature(inst, attr, inputAS, instanceAnnotation); } // TODO: we destructively replace the AugmentableFeatureVector by a FeatureVector here, // but it is not clear if this is beneficial - our assumption is that yes. inst.setData(((AugmentableFeatureVector)inst.getData()).toFeatureVector()); return inst; }
Example #2
Source File: BrainRegionPipes.java From bluima with Apache License 2.0 | 6 votes |
public static void addBrainRegionLexicons(List<String> usedPipeNames, List<Pipe> pipes, boolean ignoreCase) throws FileNotFoundException, Exception { usedPipeNames.add("BrainRegions"); // BRAINREGION Lexicons pipes.add(new TrieLexiconMembership("NNHu", new File(LEXICON_HOME + "NN2002Human.txt"), ignoreCase)); pipes.add(new TrieLexiconMembership("NNMouseRat", new File(LEXICON_HOME + "NN2007RatMouse.txt"), ignoreCase)); pipes.add(new TrieLexiconMembership("Allen", new File(LEXICON_HOME + "Allen.txt"), ignoreCase)); pipes.add(new TrieLexiconMembership("BAMS", new File(LEXICON_HOME + "BAMS.txt"), ignoreCase)); pipes.add(new TrieLexiconMembership("AllRegions", new File(LEXICON_HOME + "AllRegions.txt"), ignoreCase)); pipes.addAll(NGramPipeFactory.getAllGramsPipes("AllRegions", new File( LEXICON_HOME + "AllRegions.txt"), ignoreCase)); }
Example #3
Source File: BrainRegionPipes.java From bluima with Apache License 2.0 | 6 votes |
public static void addTextPressoPipes(List<String> usedPipeNames, List<Pipe> pipes, boolean ignoreCase) throws FileNotFoundException, Exception { usedPipeNames.add("TextPresso"); // TEXTPRESSO files, files are split by how many tokens for (int i = 1; i < 8; i++) { pipes.add(new TrieLexiconMembership("textPresso" + i, new File( LEXICON_HOME + "TextPresso-wordLength-" + i + ".txt"), ignoreCase)); } pipes.add(new TrieLexiconMembership("textPressoAll", new File( LEXICON_HOME + "TextPresso-all.txt"), ignoreCase)); pipes.addAll(NGramPipeFactory.getAllGramsPipes("textPressoAll", new File(LEXICON_HOME + "TextPresso-all.txt"), ignoreCase)); }
Example #4
Source File: MaxEntClassifierTrainerTest.java From baleen with Apache License 2.0 | 6 votes |
@Test public void testTaskProducesValidModelFile() throws Exception { File modelFile = modelPath.toFile(); assertTrue(modelFile.exists()); Classifier classifier = new FileObject<Classifier>(modelFile.getPath()).object(); assertTrue(classifier.getLabelAlphabet().contains("pos")); assertTrue(classifier.getLabelAlphabet().contains("neg")); Pipe pipe = classifier.getInstancePipe(); InstanceList instanceList = new InstanceList(pipe); instanceList.addThruPipe( new Instance("I love this amazing awesome classifier.", null, null, null)); instanceList.addThruPipe(new Instance("I can't stand this horrible test.", null, null, null)); assertEquals( "pos", classifier.classify(instanceList.get(0)).getLabeling().getBestLabel().toString()); assertEquals( "neg", classifier.classify(instanceList.get(1)).getLabeling().getBestLabel().toString()); }
Example #5
Source File: MalletClassifierTrainerTest.java From baleen with Apache License 2.0 | 6 votes |
private void validateModel() { File modelFile = modelPath.toFile(); assertTrue(modelFile.exists()); Classifier classifier = new FileObject<Classifier>(modelFile.getPath()).object(); assertTrue(classifier.getLabelAlphabet().contains("pos")); assertTrue(classifier.getLabelAlphabet().contains("neg")); Pipe pipe = classifier.getInstancePipe(); InstanceList instanceList = new InstanceList(pipe); instanceList.addThruPipe( new Instance("I love this amazing awesome classifier.", "", null, null)); instanceList.addThruPipe(new Instance("I can't stand this horrible test.", "", null, null)); ImmutableSet<String> labels = ImmutableSet.of("pos", "neg"); assertTrue( labels.contains( classifier.classify(instanceList.get(0)).getLabeling().getBestLabel().toString())); assertTrue( labels.contains( classifier.classify(instanceList.get(1)).getLabeling().getBestLabel().toString())); }
Example #6
Source File: MalletClassifierTrainer.java From baleen with Apache License 2.0 | 6 votes |
@Override protected void execute(JobSettings settings) throws AnalysisEngineProcessException { Pipe pipe = new ClassifierPipe(stopwords); InstanceList instances = new InstanceList(pipe); instances.addThruPipe(getDocumentsFromMongo()); InstanceList training = null; InstanceList testing = null; if (forTesting > 0.0) { InstanceList[] ilists = instances.split(new double[] {1 - forTesting, forTesting}); training = ilists[0]; testing = ilists[1]; } else { training = instances; } processTrainerDefinitions(training, testing); }
Example #7
Source File: BrainRegionPipes.java From bluima with Apache License 2.0 | 5 votes |
public static void addSubstringRegexPipes(List<String> usedPipeNames, List<Pipe> pipes) throws Exception { usedPipeNames.add("Substring regexes"); // "thalamic" and nuclie are probably in the 1-grams for (String substring : new String[] { "cortic", "cerebel" }) { pipes.add(new RegexMatches(substring + "Regex", compile(".*" + substring + ".*", CASE_INSENSITIVE))); } }
Example #8
Source File: BrainRegionPipes.java From bluima with Apache License 2.0 | 5 votes |
public static List<Pipe> getPipes() throws Exception { List<Pipe> pipes = newArrayList(); pipes.add(new Jcas2TokenSequence()); pipes.add(new Target2LabelSequence()); // more piiiiipes addAllGoodPipes(pipes); pipes.add(new FeatureWindow(window, window)); // for debugging pipes.add(new PrintInputAndTarget()); pipes.add(new TokenSequence2FeatureVectorSequence()); return pipes; }
Example #9
Source File: BrainRegionPipes.java From bluima with Apache License 2.0 | 5 votes |
public static void addAllGoodPipes(List<Pipe> pipes) throws Exception { List<String> usedPipeNames = new LinkedList<String>(); // / if (StaticOption.getBoolean("TextPipe")) addTextPipe(usedPipeNames, pipes); // ren/ addOriginalMarkupPipes(); addAreaRegexPipes(usedPipeNames, pipes); // this catches tracts, change? // /if (StaticOption.getBoolean("SubstringRegexPipes")) addSubstringRegexPipes(usedPipeNames, pipes); addSpineRegexPipes(usedPipeNames, pipes); // /if (StaticOption // .getBoolean("SmallLexicons_TextPressoPipes_BrainRegionLexicons_AbbreviationLexiconPipes_AreaLexicons")) addSmallLexicons(usedPipeNames, pipes, ignoreCase); addTextPressoPipes(usedPipeNames, pipes, ignoreCase); addBrainRegionLexicons(usedPipeNames, pipes, ignoreCase); // ren/ addPigeonLexicon(usedPipeNames, pipes, ignoreCase); addAbbreviationLexiconPipes(usedPipeNames, pipes); addAreaLexicons(usedPipeNames, pipes, ignoreCase); addLengthPipes(usedPipeNames, pipes); if (Jcas2TokenSequence.NEW_FEATURES) addFullTextPipes(usedPipeNames, pipes); // / if (StaticOption.getBoolean("HandMadeRegexPipes_MalletNEPipes")) { addHandMadeRegexPipes(usedPipeNames, pipes); addMalletNEPipes(usedPipeNames, pipes); }
Example #10
Source File: BrainRegionPipes.java From bluima with Apache License 2.0 | 5 votes |
/** Pipes added based on experience with full text */ private static void addFullTextPipes(List<String> usedPipeNames, List<Pipe> pipes) { // blabla 24 24 pipes.add(new LongRegexSpaced("digit_then_other_then_digit", Pattern .compile("\\d+[^\\d]+\\d+"), 2, 4)); // 30 mM K SO , 5 mM MgCl 6H O, 10 mM 24 24 22 HEPES pipes.add(new LongRegexSpaced( "digit_then_other_then_digit_then_other_then_digit", Pattern .compile(".*\\d+[^\\d\\n]+\\d+[^\\d\\n]+\\d+.*"), 4, 9)); // n 19 // n 5 pipes.add(new LongRegexSpaced("n_space_digit", Pattern .compile("n \\d+"), 2, 2)); pipes.add(new LongRegexSpaced("parenthesis_n_space_digit_parenthesis", Pattern.compile("\\( n \\d+ \\)"), 3, 4)); pipes.add(new LongRegexSpaced("n_space_digit_parenthesis", Pattern .compile("n \\d+ \\)"), 3, 4)); pipes.add(new LongRegexSpaced("parenthesis_n_space_digit", Pattern .compile("\\( n \\d+"), 3, 4)); // Fig is never found in any lexicon pipes.add(new RegexMatches("Figure", Pattern.compile(".*Fig.*"))); }
Example #11
Source File: BrainRegionPipes.java From bluima with Apache License 2.0 | 5 votes |
private static void addAbbreviationLexiconPipes(List<String> usedPipeNames, List<Pipe> pipes) throws IOException { usedPipeNames.add("AbbrevLex"); File ratMouse = new File(LEXICON_HOME + "NN2007RatMouseAbbrev.txt"); File human = new File(LEXICON_HOME + "NN2002HumanAbbrev.txt"); boolean ignoreCase = true; // should be one word only but may not.. pipes.add(new TrieLexiconMembership("NNHumanAbbrev", human, ignoreCase)); pipes.add(new TrieLexiconMembership("NNRatMouseAbbrev", ratMouse, ignoreCase)); addPrefixPipes(pipes, ratMouse, "NNHumanAbbrevPrefix"); addPrefixPipes(pipes, human, "NNRatMouseAbbrevPrefix"); }
Example #12
Source File: BrainRegionPipes.java From bluima with Apache License 2.0 | 5 votes |
public static void addPrefixPipes(List<Pipe> pipes, File file, String name) throws IOException { for (String line : linesFrom(file.getAbsolutePath())) { pipes.add(new RegexMatches(name, compile("(" + line.trim() + ".{1,3})", CASE_INSENSITIVE))); } }
Example #13
Source File: BrainRegionPipes.java From bluima with Apache License 2.0 | 5 votes |
public static void addAreaRegexPipes(List<String> usedPipeNames, List<Pipe> pipes) { usedPipeNames.add("Area regexes"); pipes.add(new LongRegexSpaced("Brodmann", Pattern .compile("areas? \\d+((, ?\\d)*,? (or|and) \\d+)?"), 2, 9)); // a looser version that allows just letters pipes.add(new LongRegexSpaced( "Areas", compile("areas? (\\p{Upper}|\\d)+((, ?(\\p{Upper}|\\d))*,? (or|and) (\\p{Upper}|\\d)+)?"), 2, 9)); }
Example #14
Source File: NGramPipeFactory.java From bluima with Apache License 2.0 | 5 votes |
public static List<Pipe> getAllGramsPipes(String name, File inputFile, boolean ignoreCase, int startGramSize) throws Exception { // go up to seven? List<Pipe> pipes = new LinkedList<Pipe>(); for (int i = startGramSize; i < 7; i++) { pipes.add(getNGramPipe(name, inputFile, ignoreCase, i)); } return pipes; }
Example #15
Source File: BrainRegionPipes.java From bluima with Apache License 2.0 | 5 votes |
public static void addSpineRegexPipes(List<String> usedPipeNames, List<Pipe> pipes) throws Exception { usedPipeNames.add("SpineRegex"); // T1-T12 // L1-L5 // S1-S5 // C1-C8 pipes.add(new LongRegexMatches("SpinalParts", Pattern .compile("([LS][1-5])|T((1[0-2]?)|[2-9])|(C[1-8])"), 1, 2)); }
Example #16
Source File: BrainRegionPipes.java From bluima with Apache License 2.0 | 5 votes |
public static void addSmallLexicons(List<String> usedPipeNames, List<Pipe> pipes, boolean ignoreCase) throws FileNotFoundException { usedPipeNames.add("SmallLex"); pipes.add(new LexiconMembership("chudlerListWord", new File( LEXICON_HOME + "chudler.txt"), ignoreCase)); pipes.add(new LexiconMembership("directionWord", new File(LEXICON_HOME + "directions.txt"), ignoreCase)); pipes.add(new LexiconMembership("extendedDirectionWord", new File( LEXICON_HOME + "extendedDirections.txt"), ignoreCase)); pipes.add(new LexiconMembership("stopWord", new File(LEXICON_HOME + "stop.txt"), ignoreCase)); }
Example #17
Source File: BrainRegionPipes.java From bluima with Apache License 2.0 | 5 votes |
public static void addHandMadeRegexPipes(List<String> usedPipeNames, List<Pipe> pipes) throws Exception { usedPipeNames.add("Handmade regexes"); pipes.add(new LongRegexSpaced("of_The", compile("of the"), 2, 2)); pipes.add(new LongRegexSpaced("part_Of", compile("part of"), 2, 2)); pipes.add(new LongRegexSpaced("neurnEnd", compile("(.* neurons)"), 2, 3)); pipes.add(new LongRegexSpaced("nucleiEnd", compile("(.* nuclei)"), 2, 3)); pipes.add(new LongRegexSpaced("nclusEnd", compile("(.* nucleus)"), 2, 5)); pipes.add(new LongRegexSpaced("fieldEnd", compile("(.* field)"), 2, 4)); pipes.add(new LongRegexSpaced("cortexEnd", compile("(.* cortex)"), 2, 3)); pipes.add(new LongRegexSpaced("areaEnd", compile("(.* area)"), 2, 4)); pipes.add(new LongRegexSpaced("territoryEnd", compile("(.* territory)|(.* territories)"), 2, 4)); }
Example #18
Source File: BrainRegionPipes.java From bluima with Apache License 2.0 | 5 votes |
public static void addLengthPipes(List<String> usedPipeNames, List<Pipe> pipes) throws Exception { usedPipeNames.add("Length"); // length feature - binary bins pipes.add(new LengthBins("Length", new int[] { 1, 2, 3, 5, 8, 11, 14, 18, 22 })); // from some calcs the average brain token is 6.92 while the outside is // 4.64 (~3.55 stdev) pipes.add(new LengthBins("LengthThreshold", new int[] { 6 })); }
Example #19
Source File: LDA.java From topic-detection with Apache License 2.0 | 5 votes |
/** * Creates a list of Malelt instances from a list of documents * @param texts a list of documents * @return a list of Mallet instances * @throws IOException */ private InstanceList createInstanceList(List<String> texts) throws IOException { ArrayList<Pipe> pipes = new ArrayList<Pipe>(); pipes.add(new CharSequence2TokenSequence()); pipes.add(new TokenSequenceLowercase()); pipes.add(new TokenSequenceRemoveStopwords()); pipes.add(new TokenSequence2FeatureSequence()); InstanceList instanceList = new InstanceList(new SerialPipes(pipes)); instanceList.addThruPipe(new ArrayIterator(texts)); return instanceList; }
Example #20
Source File: NGramPipeFactory.java From bluima with Apache License 2.0 | 5 votes |
public static Pipe getNGramPipe(String name, File inputFile, boolean ignoreCase, int gram) throws Exception { File tempFile = File.createTempFile("ngram", ".txt"); // System.out.println( "Your temp file is " + // tempFile.getCanonicalPath() ); // Arrange for it to be deleted at exit. tempFile.deleteOnExit(); BufferedWriter bw = new BufferedWriter(new FileWriter(tempFile)); BufferedReader br = new BufferedReader(new FileReader(inputFile)); while (br.ready()) { String line = br.readLine().trim(); if (line.equals("")) continue; // ignore blank lines // check null String[] gramStrings = getGrams(line, gram); if (gramStrings != null) { for (String gramString : gramStrings) { bw.write(gramString); // System.out.println(gramString); bw.newLine(); } } } bw.close(); br.close(); return new TrieLexiconMembership(name + "-" + gram + "-gram", tempFile, ignoreCase); }
Example #21
Source File: ReferencesClassifierTrainer.java From bluima with Apache License 2.0 | 5 votes |
static List<Pipe> getPipes() { List<Pipe> pipes = newArrayList(); pipes.add(new Target2Label()); pipes.add(new MyInput2RegexTokens()); // pipes.add(new PrintInputAndTarget()); pipes.add(new TokenSequence2FeatureSequence()); pipes.add(new FeatureSequence2FeatureVector()); return pipes; }
Example #22
Source File: CorpusRepresentationMalletTarget.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 5 votes |
/** * Constructor for creating a new CorpusRepresentation from a FeatureInfo. * * @param fi FeatureInfo instance * @param targetType type of target */ public CorpusRepresentationMalletTarget(FeatureInfo fi, TargetType targetType) { featureInfo = fi; scalingMethod = fi.getGlobalScalingMethod(); this.targetType = targetType; LabelAlphabet targetAlphabet = (targetType == TargetType.NOMINAL) ? new LabelAlphabet() : null; Pipe innerPipe = new Noop(new LFAlphabet(), targetAlphabet); List<Pipe> pipes = new ArrayList<>(); pipes.add(innerPipe); pipe = new LFPipe(pipes); pipe.setFeatureInfo(fi); instances = new LFInstanceList(pipe); }
Example #23
Source File: CorpusRepresentationMalletLDA.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 5 votes |
public CorpusRepresentationMalletLDA(FeatureInfo fi) { featureInfo = fi; // always null // since we always pass a null feature info, the scaling method is hard-wired to be NONE scalingMethod = ScalingMethod.NONE; // TODO: we really do not need any of this, figure out if we can simplify, // but keeping this should not really do any harm! Pipe innerPipe = new Noop(new LFAlphabet(), null); List<Pipe> pipes = new ArrayList<>(); pipes.add(innerPipe); pipe = new LFPipe(pipes); pipe.setFeatureInfo(fi); instances = new LFInstanceList(pipe); targetType = TargetType.NONE; }
Example #24
Source File: CorpusRepresentationMalletSeq.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 5 votes |
public CorpusRepresentationMalletSeq(FeatureInfo fi) { featureInfo = fi; scalingMethod = fi.getGlobalScalingMethod(); Pipe innerPipe = new Noop(new LFAlphabet(), new LabelAlphabet()); List<Pipe> pipes = new ArrayList<>(); pipes.add(innerPipe); pipe = new LFPipe(pipes); pipe.setFeatureInfo(fi); instances = new LFInstanceList(pipe); targetType = TargetType.NOMINAL; }
Example #25
Source File: NGramPipeFactory.java From bluima with Apache License 2.0 | 4 votes |
public static List<Pipe> getAllGramsPipes(String name, File inputFile, boolean ignoreCase) throws Exception { return getAllGramsPipes(name, inputFile, ignoreCase, 1); }
Example #26
Source File: NEPipes.java From bluima with Apache License 2.0 | 4 votes |
public NEPipes() { super( new Pipe[] { //new TokenText( "text=" ), new RegexMatches( "SingleLetter", Pattern.compile( "[A-Za-z]" ) ), new RegexMatches( "AllCaps", Pattern.compile( ALLCAPS ) ), new RegexMatches( "AllLower", Pattern.compile( ALLLOWER ) ), new RegexMatches( "InitCaps", Pattern.compile( INITCAPS ) ), new RegexMatches( "MixedCase", Pattern.compile( MIXEDCASE ) ), new RegexMatches( "MixedNum", Pattern.compile( MIXEDNUM ) ), new RegexMatches( "EndSentPunc", Pattern.compile( ENDSENTENCE ) ), new RegexMatches( "Punc", Pattern.compile( PUNCTUATION ) ), new RegexMatches( "Bracket", Pattern.compile( BRACKET ) ), new RegexMatches( "Ordinal", Pattern.compile( ORDINAL, Pattern.CASE_INSENSITIVE ) ), new LongRegexMatches( "Quoted", Pattern.compile( QUOTED ), 1, 4 ), new LongRegexMatches( "Bracketed", Pattern.compile( BRACKETED ), 1, 4 ), new LongRegexMatches( "Initial", Pattern.compile( INITIAL ), 2, 2 ), new LongRegexMatches( "Ellipse", Pattern.compile( DOTS ), 1, 2 ), new LongRegexMatches( "Dashes", Pattern.compile( DASHES ), 2, 2 ), new LongRegexMatches( "Fraction", Pattern.compile( FRACTION ), 1, 3 ), new LongRegexMatches( "DotDecimal", Pattern.compile( DOTDECIMAL ), 1, 3 ), new LongRegexMatches( "Percent", Pattern.compile( "(" + RANGE + "|" + DECIMAL + ")%" ), 2, 4 ), new RegexMatches( "10^3n", Pattern.compile( ILLION, Pattern.CASE_INSENSITIVE ) ), new LongRegexMatches( "Numeric", Pattern.compile( DECIMAL ), 1, 3 ), new LongRegexMatches( "BigNumber", Pattern.compile( COMMA_DECIMAL ), 1, 7 ), new LongRegexMatches( "kmbNumber", Pattern.compile( DECIMAL + ILLION, Pattern.CASE_INSENSITIVE ), 1, 4 ), new RegexMatches( "kmbMixed", Pattern.compile( MIXED_ILLION, Pattern.CASE_INSENSITIVE ) ), new LongRegexMatches( "Dollars", Pattern.compile( "[$](" + RANGE + "|" + DECIMAL + "|" + COMMA_DECIMAL + "|" + DECIMAL + ILLION + "|" + MIXED_ILLION + ")", Pattern.CASE_INSENSITIVE ), 2, 8 ), new RegexMatches( "NumberWord", Pattern.compile( NUMBER_WORD, Pattern.CASE_INSENSITIVE ) ), //FIXME useful beyond this? new RegexMatches( "Currency", Pattern.compile( CURRENCY, Pattern.CASE_INSENSITIVE ) ), new LongRegexMatches( "MoneyWords", Pattern.compile( MONEYWORDS, Pattern.CASE_INSENSITIVE ), 2, 4 ), new LongRegexMatches( "AmPm", Pattern.compile( AMPM, Pattern.CASE_INSENSITIVE ), 1, 4 ), new RegexMatches( "MixedAmPm", Pattern.compile( MIXED_AMPM, Pattern.CASE_INSENSITIVE ) ), new LongRegexMatches( "TimeNum", Pattern.compile( TIMENUM ), 3, 5 ), new RegexMatches( "TimeZone", Pattern.compile( TIMEZONES, Pattern.CASE_INSENSITIVE ) ), new LongRegexMatches( "Time", Pattern.compile( TIME, Pattern.CASE_INSENSITIVE ), 1, 9 ), new LongRegexMatches( "TimeRange", Pattern.compile( TIMERANGE, Pattern.CASE_INSENSITIVE ), 3, 19 ), new LongRegexMatches( "P10", Pattern.compile( P10 ), 3, 7 ), new LongRegexMatches( "P5", Pattern.compile( P10 ), 3, 3 ), new LongRegexMatches( "Phone", Pattern.compile( P10 + "|" + P5 ), 3, 7 ), new RegexMatches( "UncasedMonthName", Pattern.compile( MONTHNAME, Pattern.CASE_INSENSITIVE ) ), new LongRegexMatches( "UncasedMonthAbbr", Pattern.compile( MONTHABBR, Pattern.CASE_INSENSITIVE ), 1, 2 ), new LongRegexMatches( "CasedMonth", Pattern.compile( MONTH ), 1, 2 ), new LongRegexMatches( "UncasedMonth", Pattern.compile( MONTH, Pattern.CASE_INSENSITIVE ), 1, 2 ), new RegexMatches( "UncasedWeekdayName", Pattern.compile( WEEKDAYNAME, Pattern.CASE_INSENSITIVE ) ), new LongRegexMatches( "UncasedWeekdayAbbr", Pattern.compile( WEEKDAYABBR, Pattern.CASE_INSENSITIVE ), 1, 2 ), new LongRegexMatches( "CasedWeekday", Pattern.compile( WEEKDAY ), 1, 2 ), new LongRegexMatches( "UncasedWeekday", Pattern.compile( WEEKDAY, Pattern.CASE_INSENSITIVE ), 1, 2 ), new LongRegexMatches( "MonthDay", Pattern.compile( MONTHDAY, Pattern.CASE_INSENSITIVE ), 2, 3 ), new LongRegexMatches( "DayMonthDay", Pattern.compile( DAYMONTHDAY, Pattern.CASE_INSENSITIVE ), 3, 6 ), new LongRegexMatches( "MonthYear", Pattern.compile( MONTHYEAR, Pattern.CASE_INSENSITIVE ), 2, 4 ), new LongRegexMatches( "MonthDayYear", Pattern.compile( MONTHDAYYEAR, Pattern.CASE_INSENSITIVE ), 3, 5 ), new LongRegexMatches( "DayMonthDayYear", Pattern.compile( DAYMONTHDAYYEAR, Pattern.CASE_INSENSITIVE ), 4, 8 ), new LongRegexMatches( "SeparatorDate", Pattern.compile( SEPDATE ), 3, 5 ), new LongRegexMatches( "FullSeparatorDate", Pattern.compile( FULLSEPDATE ), 5, 5 ), } ); }
Example #27
Source File: BrainRegionPipes.java From bluima with Apache License 2.0 | 4 votes |
public static void addMalletNEPipes(List<String> usedPipeNames, List<Pipe> pipes) throws Exception { usedPipeNames.add("Mallet NE"); // random pipes from general NER pipes.addAll(new NEPipes().pipes()); }
Example #28
Source File: Attributes.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 4 votes |
/** * Generate the attributes object from the information in the pipe. * The pipe should be a LFPipe, but we also try to come up with something * if it is an ordinary pipe. * * @param pipe mallet pipe * @param instanceType instance type */ public Attributes(Pipe pipe, String instanceType) { // first create the attributes (independent vars) Alphabet dataAlphabet = pipe.getDataAlphabet(); // if we can, also represent the pipe as LFPipe LFPipe lfPipe; FeatureInfo featureInfo = null; if(pipe instanceof LFPipe) { lfPipe = (LFPipe)pipe; featureInfo = lfPipe.getFeatureInfo(); } // the alphabet we use if we have a boolean variable LFAlphabet booleanAlph = new LFAlphabet(); booleanAlph.lookupIndex("false"); booleanAlph.lookupIndex("true"); for(int i =0; i<dataAlphabet.size(); i++) { String malletFeatureName = (String) dataAlphabet.lookupObject(i); // create an attribute with default settings for datatype, code and // alphabet, if we got more information about it we will override later Attribute attr = new Attribute( malletFeatureName, i, Datatype.numeric, null, null, null); // add it attributes.add(attr); name2index.put(malletFeatureName, i); // If we have a LFPipe, also get some additional info about the type, values etc. // NOTE that the default type for features that indicate the presence of // strings, ngrams etc. (which we assume when nothing else is declared) // is numeric, so that instead of 0/1 we can have counts or tf/idf or // other scores. So only if there is an explicity declaration of a different // type, we will change the default values. if(featureInfo != null) { FeatureSpecAttribute fsAttr = FeatureExtractionMalletSparse.lookupAttributeForFeatureName( featureInfo.getAttributes(), malletFeatureName, instanceType); if(fsAttr instanceof FeatureSpecAttributeList) { FeatureSpecAttributeList fsAttrList = (FeatureSpecAttributeList)fsAttr; attr.codeAs = fsAttrList.codeas; attr.mvTreatment = fsAttrList.missingValueTreatment; attr.datatype = fsAttrList.datatype; if(fsAttrList.datatype == Datatype.bool) { attr.alphabet = booleanAlph; } else if(fsAttrList.datatype == Datatype.nominal) { if(fsAttrList.codeas == CodeAs.number) { attr.alphabet = fsAttrList.alphabet; } } } else if(fsAttr instanceof FeatureSpecSimpleAttribute) { FeatureSpecSimpleAttribute fsAttrSimple = (FeatureSpecSimpleAttribute)fsAttr; attr.codeAs = fsAttrSimple.codeas; attr.mvTreatment = fsAttrSimple.missingValueTreatment; attr.datatype = fsAttrSimple.datatype; if(fsAttrSimple.datatype == Datatype.bool) { attr.alphabet = booleanAlph; } else if(fsAttrSimple.datatype == Datatype.nominal) { if(fsAttrSimple.codeas == CodeAs.number) { attr.alphabet = fsAttrSimple.alphabet; } } } else if(fsAttr instanceof FeatureSpecNgram) { // nothing to do here } else if(fsAttr==null) { // This can also happen if we try to look up a START/STOP feature which // is created by us and for which not specification exists. In this case, // we simply do nothing and use the default attr we have created above if(malletFeatureName.endsWith(FeatureExtractionMalletSparse.START_SYMBOL) || malletFeatureName.endsWith(FeatureExtractionMalletSparse.STOP_SYMBOL)) { // do nothing } else { throw new RuntimeException("FeatureSpecification is null for feature "+ i+", name="+malletFeatureName+ "\nFeatureSpecification is "+featureInfo); } } else { throw new RuntimeException( "Impossible: found odd FeatureSpecAttribute type "+fsAttr.getClass()); } } } @SuppressWarnings("unchecked") LabelAlphabet targetAlphabet = (LabelAlphabet)pipe.getTargetAlphabet(); // if the target alphabet exists, we assume a nominal target // The target index is the next index after the last independent attribute // index. This is convenient for Weka. targetAttribute = new Attribute("target", attributes.size(), Datatype.numeric, null, null, null); if(targetAlphabet != null) { targetAttribute.alphabet = targetAlphabet; targetAttribute.datatype = Datatype.nominal; } }
Example #29
Source File: BrainRegionPipes.java From bluima with Apache License 2.0 | 4 votes |
public static void addAreaLexicons(List<String> usedPipeNames, List<Pipe> pipes, boolean ignoreCase) throws FileNotFoundException { usedPipeNames.add("Areawords"); pipes.add(new LexiconMembership("areawords", new File(LEXICON_HOME + "areawords.txt"), ignoreCase)); }
Example #30
Source File: ITEngineMalletClass.java From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 | 4 votes |
@Test public void testEngineMalletClass1() throws MalformedURLException, ResourceInstantiationException { File configFile = new File("tests/cl-ionosphere/feats.xml"); FeatureSpecification spec = new FeatureSpecification(configFile); FeatureInfo featureInfo = spec.getFeatureInfo(); Engine engine = Engine.create(AlgorithmClassification.MalletC45_CL_MR, "", featureInfo, TargetType.NOMINAL, null); CorpusRepresentationMalletTarget crm = (CorpusRepresentationMalletTarget)engine.getCorpusRepresentation(); System.err.println("TESTS: have engine "+engine); // load a document and train the model Document doc = loadDocument(new File("tests/cl-ionosphere/ionosphere_gate.xml")); System.err.println("TESTS: have document"); AnnotationSet instanceAS = doc.getAnnotations().get("Mention"); AnnotationSet sequenceAS = null; AnnotationSet inputAS = doc.getAnnotations(); AnnotationSet classAS = null; String targetFeature = "class"; String nameFeature = null; crm.add(instanceAS, sequenceAS, inputAS, classAS, targetFeature, TargetType.NOMINAL, "", nameFeature, null); System.err.println("TESTS: added instances, number of instances now: "+crm.getRepresentationMallet().size()); engine.trainModel(null,"",""); System.err.println("TESTS: model trained"); System.err.println("TESTS: engine before saving: "+engine); engine.saveEngine(new File(".")); // Now check if we can restore the engine and thus the corpus representation Engine engine2 = Engine.load(new File(".").toURI().toURL(), ""); System.err.println("RESTORED engine is "+engine2); // check if the corpusRepresentation has been restored correctly CorpusRepresentation cr2 = engine2.getCorpusRepresentation(); assertNotNull(cr2); assertTrue(cr2 instanceof CorpusRepresentationMalletTarget); CorpusRepresentationMalletTarget crmc2 = (CorpusRepresentationMalletTarget)cr2; Pipe pipe = crmc2.getPipe(); assertNotNull(pipe); assertTrue(pipe instanceof LFPipe); LFPipe lfpipe = (LFPipe)pipe; FeatureInfo fi = lfpipe.getFeatureInfo(); assertNotNull(fi); AnnotationSet lfAS = doc.getAnnotations("LF"); String parms = ""; List<ModelApplication> gcs = engine2.applyModel(instanceAS, inputAS, sequenceAS, parms); System.err.println("Number of classifications: "+gcs.size()); ModelApplication.applyClassification(doc, gcs, "target", lfAS, null); System.err.println("Original instances: "+instanceAS.size()+", classification: "+lfAS.size()); // quick and dirty evaluation: go through all the original annotations, get the // co-extensive annotations from LF, and compare the values from the "class" feature int total = 0; int correct = 0; for(Annotation orig : instanceAS) { total++; Annotation lf = gate.Utils.getOnlyAnn(gate.Utils.getCoextensiveAnnotations(lfAS, orig)); //System.err.println("ORIG="+orig+", lf="+lf); if(orig.getFeatures().get("class").equals(lf.getFeatures().get("target"))) { correct++; } } double acc = (double)correct / (double)total; System.err.println("Got total="+total+", correct="+correct+", acc="+acc); assertEquals(0.9630, acc, 0.01); }