opennlp.tools.util.ObjectStream Java Exaples

Source File: AbstractTaggerTrainer.java From ixa-pipe-pos with Apache License 2.0

6 votes

/**
 * Construct an AbstractTrainer. In the params parameter there is information
 * about the language, the featureset, and whether to use pos tag dictionaries
 * or automatically created dictionaries from the training set.
 * 
 * @param params
 *          the training parameters
 * @throws IOException
 *           the io exceptions
 */
public AbstractTaggerTrainer(final TrainingParameters params) throws IOException {
  this.lang = Flags.getLanguage(params);
  final String trainData = Flags.getDataSet("TrainSet", params);
  final String testData = Flags.getDataSet("TestSet", params);
  final ObjectStream<String> trainStream = InputOutputUtils
      .readFileIntoMarkableStreamFactory(trainData);
  this.trainSamples = new MorphoSampleStream(trainStream);
  final ObjectStream<String> testStream = InputOutputUtils
      .readFileIntoMarkableStreamFactory(testData);
  this.testSamples = new MorphoSampleStream(testStream);
  final ObjectStream<String> dictStream = InputOutputUtils
      .readFileIntoMarkableStreamFactory(trainData);
  setDictSamples(new MorphoSampleStream(dictStream));
  this.dictCutOff = Flags.getAutoDictFeatures(params);
  this.ngramCutOff = Flags.getNgramDictFeatures(params);

}

Source File: AbstractTaggerTrainer.java From ixa-pipe-pos with Apache License 2.0

6 votes

/**
 * Automatically create a tag dictionary from training data.
 * 
 * @param aDictSamples
 *          the dictSamples created from training data
 * @param aDictCutOff
 *          the cutoff to create the dictionary
 */
protected final void createAutomaticDictionary(
    final ObjectStream<POSSample> aDictSamples, final int aDictCutOff) {
  if (aDictCutOff != Flags.DEFAULT_DICT_CUTOFF) {
    try {
      TagDictionary dict = getPosTaggerFactory().getTagDictionary();
      if (dict == null) {
        dict = getPosTaggerFactory().createEmptyTagDictionary();
        getPosTaggerFactory().setTagDictionary(dict);
      }
      if (dict instanceof MutableTagDictionary) {
        POSTaggerME.populatePOSDictionary(aDictSamples,
            (MutableTagDictionary) dict, aDictCutOff);
      } else {
        throw new IllegalArgumentException("Can't extend a POSDictionary"
            + " that does not implement MutableTagDictionary.");
      }
      this.dictSamples.reset();
    } catch (final IOException e) {
      throw new TerminateToolException(-1,
          "IO error while creating/extending POS Dictionary: "
              + e.getMessage(), e);
    }
  }
}

Source File: AbstractTaggerTrainer.java From ixa-pipe-pos with Apache License 2.0

6 votes

/**
 * Create ngram dictionary from training data.
 * 
 * @param aDictSamples
 *          the training data
 * @param aNgramCutoff
 *          the cutoff
 * @return ngram dictionary
 */
protected final Dictionary createNgramDictionary(
    final ObjectStream<POSSample> aDictSamples, final int aNgramCutoff) {
  Dictionary ngramDict = null;
  if (aNgramCutoff != Flags.DEFAULT_DICT_CUTOFF) {
    System.err.print("Building ngram dictionary ... ");
    try {
      ngramDict = POSTaggerME
          .buildNGramDictionary(aDictSamples, aNgramCutoff);
      this.dictSamples.reset();
    } catch (final IOException e) {
      throw new TerminateToolException(-1,
          "IO error while building NGram Dictionary: " + e.getMessage(), e);
    }
    System.err.println("done");
  }
  return ngramDict;
}

Source File: LanguageDetectorAndTrainingDataUnitTest.java From tutorials with MIT License

6 votes

@Test
public void givenLanguageDictionary_whenLanguageDetect_thenLanguageIsDetected() throws FileNotFoundException, IOException {
    InputStreamFactory dataIn = new MarkableFileInputStreamFactory(new File("src/main/resources/models/DoccatSample.txt"));
    ObjectStream lineStream = new PlainTextByLineStream(dataIn, "UTF-8");
    LanguageDetectorSampleStream sampleStream = new LanguageDetectorSampleStream(lineStream);
    TrainingParameters params = new TrainingParameters();
    params.put(TrainingParameters.ITERATIONS_PARAM, 100);
    params.put(TrainingParameters.CUTOFF_PARAM, 5);
    params.put("DataIndexer", "TwoPass");
    params.put(TrainingParameters.ALGORITHM_PARAM, "NAIVEBAYES");

    LanguageDetectorModel model = LanguageDetectorME.train(sampleStream, params, new LanguageDetectorFactory());

    LanguageDetector ld = new LanguageDetectorME(model);
    Language[] languages = ld.predictLanguages("estava em uma marcenaria na Rua Bruno");
    
    assertThat(Arrays.asList(languages)).extracting("lang", "confidence").contains(tuple("pob", 0.9999999950605625),
             tuple("ita", 4.939427661577956E-9), tuple("spa", 9.665954064665144E-15),
            tuple("fra", 8.250349924885834E-25));
}

Source File: Chapter4.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

private static void trainingOpenNLPNERModel() {
    try (OutputStream modelOutputStream = new BufferedOutputStream(
            new FileOutputStream(new File("modelFile")));) {
        ObjectStream<String> lineStream = new PlainTextByLineStream(
                new FileInputStream("en-ner-person.train"), "UTF-8");
        ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);

        TokenNameFinderModel model = NameFinderME.train("en", "person", sampleStream,
                null, 100, 5);

        model.serialize(modelOutputStream);
    } catch (IOException ex) {
        ex.printStackTrace();
    }
}

Source File: POSCrossValidator.java From ixa-pipe-pos with Apache License 2.0

5 votes

/**
 * Construct a CrossValidator. In the params parameter there is information
 * about the language, the featureset, and whether to use pos tag dictionaries
 * or automatically created dictionaries from the training set.
 * 
 * @param params
 *          the training parameters
 * @throws IOException
 *           the io exceptions
 */
public POSCrossValidator(final TrainingParameters params) throws IOException {
  this.lang = Flags.getLanguage(params);
  final String trainData = Flags.getDataSet("TrainSet", params);
  final ObjectStream<String> trainStream = InputOutputUtils
      .readFileIntoMarkableStreamFactory(trainData);
  this.trainSamples = new WordTagSampleStream(trainStream);
  this.dictCutOff = Flags.getAutoDictFeatures(params);
  this.folds = Flags.getFolds(params);
  createPOSFactory(params);
  getEvalListeners(params);
}

Source File: IntentDocumentSampleStream.java From org.openhab.ui.habot with Eclipse Public License 1.0

4 votes

public IntentDocumentSampleStream(String category, ObjectStream<String> stream) {
    this.category = category;
    this.stream = stream;
}

Source File: IntentDocumentSampleStream.java From nlp-intent-toolkit with The Unlicense

4 votes

public IntentDocumentSampleStream(String category, ObjectStream<String> stream) {
    this.category = category;
    this.stream = stream;
}

Source File: LemmaSampleSequenceStream.java From ixa-pipe-pos with Apache License 2.0

4 votes

public LemmaSampleSequenceStream(ObjectStream<LemmaSample> samples,
    LemmatizerContextGenerator contextGenerator) {
  this.samples = samples;
  this.contextGenerator = contextGenerator;
}

Source File: LemmaSampleStream.java From ixa-pipe-pos with Apache License 2.0

4 votes

public LemmaSampleStream(ObjectStream<String> samples) {
  super(samples);
}

Source File: MorphoSampleStream.java From ixa-pipe-pos with Apache License 2.0

4 votes

public MorphoSampleStream(ObjectStream<String> samples) {
  super(samples);
}

Source File: AbstractLemmatizerTrainer.java From ixa-pipe-pos with Apache License 2.0

3 votes

/**
 * Construct an AbstractTrainer. In the params parameter there is information
 * about the language, the featureset, and whether to use pos tag dictionaries
 * or automatically created dictionaries from the training set.
 * 
 * @param params
 *          the training parameters
 * @throws IOException
 *           the io exceptions
 */
public AbstractLemmatizerTrainer(final TrainingParameters params) throws IOException {
  this.lang = Flags.getLanguage(params);
  final String trainData = Flags.getDataSet("TrainSet", params);
  final String testData = Flags.getDataSet("TestSet", params);
  final ObjectStream<String> trainStream = InputOutputUtils.readFileIntoMarkableStreamFactory(trainData);
  this.trainSamples = new LemmaSampleStream(trainStream);
  final ObjectStream<String> testStream = InputOutputUtils.readFileIntoMarkableStreamFactory(testData);
  this.testSamples = new LemmaSampleStream(testStream);
}

Source File: LemmaSampleEventStream.java From ixa-pipe-pos with Apache License 2.0

2 votes

/**
 * Creates a new event stream based on the specified data stream using the specified context generator.
 * @param d The data stream for this event stream.
 * @param cg The context generator which should be used in the creation of events for this event stream.
 */
public LemmaSampleEventStream(ObjectStream<LemmaSample> d, LemmatizerContextGenerator cg) {
  super(d);
  this.contextGenerator = cg;
}

opennlp.tools.util.ObjectStream Java Examples