Java Code Examples for org.apache.uima.collection.CollectionReader#hasNext()

The following examples show how to use org.apache.uima.collection.CollectionReader#hasNext() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DataMajorityNerRecommenderTest.java    From inception with Apache License 2.0 6 votes vote down vote up
private List<CAS> loadData(Dataset ds, File ... files) throws UIMAException, IOException
{
    CollectionReader reader = createReader(Conll2002Reader.class,
        Conll2002Reader.PARAM_PATTERNS, files, 
        Conll2002Reader.PARAM_LANGUAGE, ds.getLanguage(), 
        Conll2002Reader.PARAM_COLUMN_SEPARATOR, Conll2002Reader.ColumnSeparators.TAB.getName(),
        Conll2002Reader.PARAM_HAS_TOKEN_NUMBER, true, 
        Conll2002Reader.PARAM_HAS_HEADER, true, 
        Conll2002Reader.PARAM_HAS_EMBEDDED_NAMED_ENTITY, true);

    List<CAS> casList = new ArrayList<>();
    while (reader.hasNext()) {
        JCas cas = JCasFactory.createJCas();
        reader.getNext(cas.getCas());
        casList.add(cas.getCas());
    }
    return casList;
}
 
Example 2
Source File: StringMatchingRecommenderTest.java    From inception with Apache License 2.0 6 votes vote down vote up
private List<CAS> loadData(Dataset ds, File ... files) throws UIMAException, IOException
{
    CollectionReader reader = createReader(Conll2002Reader.class,
        Conll2002Reader.PARAM_PATTERNS, files, 
        Conll2002Reader.PARAM_LANGUAGE, ds.getLanguage(), 
        Conll2002Reader.PARAM_COLUMN_SEPARATOR, Conll2002Reader.ColumnSeparators.TAB.getName(),
        Conll2002Reader.PARAM_HAS_TOKEN_NUMBER, true, 
        Conll2002Reader.PARAM_HAS_HEADER, true, 
        Conll2002Reader.PARAM_HAS_EMBEDDED_NAMED_ENTITY, true);

    List<CAS> casList = new ArrayList<>();
    int n = 1;
    while (reader.hasNext()) {
        JCas cas = JCasFactory.createJCas();
        reader.getNext(cas.getCas());
        casList.add(cas.getCas());
        casStorageSession.add("testDataCas" + n, EXCLUSIVE_WRITE_ACCESS, cas.getCas());
    }
    
    return casList;
}
 
Example 3
Source File: OpenNlpDoccatRecommenderTest.java    From inception with Apache License 2.0 6 votes vote down vote up
private List<CAS> loadData(Dataset ds, File ... files) throws UIMAException, IOException
{
    CollectionReader reader = createReader(Reader.class,
            Reader.PARAM_PATTERNS, files, 
            Reader.PARAM_LANGUAGE, ds.getLanguage());

    AnalysisEngine segmenter = createEngine(BreakIteratorSegmenter.class,
            BreakIteratorSegmenter.PARAM_WRITE_SENTENCE, false);
    
    List<CAS> casList = new ArrayList<>();
    while (reader.hasNext()) {
        JCas cas = JCasFactory.createJCas();
        reader.getNext(cas.getCas());
        segmenter.process(cas);
        casList.add(cas.getCas());
    }
    return casList;
}
 
Example 4
Source File: OpenNlpNerRecommenderTest.java    From inception with Apache License 2.0 6 votes vote down vote up
private List<CAS> loadData(Dataset ds, File ... files) throws UIMAException, IOException
{
    CollectionReader reader = createReader(
        Conll2002Reader.class,
        Conll2002Reader.PARAM_PATTERNS, files, 
        Conll2002Reader.PARAM_LANGUAGE, ds.getLanguage(), 
        Conll2002Reader.PARAM_COLUMN_SEPARATOR, Conll2002Reader.ColumnSeparators.TAB.getName(),
        Conll2002Reader.PARAM_HAS_TOKEN_NUMBER, true, 
        Conll2002Reader.PARAM_HAS_HEADER, true, 
        Conll2002Reader.PARAM_HAS_EMBEDDED_NAMED_ENTITY, true);

    List<CAS> casList = new ArrayList<>();
    while (reader.hasNext()) {
        JCas cas = JCasFactory.createJCas();
        reader.getNext(cas.getCas());
        casList.add(cas.getCas());
    }
    return casList;
}
 
Example 5
Source File: ExternalRecommenderIntegrationTest.java    From inception with Apache License 2.0 6 votes vote down vote up
private List<CAS> loadData(Dataset ds, File ... files) throws UIMAException, IOException
{
    CollectionReader reader = createReader(Conll2002Reader.class,
        Conll2002Reader.PARAM_PATTERNS, files,
        Conll2002Reader.PARAM_LANGUAGE, ds.getLanguage(),
        Conll2002Reader.PARAM_COLUMN_SEPARATOR, Conll2002Reader.ColumnSeparators.TAB.getName(),
        Conll2002Reader.PARAM_HAS_TOKEN_NUMBER, true,
        Conll2002Reader.PARAM_HAS_HEADER, true,
        Conll2002Reader.PARAM_HAS_EMBEDDED_NAMED_ENTITY, true);

    List<CAS> casList = new ArrayList<>();
    while (reader.hasNext()) {
        // Add the CasMetadata type to the CAS
        List<TypeSystemDescription> typeSystems = new ArrayList<>();
        typeSystems.add(createTypeSystemDescription());
        typeSystems.add(CasMetadataUtils.getInternalTypeSystem());
        JCas cas = JCasFactory.createJCas(mergeTypeSystems(typeSystems));
        reader.getNext(cas.getCas());
        casList.add(cas.getCas());
    }
    return casList;
}
 
Example 6
Source File: NamedEntityLinkerTest.java    From inception with Apache License 2.0 6 votes vote down vote up
private List<CAS> loadData(Dataset ds, File ... files) throws UIMAException, IOException
{
    CollectionReader reader = createReader(
        Conll2002Reader.class,
        Conll2002Reader.PARAM_PATTERNS, files, 
        Conll2002Reader.PARAM_LANGUAGE, ds.getLanguage(), 
        Conll2002Reader.PARAM_COLUMN_SEPARATOR, Conll2002Reader.ColumnSeparators.TAB.getName(),
        Conll2002Reader.PARAM_HAS_TOKEN_NUMBER, true, 
        Conll2002Reader.PARAM_HAS_HEADER, true, 
        Conll2002Reader.PARAM_HAS_EMBEDDED_NAMED_ENTITY, true);

    List<CAS> casList = new ArrayList<>();
    while (reader.hasNext()) {
        JCas cas = JCasFactory.createJCas();
        reader.getNext(cas.getCas());
        casList.add(cas.getCas());
    }
    return casList;
}
 
Example 7
Source File: PubmedWholeDatabaseCRTest.java    From bluima with Apache License 2.0 6 votes vote down vote up
@Test
public void test() throws Exception {

    CollectionReader cr = createReader(PubmedWholeDatabaseCR.class);

    CAS cas = createCas(cr.getProcessingResourceMetaData());
    cr.getNext(cas);

    for (int i = 0; i < 10000; i++) {
        cr.hasNext();
        cas = createCas(cr.getProcessingResourceMetaData());
        cr.getNext(cas);
        System.err.println(cas.getDocumentText());
    }

    // FIXME hangs up cr.close();

    Collection<Header> h = select(cas.getJCas(), Header.class);
    assertResultsContains(h, "DocId", "1");
}
 
Example 8
Source File: BioNLPGeniaEventsReaderTest.java    From bluima with Apache License 2.0 6 votes vote down vote up
@Test
public void testCount() throws Exception {

    CollectionReader cr = CollectionReaderFactory.createReader(
            BioNLPGeniaEventsCollectionReader.class,
            BlueUima.PARAM_INPUT_DIRECTORY, TEST_DIR);

    int i = 0;
    while (cr.hasNext()) {
        CAS cas = CasCreationUtils.createCas(cr
                .getProcessingResourceMetaData());
        cr.getNext(cas);
        LOG.debug(To.string("cas nr " + i, cas.getJCas()));
        i++;
    }
    cr.close();
    assertEquals(3, i);
}
 
Example 9
Source File: BioNLPGeniaEventsCollectionReaderTest.java    From bluima with Apache License 2.0 6 votes vote down vote up
@Test
   public void test() throws Exception {

CollectionReader cr = CollectionReaderFactory.createReader(
	BioNLPGeniaEventsCollectionReader.class);

int i = 0;
while (cr.hasNext()) {
    CAS cas = CasCreationUtils.createCas(cr
	    .getProcessingResourceMetaData());
    cr.getNext(cas);

    // if (createHtml)
    // viewer.createHtml(cas.getJCas(), cas.getTypeSystem(),
    // styleMapFile, new File("target/" + i));

    i++;
}
cr.close();
assertEquals(259, i);

   }
 
Example 10
Source File: OpenNlpPosRecommenderTest.java    From inception with Apache License 2.0 5 votes vote down vote up
private List<CAS> loadData(Dataset ds, File ... files) throws UIMAException, IOException
{
    CollectionReader reader = createReader(Conll2006Reader.class,
        Conll2006Reader.PARAM_PATTERNS, files,
        Conll2006Reader.PARAM_LANGUAGE, ds.getLanguage());

    List<CAS> casList = new ArrayList<>();
    while (reader.hasNext()) {
        JCas cas = JCasFactory.createJCas();
        reader.getNext(cas.getCas());
        casList.add(cas.getCas());
    }
    return casList;
}
 
Example 11
Source File: LappsRecommenderIntegrationTest.java    From inception with Apache License 2.0 5 votes vote down vote up
private static CAS loadData(File aFile) throws UIMAException, IOException
{
    CollectionReader reader = createReader(ConllUReader.class,
            ConllUReader.PARAM_PATTERNS, aFile);

    List<CAS> casList = new ArrayList<>();
    while (reader.hasNext()) {
        JCas cas = JCasFactory.createJCas();
        reader.getNext(cas.getCas());
        casList.add(cas.getCas());
    }

    return casList.get(0);
}
 
Example 12
Source File: LappsGridRecommenderConformityTest.java    From inception with Apache License 2.0 5 votes vote down vote up
private static CAS loadData(File aFile) throws UIMAException, IOException
{
    CollectionReader reader = createReader(XmiReader.class,
            ConllUReader.PARAM_PATTERNS, aFile);

    List<CAS> casList = new ArrayList<>();
    while (reader.hasNext()) {
        JCas cas = JCasFactory.createJCas();
        reader.getNext(cas.getCas());
        casList.add(cas.getCas());
    }

    return casList.get(0);
}
 
Example 13
Source File: SparkUimaUtils.java    From ambiverse-nlu with Apache License 2.0 5 votes vote down vote up
public static List<SCAS> readFrom(CollectionReader reader) throws IOException, UIMAException {
  List<SCAS> scasList = new ArrayList<>();
  while (reader.hasNext()) {
    JCas jcas = JCasFactory.createJCas();
    CAS cas = jcas.getCas();
    reader.getNext(cas);
    scasList.add(new SCAS(cas));
  }
  return scasList;
}
 
Example 14
Source File: CollectionReaderFactoryExternalResourceTest.java    From uima-uimafit with Apache License 2.0 5 votes vote down vote up
@Test
public void testAutoExternalResourceBinding() throws UIMAException, IOException {
  CollectionReader reader = createReader(
          TestReader.class,
          TestReader.PARAM_RESOURCE,
          createResourceDescription(TestExternalResource.class,
                  TestExternalResource.PARAM_VALUE, TestExternalResource.EXPECTED_VALUE));

  reader.hasNext();
}
 
Example 15
Source File: GeniaCorpusCollectionReaderTest.java    From bluima with Apache License 2.0 5 votes vote down vote up
@Test
public void testFaster() throws Exception {
    CollectionReader cr = createReader(GeniaCorpusCollectionReader.class);
    cr.hasNext();
    CAS cas = createCas(cr.getProcessingResourceMetaData());
    cr.getNext(cas);
    assertEquals("Activation of the CD", cas.getJCas().getDocumentText()
            .substring(0, 20));
    cr.close();
}
 
Example 16
Source File: SimplePipeline.java    From uima-uimafit with Apache License 2.0 4 votes vote down vote up
/**
 * <p>
 * Provides a simple way to run a pipeline for a given collection reader and sequence of analysis
 * engines. After processing all CASes provided by the reader, the method calls
 * {@link AnalysisEngine#collectionProcessComplete() collectionProcessComplete()} on the engines.
 * Note that {@link AnalysisEngine#destroy()} and {@link CollectionReader#destroy()} are
 * <b>NOT</b> called. As the components were instantiated by the caller, they must also be managed
 * (i.e. destroyed) the caller.
 * </p>
 * <p>
 * External resources can only be shared between the reader and/or the analysis engines if the
 * reader/engines have been previously instantiated using a shared resource manager.
 * </p>
 * 
 * @param aResMgr
 *          a resource manager. Normally the same one used by the collection reader and analysis
 *          engines.
 * @param reader
 *          a collection reader
 * @param engines
 *          a sequence of analysis engines
 * @throws IOException
 *           if there is an I/O problem in the reader
 * @throws ResourceInitializationException 
 *           if there is a problem initializing or running the pipeline.
 * @throws CollectionException 
 *           if there is a problem initializing or running the pipeline.
 * @throws AnalysisEngineProcessException 
 *           if there is a problem initializing or running the pipeline.
 */
public static void runPipeline(final ResourceManager aResMgr, final CollectionReader reader,
        final AnalysisEngine... engines) throws IOException, ResourceInitializationException,
        AnalysisEngineProcessException, CollectionException {
  final List<ResourceMetaData> metaData = new ArrayList<ResourceMetaData>();
  metaData.add(reader.getMetaData());
  for (AnalysisEngine engine : engines) {
    metaData.add(engine.getMetaData());
  }

  final CAS cas = CasCreationUtils.createCas(metaData, null, aResMgr);
  reader.typeSystemInit(cas.getTypeSystem());

  while (reader.hasNext()) {
    reader.getNext(cas);
    runPipeline(cas, engines);
    cas.reset();
  }

  collectionProcessComplete(engines);
}
 
Example 17
Source File: Biocreative2GeneCollectionReaderTest.java    From bluima with Apache License 2.0 4 votes vote down vote up
/**
 * <pre>
 * GENE.eval
 * P00001606T0076|14 33|alkaline phosphatases
 * P00001606T0076|37 50|5-nucleotidase
 * 
 * train.in:
 * P00001606T0076 Comparison with alkaline phosphatases and 5-nucleotidase
 * </pre>
 */
@Test
public void testTrainCorpus() throws Exception {

    CollectionReader cr = CollectionReaderFactory.createReader(
            Biocreative2GeneCollectionReader.class,
            BlueUima.PARAM_MODE, "train");

    CAS cas = CasCreationUtils
            .createCas(cr.getProcessingResourceMetaData());
    cr.getNext(cas);

    Collection<BioEntityMention> genes = JCasUtil.select(cas.getJCas(),
            BioEntityMention.class);
    assertEquals(2, genes.size());
    Iterator<BioEntityMention> iterator = genes.iterator();
    BioEntityMention gene = iterator.next();
    assertEquals("alkaline phosphatases", gene.getCoveredText());
    gene = iterator.next();
    assertEquals("5-nucleotidase", gene.getCoveredText());

    // move to 'P00027739T0000 Serum gamma glutamyltransferase in the
    // diagnosis of liver disease in cattle.' to test ALTGENE annotations
    for (int i = 0; i < 11; i++) {
        cas = CasCreationUtils
                .createCas(cr.getProcessingResourceMetaData());
        cr.hasNext();
        cr.getNext(cas);
        Header header = JCasUtil.selectSingle(cas.getJCas(), Header.class);
        LOG.debug("docid:{}, text:{}", header.getDocId(),
                cas.getDocumentText());
    }

    genes = JCasUtil.select(cas.getJCas(), BioEntityMention.class);
    iterator = genes.iterator();

    // check the 2 alternate forms
    assertEquals(2, genes.size());
    gene = iterator.next();
    LOG.debug(gene.getCoveredText());
    assertEquals("Serum gamma glutamyltransferase", gene.getCoveredText());
    gene = iterator.next();
    LOG.debug(gene.getCoveredText());
    assertEquals("gamma glutamyltransferase", gene.getCoveredText());
}
 
Example 18
Source File: ImportExportServiceImpl.java    From webanno with Apache License 2.0 4 votes vote down vote up
@Override
    public CAS importCasFromFile(File aFile, Project aProject, String aFormatId,
            TypeSystemDescription aFullProjectTypeSystem)
        throws UIMAException, IOException
    {
        TypeSystemDescription tsd = aFullProjectTypeSystem;
        
        if (tsd == null) {
            tsd = annotationService.getFullProjectTypeSystem(aProject);
        }
        
        // Prepare a CAS with the project type system
        CAS cas = CasFactory.createCas(tsd);

        // Convert the source document to CAS
        FormatSupport format = getReadableFormatById(aFormatId).orElseThrow(() -> 
                new IOException("No reader available for format [" + aFormatId + "]"));
        
        CollectionReaderDescription readerDescription = format.getReaderDescription(tsd);
        addConfigurationParameters(readerDescription, 
                ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, 
                    aFile.getParentFile().getAbsolutePath(), 
                ResourceCollectionReaderBase.PARAM_PATTERNS, "[+]" + aFile.getName());
        CollectionReader reader = createReader(readerDescription);
        
        if (!reader.hasNext()) {
            throw new FileNotFoundException(
                    "Source file [" + aFile.getName() + "] not found in [" + aFile.getPath() + "]");
        }
        reader.getNext(cas);
                
        // Create sentence / token annotations if they are missing
        boolean hasTokens = exists(cas, getType(cas, Token.class));
        boolean hasSentences = exists(cas, getType(cas, Sentence.class));

//        if (!hasTokens || !hasSentences) {
//            AnalysisEngine pipeline = createEngine(createEngineDescription(
//                    BreakIteratorSegmenter.class, 
//                    BreakIteratorSegmenter.PARAM_WRITE_TOKEN, !hasTokens,
//                    BreakIteratorSegmenter.PARAM_WRITE_SENTENCE, !hasSentences));
//            pipeline.process(jCas);
//        }
        
        if (!hasSentences) {
            splitSentences(cas);
        }

        if (!hasTokens) {
            tokenize(cas);
        }
        
        if (!exists(cas, getType(cas, Token.class)) || !exists(cas, getType(cas, Sentence.class))) {
            throw new IOException("The document appears to be empty. Unable to detect any "
                    + "tokens or sentences. Empty documents cannot be imported.");
        }
        
        return cas;
    }
 
Example 19
Source File: SimplePipeline.java    From uima-uimafit with Apache License 2.0 3 votes vote down vote up
/**
 * <p>
 * Run the CollectionReader and AnalysisEngines as a pipeline. After processing all CASes provided
 * by the reader, the method calls the life-cycle methods
 * ({@link AnalysisEngine#collectionProcessComplete() collectionProcessComplete()} on the engines
 * and {@link Resource#destroy() destroy()}) on all engines. Note that the life-cycle methods are
 * <b>NOT</b> called on the reader. As the reader was instantiated by the caller, it must also be
 * managed (i.e. destroyed) the caller.
 * </p>
 * <p>
 * Note that with this method, external resources cannot be shared between the reader and the
 * analysis engines. They can be shared amongst the analysis engines.
 * </p>
 * <p>
 * The CAS is created using the resource manager used by the collection reader.
 * </p>
 * 
 * @param reader
 *          The CollectionReader that loads the documents into the CAS.
 * @param descs
 *          Primitive AnalysisEngineDescriptions that process the CAS, in order. If you have a mix
 *          of primitive and aggregate engines, then please create the AnalysisEngines yourself
 *          and call the other runPipeline method.
 * @throws IOException
 *           if there is an I/O problem in the reader
 * @throws ResourceInitializationException 
 *           if there is a problem initializing or running the pipeline.
 * @throws CollectionException 
 *           if there is a problem initializing or running the pipeline.
 * @throws AnalysisEngineProcessException 
 *           if there is a problem initializing or running the pipeline.
 */
public static void runPipeline(final CollectionReader reader,
        final AnalysisEngineDescription... descs) throws IOException,
        ResourceInitializationException, AnalysisEngineProcessException, CollectionException {
  AnalysisEngine aae = null;
  try {
    // Create AAE
    final AnalysisEngineDescription aaeDesc = createEngineDescription(descs);

    // Instantiate AAE
    aae = createEngine(aaeDesc);

    // Create CAS from merged metadata
    final CAS cas = CasCreationUtils.createCas(asList(reader.getMetaData(), aae.getMetaData()), 
            null, reader.getResourceManager());
    reader.typeSystemInit(cas.getTypeSystem());

    // Process
    while (reader.hasNext()) {
      reader.getNext(cas);
      aae.process(cas);
      cas.reset();
    }

    // Signal end of processing
    aae.collectionProcessComplete();
  } finally {
    // Destroy
    LifeCycleUtil.destroy(aae);
  }
}
 
Example 20
Source File: SimplePipeline.java    From uima-uimafit with Apache License 2.0 3 votes vote down vote up
/**
 * <p>
 * Run the CollectionReader and AnalysisEngines as a pipeline. After processing all CASes provided
 * by the reader, the method calls {@link AnalysisEngine#collectionProcessComplete()
 * collectionProcessComplete()} on the engines, {@link CollectionReader#close() close()} on the
 * reader and {@link Resource#destroy() destroy()} on the reader and all engines.
 * </p>
 * <p>
 * External resources can be shared between the reader and the analysis engines.
 * </p>
 * <p>
 * This method is suitable for the batch-processing of sets of documents where the overheaded
 * of instantiating the pipeline components does not significantly impact the overall runtime
 * of the pipeline. If you need to avoid this overhead, e.g. because you wish to run a pipeline
 * on individual documents, then you should not use this method. Instead, create a CAS using
 * {@link JCasFactory}, create a reader instance using {@link CollectionReaderFactory#createReader},
 * create an engine instance using {@link AnalysisEngineFactory#createEngine} and then use
 * a loop to process the data, resetting the CAS after each step.
 * </p>
 * <pre><code>
 *   while (reader.hasNext()) {
 *     reader.getNext(cas);
 *     engine.process(cas);
 *     cas.reset();
 *   }
 * </code></pre>
 * 
 * @param readerDesc
 *          The CollectionReader that loads the documents into the CAS.
 * @param descs
 *          Primitive AnalysisEngineDescriptions that process the CAS, in order. If you have a mix
 *          of primitive and aggregate engines, then please create the AnalysisEngines yourself
 *          and call the other runPipeline method.
 * @throws IOException
 *           if there is an I/O problem in the reader
 * @throws ResourceInitializationException 
 *           if there is a problem initializing or running the pipeline.
 * @throws CollectionException 
 *           if there is a problem initializing or running the pipeline.
 * @throws AnalysisEngineProcessException 
 *           if there is a problem initializing or running the pipeline.
 */
public static void runPipeline(final CollectionReaderDescription readerDesc,
        final AnalysisEngineDescription... descs) throws IOException,
        ResourceInitializationException, AnalysisEngineProcessException, CollectionException {
  CollectionReader reader = null;
  AnalysisEngine aae = null;
  ResourceManager resMgr = null;
  try {
    resMgr = ResourceManagerFactory.newResourceManager();
    
    // Create the components
    reader = UIMAFramework.produceCollectionReader(readerDesc, resMgr, null);

    // Create AAE
    final AnalysisEngineDescription aaeDesc = createEngineDescription(descs);

    // Instantiate AAE
    aae = UIMAFramework.produceAnalysisEngine(aaeDesc, resMgr, null);

    // Create CAS from merged metadata
    final CAS cas = CasCreationUtils.createCas(asList(reader.getMetaData(), aae.getMetaData()),
            null, resMgr);
    reader.typeSystemInit(cas.getTypeSystem());

    // Process
    while (reader.hasNext()) {
      reader.getNext(cas);
      aae.process(cas);
      cas.reset();
    }

    // Signal end of processing
    aae.collectionProcessComplete();
  } finally {
    // Destroy
    LifeCycleUtil.destroy(reader);
    LifeCycleUtil.destroy(aae);
    LifeCycleUtil.destroy(resMgr);
  }
}