org.apache.uima.collection.CollectionReader Java Examples

The following examples show how to use org.apache.uima.collection.CollectionReader. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CurationTestUtils.java    From webanno with Apache License 2.0 6 votes vote down vote up
public static CAS readXMI(String aPath, TypeSystemDescription aType)
    throws UIMAException, IOException
{
    CollectionReader reader = createReader(XmiReader.class, XmiReader.PARAM_SOURCE_LOCATION,
            "src/test/resources/" + aPath);
    CAS jcas;
    if (aType != null) {
        TypeSystemDescription builtInTypes = TypeSystemDescriptionFactory
                .createTypeSystemDescription();
        List<TypeSystemDescription> allTypes = new ArrayList<>();
        allTypes.add(builtInTypes);
        allTypes.add(aType);
        jcas = JCasFactory.createJCas(CasCreationUtils.mergeTypeSystems(allTypes)).getCas();
    }
    else {
        jcas = JCasFactory.createJCas().getCas();
    }

    reader.getNext(jcas);

    return jcas;
}
 
Example #2
Source File: BioNLPGeniaEventsCollectionReaderTest.java    From bluima with Apache License 2.0 6 votes vote down vote up
@Test
   public void test() throws Exception {

CollectionReader cr = CollectionReaderFactory.createReader(
	BioNLPGeniaEventsCollectionReader.class);

int i = 0;
while (cr.hasNext()) {
    CAS cas = CasCreationUtils.createCas(cr
	    .getProcessingResourceMetaData());
    cr.getNext(cas);

    // if (createHtml)
    // viewer.createHtml(cas.getJCas(), cas.getTypeSystem(),
    // styleMapFile, new File("target/" + i));

    i++;
}
cr.close();
assertEquals(259, i);

   }
 
Example #3
Source File: LineOrientedTextReaderTest.java    From webanno with Apache License 2.0 6 votes vote down vote up
@Test
public void test()
    throws Exception
{
    JCas doc = JCasFactory.createJCas();

    CollectionReader reader = createReader(LineOrientedTextReader.class,
            LineOrientedTextReader.PARAM_SOURCE_LOCATION, "LICENSE.txt");

    reader.getNext(doc.getCas());
    
    // select(doc, Sentence.class).forEach(s -> System.out.println(s.getCoveredText()));
    
    assertEquals(169, select(doc, Sentence.class).size());
    assertEquals(0, select(doc, Token.class).size());
}
 
Example #4
Source File: BaleenPipeline.java    From baleen with Apache License 2.0 6 votes vote down vote up
/**
 * Constructor
 *
 * @param name Pipeline name
 * @param originalYaml The original YAML string that was used to build the pipeline
 * @param orderer The IPipelineOrderer to use to order the pipeline
 * @param collectionReader The collection reader
 * @param annotators The annotators to be ordered and used
 * @param consumers The consumers to be ordered and used
 * @throws IOException if error reading config
 * @deprecated Use {@link BaleenPipeline#BaleenPipeline(String, PipelineConfiguration,
 *     IPipelineOrderer, CollectionReader, List, List)}
 */
@Deprecated
public BaleenPipeline(
    String name,
    String originalYaml,
    IPipelineOrderer orderer,
    CollectionReader collectionReader,
    List<AnalysisEngine> annotators,
    List<AnalysisEngine> consumers)
    throws IOException {
  this(
      name,
      new YamlPipelineConfiguration(originalYaml),
      orderer,
      collectionReader,
      annotators,
      consumers);
}
 
Example #5
Source File: BioNLPGeniaEventsReaderTest.java    From bluima with Apache License 2.0 6 votes vote down vote up
@Test
public void testCount() throws Exception {

    CollectionReader cr = CollectionReaderFactory.createReader(
            BioNLPGeniaEventsCollectionReader.class,
            BlueUima.PARAM_INPUT_DIRECTORY, TEST_DIR);

    int i = 0;
    while (cr.hasNext()) {
        CAS cas = CasCreationUtils.createCas(cr
                .getProcessingResourceMetaData());
        cr.getNext(cas);
        LOG.debug(To.string("cas nr " + i, cas.getJCas()));
        i++;
    }
    cr.close();
    assertEquals(3, i);
}
 
Example #6
Source File: FormatSupportDescription.java    From webanno with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
@Override
public CollectionReaderDescription getReaderDescription(TypeSystemDescription aTSD)
    throws ResourceInitializationException
{
    if (!isReadable()) {
        throw new UnsupportedOperationException("The format [" + getName() + "] cannot be read");
    }
    
    Class<? extends CollectionReader> readerClazz;
    try {
        readerClazz = (Class<? extends CollectionReader>) Class.forName(readerClass);
    }
    catch (ClassNotFoundException e) {
        throw new ResourceInitializationException(e);
    }
    
    return createReaderDescription(readerClazz, aTSD);
}
 
Example #7
Source File: PubmedDatabaseCRTest.java    From bluima with Apache License 2.0 6 votes vote down vote up
@Test
public void testAuthors() throws Exception {

    // http://www.ncbi.nlm.nih.gov/pubmed/?term=1&report=xml&format=text
    CollectionReader cr = createReader(PubmedDatabaseCR.class,
            BlueUima.PARAM_BETWEEN, new int[] { 0, 1 },
            BlueUima.PARAM_SKIP_EMPTY_DOCS, false);

    String[] lastNames = { "Makar", "McMartin", "Palese", "Tephly" };
    String[] foreNames = { "A B", "K E", "M", "T R" };
    // AB___A B___Makar__-__KE___K
    // E___McMartin__-__M___M___Palese__-__TR___T R___Tephly
    for (JCas jCas : asList(cr)) {
        Header header = JCasUtil.selectSingle(jCas, Header.class);

        FSArray authors = header.getAuthors();
        for (int i = 0; i < authors.size(); i++) {
            AuthorInfo a = (AuthorInfo) authors.get(i);
            assertEquals(foreNames[i], a.getForeName());
            assertEquals(lastNames[i], a.getLastName());
        }

        assertEquals("1976-01-16", header.getCopyright());
    }
}
 
Example #8
Source File: NamedEntityLinkerTest.java    From inception with Apache License 2.0 6 votes vote down vote up
private List<CAS> loadData(Dataset ds, File ... files) throws UIMAException, IOException
{
    CollectionReader reader = createReader(
        Conll2002Reader.class,
        Conll2002Reader.PARAM_PATTERNS, files, 
        Conll2002Reader.PARAM_LANGUAGE, ds.getLanguage(), 
        Conll2002Reader.PARAM_COLUMN_SEPARATOR, Conll2002Reader.ColumnSeparators.TAB.getName(),
        Conll2002Reader.PARAM_HAS_TOKEN_NUMBER, true, 
        Conll2002Reader.PARAM_HAS_HEADER, true, 
        Conll2002Reader.PARAM_HAS_EMBEDDED_NAMED_ENTITY, true);

    List<CAS> casList = new ArrayList<>();
    while (reader.hasNext()) {
        JCas cas = JCasFactory.createJCas();
        reader.getNext(cas.getCas());
        casList.add(cas.getCas());
    }
    return casList;
}
 
Example #9
Source File: DL4JSequenceRecommenderTest.java    From inception with Apache License 2.0 6 votes vote down vote up
private JCas loadNerDevelopmentData() throws IOException, UIMAException
{
    Dataset ds = loader.load("germeval2014-de", CONTINUE);
    
    CollectionReader reader = createReader(Conll2002Reader.class,
        Conll2002Reader.PARAM_PATTERNS, ds.getDefaultSplit().getDevelopmentFiles(), 
        Conll2002Reader.PARAM_LANGUAGE, ds.getLanguage(), 
        Conll2002Reader.PARAM_COLUMN_SEPARATOR, Conll2002Reader.ColumnSeparators.TAB.getName(),
        Conll2002Reader.PARAM_HAS_TOKEN_NUMBER, true, 
        Conll2002Reader.PARAM_HAS_HEADER, true, 
        Conll2002Reader.PARAM_HAS_EMBEDDED_NAMED_ENTITY, true);
    
    JCas cas = JCasFactory.createJCas();
    reader.getNext(cas.getCas());
    return cas;
}
 
Example #10
Source File: XmiCollectionReaderTest.java    From bluima with Apache License 2.0 6 votes vote down vote up
@Test
public void testSerializeDeserializeXmi() throws Exception {

    // serialize
    CollectionReader cr = createReader(TextArrayReader.class,
             PARAM_INPUT, new String[] { "this is a test" });
    AnalysisEngine serializer = createEngine(XWriter.class,
            PARAM_OUTPUT_DIR, testDir + "/serdeser");
    runPipeline(cr, serializer);

    // deserialize
    cr = createReader(XCollectionReader.class, 
            PARAM_INPUT_DIRECTORY, testDir + "/serdeser");
    List<JCas> deserializedCases = asList(cr);
    assertEquals(1, deserializedCases.size());

    assertEquals("this is a test", deserializedCases.get(0)
            .getDocumentText());
}
 
Example #11
Source File: ExternalRecommenderIntegrationTest.java    From inception with Apache License 2.0 6 votes vote down vote up
private List<CAS> loadData(Dataset ds, File ... files) throws UIMAException, IOException
{
    CollectionReader reader = createReader(Conll2002Reader.class,
        Conll2002Reader.PARAM_PATTERNS, files,
        Conll2002Reader.PARAM_LANGUAGE, ds.getLanguage(),
        Conll2002Reader.PARAM_COLUMN_SEPARATOR, Conll2002Reader.ColumnSeparators.TAB.getName(),
        Conll2002Reader.PARAM_HAS_TOKEN_NUMBER, true,
        Conll2002Reader.PARAM_HAS_HEADER, true,
        Conll2002Reader.PARAM_HAS_EMBEDDED_NAMED_ENTITY, true);

    List<CAS> casList = new ArrayList<>();
    while (reader.hasNext()) {
        // Add the CasMetadata type to the CAS
        List<TypeSystemDescription> typeSystems = new ArrayList<>();
        typeSystems.add(createTypeSystemDescription());
        typeSystems.add(CasMetadataUtils.getInternalTypeSystem());
        JCas cas = JCasFactory.createJCas(mergeTypeSystems(typeSystems));
        reader.getNext(cas.getCas());
        casList.add(cas.getCas());
    }
    return casList;
}
 
Example #12
Source File: DiffTestUtils.java    From webanno with Apache License 2.0 6 votes vote down vote up
public static JCas readWebAnnoTSV(String aPath, TypeSystemDescription aType)
    throws UIMAException, IOException
{
    CollectionReader reader = createReader(WebannoTsv2Reader.class,
            WebannoTsv2Reader.PARAM_SOURCE_LOCATION, "src/test/resources/" + aPath);
    JCas jcas;
    if (aType != null) {
        TypeSystemDescription builtInTypes = TypeSystemDescriptionFactory
                .createTypeSystemDescription();
        List<TypeSystemDescription> allTypes = new ArrayList<>();
        allTypes.add(builtInTypes);
        allTypes.add(aType);
        jcas = JCasFactory.createJCas(CasCreationUtils.mergeTypeSystems(allTypes));
    }
    else {
        jcas = JCasFactory.createJCas();
    }

    reader.getNext(jcas.getCas());

    return jcas;
}
 
Example #13
Source File: DiffTestUtils.java    From webanno with Apache License 2.0 6 votes vote down vote up
public static JCas readXMI(String aPath, TypeSystemDescription aType)
    throws UIMAException, IOException
{
    CollectionReader reader = createReader(XmiReader.class, XmiReader.PARAM_SOURCE_LOCATION,
            "src/test/resources/" + aPath);
    JCas jcas;
    if (aType != null) {
        TypeSystemDescription builtInTypes = TypeSystemDescriptionFactory
                .createTypeSystemDescription();
        List<TypeSystemDescription> allTypes = new ArrayList<>();
        allTypes.add(builtInTypes);
        allTypes.add(aType);
        jcas = JCasFactory.createJCas(CasCreationUtils.mergeTypeSystems(allTypes));
    }
    else {
        jcas = JCasFactory.createJCas();
    }

    reader.getNext(jcas.getCas());

    return jcas;
}
 
Example #14
Source File: OpenNlpDoccatRecommenderTest.java    From inception with Apache License 2.0 6 votes vote down vote up
private List<CAS> loadData(Dataset ds, File ... files) throws UIMAException, IOException
{
    CollectionReader reader = createReader(Reader.class,
            Reader.PARAM_PATTERNS, files, 
            Reader.PARAM_LANGUAGE, ds.getLanguage());

    AnalysisEngine segmenter = createEngine(BreakIteratorSegmenter.class,
            BreakIteratorSegmenter.PARAM_WRITE_SENTENCE, false);
    
    List<CAS> casList = new ArrayList<>();
    while (reader.hasNext()) {
        JCas cas = JCasFactory.createJCas();
        reader.getNext(cas.getCas());
        segmenter.process(cas);
        casList.add(cas.getCas());
    }
    return casList;
}
 
Example #15
Source File: StringMatchingRecommenderTest.java    From inception with Apache License 2.0 6 votes vote down vote up
private List<CAS> loadData(Dataset ds, File ... files) throws UIMAException, IOException
{
    CollectionReader reader = createReader(Conll2002Reader.class,
        Conll2002Reader.PARAM_PATTERNS, files, 
        Conll2002Reader.PARAM_LANGUAGE, ds.getLanguage(), 
        Conll2002Reader.PARAM_COLUMN_SEPARATOR, Conll2002Reader.ColumnSeparators.TAB.getName(),
        Conll2002Reader.PARAM_HAS_TOKEN_NUMBER, true, 
        Conll2002Reader.PARAM_HAS_HEADER, true, 
        Conll2002Reader.PARAM_HAS_EMBEDDED_NAMED_ENTITY, true);

    List<CAS> casList = new ArrayList<>();
    int n = 1;
    while (reader.hasNext()) {
        JCas cas = JCasFactory.createJCas();
        reader.getNext(cas.getCas());
        casList.add(cas.getCas());
        casStorageSession.add("testDataCas" + n, EXCLUSIVE_WRITE_ACCESS, cas.getCas());
    }
    
    return casList;
}
 
Example #16
Source File: PubmedCentralCollectionReaderTest.java    From bluima with Apache License 2.0 6 votes vote down vote up
@Test
   @Ignore
   // FIXME
   public void testCount() throws Exception {

CollectionReader cr = PubmedCentralCollectionReader.getCR("pmc_test_archive");

int i = 0;
while (cr.hasNext()) {
    CAS cas = CasCreationUtils.createCas(cr
	    .getProcessingResourceMetaData());
    cr.getNext(cas);
    i++;
}
cr.close();
assertEquals(6, i);
   }
 
Example #17
Source File: JobBuilderTest.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Test
public void testValid2() throws Exception {
  String yaml = Files.asCharSource(getFile("jobConfig2.yaml"), Charset.defaultCharset()).read();

  JobBuilder jb = new JobBuilder("Test Job", new YamlPipelineConfiguration(yaml));
  BaleenJob job = (BaleenJob) jb.createNewPipeline();

  assertEquals("Test Job", job.getName());
  assertEquals(yaml, job.originalConfig());
  assertEquals(yaml, job.orderedConfig());

  CollectionReader cr = job.collectionReader();
  assertEquals(BaleenDefaults.DEFAULT_SCHEDULER, cr.getMetaData().getName());
  assertEquals("Foo", cr.getConfigParameterValue("key"));

  List<AnalysisEngine> annotators = job.annotators();
  assertEquals(2, annotators.size());

  AnalysisEngine ann0 = annotators.get(0);
  assertEquals("uk.gov.dstl.baleen.testing.DummyTask", ann0.getMetaData().getName());
  assertEquals("Foo", ann0.getConfigParameterValue("key"));

  AnalysisEngine ann1 = annotators.get(1);
  assertEquals("uk.gov.dstl.baleen.testing.DummyTaskParams", ann1.getMetaData().getName());
  assertEquals("Bar", ann1.getConfigParameterValue("key"));

  List<AnalysisEngine> consumers = job.consumers();
  assertEquals(0, consumers.size());
}
 
Example #18
Source File: DL4JSequenceRecommenderTest.java    From inception with Apache License 2.0 5 votes vote down vote up
private JCas loadPosDevelopmentData() throws IOException, UIMAException
{
    Dataset ds = loader.load("conll2000-en", CONTINUE);
    
    CollectionReader reader = createReader(Conll2000Reader.class,
            Conll2000Reader.PARAM_PATTERNS, ds.getDefaultSplit().getTestFiles(), 
            Conll2000Reader.PARAM_LANGUAGE, ds.getLanguage());
    
    JCas cas = JCasFactory.createJCas();
    reader.getNext(cas.getCas());
    return cas;
}
 
Example #19
Source File: PubmedDatabaseCRTest.java    From bluima with Apache License 2.0 5 votes vote down vote up
@Test
public void testLimit10() throws Exception {

    CollectionReader cr = createReader(PubmedDatabaseCR.class,
            BlueUima.PARAM_BETWEEN, new int[] { 0, 9 },
            BlueUima.PARAM_SKIP_EMPTY_DOCS, false);

    ArrayList<JCas> jCases = asList(cr);
    assertEquals(9, jCases.size());

    for (JCas jCas : jCases) {
        int docId = getHeaderIntDocId(jCas);
        System.err.println(docId);
    }
}
 
Example #20
Source File: LappsGridRecommenderConformityTest.java    From inception with Apache License 2.0 5 votes vote down vote up
private static CAS loadData(File aFile) throws UIMAException, IOException
{
    CollectionReader reader = createReader(XmiReader.class,
            ConllUReader.PARAM_PATTERNS, aFile);

    List<CAS> casList = new ArrayList<>();
    while (reader.hasNext()) {
        JCas cas = JCasFactory.createJCas();
        reader.getNext(cas.getCas());
        casList.add(cas.getCas());
    }

    return casList.get(0);
}
 
Example #21
Source File: ZipXmiCollectionReaderTest.java    From bluima with Apache License 2.0 5 votes vote down vote up
@Test
public void testSerializeDeserializeXmi() throws Exception {

    CollectionReader cr = createReader(TextArrayReader.class,
             PARAM_INPUT, new String[] { "this is a test" });
    AnalysisEngine serializer = createEngine(ZipXWriter.class,
            BlueUima.PARAM_OUTPUT_DIR, testDir + "/serdeser");
    SimplePipeline.runPipeline(cr, serializer);

    cr = createReader(ZipXmiCollectionReader.class, 
            PARAM_INPUT_DIRECTORY, testDir + "/serdeser");
    CAS cas = createCas(cr.getProcessingResourceMetaData());
    cr.getNext(cas);
    assertEquals("this is a test", cas.getDocumentText());
}
 
Example #22
Source File: SparkUimaUtils.java    From ambiverse-nlu with Apache License 2.0 5 votes vote down vote up
public static List<SCAS> readFrom(CollectionReader reader) throws IOException, UIMAException {
  List<SCAS> scasList = new ArrayList<>();
  while (reader.hasNext()) {
    JCas jcas = JCasFactory.createJCas();
    CAS cas = jcas.getCas();
    reader.getNext(cas);
    scasList.add(new SCAS(cas));
  }
  return scasList;
}
 
Example #23
Source File: PdfCollectionReaderTest.java    From bluima with Apache License 2.0 5 votes vote down vote up
@Test
@Ignore
public void testOnSampleForSrikanth() throws Exception {

    CollectionReader cr = createReader(PdfCollectionReader.class,
             PARAM_INPUT_DIRECTORY, "pdf_srikanth");

    AnalysisEngine dumper = createEngine(DocumentTextWriter.class,
            PARAM_OUTPUT_DIR, "/Users/richarde/Desktop/");

    SimplePipeline.runPipeline(cr, dumper);
}
 
Example #24
Source File: PdfCollectionReaderTest.java    From bluima with Apache License 2.0 5 votes vote down vote up
@Test
@Ignore
public void testOnSampleForExtractionQuality() throws Exception {

    CollectionReader cr = createReader(
            PdfCollectionReader.class,
            PARAM_INPUT_DIRECTORY,
            "/Users/richarde/data_hdd/_papers_etc/pubmed/sample_pdfs_68/pdfs",
            PARAM_EXPAND_ABBREVIATIONS, true);

    AnalysisEngine dumper = createEngine(DocumentTextWriter.class,
            PARAM_OUTPUT_DIR, "/Users/richarde/Desktop/");

    runPipeline(cr, dumper);
}
 
Example #25
Source File: PdfCollectionReaderTest.java    From bluima with Apache License 2.0 5 votes vote down vote up
@Test
@Ignore
public void testExtractTablesOnSample() throws Exception {

    CollectionReader cr = createReader(PdfCollectionReader.class,
             PARAM_INPUT_DIRECTORY,
            "/Users/richarde/data/_papers_etc/pmc_pdfs_sample");

    AnalysisEngine dumper = createEngine(TableWriter.class);

    SimplePipeline.runPipeline(cr, dumper);
}
 
Example #26
Source File: AgreementTestUtils.java    From webanno with Apache License 2.0 5 votes vote down vote up
public static CAS readConll2006(String aPath)
    throws UIMAException, IOException
{
    CollectionReader reader = createReader(Conll2006Reader.class,
            Conll2006Reader.PARAM_SOURCE_LOCATION, "src/test/resources/" + aPath);

    CAS jcas = JCasFactory.createJCas().getCas();

    reader.getNext(jcas);

    return jcas;
}
 
Example #27
Source File: Txt2PubmedIdIndexer.java    From bluima with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {

        // cr
        CollectionReader cr = CollectionReaderFactory.createReader(
                PubmedWholeDatabaseCR.class, PARAM_DB_CONNECTION, new String[] {
                        "localhost", "bb_pubmed", "root", "" });

        SimplePipeline.runPipeline(
                cr,
                createEngineDescription(MyIndexer.class),
                createEngineDescription(StatsAnnotatorPlus.class,
                        PARAM_PRINT_EVERY, 50000));
    }
 
Example #28
Source File: AgreementTestUtils.java    From webanno with Apache License 2.0 5 votes vote down vote up
public static JCas loadWebAnnoTsv3(File aPath) throws UIMAException, IOException
{
    CollectionReader reader = createReader(WebannoTsv3XReader.class,
            WebannoTsv3XReader.PARAM_SOURCE_LOCATION, aPath);
    JCas jcas = JCasFactory.createJCas();
    reader.getNext(jcas.getCas());
    return jcas;
}
 
Example #29
Source File: AgreementTestUtils.java    From webanno with Apache License 2.0 5 votes vote down vote up
public static CAS read(String aPath)
    throws UIMAException, IOException
{
    CollectionReader reader = createReader(Conll2006Reader.class,
            Conll2006Reader.PARAM_SOURCE_LOCATION, "src/test/resources/" + aPath);

    CAS cas = JCasFactory.createJCas().getCas();

    reader.getNext(cas);

    return cas;
}
 
Example #30
Source File: AgreementTestUtils.java    From webanno with Apache License 2.0 5 votes vote down vote up
public static JCas loadWebAnnoTsv3(String aPath) throws UIMAException, IOException
{
    CollectionReader reader = createReader(WebannoTsv3XReader.class,
            WebannoTsv3XReader.PARAM_SOURCE_LOCATION, "src/test/resources/" + aPath);
    JCas jcas = JCasFactory.createJCas();
    reader.getNext(jcas.getCas());
    return jcas;
}