Java Code Examples for org.apache.uima.jcas.tcas.DocumentAnnotation#setSourceUri()

The following examples show how to use org.apache.uima.jcas.tcas.DocumentAnnotation#setSourceUri() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DocumentTypeByLocationTest.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Test
public void testBaseDirectoryTwoLayers() throws Exception {

  DocumentAnnotation da = getDocumentAnnotation();
  da.setSourceUri(tmp.getAbsolutePath());

  processJCas(BASE_DIRECTORY, topDir.getAbsolutePath());

  String relative =
      tmp.getAbsolutePath()
          .substring(
              topDir.getAbsolutePath().length() + 1,
              tmp.getAbsolutePath().length() - tmp.getName().length() - 1);

  assertEquals(relative, da.getDocType());
}
 
Example 2
Source File: DocumentTypeByLocationTest.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Test
public void testBaseDirectoryOneLayers() throws Exception {

  DocumentAnnotation da = getDocumentAnnotation();
  da.setSourceUri(tmp.getAbsolutePath());

  processJCas(BASE_DIRECTORY, parentDir.getAbsolutePath());

  String relative =
      tmp.getAbsolutePath()
          .substring(
              parentDir.getAbsolutePath().length() + 1,
              tmp.getAbsolutePath().length() - tmp.getName().length() - 1);

  assertEquals(relative, da.getDocType());
}
 
Example 3
Source File: DocumentTypeByLocationTest.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Test
public void test() throws Exception {

  try {
    DocumentAnnotation da = getDocumentAnnotation();
    da.setSourceUri(tmp.getAbsolutePath());

    processJCas();

    // Remove slash (requried for unix paths)
    String absolutePath = childDir.getAbsolutePath();
    if (absolutePath.startsWith(File.separator)) {
      absolutePath = absolutePath.substring(1);
    }

    assertEquals(absolutePath, da.getDocType());
  } finally {
    tmp.delete();
  }
}
 
Example 4
Source File: ElasticsearchTestBase.java    From baleen with Apache License 2.0 6 votes vote down vote up
protected long createNoEntitiesDocument() {
  jCas.reset();
  jCas.setDocumentText("Hello World");
  jCas.setDocumentLanguage("en");

  long timestamp = System.currentTimeMillis();

  DocumentAnnotation da = getDocumentAnnotation(jCas);
  da.setTimestamp(timestamp);
  da.setSourceUri("test/no_entities");
  da.setDocType("test");
  da.setDocumentClassification("OFFICIAL");
  da.setDocumentCaveats(
      UimaTypesUtils.toArray(jCas, Arrays.asList(new String[] {"TEST_A", "TEST_B"})));
  da.setDocumentReleasability(
      UimaTypesUtils.toArray(jCas, Arrays.asList(new String[] {"ENG", "SCO", "WAL"})));

  return timestamp;
}
 
Example 5
Source File: AbstractBaleenFileConsumerTest.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Test
public void testNullBasePath() throws Exception {
  AnalysisEngine consumer =
      AnalysisEngineFactory.createEngine(
          TestFileConsumer.class, TypeSystemSingleton.getTypeSystemDescriptionInstance());

  DocumentAnnotation da = (DocumentAnnotation) jCas.getDocumentAnnotationFs();
  da.setSourceUri(FILENAME);

  consumer.process(jCas);

  File f = new File(FILENAME);
  assertTrue(f.exists());

  f.delete();
}
 
Example 6
Source File: DocumentTypeByFilenameTest.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Test
public void testPatternNoMatch() throws Exception {
  DocumentAnnotation da = getDocumentAnnotation();
  da.setSourceUri("20170127-Test_Document.docx");

  processJCas(
      DocumentTypeByFilename.PARAM_PATTERN,
      "([a-z]{2}).*",
      DocumentTypeByFilename.PARAM_DEFAULT,
      "unknown");

  assertEquals("unknown", da.getDocType());
}
 
Example 7
Source File: MoveSourceFileTest.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Test
public void testMove() throws Exception {
  File destinationFolder = Files.createTempDir();

  AnalysisEngine consumer =
      AnalysisEngineFactory.createEngine(
          MoveSourceFile.class,
          TypeSystemSingleton.getTypeSystemDescriptionInstance(),
          MoveSourceFile.PARAM_DESTINATION,
          destinationFolder.getPath());

  File f = new File(sourceFolder, BALEEN_TXT);
  if (!f.exists()) f.createNewFile();

  File f2 = new File(destinationFolder, BALEEN_TXT);

  assertEquals(false, f2.exists());

  DocumentAnnotation da = (DocumentAnnotation) jCas.getDocumentAnnotationFs();
  da.setSourceUri(f.getPath());

  consumer.process(jCas);

  assertEquals(false, f.exists());
  assertEquals(true, f2.exists());

  Metadata md = JCasUtil.selectByIndex(jCas, Metadata.class, 0);
  assertNotNull(md);
  assertEquals(MOVED_DOCUMENT_LOCATION, md.getKey());
  assertEquals(f2.getPath(), md.getValue());

  f2.delete();
  destinationFolder.delete();
}
 
Example 8
Source File: ConsumerUtilsTest.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Test
public void testExternalId() throws UIMAException {
  JCas jCas = JCasSingleton.getJCasInstance();
  jCas.setDocumentText("Hello World");
  DocumentAnnotation da = UimaSupport.getDocumentAnnotation(jCas);

  assertEquals(
      "a591a6d40bf420404a011733cfb7b190d62c65bf0bcda32b57b277d9ad9f146e",
      ConsumerUtils.getExternalId(da, true));

  da.setSourceUri("http://www.example.com/test.html");
  assertEquals(
      "b2e870534ee6fc1abc14feac22dcfd0b268460ac4205d9c3f68a000aab685f4f",
      ConsumerUtils.getExternalId(da, false));
}
 
Example 9
Source File: DocumentTypeByFilenameTest.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Test
public void testPattern() throws Exception {
  DocumentAnnotation da = getDocumentAnnotation();
  da.setSourceUri("20170127-Test_Document.docx");

  processJCas(DocumentTypeByFilename.PARAM_PATTERN, "(\\d{4}).*");

  assertEquals("2017", da.getDocType());
}
 
Example 10
Source File: StructuralHtmlTest.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Test
public void testOutputEmpty() throws Exception {
  final AnalysisEngine consumer =
      AnalysisEngineFactory.createEngine(
          StructuralHtml.class,
          Html5.PARAM_OUTPUT_FOLDER,
          outputFolder.getPath(),
          StructuralHtml.PARAM_OUTPUT_EMPTY_TAGS,
          true);
  final DocumentAnnotation da = (DocumentAnnotation) jCas.getDocumentAnnotationFs();
  da.setSourceUri("test.txt");

  jCas.setDocumentText("Example document: ''");

  final Paragraph d = new Paragraph(jCas);
  d.setBegin(19);
  d.setEnd(19);
  d.addToIndexes();

  consumer.process(jCas);

  final File f = new File(outputFolder, "test.txt.html");
  assertTrue(f.exists());

  System.out.println(Jsoup.parse(f, "UTF-8").html());

  assertEquals(
      Jsoup.parse(f, "UTF-8").html().replaceAll("\\s*", ""),
      EXPECTED_EMPTY.replaceAll("\\s*", ""));
}
 
Example 11
Source File: TemplateFieldJsonReportConsumerTest.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Before
public void setup() throws IOException {
  jCas.setDocumentText(TEXT);
  tempDirectory =
      Files.createTempDirectory(TemplateFieldJsonReportConsumerTest.class.getSimpleName());
  tempDirectory.toFile().deleteOnExit();

  DocumentAnnotation documentAnnotation = (DocumentAnnotation) jCas.getDocumentAnnotationFs();
  documentAnnotation.setSourceUri(SOURCEURI);

  Paragraph paragraph1 = new Paragraph(jCas);
  paragraph1.setBegin(0);
  paragraph1.setDepth(1);
  paragraph1.setEnd(52);
  paragraph1.addToIndexes();

  Paragraph paragraph2 = new Paragraph(jCas);
  paragraph2.setBegin(53);
  paragraph2.setDepth(1);
  paragraph2.setEnd(105);
  paragraph2.addToIndexes();

  TemplateField field = new TemplateField(jCas);
  field.setBegin(53);
  field.setEnd(105);
  field.addToIndexes();
}
 
Example 12
Source File: EntityCountTest.java    From baleen with Apache License 2.0 5 votes vote down vote up
private void createDocument() {
  DocumentAnnotation da = (DocumentAnnotation) jCas.getDocumentAnnotationFs();
  da.setSourceUri(TEST1_TXT);

  CommsIdentifier ci = new CommsIdentifier(jCas);
  ci.addToIndexes();

  Person p = new Person(jCas);
  p.addToIndexes();
}
 
Example 13
Source File: AllAnnotationsJsonConsumerTest.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Before
public void setup() throws IOException {
  jCas.setDocumentText(TEXT);
  tempDirectory = Files.createTempDirectory(AllAnnotationsJsonConsumerTest.class.getSimpleName());
  tempDirectory.toFile().deleteOnExit();

  DocumentAnnotation documentAnnotation = (DocumentAnnotation) jCas.getDocumentAnnotationFs();
  documentAnnotation.setSourceUri(SOURCEURI);

  Paragraph paragraph1 = new Paragraph(jCas);
  paragraph1.setBegin(0);
  paragraph1.setDepth(1);
  paragraph1.setEnd(52);
  paragraph1.addToIndexes();

  Person entity1 = new Person(jCas);
  entity1.setBegin(70);
  entity1.setEnd(73);
  entity1.setValue("cat");
  entity1.addToIndexes();

  Event event = new Event(jCas);
  event.setBegin(53);
  event.setEnd(105);
  event.setArguments(new StringArray(jCas, 2));
  event.setArguments(0, "cat");
  event.setArguments(1, "dog");
  event.setEntities(new FSArray(jCas, 1));
  event.setEntities(0, entity1);
  event.addToIndexes();
}
 
Example 14
Source File: AbstractTemplateRecordConsumerTest.java    From baleen with Apache License 2.0 4 votes vote down vote up
@Before
public void beforeAbstractRecordConsumerTest() throws IOException {
  jCas.setDocumentText(TEXT);

  DocumentAnnotation documentAnnotation = (DocumentAnnotation) jCas.getDocumentAnnotationFs();
  documentAnnotation.setSourceUri(sourceUri);

  Metadata author = new Metadata(jCas);
  author.setBegin(0);
  author.setEnd(0);
  author.setKey("author");
  author.setValue("The Author");
  author.addToIndexes();

  Metadata creator1 = new Metadata(jCas);
  creator1.setBegin(0);
  creator1.setEnd(0);
  creator1.setKey("creator");
  creator1.setValue("The Creator");
  creator1.addToIndexes();

  Metadata creator2 = new Metadata(jCas);
  creator2.setBegin(0);
  creator2.setEnd(0);
  creator2.setKey("creator");
  creator2.setValue("Baleen");
  creator2.addToIndexes();

  TemplateRecord record1 = new TemplateRecord(jCas);
  record1.setBegin(0);
  record1.setEnd(52);
  record1.setName("record1");
  record1.setSource(sourceName);
  record1.addToIndexes();

  TemplateField record1Field1 = new TemplateField(jCas);
  record1Field1.setBegin(0);
  record1Field1.setEnd(15);
  record1Field1.setName("record1Field1");
  record1Field1.setSource(sourceName);
  record1Field1.setValue(record1Field1.getCoveredText());
  record1Field1.addToIndexes();

  TemplateField record1Field2 = new TemplateField(jCas);
  record1Field2.setBegin(16);
  record1Field2.setEnd(31);
  record1Field2.setName("record1Field2");
  record1Field2.setSource(sourceName);
  record1Field2.setValue(record1Field2.getCoveredText());
  record1Field2.addToIndexes();

  TemplateRecord record2 = new TemplateRecord(jCas);
  record2.setBegin(53);
  record2.setEnd(105);
  record2.setName("record2");
  record2.setSource(sourceName);
  record2.addToIndexes();

  TemplateField record2Field1 = new TemplateField(jCas);
  record2Field1.setBegin(53);
  record2Field1.setEnd(68);
  record2Field1.setName("record2Field1");
  record2Field1.setSource(sourceName);
  record2Field1.setValue(record2Field1.getCoveredText());
  record2Field1.addToIndexes();

  TemplateField record2Field2 = new TemplateField(jCas);
  record2Field2.setBegin(69);
  record2Field2.setEnd(84);
  record2Field2.setName("record2Field2");
  record2Field2.setSource(sourceName);
  record2Field2.setValue(record2Field2.getCoveredText());
  record2Field2.addToIndexes();

  TemplateField noRecordField1 = new TemplateField(jCas);
  noRecordField1.setBegin(106);
  noRecordField1.setEnd(121);
  noRecordField1.setName("noRecordField1");
  noRecordField1.setSource(sourceName);
  noRecordField1.setValue(noRecordField1.getCoveredText());
  noRecordField1.addToIndexes();

  TemplateField noRecordField2 = new TemplateField(jCas);
  noRecordField2.setBegin(122);
  noRecordField2.setEnd(137);
  noRecordField2.setName("noRecordField2");
  noRecordField2.setSource(sourceName);
  noRecordField2.setValue(noRecordField2.getCoveredText());
  noRecordField2.addToIndexes();
}
 
Example 15
Source File: MimeReader.java    From baleen with Apache License 2.0 4 votes vote down vote up
@Override
protected void doGetNext(final JCas jCas) throws IOException, CollectionException {
  final Path path = files.pop();
  final File file = path.toFile();

  final int left = files.size();

  getMonitor()
      .info(
          "Processing {} ({} %)",
          file.getAbsolutePath(), String.format("%.2f", 100 * (total - left) / (double) total));

  try (FileInputStream is = new FileInputStream(file)) {
    final Session s = Session.getDefaultInstance(new Properties());
    final MimeMessageParser parser = new MimeMessageParser(new MimeMessage(s, is));
    parser.parse();
    final MimeMessage message = parser.getMimeMessage();

    final DocumentAnnotation da = UimaSupport.getDocumentAnnotation(jCas);
    da.setTimestamp(calculateBestDate(message, file));
    da.setDocType("email");
    da.setDocumentClassification("O");
    String source = file.getAbsolutePath().substring(rootFolder.length());
    da.setSourceUri(source);
    da.setLanguage("en");

    // Add all headers as metadata, with email prefix
    final Enumeration<Header> allHeaders = message.getAllHeaders();
    while (allHeaders.hasMoreElements()) {
      final Header header = allHeaders.nextElement();
      addMetadata(jCas, "email." + header.getName(), header.getValue());
    }

    addMetadata(jCas, "from", parser.getFrom());
    addMetadata(jCas, "to", parser.getTo());
    addMetadata(jCas, "cc", parser.getCc());
    addMetadata(jCas, "bcc", parser.getBcc());
    addMetadata(jCas, "subject", parser.getSubject());

    // Add fake title
    addMetadata(jCas, "title", parser.getSubject());

    String actualContent = parser.getPlainContent();

    if (actualContent == null) {
      actualContent = "";
    }

    // TODO: At this point we could create a representation of the addresses, etc in the content
    // eg a table of to, from, and etc
    // then annotate them a commsidentifier, date, person.
    // We could also create relations between sender and receiver

    String content = actualContent + "\n\n---\n\n";

    final String headerBlock = createHeaderBlock(content.length(), jCas, parser);
    content = content + headerBlock;

    final Text text = new Text(jCas);
    text.setBegin(0);
    text.setEnd(actualContent.length());
    text.addToIndexes();

    extractContent(new ByteArrayInputStream(content.getBytes()), source, jCas);
  } catch (final Exception e) {
    getMonitor().warn("Discarding message", e);
  }
}
 
Example 16
Source File: AbstractTemplateRecordConfigurationCreatingConsumerTest.java    From baleen with Apache License 2.0 4 votes vote down vote up
public void setup() throws IOException {
  jCas.setDocumentText(TEXT);
  tempDirectory =
      Files.createTempDirectory(
          AbstractTemplateRecordConfigurationCreatingConsumerTest.class.getSimpleName());
  tempDirectory.toFile().deleteOnExit();

  DocumentAnnotation documentAnnotation = (DocumentAnnotation) jCas.getDocumentAnnotationFs();
  documentAnnotation.setSourceUri(SOURCEURI);

  Paragraph paragraph1 = new Paragraph(jCas);
  paragraph1.setBegin(0);
  paragraph1.setDepth(1);
  paragraph1.setEnd(52);
  paragraph1.addToIndexes();

  Paragraph paragraph2 = new Paragraph(jCas);
  paragraph2.setBegin(53);
  paragraph2.setDepth(1);
  paragraph2.setEnd(105);
  paragraph2.addToIndexes();

  Paragraph paragraph3 = new Paragraph(jCas);
  paragraph3.setBegin(106);
  paragraph3.setDepth(1);
  paragraph3.setEnd(158);
  paragraph3.addToIndexes();

  Paragraph paragraph4 = new Paragraph(jCas);
  paragraph4.setBegin(159);
  paragraph4.setDepth(1);
  paragraph4.setEnd(212);
  paragraph4.addToIndexes();

  TemplateFieldDefinition field1 = new TemplateFieldDefinition(jCas);
  field1.setBegin(72);
  field1.setEnd(75);
  field1.setName("field1");
  field1.addToIndexes();

  TemplateFieldDefinition field2 = new TemplateFieldDefinition(jCas);
  field2.setBegin(123);
  field2.setEnd(140);
  field2.setName("field2");
  field2.addToIndexes();

  TemplateFieldDefinition field3 = new TemplateFieldDefinition(jCas);
  field3.setBegin(17);
  field3.setEnd(20);
  field3.setName("noRecordField");
  field3.addToIndexes();
}
 
Example 17
Source File: MoveSourceFileTest.java    From baleen with Apache License 2.0 4 votes vote down vote up
@Test
public void testMoveType() throws Exception {
  File destinationFolder = Files.createTempDir();

  AnalysisEngine consumer =
      AnalysisEngineFactory.createEngine(
          MoveSourceFile.class,
          TypeSystemSingleton.getTypeSystemDescriptionInstance(),
          MoveSourceFile.PARAM_DESTINATION,
          destinationFolder.getPath(),
          MoveSourceFile.PARAM_SPLIT,
          true);

  File f = new File(sourceFolder, BALEEN_TXT);
  if (!f.exists()) f.createNewFile();

  File f2 = new File(destinationFolder, BALEEN_TXT);

  File typeFolder = new File(destinationFolder, "test");
  File f3 = new File(typeFolder, BALEEN_TXT);

  assertEquals(false, f2.exists());

  DocumentAnnotation da = (DocumentAnnotation) jCas.getDocumentAnnotationFs();
  da.setSourceUri(f.getPath());
  da.setDocType("test");

  consumer.process(jCas);

  assertEquals(false, f.exists());
  assertEquals(false, f2.exists());
  assertEquals(true, f3.exists());

  Metadata md = JCasUtil.selectByIndex(jCas, Metadata.class, 0);
  assertNotNull(md);
  assertEquals(MOVED_DOCUMENT_LOCATION, md.getKey());
  assertEquals(f3.getPath(), md.getValue());

  f3.delete();
  typeFolder.delete();
  destinationFolder.delete();
}
 
Example 18
Source File: MongoTest.java    From baleen with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
@Test
public void testNoEntities() throws Exception {
  jCas.setDocumentText(TEXT);
  jCas.setDocumentLanguage("en");

  long timestamp = System.currentTimeMillis();

  DocumentAnnotation da = getDocumentAnnotation(jCas);
  da.setTimestamp(timestamp);
  da.setSourceUri("test/no_entities");
  da.setDocType("test");
  da.setDocumentClassification("OFFICIAL");
  da.setDocumentCaveats(
      UimaTypesUtils.toArray(jCas, Arrays.asList(new String[] {"TEST_A", "TEST_B"})));
  da.setDocumentReleasability(
      UimaTypesUtils.toArray(jCas, Arrays.asList(new String[] {"ENG", "SCO", "WAL"})));

  ae.process(jCas);

  assertEquals(1, documents.count());
  Document result = documents.find().first();

  assertEquals(TEXT, result.get(Mongo.FIELD_CONTENT));
  assertEquals(
      "en", ((Document) result.get(Mongo.FIELD_DOCUMENT)).get(Mongo.FIELD_DOCUMENT_LANGUAGE));

  assertEquals(
      new Date(timestamp),
      ((Document) result.get(Mongo.FIELD_DOCUMENT)).get(Mongo.FIELD_DOCUMENT_TIMESTAMP));
  assertEquals(
      "test/no_entities",
      ((Document) result.get(Mongo.FIELD_DOCUMENT)).get(Mongo.FIELD_DOCUMENT_SOURCE));

  assertEquals(
      "test", ((Document) result.get(Mongo.FIELD_DOCUMENT)).get(Mongo.FIELD_DOCUMENT_TYPE));

  assertEquals(
      "OFFICIAL",
      ((Document) result.get(Mongo.FIELD_DOCUMENT)).get(Mongo.FIELD_DOCUMENT_CLASSIFICATION));
  assertArrayEquals(
      new String[] {"TEST_A", "TEST_B"},
      ((Collection<String>)
              ((Document) result.get(Mongo.FIELD_DOCUMENT)).get(Mongo.FIELD_DOCUMENT_CAVEATS))
          .toArray());
  assertArrayEquals(
      new String[] {"ENG", "SCO", "WAL"},
      ((Collection<String>)
              ((Document) result.get(Mongo.FIELD_DOCUMENT))
                  .get(Mongo.FIELD_DOCUMENT_RELEASABILITY))
          .toArray());

  assertEquals(getDocumentAnnotation(jCas).getHash(), result.get(fields.getExternalId()));
}
 
Example 19
Source File: StructuralHtmlTest.java    From baleen with Apache License 2.0 4 votes vote down vote up
@Test
public void testDocument() throws UIMAException, IOException {
  final AnalysisEngine consumer =
      AnalysisEngineFactory.createEngine(
          StructuralHtml.class, Html5.PARAM_OUTPUT_FOLDER, outputFolder.getPath());
  final DocumentAnnotation da = (DocumentAnnotation) jCas.getDocumentAnnotationFs();
  da.setSourceUri("test.txt");

  final String text =
      "This is a test document, that contains structure. This test was written by Chris. On the on 24 December 2016.";

  jCas.setDocumentText(text);

  final Document doc = new Document(jCas);
  doc.setBegin(0);
  doc.setEnd(text.length());
  doc.addToIndexes();

  final Section section = new Section(jCas);
  section.setDepth(1);
  section.setBegin(0);
  section.setEnd(text.length());
  section.addToIndexes();

  final Paragraph para = new Paragraph(jCas);
  para.setBegin(2 - 2);
  para.setEnd(50 - 2);
  para.addToIndexes();

  final Details details = new Details(jCas);
  details.setBegin(52 - 2);
  details.setEnd(82 - 2);
  details.addToIndexes();

  final Aside aside = new Aside(jCas);
  aside.setBegin(84 - 2);
  aside.setEnd(110 - 2);
  aside.addToIndexes();

  final Style bold = new Style(jCas);
  bold.setDecoration(UimaTypesUtils.toArray(jCas, Arrays.asList("bold")));
  bold.setBegin(77 - 2);
  bold.setEnd(82 - 2);
  bold.addToIndexes();

  final Style italics = new Style(jCas);
  italics.setDecoration(UimaTypesUtils.toArray(jCas, Arrays.asList("italics")));
  italics.setBegin(94 - 2);
  italics.setEnd(110 - 2);
  italics.addToIndexes();

  consumer.process(jCas);

  final File f = new File(outputFolder, "test.txt.html");
  assertTrue(f.exists());

  // Strip out all the whitespace... just to normalise it
  assertEquals(
      Jsoup.parse(f, "UTF-8").html().replaceAll("\\s*", ""), EXPECTED.replaceAll("\\s*", ""));
}
 
Example 20
Source File: CsvFolderReader.java    From baleen with Apache License 2.0 4 votes vote down vote up
@Override
public void doGetNext(JCas jCas) throws IOException, CollectionException {
  if (currLines.isEmpty()) {
    // Read next file
    currPath = queue.remove(0);
    currentLine = 0;
    getMonitor().info("Processing file {}", currPath.toString());

    List<String> lines;
    try (Stream<String> ln = Files.lines(currPath)) {
      lines = ln.collect(Collectors.toList());
    }

    String header = lines.remove(0);
    columnHeadings = Arrays.asList(csvParser.parseLine(header));

    currLines.addAll(lines);
  }

  String line = currLines.remove(0);
  String[] cols = csvParser.parseLine(line);
  currentLine++;

  StringJoiner sj = new StringJoiner("\n\n");
  Map<String, String> meta = new HashMap<>();

  for (int i = 0; i < columnHeadings.size(); i++) {
    if (inArray(columnHeadings.get(i), textColumn)) {
      sj.add(cols[i]);
    } else {
      meta.put(columnHeadings.get(i), cols[i]);
    }
  }

  jCas.setDocumentText(sj.toString());
  for (Map.Entry<String, String> e : meta.entrySet()) {
    Metadata md = new Metadata(jCas);
    md.setKey(e.getKey());
    md.setValue(e.getValue());
    md.addToIndexes();
  }

  DocumentAnnotation doc = UimaSupport.getDocumentAnnotation(jCas);
  doc.setSourceUri(currPath.toString() + "#" + currentLine);
  doc.setTimestamp(System.currentTimeMillis());
}