Java Code Examples for org.apache.uima.jcas.tcas.DocumentAnnotation#setTimestamp()

The following examples show how to use org.apache.uima.jcas.tcas.DocumentAnnotation#setTimestamp() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ElasticsearchTestBase.java    From baleen with Apache License 2.0 6 votes vote down vote up
protected long createNoEntitiesDocument() {
  jCas.reset();
  jCas.setDocumentText("Hello World");
  jCas.setDocumentLanguage("en");

  long timestamp = System.currentTimeMillis();

  DocumentAnnotation da = getDocumentAnnotation(jCas);
  da.setTimestamp(timestamp);
  da.setSourceUri("test/no_entities");
  da.setDocType("test");
  da.setDocumentClassification("OFFICIAL");
  da.setDocumentCaveats(
      UimaTypesUtils.toArray(jCas, Arrays.asList(new String[] {"TEST_A", "TEST_B"})));
  da.setDocumentReleasability(
      UimaTypesUtils.toArray(jCas, Arrays.asList(new String[] {"ENG", "SCO", "WAL"})));

  return timestamp;
}
 
Example 2
Source File: MboxReader.java    From baleen with Apache License 2.0 5 votes vote down vote up
/** Process a single body part */
private boolean processBody(JCas jCas, Body body, String sourceUri) throws IOException {
  if (body instanceof TextBody) {
    // Process plain text body
    processTextBody(jCas, (TextBody) body);

    // Add fields from parent
    for (Field f : body.getParent().getHeader().getFields()) {
      addMetadata(jCas, f.getName(), f.getBody());
    }

    // Set up document annotation - this is done by the content extractor in other cases
    DocumentAnnotation doc = UimaSupport.getDocumentAnnotation(jCas);
    doc.setSourceUri(sourceUri);
    doc.setTimestamp(System.currentTimeMillis());
  } else if (body instanceof BinaryBody) {
    processBinaryBody(jCas, (BinaryBody) body, sourceUri);
  } else if (body instanceof Multipart) {
    // Multipart message, so recurse
    Multipart mp = (Multipart) body;
    return processMultipart(jCas, mp, sourceUri);
  } else {
    // No body processed
    return false;
  }

  return true;
}
 
Example 3
Source File: AbstractContentExtractor.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
  DocumentAnnotation doc = UimaSupport.getDocumentAnnotation(jCas);
  doc.setSourceUri(source);
  doc.setTimestamp(System.currentTimeMillis());

  // Add metadata item to capture which content extractor was used
  addMetadata(jCas, METADATA_KEY_CONTENT_EXTRACTOR, this.getClass().getName());
}
 
Example 4
Source File: JCasDeserialiser.java    From baleen with Apache License 2.0 5 votes vote down vote up
private void processDocumentAnnotation(
    final JCas jCas, final DocumentAnnotation da, final Map<String, Object> map) {
  da.setDocType((String) map.getOrDefault(JsonJCas.DA_DOCUMENT_TYPE, ""));
  da.setDocumentClassification((String) map.getOrDefault(JsonJCas.DA_CLASSIFICATION, ""));
  da.setLanguage((String) map.getOrDefault(JsonJCas.DA_LANGUAGE, ""));
  da.setSourceUri((String) map.getOrDefault(JsonJCas.DA_SOURCE_URI, ""));
  da.setTimestamp(((Number) map.getOrDefault(JsonJCas.DA_TIMESTAMP, 0)).longValue());

  da.setDocumentCaveats(
      UimaTypesUtils.toArray(
          jCas, (Collection<String>) map.getOrDefault(JsonJCas.DA_CAVEATS, null)));
  da.setDocumentReleasability(
      UimaTypesUtils.toArray(
          jCas, (Collection<String>) map.getOrDefault(JsonJCas.DA_RELEASABILITY, null)));
}
 
Example 5
Source File: YYYYMMDDAssigner.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException {
  final DocumentAnnotation da = getDocumentAnnotation(jCas);
  final String source = da.getSourceUri();

  final Matcher matcher = pattern.matcher(source);
  if (matcher.matches()) {
    try {
      final int y = Integer.parseInt(matcher.group("year"));
      final int m = Integer.parseInt(matcher.group("month"));
      final int d = Integer.parseInt(matcher.group("day"));

      if (m >= 1 && m <= 12 && d >= 1 && d <= 31) {
        // This will check if its' actually valid (31 Feb) it's actualy valid date...

        final LocalDate date = LocalDate.of(y, m, d);
        final long ts = date.atStartOfDay().atOffset(ZoneOffset.UTC).toInstant().toEpochMilli();

        da.setTimestamp(ts);
      }

    } catch (final Exception e) {
      // Do nothing.. not a valid source path...
      getMonitor().warn("Cant parse date from source uri {} ", source, e);
    }
  }
}
 
Example 6
Source File: MimeReader.java    From baleen with Apache License 2.0 4 votes vote down vote up
@Override
protected void doGetNext(final JCas jCas) throws IOException, CollectionException {
  final Path path = files.pop();
  final File file = path.toFile();

  final int left = files.size();

  getMonitor()
      .info(
          "Processing {} ({} %)",
          file.getAbsolutePath(), String.format("%.2f", 100 * (total - left) / (double) total));

  try (FileInputStream is = new FileInputStream(file)) {
    final Session s = Session.getDefaultInstance(new Properties());
    final MimeMessageParser parser = new MimeMessageParser(new MimeMessage(s, is));
    parser.parse();
    final MimeMessage message = parser.getMimeMessage();

    final DocumentAnnotation da = UimaSupport.getDocumentAnnotation(jCas);
    da.setTimestamp(calculateBestDate(message, file));
    da.setDocType("email");
    da.setDocumentClassification("O");
    String source = file.getAbsolutePath().substring(rootFolder.length());
    da.setSourceUri(source);
    da.setLanguage("en");

    // Add all headers as metadata, with email prefix
    final Enumeration<Header> allHeaders = message.getAllHeaders();
    while (allHeaders.hasMoreElements()) {
      final Header header = allHeaders.nextElement();
      addMetadata(jCas, "email." + header.getName(), header.getValue());
    }

    addMetadata(jCas, "from", parser.getFrom());
    addMetadata(jCas, "to", parser.getTo());
    addMetadata(jCas, "cc", parser.getCc());
    addMetadata(jCas, "bcc", parser.getBcc());
    addMetadata(jCas, "subject", parser.getSubject());

    // Add fake title
    addMetadata(jCas, "title", parser.getSubject());

    String actualContent = parser.getPlainContent();

    if (actualContent == null) {
      actualContent = "";
    }

    // TODO: At this point we could create a representation of the addresses, etc in the content
    // eg a table of to, from, and etc
    // then annotate them a commsidentifier, date, person.
    // We could also create relations between sender and receiver

    String content = actualContent + "\n\n---\n\n";

    final String headerBlock = createHeaderBlock(content.length(), jCas, parser);
    content = content + headerBlock;

    final Text text = new Text(jCas);
    text.setBegin(0);
    text.setEnd(actualContent.length());
    text.addToIndexes();

    extractContent(new ByteArrayInputStream(content.getBytes()), source, jCas);
  } catch (final Exception e) {
    getMonitor().warn("Discarding message", e);
  }
}
 
Example 7
Source File: CsvFolderReader.java    From baleen with Apache License 2.0 4 votes vote down vote up
@Override
public void doGetNext(JCas jCas) throws IOException, CollectionException {
  if (currLines.isEmpty()) {
    // Read next file
    currPath = queue.remove(0);
    currentLine = 0;
    getMonitor().info("Processing file {}", currPath.toString());

    List<String> lines;
    try (Stream<String> ln = Files.lines(currPath)) {
      lines = ln.collect(Collectors.toList());
    }

    String header = lines.remove(0);
    columnHeadings = Arrays.asList(csvParser.parseLine(header));

    currLines.addAll(lines);
  }

  String line = currLines.remove(0);
  String[] cols = csvParser.parseLine(line);
  currentLine++;

  StringJoiner sj = new StringJoiner("\n\n");
  Map<String, String> meta = new HashMap<>();

  for (int i = 0; i < columnHeadings.size(); i++) {
    if (inArray(columnHeadings.get(i), textColumn)) {
      sj.add(cols[i]);
    } else {
      meta.put(columnHeadings.get(i), cols[i]);
    }
  }

  jCas.setDocumentText(sj.toString());
  for (Map.Entry<String, String> e : meta.entrySet()) {
    Metadata md = new Metadata(jCas);
    md.setKey(e.getKey());
    md.setValue(e.getValue());
    md.addToIndexes();
  }

  DocumentAnnotation doc = UimaSupport.getDocumentAnnotation(jCas);
  doc.setSourceUri(currPath.toString() + "#" + currentLine);
  doc.setTimestamp(System.currentTimeMillis());
}
 
Example 8
Source File: JCasTestGraphUtil.java    From baleen with Apache License 2.0 4 votes vote down vote up
public static void populateJcas(final JCas jCas) {

    jCas.setDocumentText(CONTENT);
    final DocumentAnnotation da = (DocumentAnnotation) jCas.getDocumentAnnotationFs();
    da.setDocumentClassification("CLASS");
    da.setDocType("MANUAL");
    da.setSourceUri("http://test.com");
    da.setLanguage("en");
    da.setTimestamp(new Date().getTime());
    da.setDocumentCaveats(new StringArray(jCas, 2));
    da.setDocumentCaveats(0, "GITHUB");
    da.setDocumentCaveats(1, "CAVEAT");

    final Metadata m1 = new Metadata(jCas);
    m1.setKey("test");
    m1.setValue("1");
    m1.addToIndexes(jCas);

    final Metadata m2 = new Metadata(jCas);
    m2.setKey("test");
    m2.setValue("2");
    m2.addToIndexes(jCas);

    final PublishedId pId = new PublishedId(jCas);
    pId.setPublishedIdType("test");
    pId.setValue("12");
    pId.addToIndexes(jCas);

    ReferenceTarget target = new ReferenceTarget(jCas);
    target.setLinking("testLinking");
    target.addToIndexes(jCas);

    final Person js = new Person(jCas);
    js.setBegin(25);
    js.setEnd(35);
    js.setGender("Male");
    js.setValue("John Smith");
    js.setConfidence(0.9d);
    js.setReferent(target);
    js.addToIndexes(jCas);

    final Person jd = new Person(jCas);
    jd.setBegin(50);
    jd.setEnd(58);
    jd.setGender("Female");
    jd.setValue("Jane Doe");
    jd.setConfidence(0.8d);
    jd.addToIndexes(jCas);

    final Person he = new Person(jCas);
    he.setBegin(60);
    he.setEnd(62);
    he.setGender("Male");
    he.setValue("He");
    he.setConfidence(0.9d);
    he.setReferent(target);
    he.addToIndexes(jCas);

    final Location l = new Location(jCas);
    l.setBegin(72);
    l.setEnd(87);
    l.setGeoJson(GEO_JSON);
    l.setValue("Dinagat Islands");
    l.setConfidence(0.9d);
    l.addToIndexes(jCas);

    final Relation related = new Relation(jCas);
    related.setBegin(36);
    related.setEnd(49);
    related.setValue("is related to");
    related.setRelationshipType(RELATED_TYPE);
    related.setSource(js);
    related.setTarget(jd);
    related.addToIndexes(jCas);

    final Relation lives = new Relation(jCas);
    lives.setBegin(63);
    lives.setEnd(71);
    lives.setValue("lives at");
    lives.setRelationshipType(LIVES_TYPE);
    lives.setSource(js);
    lives.setTarget(l);
    lives.addToIndexes(jCas);

    final Event event = new Event(jCas);
    event.setBegin(0);
    event.setEnd(10);
    event.setValue("test event");
    event.setEventType(new StringArray(jCas, 1));
    event.setEventType(0, "MEETING");
    event.setEntities(new FSArray(jCas, 2));
    event.setEntities(0, js);
    event.setEntities(1, jd);
    event.setArguments(new StringArray(jCas, 2));
    event.setArguments(0, "argument");
    event.setArguments(1, "Other");
    event.addToIndexes(jCas);
  }
 
Example 9
Source File: MongoTest.java    From baleen with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
@Test
public void testNoEntities() throws Exception {
  jCas.setDocumentText(TEXT);
  jCas.setDocumentLanguage("en");

  long timestamp = System.currentTimeMillis();

  DocumentAnnotation da = getDocumentAnnotation(jCas);
  da.setTimestamp(timestamp);
  da.setSourceUri("test/no_entities");
  da.setDocType("test");
  da.setDocumentClassification("OFFICIAL");
  da.setDocumentCaveats(
      UimaTypesUtils.toArray(jCas, Arrays.asList(new String[] {"TEST_A", "TEST_B"})));
  da.setDocumentReleasability(
      UimaTypesUtils.toArray(jCas, Arrays.asList(new String[] {"ENG", "SCO", "WAL"})));

  ae.process(jCas);

  assertEquals(1, documents.count());
  Document result = documents.find().first();

  assertEquals(TEXT, result.get(Mongo.FIELD_CONTENT));
  assertEquals(
      "en", ((Document) result.get(Mongo.FIELD_DOCUMENT)).get(Mongo.FIELD_DOCUMENT_LANGUAGE));

  assertEquals(
      new Date(timestamp),
      ((Document) result.get(Mongo.FIELD_DOCUMENT)).get(Mongo.FIELD_DOCUMENT_TIMESTAMP));
  assertEquals(
      "test/no_entities",
      ((Document) result.get(Mongo.FIELD_DOCUMENT)).get(Mongo.FIELD_DOCUMENT_SOURCE));

  assertEquals(
      "test", ((Document) result.get(Mongo.FIELD_DOCUMENT)).get(Mongo.FIELD_DOCUMENT_TYPE));

  assertEquals(
      "OFFICIAL",
      ((Document) result.get(Mongo.FIELD_DOCUMENT)).get(Mongo.FIELD_DOCUMENT_CLASSIFICATION));
  assertArrayEquals(
      new String[] {"TEST_A", "TEST_B"},
      ((Collection<String>)
              ((Document) result.get(Mongo.FIELD_DOCUMENT)).get(Mongo.FIELD_DOCUMENT_CAVEATS))
          .toArray());
  assertArrayEquals(
      new String[] {"ENG", "SCO", "WAL"},
      ((Collection<String>)
              ((Document) result.get(Mongo.FIELD_DOCUMENT))
                  .get(Mongo.FIELD_DOCUMENT_RELEASABILITY))
          .toArray());

  assertEquals(getDocumentAnnotation(jCas).getHash(), result.get(fields.getExternalId()));
}