Java Code Examples for org.apache.uima.jcas.tcas.DocumentAnnotation#setTimestamp()
The following examples show how to use
org.apache.uima.jcas.tcas.DocumentAnnotation#setTimestamp() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ElasticsearchTestBase.java From baleen with Apache License 2.0 | 6 votes |
protected long createNoEntitiesDocument() { jCas.reset(); jCas.setDocumentText("Hello World"); jCas.setDocumentLanguage("en"); long timestamp = System.currentTimeMillis(); DocumentAnnotation da = getDocumentAnnotation(jCas); da.setTimestamp(timestamp); da.setSourceUri("test/no_entities"); da.setDocType("test"); da.setDocumentClassification("OFFICIAL"); da.setDocumentCaveats( UimaTypesUtils.toArray(jCas, Arrays.asList(new String[] {"TEST_A", "TEST_B"}))); da.setDocumentReleasability( UimaTypesUtils.toArray(jCas, Arrays.asList(new String[] {"ENG", "SCO", "WAL"}))); return timestamp; }
Example 2
Source File: MboxReader.java From baleen with Apache License 2.0 | 5 votes |
/** Process a single body part */ private boolean processBody(JCas jCas, Body body, String sourceUri) throws IOException { if (body instanceof TextBody) { // Process plain text body processTextBody(jCas, (TextBody) body); // Add fields from parent for (Field f : body.getParent().getHeader().getFields()) { addMetadata(jCas, f.getName(), f.getBody()); } // Set up document annotation - this is done by the content extractor in other cases DocumentAnnotation doc = UimaSupport.getDocumentAnnotation(jCas); doc.setSourceUri(sourceUri); doc.setTimestamp(System.currentTimeMillis()); } else if (body instanceof BinaryBody) { processBinaryBody(jCas, (BinaryBody) body, sourceUri); } else if (body instanceof Multipart) { // Multipart message, so recurse Multipart mp = (Multipart) body; return processMultipart(jCas, mp, sourceUri); } else { // No body processed return false; } return true; }
Example 3
Source File: AbstractContentExtractor.java From baleen with Apache License 2.0 | 5 votes |
@Override public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException { DocumentAnnotation doc = UimaSupport.getDocumentAnnotation(jCas); doc.setSourceUri(source); doc.setTimestamp(System.currentTimeMillis()); // Add metadata item to capture which content extractor was used addMetadata(jCas, METADATA_KEY_CONTENT_EXTRACTOR, this.getClass().getName()); }
Example 4
Source File: JCasDeserialiser.java From baleen with Apache License 2.0 | 5 votes |
private void processDocumentAnnotation( final JCas jCas, final DocumentAnnotation da, final Map<String, Object> map) { da.setDocType((String) map.getOrDefault(JsonJCas.DA_DOCUMENT_TYPE, "")); da.setDocumentClassification((String) map.getOrDefault(JsonJCas.DA_CLASSIFICATION, "")); da.setLanguage((String) map.getOrDefault(JsonJCas.DA_LANGUAGE, "")); da.setSourceUri((String) map.getOrDefault(JsonJCas.DA_SOURCE_URI, "")); da.setTimestamp(((Number) map.getOrDefault(JsonJCas.DA_TIMESTAMP, 0)).longValue()); da.setDocumentCaveats( UimaTypesUtils.toArray( jCas, (Collection<String>) map.getOrDefault(JsonJCas.DA_CAVEATS, null))); da.setDocumentReleasability( UimaTypesUtils.toArray( jCas, (Collection<String>) map.getOrDefault(JsonJCas.DA_RELEASABILITY, null))); }
Example 5
Source File: YYYYMMDDAssigner.java From baleen with Apache License 2.0 | 5 votes |
@Override protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException { final DocumentAnnotation da = getDocumentAnnotation(jCas); final String source = da.getSourceUri(); final Matcher matcher = pattern.matcher(source); if (matcher.matches()) { try { final int y = Integer.parseInt(matcher.group("year")); final int m = Integer.parseInt(matcher.group("month")); final int d = Integer.parseInt(matcher.group("day")); if (m >= 1 && m <= 12 && d >= 1 && d <= 31) { // This will check if its' actually valid (31 Feb) it's actualy valid date... final LocalDate date = LocalDate.of(y, m, d); final long ts = date.atStartOfDay().atOffset(ZoneOffset.UTC).toInstant().toEpochMilli(); da.setTimestamp(ts); } } catch (final Exception e) { // Do nothing.. not a valid source path... getMonitor().warn("Cant parse date from source uri {} ", source, e); } } }
Example 6
Source File: MimeReader.java From baleen with Apache License 2.0 | 4 votes |
@Override protected void doGetNext(final JCas jCas) throws IOException, CollectionException { final Path path = files.pop(); final File file = path.toFile(); final int left = files.size(); getMonitor() .info( "Processing {} ({} %)", file.getAbsolutePath(), String.format("%.2f", 100 * (total - left) / (double) total)); try (FileInputStream is = new FileInputStream(file)) { final Session s = Session.getDefaultInstance(new Properties()); final MimeMessageParser parser = new MimeMessageParser(new MimeMessage(s, is)); parser.parse(); final MimeMessage message = parser.getMimeMessage(); final DocumentAnnotation da = UimaSupport.getDocumentAnnotation(jCas); da.setTimestamp(calculateBestDate(message, file)); da.setDocType("email"); da.setDocumentClassification("O"); String source = file.getAbsolutePath().substring(rootFolder.length()); da.setSourceUri(source); da.setLanguage("en"); // Add all headers as metadata, with email prefix final Enumeration<Header> allHeaders = message.getAllHeaders(); while (allHeaders.hasMoreElements()) { final Header header = allHeaders.nextElement(); addMetadata(jCas, "email." + header.getName(), header.getValue()); } addMetadata(jCas, "from", parser.getFrom()); addMetadata(jCas, "to", parser.getTo()); addMetadata(jCas, "cc", parser.getCc()); addMetadata(jCas, "bcc", parser.getBcc()); addMetadata(jCas, "subject", parser.getSubject()); // Add fake title addMetadata(jCas, "title", parser.getSubject()); String actualContent = parser.getPlainContent(); if (actualContent == null) { actualContent = ""; } // TODO: At this point we could create a representation of the addresses, etc in the content // eg a table of to, from, and etc // then annotate them a commsidentifier, date, person. // We could also create relations between sender and receiver String content = actualContent + "\n\n---\n\n"; final String headerBlock = createHeaderBlock(content.length(), jCas, parser); content = content + headerBlock; final Text text = new Text(jCas); text.setBegin(0); text.setEnd(actualContent.length()); text.addToIndexes(); extractContent(new ByteArrayInputStream(content.getBytes()), source, jCas); } catch (final Exception e) { getMonitor().warn("Discarding message", e); } }
Example 7
Source File: CsvFolderReader.java From baleen with Apache License 2.0 | 4 votes |
@Override public void doGetNext(JCas jCas) throws IOException, CollectionException { if (currLines.isEmpty()) { // Read next file currPath = queue.remove(0); currentLine = 0; getMonitor().info("Processing file {}", currPath.toString()); List<String> lines; try (Stream<String> ln = Files.lines(currPath)) { lines = ln.collect(Collectors.toList()); } String header = lines.remove(0); columnHeadings = Arrays.asList(csvParser.parseLine(header)); currLines.addAll(lines); } String line = currLines.remove(0); String[] cols = csvParser.parseLine(line); currentLine++; StringJoiner sj = new StringJoiner("\n\n"); Map<String, String> meta = new HashMap<>(); for (int i = 0; i < columnHeadings.size(); i++) { if (inArray(columnHeadings.get(i), textColumn)) { sj.add(cols[i]); } else { meta.put(columnHeadings.get(i), cols[i]); } } jCas.setDocumentText(sj.toString()); for (Map.Entry<String, String> e : meta.entrySet()) { Metadata md = new Metadata(jCas); md.setKey(e.getKey()); md.setValue(e.getValue()); md.addToIndexes(); } DocumentAnnotation doc = UimaSupport.getDocumentAnnotation(jCas); doc.setSourceUri(currPath.toString() + "#" + currentLine); doc.setTimestamp(System.currentTimeMillis()); }
Example 8
Source File: JCasTestGraphUtil.java From baleen with Apache License 2.0 | 4 votes |
public static void populateJcas(final JCas jCas) { jCas.setDocumentText(CONTENT); final DocumentAnnotation da = (DocumentAnnotation) jCas.getDocumentAnnotationFs(); da.setDocumentClassification("CLASS"); da.setDocType("MANUAL"); da.setSourceUri("http://test.com"); da.setLanguage("en"); da.setTimestamp(new Date().getTime()); da.setDocumentCaveats(new StringArray(jCas, 2)); da.setDocumentCaveats(0, "GITHUB"); da.setDocumentCaveats(1, "CAVEAT"); final Metadata m1 = new Metadata(jCas); m1.setKey("test"); m1.setValue("1"); m1.addToIndexes(jCas); final Metadata m2 = new Metadata(jCas); m2.setKey("test"); m2.setValue("2"); m2.addToIndexes(jCas); final PublishedId pId = new PublishedId(jCas); pId.setPublishedIdType("test"); pId.setValue("12"); pId.addToIndexes(jCas); ReferenceTarget target = new ReferenceTarget(jCas); target.setLinking("testLinking"); target.addToIndexes(jCas); final Person js = new Person(jCas); js.setBegin(25); js.setEnd(35); js.setGender("Male"); js.setValue("John Smith"); js.setConfidence(0.9d); js.setReferent(target); js.addToIndexes(jCas); final Person jd = new Person(jCas); jd.setBegin(50); jd.setEnd(58); jd.setGender("Female"); jd.setValue("Jane Doe"); jd.setConfidence(0.8d); jd.addToIndexes(jCas); final Person he = new Person(jCas); he.setBegin(60); he.setEnd(62); he.setGender("Male"); he.setValue("He"); he.setConfidence(0.9d); he.setReferent(target); he.addToIndexes(jCas); final Location l = new Location(jCas); l.setBegin(72); l.setEnd(87); l.setGeoJson(GEO_JSON); l.setValue("Dinagat Islands"); l.setConfidence(0.9d); l.addToIndexes(jCas); final Relation related = new Relation(jCas); related.setBegin(36); related.setEnd(49); related.setValue("is related to"); related.setRelationshipType(RELATED_TYPE); related.setSource(js); related.setTarget(jd); related.addToIndexes(jCas); final Relation lives = new Relation(jCas); lives.setBegin(63); lives.setEnd(71); lives.setValue("lives at"); lives.setRelationshipType(LIVES_TYPE); lives.setSource(js); lives.setTarget(l); lives.addToIndexes(jCas); final Event event = new Event(jCas); event.setBegin(0); event.setEnd(10); event.setValue("test event"); event.setEventType(new StringArray(jCas, 1)); event.setEventType(0, "MEETING"); event.setEntities(new FSArray(jCas, 2)); event.setEntities(0, js); event.setEntities(1, jd); event.setArguments(new StringArray(jCas, 2)); event.setArguments(0, "argument"); event.setArguments(1, "Other"); event.addToIndexes(jCas); }
Example 9
Source File: MongoTest.java From baleen with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") @Test public void testNoEntities() throws Exception { jCas.setDocumentText(TEXT); jCas.setDocumentLanguage("en"); long timestamp = System.currentTimeMillis(); DocumentAnnotation da = getDocumentAnnotation(jCas); da.setTimestamp(timestamp); da.setSourceUri("test/no_entities"); da.setDocType("test"); da.setDocumentClassification("OFFICIAL"); da.setDocumentCaveats( UimaTypesUtils.toArray(jCas, Arrays.asList(new String[] {"TEST_A", "TEST_B"}))); da.setDocumentReleasability( UimaTypesUtils.toArray(jCas, Arrays.asList(new String[] {"ENG", "SCO", "WAL"}))); ae.process(jCas); assertEquals(1, documents.count()); Document result = documents.find().first(); assertEquals(TEXT, result.get(Mongo.FIELD_CONTENT)); assertEquals( "en", ((Document) result.get(Mongo.FIELD_DOCUMENT)).get(Mongo.FIELD_DOCUMENT_LANGUAGE)); assertEquals( new Date(timestamp), ((Document) result.get(Mongo.FIELD_DOCUMENT)).get(Mongo.FIELD_DOCUMENT_TIMESTAMP)); assertEquals( "test/no_entities", ((Document) result.get(Mongo.FIELD_DOCUMENT)).get(Mongo.FIELD_DOCUMENT_SOURCE)); assertEquals( "test", ((Document) result.get(Mongo.FIELD_DOCUMENT)).get(Mongo.FIELD_DOCUMENT_TYPE)); assertEquals( "OFFICIAL", ((Document) result.get(Mongo.FIELD_DOCUMENT)).get(Mongo.FIELD_DOCUMENT_CLASSIFICATION)); assertArrayEquals( new String[] {"TEST_A", "TEST_B"}, ((Collection<String>) ((Document) result.get(Mongo.FIELD_DOCUMENT)).get(Mongo.FIELD_DOCUMENT_CAVEATS)) .toArray()); assertArrayEquals( new String[] {"ENG", "SCO", "WAL"}, ((Collection<String>) ((Document) result.get(Mongo.FIELD_DOCUMENT)) .get(Mongo.FIELD_DOCUMENT_RELEASABILITY)) .toArray()); assertEquals(getDocumentAnnotation(jCas).getHash(), result.get(fields.getExternalId())); }