org.archive.format.warc.WARCConstants Java Examples

The following examples show how to use org.archive.format.warc.WARCConstants. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: WarcTargetRepository.java    From ache with Apache License 2.0 5 votes vote down vote up
public WARCRecordInfo getWarcRecordInfo(Page page, URI uri) throws IOException {
    WARCRecordInfo warcRecord = new WARCRecordInfo();

    warcRecord.setUrl(page.getFinalUrl());
    warcRecord.setRecordId(uri);
    warcRecord.setType(WARCRecordType.response);
    warcRecord.setMimetype(WARCConstants.HTTP_RESPONSE_MIMETYPE);

    // Store fetch times using ISO-8601 format
    Date fetchTime = createFetchTimeDate(page);
    warcRecord.setCreate14DigitDate(dateFormater.get().format(fetchTime));

    // Re-create response body based on content bytes and response headers
    byte[] contentBytes = createContentBytes(page);
    warcRecord.setContentLength(contentBytes.length);
    warcRecord.setContentStream(new ByteArrayInputStream(contentBytes));

    // Store ACHE-specific metadata as non-standard extension header fields
    if (page.getTargetRelevance() != null) {
        TargetRelevance targetRelevance = page.getTargetRelevance();
        warcRecord.addExtraHeader("ACHE-IsRelevant", String.valueOf(targetRelevance.isRelevant()));
        warcRecord.addExtraHeader("ACHE-Relevance", String.format("%.10f", targetRelevance.getRelevance()));
    }
    warcRecord.addExtraHeader("ACHE-Requested-URL", page.getRequestedUrl());

    return warcRecord;
}
 
Example #2
Source File: WarcTargetRepositoryTest.java    From ache with Apache License 2.0 5 votes vote down vote up
@Test
public void shouldStoreAndIterageOverData() throws IOException {

    String folder = tempFolder.newFolder().toString();

    Page target = new Page(new URL(url), html, responseHeaders);
    target.setTargetRelevance(TargetRelevance.RELEVANT);
    target.setFetchTime(System.currentTimeMillis());

    WarcTargetRepository repository = new WarcTargetRepository(folder);

    // when
    repository.insert(target);
    repository.close();
    File testFolder = new File(folder);

    if (testFolder.isDirectory()) {
        File[] allFiles = testFolder.listFiles();
        assertTrue(allFiles[0].getName().startsWith("crawl_data"));
    }

    Iterator<WARCRecord> it = repository.iterator();

    // then
    assertThat(it.hasNext(), is(true));
    WARCRecord page = it.next();
    assertThat(it.hasNext(), is(false));

    assertThat(page.getHeader().getUrl(), is(url));

    assertThat(page.getHeader().getHeaderValue("Content-Type"),
            is(WARCConstants.HTTP_RESPONSE_MIMETYPE));

    assertThat(page.getHeader().getHeaderValue("ACHE-IsRelevant"),
            is(target.getTargetRelevance().isRelevant() + ""));

    assertThat(Double.valueOf(page.getHeader().getHeaderValue("ACHE-Relevance").toString()),
            is(Double.valueOf(target.getTargetRelevance().getRelevance())));
}
 
Example #3
Source File: ExtractingResourceFactoryMapper.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
private boolean isHTTPResponseWARCResource(MetaData envelope) {
	return childFieldEquals(envelope,WARC_HEADER_METADATA,
			WARCConstants.CONTENT_TYPE,
			WARCConstants.HTTP_RESPONSE_MIMETYPE)
		|| childFieldEquals(envelope,WARC_HEADER_METADATA,
			WARCConstants.CONTENT_TYPE,
			WARCConstants.HTTP_RESPONSE_MIMETYPE_NS);
}
 
Example #4
Source File: WARCReaderFactoryTest.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
public void testGetStringInputstreamBoolean() throws IOException {
	// Check the test files can be opened:
	for( String file : files ) {
		FileInputStream is = new FileInputStream(file);
		ArchiveReader ar = WARCReaderFactory.get(file, is, true);
		ArchiveRecord r = ar.get();
		String type = (String) r.getHeader().getHeaderValue(WARCConstants.HEADER_KEY_TYPE);
		// Check the first record comes out as a 'warcinfo' record.
		assertEquals(WARCRecordType.warcinfo.name(), type);
	}
}
 
Example #5
Source File: WarcTargetRepository.java    From ache with Apache License 2.0 4 votes vote down vote up
public WarcTargetRepository(String directory) throws IOException {
    this(Paths.get(directory), WARCConstants.DEFAULT_MAX_WARC_FILE_SIZE, true);
}
 
Example #6
Source File: ArcDigitalAssetStoreService.java    From webcurator with Apache License 2.0 4 votes vote down vote up
private void skipHeaders(ArchiveRecord record) throws IOException {
    HttpParser.parseHeaders(record, WARCConstants.DEFAULT_ENCODING);
}
 
Example #7
Source File: ExtractingResourceFactoryMapper.java    From webarchive-commons with Apache License 2.0 4 votes vote down vote up
private boolean isWARCType(MetaData envelope, WARCRecordType type) {
	return childFieldEquals(envelope,WARC_HEADER_METADATA, 
			WARCConstants.HEADER_KEY_TYPE,type.toString());
}
 
Example #8
Source File: ExtractingResourceFactoryMapper.java    From webarchive-commons with Apache License 2.0 4 votes vote down vote up
private boolean isWARCJSONResource(MetaData envelope) {
	return childFieldEquals(envelope,WARC_HEADER_METADATA,
			WARCConstants.CONTENT_TYPE,
			"application/json");
}
 
Example #9
Source File: ExtractingResourceFactoryMapper.java    From webarchive-commons with Apache License 2.0 4 votes vote down vote up
private boolean isDNSResponseWARCResource(MetaData envelope) {
	return childFieldEquals(envelope,WARC_HEADER_METADATA,
			WARCConstants.CONTENT_TYPE,PAYLOAD_TYPE_DNS);
}