org.archive.io.warc.WARCRecord Java Examples

The following examples show how to use org.archive.io.warc.WARCRecord. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Page.java    From ache with Apache License 2.0 6 votes vote down vote up
private String readHeaderLine(WARCRecord warc) {
    StringBuilder sb = new StringBuilder();
    try {
        char c;
        char previous = '\n';
        do {
            c = (char) warc.read();
            if (c == '\n' && previous == '\r') {
                // trim the CR (\r) from last iteration
                sb.deleteCharAt(sb.length() - 1);
                break;
            }
            sb.append((char) c);
            previous = c;
        } while (c != -1);
        return sb.toString();
    } catch (IOException e) {
        return sb.toString();
    }
}
 
Example #2
Source File: WarcTargetRepository.java    From ache with Apache License 2.0 5 votes vote down vote up
@Override
public WARCRecord next() {
    if (!warcRecordsIterator.hasNext()) {
        return null;
    }
    return warcRecordsIterator.next();
}
 
Example #3
Source File: WarcTargetRepository.java    From ache with Apache License 2.0 5 votes vote down vote up
@Override
public WARCRecord next() {
    WARCRecord returnValue = null;
    try {
        returnValue = readNext();
    } catch (IOException e) {
        e.printStackTrace();
    }
    return returnValue;
}
 
Example #4
Source File: WarcTargetRepositoryTest.java    From ache with Apache License 2.0 5 votes vote down vote up
@Test
public void shouldStoreAndIterageOverData() throws IOException {

    String folder = tempFolder.newFolder().toString();

    Page target = new Page(new URL(url), html, responseHeaders);
    target.setTargetRelevance(TargetRelevance.RELEVANT);
    target.setFetchTime(System.currentTimeMillis());

    WarcTargetRepository repository = new WarcTargetRepository(folder);

    // when
    repository.insert(target);
    repository.close();
    File testFolder = new File(folder);

    if (testFolder.isDirectory()) {
        File[] allFiles = testFolder.listFiles();
        assertTrue(allFiles[0].getName().startsWith("crawl_data"));
    }

    Iterator<WARCRecord> it = repository.iterator();

    // then
    assertThat(it.hasNext(), is(true));
    WARCRecord page = it.next();
    assertThat(it.hasNext(), is(false));

    assertThat(page.getHeader().getUrl(), is(url));

    assertThat(page.getHeader().getHeaderValue("Content-Type"),
            is(WARCConstants.HTTP_RESPONSE_MIMETYPE));

    assertThat(page.getHeader().getHeaderValue("ACHE-IsRelevant"),
            is(target.getTargetRelevance().isRelevant() + ""));

    assertThat(Double.valueOf(page.getHeader().getHeaderValue("ACHE-Relevance").toString()),
            is(Double.valueOf(target.getTargetRelevance().getRelevance())));
}
 
Example #5
Source File: WarcTargetRepositoryTest.java    From ache with Apache License 2.0 5 votes vote down vote up
@Test
public void testReadingMultipleWarcRecordsUsingIterator() throws Exception {
    // given
    String folder = tempFolder.newFolder().toString();
    String url1 = "http://a.com";
    String url2 = "http://b.com";

    Page target1 = new Page(new URL(url1), html);
    Page target2 = new Page(new URL(url2), html);

    WarcTargetRepository repository = new WarcTargetRepository(folder);

    // when
    repository.insert(target1);
    repository.insert(target2);
    repository.close();

    RepositoryIterator respositoryIterator = repository.iterator();

    // then
    assertTrue(respositoryIterator.hasNext());
    WARCRecord record = respositoryIterator.next();
    assertThat(record.getHeader().getUrl(), is(url1));

    assertTrue(respositoryIterator.hasNext());
    record = respositoryIterator.next();
    assertThat(record.getHeader().getUrl(), is(url2));

    assertFalse(respositoryIterator.hasNext());
}
 
Example #6
Source File: WarcTargetRepositoryTest.java    From ache with Apache License 2.0 5 votes vote down vote up
@Test
public void testShouldNotFailWhenThereAreNonASCIICharactersOnHeaders() throws Exception {
    // given
    String folder = tempFolder.newFolder().toString();

    String url1 = "http://a.com";

    Map<String, List<String>> headers = new HashMap<>();
    Character invalidChar = new Character((char) 0x80);
    String headerValue = "inline; filename=\"Invalid_" + invalidChar + "\"";
    headers.put("Content-Disposition", asList(headerValue));

    Page target1 = new Page(new URL(url1), html, headers);

    WarcTargetRepository repository = new WarcTargetRepository(folder);

    // when
    repository.insert(target1);
    repository.close();

    RepositoryIterator respositoryIterator = repository.iterator();

    // then
    assertTrue(respositoryIterator.hasNext());
    WARCRecord record = respositoryIterator.next();
    assertThat(record.getHeader().getUrl(), is(url1));
    String recordData = IOUtils.toString(record);
    assertThat(recordData, containsString(html));
    assertThat(recordData, containsString(headerValue));

    assertFalse(respositoryIterator.hasNext());
}
 
Example #7
Source File: WarcTargetRepositoryTest.java    From ache with Apache License 2.0 5 votes vote down vote up
@Test
public void shouldIterateOverEmptyFolder() throws IOException {
    // given
    String folder = tempFolder.newFolder().toString();

    WarcTargetRepository repository = new WarcTargetRepository(folder);

    // when
    Iterator<WARCRecord> it = repository.iterator();

    // then
    assertThat(it.hasNext(), is(false));
    assertThat(it.next(), is(nullValue()));
}
 
Example #8
Source File: ArcHarvestFileDTO.java    From webcurator with Apache License 2.0 5 votes vote down vote up
/**
 * Create and return the index of the ArcHarvestFile.
 * @param baseDir the base directory of the arcs
 * @throws IOException thrown if there is an error
 * @throws ParseException 
 */
public Map<String, HarvestResourceDTO> index(File baseDir) throws IOException, ParseException {
	Map<String, HarvestResourceDTO> results = new HashMap<String, HarvestResourceDTO>();
	
	File theArchiveFile = new File(baseDir, this.getName());
	ArchiveReader reader = ArchiveReaderFactory.get(theArchiveFile);
	this.compressed = reader.isCompressed();
	
	Iterator<ArchiveRecord> it = reader.iterator();
	while(it.hasNext()) {
		ArchiveRecord rec = it.next();
		
		if(rec instanceof WARCRecord) {
			String type = rec.getHeader().getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
			if(type.equals(WARCConstants.RESPONSE)) {
				String mime = rec.getHeader().getMimetype();
				if(!mime.equals("text/dns")) {
					indexWARCResponse(rec, results);
				}
			}
		}
		else {
			indexARCRecord(rec, results);
		}
	}
	reader.close();
	
	return results;
}
 
Example #9
Source File: HeaderedArchiveRecordTest.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
public void testParseHttpHeadersInWARC() throws IOException {
    final String url = "http://foo.maths.uq.edu.au/index.html";
    // final String warcHeader = "WARC/0.10 000000000486 response " +
    // url + " 20070315152520 " +
    // "urn:uuid:d8b342a8-dba4-4d7f-a551-1d8184f2ff58 " +
    // "application/http; msgtype=response\r\n" +
    // "Checksum: sha1:IT6YEX5WHKK57GOEHV2YHTTXEP5KPM6A\r\n" +
    // "IP-Address: 80.150.6.184\r\n" +
    // "\r\n";

    final String warcHeader = "WARC/0.12\r\n"
       + "MIME-Version: 1.0\r\n"
       + "WARC-Record-Type: response\r\n"
       + "WARC-Target-URI: http://foo.maths.uq.edu.au/index.html\r\n"
       + "WARC-Date: 2006-09-19T17:20:24Z\r\n"
       + "WARC-Digest: sha1:IT6YEX5WHKK57GOEHV2YHTTXEP5KPM6A\r\n"
       + "WARC-IP-Address: 80.150.6.184\r\n"
       + "Content-ID: <urn:uuid:d8b342a8-dba4-4d7f-a551-1d8184f2ff58>\r\n"
       + "Content-Type: application/http; msgtype=response\r\n"
       + "Content-Length: " + (HTTPHEADER.length() + BODY.length()) + "\r\n"
       + "\r\n";

    final String hdr = warcHeader + HTTPHEADER + BODY;

    WARCRecord r = new WARCRecord(new ByteArrayInputStream(hdr.getBytes()),
            "READER_IDENTIFIER", 0, false, true);
    HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true);

    har.skipHttpHeader();

    byte[] b = new byte[BODY.length()];
    har.read(b);
    String bodyRead = new String(b);
    assertEquals(BODY, bodyRead);
    assertHeaderCorrectlyParsed(har.getContentHeaders());
    assertEquals("failed to retrieve Url from metadata", har.getHeader()
            .getUrl(), url);
}
 
Example #10
Source File: HeaderedArchiveRecordTest.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
public void testNoheaderWARC() throws IOException {
    String b = "hello world";
    String c = "WARC/0.12\r\nContent-Type: text/plain\r\n"
            + "Content-Length: " + b.length() + "\r\n\r\n" + b;
    org.archive.io.warc.WARCRecord r = new org.archive.io.warc.WARCRecord(
            new ByteArrayInputStream(c.getBytes()), "READER_IDENTIFIER", 0,
            false, true);
    HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true);
    assertTrue(har.isStrict());
}
 
Example #11
Source File: Page.java    From ache with Apache License 2.0 4 votes vote down vote up
public Page(WARCRecord warc) {

        String warcUrl = warc.getHeader().getUrl();

        Map<String, Object> headerFields = warc.getHeader().getHeaderFields();

        String requestUrl = (String) headerFields.get("ACHE-Requested-URL");
        if (requestUrl == null || warcUrl.equals(requestUrl)) {
            this.url = createUrlObj(warcUrl);
        } else {
            this.url = createUrlObj(requestUrl);
            this.redirectedURL = createUrlObj(warcUrl);
        }

        this.fetchTime = Instant.parse(warc.getHeader().getDate()).toEpochMilli();

        this.responseHeaders = new HashMap<>();
        String line;
        while (!(line = this.readHeaderLine(warc)).isEmpty()) {
            int index = line.indexOf(":");
            if (index == -1) {
                // Unexpected header found
                continue;
            }
            String value = line.substring(index + 1).trim();
            String key = line.substring(0, index).trim();
            List<String> values = this.responseHeaders.get(key);
            if (values == null) {
                values = new ArrayList<>();
                this.responseHeaders.put(key, values);
            }
            values.add(value);
            if ("Content-Type".equalsIgnoreCase(key)) {
                this.contentType = value;
            }
        }

        try {
            this.content = ByteStreams.toByteArray(warc);
        } catch (IOException e) {
            this.content = null;
        }

        double relevance = Double.valueOf(
                (String) headerFields.get("ACHE-Relevance"));
        boolean isRelevant = Boolean.valueOf(
                (String) headerFields.get("ACHE-IsRelevant"));
        this.targetRelevance = new TargetRelevance(isRelevant, relevance);
    }
 
Example #12
Source File: WarcTargetRepositoryTest.java    From ache with Apache License 2.0 4 votes vote down vote up
@Test
public void testReadingMultipleWarcRecords() throws Exception {
    String folder = tempFolder.newFolder().toString();

    String url1 = "http://a.com";
    String url2 = "http://b.com";

    Page target1 = new Page(new URL(url1), html, responseHeaders);
    target1.setFetchTime(System.currentTimeMillis());

    Page target2 = new Page(new URL(url2), html, responseHeaders);
    target2.setFetchTime(System.currentTimeMillis());

    WarcTargetRepository repository = new WarcTargetRepository(folder);

    // when
    repository.insert(target1);
    repository.insert(target2);
    repository.close();

    WARCWriter writer = repository.getWriter();
    WARCReader reader = WARCReaderFactory.get(writer.getFile());

    // Get to second record. Get its offset for later use.
    boolean readWarcInfoRecord = false;
    boolean readFirst = false;
    boolean readSecond = false;

    for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();) {
        WARCRecord ar = (WARCRecord) i.next();
        if (!readWarcInfoRecord) {
            readWarcInfoRecord = true;
        } else if (!readFirst) {
            readFirst = true;
            assertThat(ar.getHeader().getUrl(), is(url1));
            continue;
        } else if (!readSecond) {
            url = ar.getHeader().getUrl();
            assertThat(ar.getHeader().getUrl(), is(url2));
            readSecond = true;
        }
    }
    reader.close();
}
 
Example #13
Source File: WarcTargetRepositoryTest.java    From ache with Apache License 2.0 4 votes vote down vote up
@Test
public void testWritingToAWarcFileWithMaxSize() throws Exception {
    // given
    String folder = tempFolder.newFolder().toString();

    String url1 = "http://a.com";
    String url2 = "http://b.com";

    Page target1 = new Page(new URL(url1), html, responseHeaders);
    Page target2 = new Page(new URL(url2), html);

    target1.setTargetRelevance(TargetRelevance.RELEVANT);
    target2.setTargetRelevance(TargetRelevance.IRRELEVANT);

    WarcTargetRepository repository = new WarcTargetRepository(folder, 400);

    // when
    repository.insert(target1);
    repository.insert(target2);
    repository.close();

    // then
    File[] allFiles = new File(folder).listFiles();
    assertTrue(allFiles[0].getName().startsWith("crawl_data"));
    assertThat(allFiles.length, is(2));
    assertTrue(allFiles[1].getName().startsWith("crawl_data"));

    List<String> allUrls = new ArrayList<>(asList(url1, url2));

    RepositoryIterator respositoryIterator = repository.iterator();

    assertTrue(respositoryIterator.hasNext());
    WARCRecord record = respositoryIterator.next();
    assertThat(record.getHeader().getUrl(), isIn(allUrls));

    allUrls.remove(record.getHeader().getUrl());

    assertTrue(respositoryIterator.hasNext());
    record = respositoryIterator.next();
    assertThat(record.getHeader().getUrl(), isIn(allUrls));

    allUrls.remove(record.getHeader().getUrl());

    assertThat(allUrls, empty());
}