org.archive.io.warc.WARCRecord Java Examples
The following examples show how to use
org.archive.io.warc.WARCRecord.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Page.java From ache with Apache License 2.0 | 6 votes |
private String readHeaderLine(WARCRecord warc) { StringBuilder sb = new StringBuilder(); try { char c; char previous = '\n'; do { c = (char) warc.read(); if (c == '\n' && previous == '\r') { // trim the CR (\r) from last iteration sb.deleteCharAt(sb.length() - 1); break; } sb.append((char) c); previous = c; } while (c != -1); return sb.toString(); } catch (IOException e) { return sb.toString(); } }
Example #2
Source File: WarcTargetRepository.java From ache with Apache License 2.0 | 5 votes |
@Override public WARCRecord next() { if (!warcRecordsIterator.hasNext()) { return null; } return warcRecordsIterator.next(); }
Example #3
Source File: WarcTargetRepository.java From ache with Apache License 2.0 | 5 votes |
@Override public WARCRecord next() { WARCRecord returnValue = null; try { returnValue = readNext(); } catch (IOException e) { e.printStackTrace(); } return returnValue; }
Example #4
Source File: WarcTargetRepositoryTest.java From ache with Apache License 2.0 | 5 votes |
@Test public void shouldStoreAndIterageOverData() throws IOException { String folder = tempFolder.newFolder().toString(); Page target = new Page(new URL(url), html, responseHeaders); target.setTargetRelevance(TargetRelevance.RELEVANT); target.setFetchTime(System.currentTimeMillis()); WarcTargetRepository repository = new WarcTargetRepository(folder); // when repository.insert(target); repository.close(); File testFolder = new File(folder); if (testFolder.isDirectory()) { File[] allFiles = testFolder.listFiles(); assertTrue(allFiles[0].getName().startsWith("crawl_data")); } Iterator<WARCRecord> it = repository.iterator(); // then assertThat(it.hasNext(), is(true)); WARCRecord page = it.next(); assertThat(it.hasNext(), is(false)); assertThat(page.getHeader().getUrl(), is(url)); assertThat(page.getHeader().getHeaderValue("Content-Type"), is(WARCConstants.HTTP_RESPONSE_MIMETYPE)); assertThat(page.getHeader().getHeaderValue("ACHE-IsRelevant"), is(target.getTargetRelevance().isRelevant() + "")); assertThat(Double.valueOf(page.getHeader().getHeaderValue("ACHE-Relevance").toString()), is(Double.valueOf(target.getTargetRelevance().getRelevance()))); }
Example #5
Source File: WarcTargetRepositoryTest.java From ache with Apache License 2.0 | 5 votes |
@Test public void testReadingMultipleWarcRecordsUsingIterator() throws Exception { // given String folder = tempFolder.newFolder().toString(); String url1 = "http://a.com"; String url2 = "http://b.com"; Page target1 = new Page(new URL(url1), html); Page target2 = new Page(new URL(url2), html); WarcTargetRepository repository = new WarcTargetRepository(folder); // when repository.insert(target1); repository.insert(target2); repository.close(); RepositoryIterator respositoryIterator = repository.iterator(); // then assertTrue(respositoryIterator.hasNext()); WARCRecord record = respositoryIterator.next(); assertThat(record.getHeader().getUrl(), is(url1)); assertTrue(respositoryIterator.hasNext()); record = respositoryIterator.next(); assertThat(record.getHeader().getUrl(), is(url2)); assertFalse(respositoryIterator.hasNext()); }
Example #6
Source File: WarcTargetRepositoryTest.java From ache with Apache License 2.0 | 5 votes |
@Test public void testShouldNotFailWhenThereAreNonASCIICharactersOnHeaders() throws Exception { // given String folder = tempFolder.newFolder().toString(); String url1 = "http://a.com"; Map<String, List<String>> headers = new HashMap<>(); Character invalidChar = new Character((char) 0x80); String headerValue = "inline; filename=\"Invalid_" + invalidChar + "\""; headers.put("Content-Disposition", asList(headerValue)); Page target1 = new Page(new URL(url1), html, headers); WarcTargetRepository repository = new WarcTargetRepository(folder); // when repository.insert(target1); repository.close(); RepositoryIterator respositoryIterator = repository.iterator(); // then assertTrue(respositoryIterator.hasNext()); WARCRecord record = respositoryIterator.next(); assertThat(record.getHeader().getUrl(), is(url1)); String recordData = IOUtils.toString(record); assertThat(recordData, containsString(html)); assertThat(recordData, containsString(headerValue)); assertFalse(respositoryIterator.hasNext()); }
Example #7
Source File: WarcTargetRepositoryTest.java From ache with Apache License 2.0 | 5 votes |
@Test public void shouldIterateOverEmptyFolder() throws IOException { // given String folder = tempFolder.newFolder().toString(); WarcTargetRepository repository = new WarcTargetRepository(folder); // when Iterator<WARCRecord> it = repository.iterator(); // then assertThat(it.hasNext(), is(false)); assertThat(it.next(), is(nullValue())); }
Example #8
Source File: ArcHarvestFileDTO.java From webcurator with Apache License 2.0 | 5 votes |
/** * Create and return the index of the ArcHarvestFile. * @param baseDir the base directory of the arcs * @throws IOException thrown if there is an error * @throws ParseException */ public Map<String, HarvestResourceDTO> index(File baseDir) throws IOException, ParseException { Map<String, HarvestResourceDTO> results = new HashMap<String, HarvestResourceDTO>(); File theArchiveFile = new File(baseDir, this.getName()); ArchiveReader reader = ArchiveReaderFactory.get(theArchiveFile); this.compressed = reader.isCompressed(); Iterator<ArchiveRecord> it = reader.iterator(); while(it.hasNext()) { ArchiveRecord rec = it.next(); if(rec instanceof WARCRecord) { String type = rec.getHeader().getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString(); if(type.equals(WARCConstants.RESPONSE)) { String mime = rec.getHeader().getMimetype(); if(!mime.equals("text/dns")) { indexWARCResponse(rec, results); } } } else { indexARCRecord(rec, results); } } reader.close(); return results; }
Example #9
Source File: HeaderedArchiveRecordTest.java From webarchive-commons with Apache License 2.0 | 5 votes |
public void testParseHttpHeadersInWARC() throws IOException { final String url = "http://foo.maths.uq.edu.au/index.html"; // final String warcHeader = "WARC/0.10 000000000486 response " + // url + " 20070315152520 " + // "urn:uuid:d8b342a8-dba4-4d7f-a551-1d8184f2ff58 " + // "application/http; msgtype=response\r\n" + // "Checksum: sha1:IT6YEX5WHKK57GOEHV2YHTTXEP5KPM6A\r\n" + // "IP-Address: 80.150.6.184\r\n" + // "\r\n"; final String warcHeader = "WARC/0.12\r\n" + "MIME-Version: 1.0\r\n" + "WARC-Record-Type: response\r\n" + "WARC-Target-URI: http://foo.maths.uq.edu.au/index.html\r\n" + "WARC-Date: 2006-09-19T17:20:24Z\r\n" + "WARC-Digest: sha1:IT6YEX5WHKK57GOEHV2YHTTXEP5KPM6A\r\n" + "WARC-IP-Address: 80.150.6.184\r\n" + "Content-ID: <urn:uuid:d8b342a8-dba4-4d7f-a551-1d8184f2ff58>\r\n" + "Content-Type: application/http; msgtype=response\r\n" + "Content-Length: " + (HTTPHEADER.length() + BODY.length()) + "\r\n" + "\r\n"; final String hdr = warcHeader + HTTPHEADER + BODY; WARCRecord r = new WARCRecord(new ByteArrayInputStream(hdr.getBytes()), "READER_IDENTIFIER", 0, false, true); HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true); har.skipHttpHeader(); byte[] b = new byte[BODY.length()]; har.read(b); String bodyRead = new String(b); assertEquals(BODY, bodyRead); assertHeaderCorrectlyParsed(har.getContentHeaders()); assertEquals("failed to retrieve Url from metadata", har.getHeader() .getUrl(), url); }
Example #10
Source File: HeaderedArchiveRecordTest.java From webarchive-commons with Apache License 2.0 | 5 votes |
public void testNoheaderWARC() throws IOException { String b = "hello world"; String c = "WARC/0.12\r\nContent-Type: text/plain\r\n" + "Content-Length: " + b.length() + "\r\n\r\n" + b; org.archive.io.warc.WARCRecord r = new org.archive.io.warc.WARCRecord( new ByteArrayInputStream(c.getBytes()), "READER_IDENTIFIER", 0, false, true); HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true); assertTrue(har.isStrict()); }
Example #11
Source File: Page.java From ache with Apache License 2.0 | 4 votes |
public Page(WARCRecord warc) { String warcUrl = warc.getHeader().getUrl(); Map<String, Object> headerFields = warc.getHeader().getHeaderFields(); String requestUrl = (String) headerFields.get("ACHE-Requested-URL"); if (requestUrl == null || warcUrl.equals(requestUrl)) { this.url = createUrlObj(warcUrl); } else { this.url = createUrlObj(requestUrl); this.redirectedURL = createUrlObj(warcUrl); } this.fetchTime = Instant.parse(warc.getHeader().getDate()).toEpochMilli(); this.responseHeaders = new HashMap<>(); String line; while (!(line = this.readHeaderLine(warc)).isEmpty()) { int index = line.indexOf(":"); if (index == -1) { // Unexpected header found continue; } String value = line.substring(index + 1).trim(); String key = line.substring(0, index).trim(); List<String> values = this.responseHeaders.get(key); if (values == null) { values = new ArrayList<>(); this.responseHeaders.put(key, values); } values.add(value); if ("Content-Type".equalsIgnoreCase(key)) { this.contentType = value; } } try { this.content = ByteStreams.toByteArray(warc); } catch (IOException e) { this.content = null; } double relevance = Double.valueOf( (String) headerFields.get("ACHE-Relevance")); boolean isRelevant = Boolean.valueOf( (String) headerFields.get("ACHE-IsRelevant")); this.targetRelevance = new TargetRelevance(isRelevant, relevance); }
Example #12
Source File: WarcTargetRepositoryTest.java From ache with Apache License 2.0 | 4 votes |
@Test public void testReadingMultipleWarcRecords() throws Exception { String folder = tempFolder.newFolder().toString(); String url1 = "http://a.com"; String url2 = "http://b.com"; Page target1 = new Page(new URL(url1), html, responseHeaders); target1.setFetchTime(System.currentTimeMillis()); Page target2 = new Page(new URL(url2), html, responseHeaders); target2.setFetchTime(System.currentTimeMillis()); WarcTargetRepository repository = new WarcTargetRepository(folder); // when repository.insert(target1); repository.insert(target2); repository.close(); WARCWriter writer = repository.getWriter(); WARCReader reader = WARCReaderFactory.get(writer.getFile()); // Get to second record. Get its offset for later use. boolean readWarcInfoRecord = false; boolean readFirst = false; boolean readSecond = false; for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();) { WARCRecord ar = (WARCRecord) i.next(); if (!readWarcInfoRecord) { readWarcInfoRecord = true; } else if (!readFirst) { readFirst = true; assertThat(ar.getHeader().getUrl(), is(url1)); continue; } else if (!readSecond) { url = ar.getHeader().getUrl(); assertThat(ar.getHeader().getUrl(), is(url2)); readSecond = true; } } reader.close(); }
Example #13
Source File: WarcTargetRepositoryTest.java From ache with Apache License 2.0 | 4 votes |
@Test public void testWritingToAWarcFileWithMaxSize() throws Exception { // given String folder = tempFolder.newFolder().toString(); String url1 = "http://a.com"; String url2 = "http://b.com"; Page target1 = new Page(new URL(url1), html, responseHeaders); Page target2 = new Page(new URL(url2), html); target1.setTargetRelevance(TargetRelevance.RELEVANT); target2.setTargetRelevance(TargetRelevance.IRRELEVANT); WarcTargetRepository repository = new WarcTargetRepository(folder, 400); // when repository.insert(target1); repository.insert(target2); repository.close(); // then File[] allFiles = new File(folder).listFiles(); assertTrue(allFiles[0].getName().startsWith("crawl_data")); assertThat(allFiles.length, is(2)); assertTrue(allFiles[1].getName().startsWith("crawl_data")); List<String> allUrls = new ArrayList<>(asList(url1, url2)); RepositoryIterator respositoryIterator = repository.iterator(); assertTrue(respositoryIterator.hasNext()); WARCRecord record = respositoryIterator.next(); assertThat(record.getHeader().getUrl(), isIn(allUrls)); allUrls.remove(record.getHeader().getUrl()); assertTrue(respositoryIterator.hasNext()); record = respositoryIterator.next(); assertThat(record.getHeader().getUrl(), isIn(allUrls)); allUrls.remove(record.getHeader().getUrl()); assertThat(allUrls, empty()); }