org.archive.io.warc.WARCReaderFactory Java Examples
The following examples show how to use
org.archive.io.warc.WARCReaderFactory.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: S3ReaderTest.java From cc-warc-examples with MIT License | 5 votes |
public static void main(String[] args) throws IOException, S3ServiceException { // We're accessing a publicly available bucket so don't need to fill in our credentials S3Service s3s = new RestS3Service(null); // Let's grab a file out of the CommonCrawl S3 bucket String fn = "common-crawl/crawl-data/CC-MAIN-2013-48/segments/1386163035819/warc/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz"; S3Object f = s3s.getObject("aws-publicdatasets", fn, null, null, null, null, null, null); // The file name identifies the ArchiveReader and indicates if it should be decompressed ArchiveReader ar = WARCReaderFactory.get(fn, f.getDataInputStream(), true); // Once we have an ArchiveReader, we can work through each of the records it contains int i = 0; for(ArchiveRecord r : ar) { // The header file contains information such as the type of record, size, creation time, and URL System.out.println("Header: " + r.getHeader()); System.out.println("URL: " + r.getHeader().getUrl()); System.out.println(); // If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream // Create a byte array that is as long as all the record's stated length byte[] rawData = new byte[r.available()]; r.read(rawData); // Note: potential optimization would be to have a large buffer only allocated once // Why don't we convert it to a string and print the start of it? Let's hope it's text! String content = new String(rawData); System.out.println(content.substring(0, Math.min(500, content.length()))); System.out.println((content.length() > 500 ? "..." : "")); // Pretty printing to make the output more readable System.out.println("=-=-=-=-=-=-=-=-="); if (i++ > 4) break; } }
Example #2
Source File: WARCReaderTest.java From cc-warc-examples with MIT License | 5 votes |
/** * @param args * @throws IOException */ public static void main(String[] args) throws IOException { // Set up a local compressed WARC file for reading String fn = "data/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz"; FileInputStream is = new FileInputStream(fn); // The file name identifies the ArchiveReader and indicates if it should be decompressed ArchiveReader ar = WARCReaderFactory.get(fn, is, true); // Once we have an ArchiveReader, we can work through each of the records it contains int i = 0; for(ArchiveRecord r : ar) { // The header file contains information such as the type of record, size, creation time, and URL System.out.println(r.getHeader()); System.out.println(r.getHeader().getUrl()); System.out.println(); // If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream // Create a byte array that is as long as the record's stated length byte[] rawData = IOUtils.toByteArray(r, r.available()); // Why don't we convert it to a string and print the start of it? Let's hope it's text! String content = new String(rawData); System.out.println(content.substring(0, Math.min(500, content.length()))); System.out.println((content.length() > 500 ? "..." : "")); // Pretty printing to make the output more readable System.out.println("=-=-=-=-=-=-=-=-="); if (i++ > 4) break; } }
Example #3
Source File: WARCFileRecordReader.java From cc-warc-examples with MIT License | 5 votes |
@Override public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) inputSplit; Configuration conf = context.getConfiguration(); Path path = split.getPath(); FileSystem fs = path.getFileSystem(conf); fsin = fs.open(path); arPath = path.getName(); ar = WARCReaderFactory.get(path.getName(), fsin, true); }
Example #4
Source File: ArchiveReaderFactory.java From webarchive-commons with Apache License 2.0 | 5 votes |
protected ArchiveReader getArchiveReader(final File f, final long offset) throws IOException { if (ARCReaderFactory.isARCSuffix(f.getName())) { return ARCReaderFactory.get(f, true, offset); } else if (WARCReaderFactory.isWARCSuffix(f.getName())) { return WARCReaderFactory.get(f, offset); } throw new IOException("Unknown file extension (Not ARC nor WARC): " + f.getName()); }
Example #5
Source File: ArchiveReaderFactory.java From webarchive-commons with Apache License 2.0 | 5 votes |
protected ArchiveReader getArchiveReader(final String id, final InputStream is, final boolean atFirstRecord) throws IOException { final InputStream stream = is; if (ARCReaderFactory.isARCSuffix(id)) { return ARCReaderFactory.get(id, stream, atFirstRecord); } else if (WARCReaderFactory.isWARCSuffix(id)) { return WARCReaderFactory.get(id, stream, atFirstRecord); } throw new IOException("Unknown extension (Not ARC nor WARC): " + id); }
Example #6
Source File: WarcTargetRepository.java From ache with Apache License 2.0 | 4 votes |
private WARCReader openFile(Path filePath) throws IOException { return WARCReaderFactory.get(filePath.toFile()); }
Example #7
Source File: WarcTargetRepositoryTest.java From ache with Apache License 2.0 | 4 votes |
@Test public void testReadingMultipleWarcRecords() throws Exception { String folder = tempFolder.newFolder().toString(); String url1 = "http://a.com"; String url2 = "http://b.com"; Page target1 = new Page(new URL(url1), html, responseHeaders); target1.setFetchTime(System.currentTimeMillis()); Page target2 = new Page(new URL(url2), html, responseHeaders); target2.setFetchTime(System.currentTimeMillis()); WarcTargetRepository repository = new WarcTargetRepository(folder); // when repository.insert(target1); repository.insert(target2); repository.close(); WARCWriter writer = repository.getWriter(); WARCReader reader = WARCReaderFactory.get(writer.getFile()); // Get to second record. Get its offset for later use. boolean readWarcInfoRecord = false; boolean readFirst = false; boolean readSecond = false; for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();) { WARCRecord ar = (WARCRecord) i.next(); if (!readWarcInfoRecord) { readWarcInfoRecord = true; } else if (!readFirst) { readFirst = true; assertThat(ar.getHeader().getUrl(), is(url1)); continue; } else if (!readSecond) { url = ar.getHeader().getUrl(); assertThat(ar.getHeader().getUrl(), is(url2)); readSecond = true; } } reader.close(); }