org.archive.io.ArchiveReader Java Examples
The following examples show how to use
org.archive.io.ArchiveReader.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ARCReaderFactoryTest.java From webarchive-commons with Apache License 2.0 | 6 votes |
private void offsetResourceTest( File testfile, long offset, String uri ) throws Exception { RandomAccessFile raf = new RandomAccessFile(testfile, "r"); raf.seek(offset); InputStream is = new FileInputStream(raf.getFD()); String fPath = testfile.getAbsolutePath(); ArchiveReader reader = ARCReaderFactory.get(fPath, is, false); // This one works: //ArchiveReader reader = ARCReaderFactory.get(testfile, offset); ArchiveRecord record = reader.get(); final String url = record.getHeader().getUrl(); assertEquals("URL of record is not as expected.", uri, url); final long position = record.getPosition(); final long recordLength = record.getHeader().getLength(); assertTrue("Position " + position + " is after end of record " + recordLength, position <= recordLength); // Clean up: if( raf != null ) raf.close(); }
Example #2
Source File: ARCReaderFactory.java From webarchive-commons with Apache License 2.0 | 6 votes |
protected ArchiveReader getArchiveReader(final File arcFile, final boolean skipSuffixTest, final long offset) throws IOException { boolean compressed = testCompressedARCFile(arcFile, skipSuffixTest); if (!compressed) { if (!FileUtils.isReadableWithExtensionAndMagic(arcFile, ARC_FILE_EXTENSION, ARC_MAGIC_NUMBER)) { throw new IOException(arcFile.getAbsolutePath() + " is not an Internet Archive ARC file."); } } return compressed? (ARCReader)ARCReaderFactory.factory. new CompressedARCReader(arcFile, offset): (ARCReader)ARCReaderFactory.factory. new UncompressedARCReader(arcFile, offset); }
Example #3
Source File: ArcHarvestFileDTO.java From webcurator with Apache License 2.0 | 5 votes |
/** * Create and return the index of the ArcHarvestFile. * @param baseDir the base directory of the arcs * @throws IOException thrown if there is an error * @throws ParseException */ public Map<String, HarvestResourceDTO> index(File baseDir) throws IOException, ParseException { Map<String, HarvestResourceDTO> results = new HashMap<String, HarvestResourceDTO>(); File theArchiveFile = new File(baseDir, this.getName()); ArchiveReader reader = ArchiveReaderFactory.get(theArchiveFile); this.compressed = reader.isCompressed(); Iterator<ArchiveRecord> it = reader.iterator(); while(it.hasNext()) { ArchiveRecord rec = it.next(); if(rec instanceof WARCRecord) { String type = rec.getHeader().getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString(); if(type.equals(WARCConstants.RESPONSE)) { String mime = rec.getHeader().getMimetype(); if(!mime.equals("text/dns")) { indexWARCResponse(rec, results); } } } else { indexARCRecord(rec, results); } } reader.close(); return results; }
Example #4
Source File: WARCReaderFactoryTest.java From webarchive-commons with Apache License 2.0 | 5 votes |
public void testGetStringInputstreamBoolean() throws IOException { // Check the test files can be opened: for( String file : files ) { FileInputStream is = new FileInputStream(file); ArchiveReader ar = WARCReaderFactory.get(file, is, true); ArchiveRecord r = ar.get(); String type = (String) r.getHeader().getHeaderValue(WARCConstants.HEADER_KEY_TYPE); // Check the first record comes out as a 'warcinfo' record. assertEquals(WARCRecordType.warcinfo.name(), type); } }
Example #5
Source File: WARCReaderFactory.java From webarchive-commons with Apache License 2.0 | 5 votes |
protected ArchiveReader getArchiveReader(final String f, final InputStream is, final boolean atFirstRecord) throws IOException { // Check if it's compressed, based on file extension. if( f.endsWith(".gz") ) { return new CompressedWARCReader(f, is, atFirstRecord); } else { return new UncompressedWARCReader(f, is); } }
Example #6
Source File: WARCReaderFactory.java From webarchive-commons with Apache License 2.0 | 5 votes |
protected ArchiveReader getArchiveReader(final File f, final long offset) throws IOException { boolean compressed = testCompressedWARCFile(f); if (!compressed) { if (!FileUtils.isReadableWithExtensionAndMagic(f, DOT_WARC_FILE_EXTENSION, WARC_MAGIC)) { throw new IOException(f.getAbsolutePath() + " is not a WARC file."); } } return (WARCReader)(compressed? WARCReaderFactory.factory.new CompressedWARCReader(f, offset): WARCReaderFactory.factory.new UncompressedWARCReader(f, offset)); }
Example #7
Source File: WordCounterMap.java From cc-warc-examples with MIT License | 5 votes |
@Override public void map(Text key, ArchiveReader value, Context context) throws IOException { for (ArchiveRecord r : value) { try { if (r.getHeader().getMimetype().equals("text/plain")) { context.getCounter(MAPPERCOUNTER.RECORDS_IN).increment(1); LOG.debug(r.getHeader().getUrl() + " -- " + r.available()); // Convenience function that reads the full message into a raw byte array byte[] rawData = IOUtils.toByteArray(r, r.available()); String content = new String(rawData); // Grab each word from the document tokenizer = new StringTokenizer(content); if (!tokenizer.hasMoreTokens()) { context.getCounter(MAPPERCOUNTER.EMPTY_PAGE_TEXT).increment(1); } else { while (tokenizer.hasMoreTokens()) { outKey.set(tokenizer.nextToken()); context.write(outKey, outVal); } } } else { context.getCounter(MAPPERCOUNTER.NON_PLAIN_TEXT).increment(1); } } catch (Exception ex) { LOG.error("Caught Exception", ex); context.getCounter(MAPPERCOUNTER.EXCEPTIONS).increment(1); } } }
Example #8
Source File: TagCounterMap.java From cc-warc-examples with MIT License | 5 votes |
@Override public void map(Text key, ArchiveReader value, Context context) throws IOException { // Compile the regular expression once as it will be used continuously patternTag = Pattern.compile(HTML_TAG_PATTERN); for (ArchiveRecord r : value) { try { LOG.debug(r.getHeader().getUrl() + " -- " + r.available()); // We're only interested in processing the responses, not requests or metadata if (r.getHeader().getMimetype().equals("application/http; msgtype=response")) { // Convenience function that reads the full message into a raw byte array byte[] rawData = IOUtils.toByteArray(r, r.available()); String content = new String(rawData); // The HTTP header gives us valuable information about what was received during the request String headerText = content.substring(0, content.indexOf("\r\n\r\n")); // In our task, we're only interested in text/html, so we can be a little lax // TODO: Proper HTTP header parsing + don't trust headers if (headerText.contains("Content-Type: text/html")) { context.getCounter(MAPPERCOUNTER.RECORDS_IN).increment(1); // Only extract the body of the HTTP response when necessary // Due to the way strings work in Java, we don't use any more memory than before String body = content.substring(content.indexOf("\r\n\r\n") + 4); // Process all the matched HTML tags found in the body of the document matcherTag = patternTag.matcher(body); while (matcherTag.find()) { String tagName = matcherTag.group(1); outKey.set(tagName.toLowerCase()); context.write(outKey, outVal); } } } } catch (Exception ex) { LOG.error("Caught Exception", ex); context.getCounter(MAPPERCOUNTER.EXCEPTIONS).increment(1); } } }
Example #9
Source File: WARCReaderTest.java From cc-warc-examples with MIT License | 5 votes |
/** * @param args * @throws IOException */ public static void main(String[] args) throws IOException { // Set up a local compressed WARC file for reading String fn = "data/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz"; FileInputStream is = new FileInputStream(fn); // The file name identifies the ArchiveReader and indicates if it should be decompressed ArchiveReader ar = WARCReaderFactory.get(fn, is, true); // Once we have an ArchiveReader, we can work through each of the records it contains int i = 0; for(ArchiveRecord r : ar) { // The header file contains information such as the type of record, size, creation time, and URL System.out.println(r.getHeader()); System.out.println(r.getHeader().getUrl()); System.out.println(); // If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream // Create a byte array that is as long as the record's stated length byte[] rawData = IOUtils.toByteArray(r, r.available()); // Why don't we convert it to a string and print the start of it? Let's hope it's text! String content = new String(rawData); System.out.println(content.substring(0, Math.min(500, content.length()))); System.out.println((content.length() > 500 ? "..." : "")); // Pretty printing to make the output more readable System.out.println("=-=-=-=-=-=-=-=-="); if (i++ > 4) break; } }
Example #10
Source File: S3ReaderTest.java From cc-warc-examples with MIT License | 5 votes |
public static void main(String[] args) throws IOException, S3ServiceException { // We're accessing a publicly available bucket so don't need to fill in our credentials S3Service s3s = new RestS3Service(null); // Let's grab a file out of the CommonCrawl S3 bucket String fn = "common-crawl/crawl-data/CC-MAIN-2013-48/segments/1386163035819/warc/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz"; S3Object f = s3s.getObject("aws-publicdatasets", fn, null, null, null, null, null, null); // The file name identifies the ArchiveReader and indicates if it should be decompressed ArchiveReader ar = WARCReaderFactory.get(fn, f.getDataInputStream(), true); // Once we have an ArchiveReader, we can work through each of the records it contains int i = 0; for(ArchiveRecord r : ar) { // The header file contains information such as the type of record, size, creation time, and URL System.out.println("Header: " + r.getHeader()); System.out.println("URL: " + r.getHeader().getUrl()); System.out.println(); // If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream // Create a byte array that is as long as all the record's stated length byte[] rawData = new byte[r.available()]; r.read(rawData); // Note: potential optimization would be to have a large buffer only allocated once // Why don't we convert it to a string and print the start of it? Let's hope it's text! String content = new String(rawData); System.out.println(content.substring(0, Math.min(500, content.length()))); System.out.println((content.length() > 500 ? "..." : "")); // Pretty printing to make the output more readable System.out.println("=-=-=-=-=-=-=-=-="); if (i++ > 4) break; } }
Example #11
Source File: WARCFileRecordReader.java From cc-warc-examples with MIT License | 4 votes |
@Override public ArchiveReader getCurrentValue() throws IOException, InterruptedException { // We only ever have one value to give -- the output of the compressed file return ar; }
Example #12
Source File: WARCFileInputFormat.java From cc-warc-examples with MIT License | 4 votes |
@Override public RecordReader<Text, ArchiveReader> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { return new WARCFileRecordReader(); }
Example #13
Source File: ARCReaderFactory.java From webarchive-commons with Apache License 2.0 | 4 votes |
protected ArchiveReader getArchiveReader(final File f, final long offset) throws IOException { return getArchiveReader(f, true, offset); }
Example #14
Source File: ARCReaderFactory.java From webarchive-commons with Apache License 2.0 | 4 votes |
public static ArchiveReader get(final String s, final InputStream is, final boolean atFirstRecord) throws IOException { return ARCReaderFactory.factory.getArchiveReader(s, is, atFirstRecord); }
Example #15
Source File: WARCReader.java From webarchive-commons with Apache License 2.0 | 4 votes |
@Override public ArchiveReader getDeleteFileOnCloseReader(final File f) { throw new NotImplementedException("TODO"); }
Example #16
Source File: WARCReaderFactory.java From webarchive-commons with Apache License 2.0 | 4 votes |
public static ArchiveReader get(final String s, final InputStream is, final boolean atFirstRecord) throws IOException { return WARCReaderFactory.factory.getArchiveReader(s, is, atFirstRecord); }
Example #17
Source File: ArcHarvestFileDTO.java From webcurator with Apache License 2.0 | 4 votes |
public boolean checkIsCompressed() throws IOException { ArchiveReader reader = ArchiveReaderFactory.get(new File(baseDir, this.getName())); boolean result = reader.isCompressed(); reader.close(); return result; }