org.archive.io.ArchiveRecord Java Exaples

Source File: WARCReaderFactory.java From webarchive-commons with Apache License 2.0

6 votes

public Iterator<ArchiveRecord> iterator() {
    /**
     * Override ArchiveRecordIterator so can base returned iterator on
     * GzippedInputStream iterator.
     */
    return new ArchiveRecordIterator() {
        private GZIPMembersInputStream gis =
            (GZIPMembersInputStream)getIn();

        private Iterator<GZIPMembersInputStream> gzipIterator = this.gis.memberIterator();

        protected boolean innerHasNext() {
            return this.gzipIterator.hasNext();
        }

        protected ArchiveRecord innerNext() throws IOException {
            // Get the position before gzipIterator.next moves
            // it on past the gzip header.
            InputStream is = (InputStream) this.gzipIterator.next();
            return createArchiveRecord(is, Math.max(gis.getCurrentMemberStart(), gis.getCurrentMemberEnd()));
        }
    };
}

Source File: ArchiveReaderFactoryTest.java From webarchive-commons with Apache License 2.0

6 votes

/**
 * Test local file as String path
 * @throws IOException
 */
public void testGetPath() throws IOException {
    File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
    ArchiveReader reader = null;
    try {
        reader = ArchiveReaderFactory.get(arc.getAbsoluteFile().getAbsolutePath());
        for (Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();) {
            ArchiveRecord r = (ArchiveRecord)i.next();
            assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
        }
    } finally {
        if (reader != null) {
            reader.close();
        }
    }
}

Source File: ArchiveReaderFactoryTest.java From webarchive-commons with Apache License 2.0

6 votes

/**
 * Test local file as File
 * @throws IOException
 */
public void testGetFile() throws IOException {
    File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
    ArchiveReader reader = null;
    try {
        reader = ArchiveReaderFactory.get(arc.getAbsoluteFile());
        for (Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();) {
            ArchiveRecord r = (ArchiveRecord)i.next();
            assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
        }
    } finally {
        if (reader != null) {
            reader.close();
        }
    }
}

Source File: ArchiveReaderFactoryTest.java From webarchive-commons with Apache License 2.0

6 votes

/**
 * Test local file as URL
 * @throws IOException
 */
public void testGetFileURL() throws IOException {
    File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
    ArchiveReader reader = null;
    try {
        reader = ArchiveReaderFactory.
            get(new URL("file:////" + arc.getAbsolutePath()));
        for (Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();) {
            ArchiveRecord r = (ArchiveRecord)i.next();
            assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
        }
    } finally {
        if (reader != null) {
            reader.close();
        }
    }
}

Source File: WARCWriterTest.java From webarchive-commons with Apache License 2.0

6 votes

public void testArcRecordOffsetReads() throws Exception {
   	// Get an ARC with one record.
	WriterPoolMember w =
		createWithOneRecord("testArcRecordInBufferStream", true);
	w.close();
	// Get reader on said ARC.
	WARCReader r = WARCReaderFactory.get(w.getFile());
	final Iterator<ArchiveRecord> i = r.iterator();
	// Skip first ARC meta record.
	ArchiveRecord ar = i.next();
	i.hasNext();
	// Now we're at first and only record in ARC.
	ar = (WARCRecord) i.next();
	// Now try getting some random set of bytes out of it 
	// at an odd offset (used to fail because we were
	// doing bad math to find where in buffer to read).
	final byte[] buffer = new byte[17];
	final int maxRead = 4;
	int totalRead = 0;
	while (totalRead < maxRead) {
		totalRead = totalRead
		    + ar.read(buffer, 13 + totalRead, maxRead - totalRead);
		assertTrue(totalRead > 0);
	}
}

Source File: ARCReaderFactoryTest.java From webarchive-commons with Apache License 2.0

6 votes

private void offsetResourceTest( File testfile, long offset, String uri ) throws Exception {
  	RandomAccessFile raf = new RandomAccessFile(testfile, "r");
raf.seek(offset);
InputStream is = new FileInputStream(raf.getFD());
String fPath = testfile.getAbsolutePath();
ArchiveReader reader = ARCReaderFactory.get(fPath, is, false);    	
// This one works:
//ArchiveReader reader = ARCReaderFactory.get(testfile, offset);
ArchiveRecord record = reader.get();

final String url = record.getHeader().getUrl();
assertEquals("URL of record is not as expected.", uri, url);

      final long position = record.getPosition();
      final long recordLength = record.getHeader().getLength();
      assertTrue("Position " + position + " is after end of record " + recordLength, position <= recordLength);

      // Clean up:
      if( raf != null )
      	raf.close();
  }

Source File: ARCReaderFactory.java From webarchive-commons with Apache License 2.0

6 votes

protected void gotoEOR(ArchiveRecord rec) throws IOException {
    int c;
    while ((c = getIn().read())==LINE_SEPARATOR);
    if(c==-1) {
        return; 
    }
    long skipped = 1; 
    while (getIn().read()>-1) {
        skipped++;
    }
    // Report on system error the number of unexpected characters
    // at the end of this record.
    ArchiveRecordHeader meta = (getCurrentRecord() != null)?
        rec.getHeader(): null;
    String message = "Record STARTING at " +
        ((GZIPMembersInputStream)getIn()).getCurrentMemberStart() +
        " has " + skipped + " trailing byte(s): " +
        ((meta != null)? meta.toString(): "");
    if (isStrict()) {
        throw new IOException(message);
    }
    logStdErr(Level.WARNING, message);
}

Source File: ARCReaderFactory.java From webarchive-commons with Apache License 2.0

6 votes

public Iterator<ArchiveRecord> iterator() {
    /**
     * Override ARCRecordIterator so can base returned iterator on
     * GzippedInputStream iterator.
     */
    return new ArchiveRecordIterator() {
        private GZIPMembersInputStream gis =
            (GZIPMembersInputStream)getIn();

        private Iterator<GZIPMembersInputStream> gzipIterator = this.gis.memberIterator();

        protected boolean innerHasNext() {
            return this.gzipIterator.hasNext();
        }

        protected ArchiveRecord innerNext() throws IOException {
            InputStream is = this.gzipIterator.next();
            return createArchiveRecord(is, Math.max(gis.getCurrentMemberStart(), gis.getCurrentMemberEnd()));
        }
    };
}

Source File: ARCReader.java From webarchive-commons with Apache License 2.0

5 votes

public void dump(final boolean compress)
throws IOException, java.text.ParseException {
    // No point digesting if we're doing a dump.
    setDigest(false);
    boolean firstRecord = true;
    ARCWriter writer = null;
    for (Iterator<ArchiveRecord> ii = iterator(); ii.hasNext();) {
        ARCRecord r = (ARCRecord)ii.next();
        // We're to dump the arc on stdout.
        // Get the first record's data if any.
        ARCRecordMetaData meta = r.getMetaData();
        if (firstRecord) {
            firstRecord = false;
            // Get an ARCWriter.
            ByteArrayOutputStream baos =
                new ByteArrayOutputStream(r.available());
            // This is slow but done only once at top of ARC.
            while (r.available() > 0) {
                baos.write(r.read());
            }
            List<String> listOfMetadata = new ArrayList<String>();
            listOfMetadata.add(baos.toString(WriterPoolMember.UTF8));
            // Assume getArc returns full path to file.  ARCWriter
            // or new File will complain if it is otherwise.
            List<File> outDirs = new ArrayList<File>(); 
            WriterPoolSettingsData settings = 
                new WriterPoolSettingsData("","",-1L,compress,outDirs,listOfMetadata); 
            writer = new ARCWriter(new AtomicInteger(), System.out,
                new File(meta.getArc()), settings);
            continue;
        }
        
        writer.write(meta.getUrl(), meta.getMimetype(), meta.getIp(),
            ArchiveUtils.parse14DigitDate(meta.getDate()).getTime(),
            (int)meta.getLength(), r);
    }
    // System.out.println(System.currentTimeMillis() - start);
}

Source File: ARCReader.java From webarchive-commons with Apache License 2.0

5 votes

/**
 * Skip over any trailing new lines at end of the record so we're lined up
 * ready to read the next.
 * @param record
 * @throws IOException
 */
protected void gotoEOR(ArchiveRecord record) throws IOException {
    if (getIn().available() <= 0) {
        return;
    }
    
    // Remove any trailing LINE_SEPARATOR
    int c = -1;
    while (getIn().available() > 0) {
        if (getIn().markSupported()) {
            getIn().mark(1);
        }
        c = getIn().read();
        if (c != -1) {
            if (c == LINE_SEPARATOR) {
                continue;
            }
            if (getIn().markSupported()) {
                // We've overread.  We're probably in next record.  There is
                // no way of telling for sure. It may be dross at end of
                // current record. Backup.
                    getIn().reset();
                break;
            }
            ArchiveRecordHeader h = (getCurrentRecord() != null)?
                record.getHeader(): null;
            throw new IOException("Read " + (char)c +
                " when only " + LINE_SEPARATOR + " expected. " + 
                getReaderIdentifier() + ((h != null)?
                    h.getHeaderFields().toString(): ""));
        }
    }
}

Source File: WARCReader.java From webarchive-commons with Apache License 2.0

5 votes

/**
 * Skip over any trailing new lines at end of the record so we're lined up
 * ready to read the next.
 * @param record
 * @throws IOException
 */
protected void gotoEOR(ArchiveRecord record) throws IOException {
    if (record.available() != 0) {
        throw new IOException("Record should be exhausted before coming " +
            "in here");
    }

    // Records end in 2*CRLF.  Suck it up.
    readExpectedChar(getIn(), CRLF.charAt(0));
    readExpectedChar(getIn(), CRLF.charAt(1));
    readExpectedChar(getIn(), CRLF.charAt(0));
    readExpectedChar(getIn(), CRLF.charAt(1));
}

Source File: WARCReader.java From webarchive-commons with Apache License 2.0

5 votes

@Override
public void dump(boolean compress)
throws IOException, java.text.ParseException {
    for (final Iterator<ArchiveRecord> i = iterator(); i.hasNext();) {
           ArchiveRecord r = i.next();
           System.out.println(r.getHeader().toString());
           r.dump();
           System.out.println();
       }
}

Source File: WARCReaderFactory.java From webarchive-commons with Apache License 2.0

5 votes

protected void gotoEOR(ArchiveRecord rec) throws IOException {
    long skipped = 0; 
    while (getIn().read()>-1) {
        skipped++;
    }
    if(skipped>4) {
        System.err.println("unexpected extra data after record "+rec);
    }
    return;
}

Source File: ARCWriterTest.java From webarchive-commons with Apache License 2.0

5 votes

protected int iterateRecords(ARCReader r)
throws IOException {
    int count = 0;
    for (Iterator<ArchiveRecord> i = r.iterator(); i.hasNext();) {
        ARCRecord rec = (ARCRecord)i.next();
        rec.close();
        if (count != 0) {
            assertTrue("Unexpected URL " + rec.getMetaData().getUrl(),
                rec.getMetaData().getUrl().startsWith(SOME_URL));
        }
        count++;
    }
    return count;
}

Source File: ARCWriterTest.java From webarchive-commons with Apache License 2.0

5 votes

/**
 * Test a particular style of using the reader iterator. (Should
 * possibly be on a reader-centric test class, but the best setup 
 * functionality is here.)
 * 
 * @throws IOException
 */
public void testReadIterator() throws IOException {
    final int recordCount = 3;
    File arcFile = writeRecords("writeRecord", true,
        DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
    ARCReader reader = ARCReaderFactory.get(arcFile);
    Iterator<ArchiveRecord> it = reader.iterator();
    while (it.hasNext()) {
        ArchiveRecord next = it.next();
        next.close();
    }
    reader.close(); 
}

Source File: ARCWriterTest.java From webarchive-commons with Apache License 2.0

5 votes

protected ARCRecord getSingleRecord(ARCReader r) {
    final Iterator<ArchiveRecord> i = r.iterator();
    // Skip first ARC meta record.
    i.next();
    i.hasNext();
    // Now we're at first and only record in ARC.
    return (ARCRecord) i.next();
}

Source File: WordCounterMap.java From cc-warc-examples with MIT License

5 votes

@Override
public void map(Text key, ArchiveReader value, Context context) throws IOException {
	for (ArchiveRecord r : value) {
		try {
			if (r.getHeader().getMimetype().equals("text/plain")) {
				context.getCounter(MAPPERCOUNTER.RECORDS_IN).increment(1);
				LOG.debug(r.getHeader().getUrl() + " -- " + r.available());
				// Convenience function that reads the full message into a raw byte array
				byte[] rawData = IOUtils.toByteArray(r, r.available());
				String content = new String(rawData);
				// Grab each word from the document
				tokenizer = new StringTokenizer(content);
				if (!tokenizer.hasMoreTokens()) {
					context.getCounter(MAPPERCOUNTER.EMPTY_PAGE_TEXT).increment(1);
				} else {
					while (tokenizer.hasMoreTokens()) {
						outKey.set(tokenizer.nextToken());
						context.write(outKey, outVal);
					}
				}
			} else {
				context.getCounter(MAPPERCOUNTER.NON_PLAIN_TEXT).increment(1);
			}
		}
		catch (Exception ex) {
			LOG.error("Caught Exception", ex);
			context.getCounter(MAPPERCOUNTER.EXCEPTIONS).increment(1);
		}
	}
}

Source File: WARCReaderFactoryTest.java From webarchive-commons with Apache License 2.0

5 votes

public void testGetStringInputstreamBoolean() throws IOException {
	// Check the test files can be opened:
	for( String file : files ) {
		FileInputStream is = new FileInputStream(file);
		ArchiveReader ar = WARCReaderFactory.get(file, is, true);
		ArchiveRecord r = ar.get();
		String type = (String) r.getHeader().getHeaderValue(WARCConstants.HEADER_KEY_TYPE);
		// Check the first record comes out as a 'warcinfo' record.
		assertEquals(WARCRecordType.warcinfo.name(), type);
	}
}

Source File: WARCWriterTest.java From webarchive-commons with Apache License 2.0

5 votes

protected int iterateRecords(WARCReader r)
throws IOException {
    int count = 0;
    for (Iterator<ArchiveRecord> i = r.iterator(); i.hasNext();) {
        ArchiveRecord ar = i.next();
        ar.close();
        if (count != 0) {
            assertTrue("Unexpected URL " + ar.getHeader().getUrl(),
                ar.getHeader().getUrl().equals(SOME_URL));
        }
        count++;
    }
    return count;
}

Source File: TagCounterMap.java From cc-warc-examples with MIT License

5 votes

@Override
public void map(Text key, ArchiveReader value, Context context) throws IOException {
	// Compile the regular expression once as it will be used continuously
	patternTag = Pattern.compile(HTML_TAG_PATTERN);
	
	for (ArchiveRecord r : value) {
		try {
			LOG.debug(r.getHeader().getUrl() + " -- " + r.available());
			// We're only interested in processing the responses, not requests or metadata
			if (r.getHeader().getMimetype().equals("application/http; msgtype=response")) {
				// Convenience function that reads the full message into a raw byte array
				byte[] rawData = IOUtils.toByteArray(r, r.available());
				String content = new String(rawData);
				// The HTTP header gives us valuable information about what was received during the request
				String headerText = content.substring(0, content.indexOf("\r\n\r\n"));
				
				// In our task, we're only interested in text/html, so we can be a little lax
				// TODO: Proper HTTP header parsing + don't trust headers
				if (headerText.contains("Content-Type: text/html")) {
					context.getCounter(MAPPERCOUNTER.RECORDS_IN).increment(1);
					// Only extract the body of the HTTP response when necessary
					// Due to the way strings work in Java, we don't use any more memory than before
					String body = content.substring(content.indexOf("\r\n\r\n") + 4);
					// Process all the matched HTML tags found in the body of the document
					matcherTag = patternTag.matcher(body);
					while (matcherTag.find()) {
						String tagName = matcherTag.group(1);
						outKey.set(tagName.toLowerCase());
						context.write(outKey, outVal);
					}
				}
			}
		}
		catch (Exception ex) {
			LOG.error("Caught Exception", ex);
			context.getCounter(MAPPERCOUNTER.EXCEPTIONS).increment(1);
		}
	}
}

Source File: WARCReaderTest.java From cc-warc-examples with MIT License

5 votes

/**
 * @param args
 * @throws IOException 
 */
public static void main(String[] args) throws IOException {
	// Set up a local compressed WARC file for reading 
	String fn = "data/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
	FileInputStream is = new FileInputStream(fn);
	// The file name identifies the ArchiveReader and indicates if it should be decompressed
	ArchiveReader ar = WARCReaderFactory.get(fn, is, true);
	
	// Once we have an ArchiveReader, we can work through each of the records it contains
	int i = 0;
	for(ArchiveRecord r : ar) {
		// The header file contains information such as the type of record, size, creation time, and URL
		System.out.println(r.getHeader());
		System.out.println(r.getHeader().getUrl());
		System.out.println();
		
		// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
		// Create a byte array that is as long as the record's stated length
		byte[] rawData = IOUtils.toByteArray(r, r.available());
		
		// Why don't we convert it to a string and print the start of it? Let's hope it's text!
		String content = new String(rawData);
		System.out.println(content.substring(0, Math.min(500, content.length())));
		System.out.println((content.length() > 500 ? "..." : ""));
		
		// Pretty printing to make the output more readable 
		System.out.println("=-=-=-=-=-=-=-=-=");
		if (i++ > 4) break; 
	}
}

Source File: S3ReaderTest.java From cc-warc-examples with MIT License

5 votes

public static void main(String[] args) throws IOException, S3ServiceException {
	// We're accessing a publicly available bucket so don't need to fill in our credentials
	S3Service s3s = new RestS3Service(null);
	
	// Let's grab a file out of the CommonCrawl S3 bucket
	String fn = "common-crawl/crawl-data/CC-MAIN-2013-48/segments/1386163035819/warc/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
	S3Object f = s3s.getObject("aws-publicdatasets", fn, null, null, null, null, null, null);
	
	// The file name identifies the ArchiveReader and indicates if it should be decompressed
	ArchiveReader ar = WARCReaderFactory.get(fn, f.getDataInputStream(), true);
	
	// Once we have an ArchiveReader, we can work through each of the records it contains
	int i = 0;
	for(ArchiveRecord r : ar) {
		// The header file contains information such as the type of record, size, creation time, and URL
		System.out.println("Header: " + r.getHeader());
		System.out.println("URL: " + r.getHeader().getUrl());
		System.out.println();
		
		// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
		// Create a byte array that is as long as all the record's stated length
		byte[] rawData = new byte[r.available()];
		r.read(rawData);
		// Note: potential optimization would be to have a large buffer only allocated once
		
		// Why don't we convert it to a string and print the start of it? Let's hope it's text!
		String content = new String(rawData);
		System.out.println(content.substring(0, Math.min(500, content.length())));
		System.out.println((content.length() > 500 ? "..." : ""));
		
		// Pretty printing to make the output more readable 
		System.out.println("=-=-=-=-=-=-=-=-=");
		if (i++ > 4) break; 
	}
}

Source File: ArcHarvestFileDTO.java From webcurator with Apache License 2.0

5 votes

/**
 * Create and return the index of the ArcHarvestFile.
 * @param baseDir the base directory of the arcs
 * @throws IOException thrown if there is an error
 * @throws ParseException 
 */
public Map<String, HarvestResourceDTO> index(File baseDir) throws IOException, ParseException {
	Map<String, HarvestResourceDTO> results = new HashMap<String, HarvestResourceDTO>();
	
	File theArchiveFile = new File(baseDir, this.getName());
	ArchiveReader reader = ArchiveReaderFactory.get(theArchiveFile);
	this.compressed = reader.isCompressed();
	
	Iterator<ArchiveRecord> it = reader.iterator();
	while(it.hasNext()) {
		ArchiveRecord rec = it.next();
		
		if(rec instanceof WARCRecord) {
			String type = rec.getHeader().getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
			if(type.equals(WARCConstants.RESPONSE)) {
				String mime = rec.getHeader().getMimetype();
				if(!mime.equals("text/dns")) {
					indexWARCResponse(rec, results);
				}
			}
		}
		else {
			indexARCRecord(rec, results);
		}
	}
	reader.close();
	
	return results;
}

Source File: WarcTargetRepositoryTest.java From ache with Apache License 2.0

4 votes

@Test
public void testReadingMultipleWarcRecords() throws Exception {
    String folder = tempFolder.newFolder().toString();

    String url1 = "http://a.com";
    String url2 = "http://b.com";

    Page target1 = new Page(new URL(url1), html, responseHeaders);
    target1.setFetchTime(System.currentTimeMillis());

    Page target2 = new Page(new URL(url2), html, responseHeaders);
    target2.setFetchTime(System.currentTimeMillis());

    WarcTargetRepository repository = new WarcTargetRepository(folder);

    // when
    repository.insert(target1);
    repository.insert(target2);
    repository.close();

    WARCWriter writer = repository.getWriter();
    WARCReader reader = WARCReaderFactory.get(writer.getFile());

    // Get to second record. Get its offset for later use.
    boolean readWarcInfoRecord = false;
    boolean readFirst = false;
    boolean readSecond = false;

    for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();) {
        WARCRecord ar = (WARCRecord) i.next();
        if (!readWarcInfoRecord) {
            readWarcInfoRecord = true;
        } else if (!readFirst) {
            readFirst = true;
            assertThat(ar.getHeader().getUrl(), is(url1));
            continue;
        } else if (!readSecond) {
            url = ar.getHeader().getUrl();
            assertThat(ar.getHeader().getUrl(), is(url2));
            readSecond = true;
        }
    }
    reader.close();
}

org.archive.io.ArchiveRecord Java Examples