org.apache.nutch.protocol.ProtocolException Java Examples

The following examples show how to use org.apache.nutch.protocol.ProtocolException. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestZipParser.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  Configuration conf = NutchConfiguration.create();
  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content).get(content.getUrl());
    assertTrue(parse.getText().equals(expectedText));
  }
}
 
Example #2
Source File: TestPdfParser.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    Configuration conf = NutchConfiguration.create();
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());

    int index = parse.getText().indexOf(expectedText);
    assertTrue(index > 0);
  }
}
 
Example #3
Source File: TestProtocolFile.java    From anthelion with Apache License 2.0 6 votes vote down vote up
/**
 * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata field.
 * 
 * @since NUTCH-384
 * 
 */
public void setContentType(String testTextFile) throws ProtocolException {
  String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
  assertNotNull(urlString);
  Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
  ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString),
      datum);
  assertNotNull(output);
  assertEquals("Status code: [" + output.getStatus().getCode()
      + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: ["
      + output.getStatus().getArgs() + "]", ProtocolStatus.SUCCESS, output
      .getStatus().getCode());
  assertNotNull(output.getContent());
  assertNotNull(output.getContent().getContentType());
  assertEquals(expectedMimeType, output.getContent().getContentType());
  assertNotNull(output.getContent().getMetadata());
  assertEquals(expectedMimeType,
      output.getContent().getMetadata().get(Response.CONTENT_TYPE));

}
 
Example #4
Source File: TestProtocolFile.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
/**
 * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata field.
 * 
 * @since NUTCH-384
 * 
 */
public void setContentType(String testTextFile) throws ProtocolException {
  String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
  assertNotNull(urlString);
  Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
  ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString),
      datum);
  assertNotNull(output);
  assertEquals("Status code: [" + output.getStatus().getCode()
      + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: ["
      + output.getStatus().getArgs() + "]", ProtocolStatus.SUCCESS, output
      .getStatus().getCode());
  assertNotNull(output.getContent());
  assertNotNull(output.getContent().getContentType());
  assertEquals(expectedMimeType, output.getContent().getContentType());
  assertNotNull(output.getContent().getMetadata());
  assertEquals(expectedMimeType,
      output.getContent().getMetadata().get(Response.CONTENT_TYPE));

}
 
Example #5
Source File: TestSWFParser.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;
  Configuration conf = NutchConfiguration.create();

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

    parse = new ParseUtil(conf).parse(content).get(content.getUrl());

    String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
    assertTrue(sampleTexts[i].equals(text));
  }
}
 
Example #6
Source File: TestRTFParser.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public void testIt() throws ProtocolException, ParseException {
  /* Temporarily disabled - see Tika-748

	String urlString;
	Protocol protocol;
	Content content;
	Parse parse;

	Configuration conf = NutchConfiguration.create();
	urlString = "file:" + sampleDir + fileSeparator + rtfFile;
	protocol = new ProtocolFactory(conf).getProtocol(urlString);
	content = protocol.getProtocolOutput(new Text(urlString),
			new CrawlDatum()).getContent();
	parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
			.get(content.getUrl());
	String text = parse.getText();
	assertEquals("The quick brown fox jumps over the lazy dog", text.trim());

	String title = parse.getData().getTitle();
	Metadata meta = parse.getData().getParseMeta();

	// METADATA extraction is not yet supported in Tika
	// assertEquals("test rft document", title);
	// assertEquals("tests", meta.get(DublinCore.SUBJECT));
 */
}
 
Example #7
Source File: TestExtParser.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
protected void setUp() throws ProtocolException, IOException {
  // prepare a temp file with expectedText as its content
  // This system property is defined in ./src/plugin/build-plugin.xml
  String path = System.getProperty("test.data");
  if (path != null) {
    File tempDir = new File(path);
    if (!tempDir.exists())
      tempDir.mkdir();
    tempFile = File.createTempFile("nutch.test.plugin.ExtParser.",".txt",tempDir);
  } else {
    // otherwise in java.io.tmpdir
    tempFile = File.createTempFile("nutch.test.plugin.ExtParser.",".txt");
  }
  urlString = tempFile.toURL().toString();

  FileOutputStream fos = new FileOutputStream(tempFile);
  fos.write(expectedText.getBytes());
  fos.close();

  // get nutch content
  Protocol protocol = new ProtocolFactory(NutchConfiguration.create()).getProtocol(urlString);
  content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
  protocol = null;
}
 
Example #8
Source File: TestSWFParser.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;
  Configuration conf = NutchConfiguration.create();

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

    parse = new ParseUtil(conf).parse(content).get(content.getUrl());

    String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
    assertTrue(sampleTexts[i].equals(text));
  }
}
 
Example #9
Source File: TestRTFParser.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public void testIt() throws ProtocolException, ParseException {
  /* Temporarily disabled - see Tika-748

	String urlString;
	Protocol protocol;
	Content content;
	Parse parse;

	Configuration conf = NutchConfiguration.create();
	urlString = "file:" + sampleDir + fileSeparator + rtfFile;
	protocol = new ProtocolFactory(conf).getProtocol(urlString);
	content = protocol.getProtocolOutput(new Text(urlString),
			new CrawlDatum()).getContent();
	parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
			.get(content.getUrl());
	String text = parse.getText();
	assertEquals("The quick brown fox jumps over the lazy dog", text.trim());

	String title = parse.getData().getTitle();
	Metadata meta = parse.getData().getParseMeta();

	// METADATA extraction is not yet supported in Tika
	// assertEquals("test rft document", title);
	// assertEquals("tests", meta.get(DublinCore.SUBJECT));
 */
}
 
Example #10
Source File: TestImageMetadata.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    Configuration conf = NutchConfiguration.create();
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());

    assertEquals("121", parse.getData().getMeta("width"));
    assertEquals("48", parse.getData().getMeta("height"));
  }
}
 
Example #11
Source File: TestExtParser.java    From anthelion with Apache License 2.0 6 votes vote down vote up
protected void setUp() throws ProtocolException, IOException {
  // prepare a temp file with expectedText as its content
  // This system property is defined in ./src/plugin/build-plugin.xml
  String path = System.getProperty("test.data");
  if (path != null) {
    File tempDir = new File(path);
    if (!tempDir.exists())
      tempDir.mkdir();
    tempFile = File.createTempFile("nutch.test.plugin.ExtParser.",".txt",tempDir);
  } else {
    // otherwise in java.io.tmpdir
    tempFile = File.createTempFile("nutch.test.plugin.ExtParser.",".txt");
  }
  urlString = tempFile.toURL().toString();

  FileOutputStream fos = new FileOutputStream(tempFile);
  fos.write(expectedText.getBytes());
  fos.close();

  // get nutch content
  Protocol protocol = new ProtocolFactory(NutchConfiguration.create()).getProtocol(urlString);
  content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
  protocol = null;
}
 
Example #12
Source File: TestZipParser.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  Configuration conf = NutchConfiguration.create();
  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content).get(content.getUrl());
    assertTrue(parse.getText().equals(expectedText));
  }
}
 
Example #13
Source File: TestPdfParser.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    Configuration conf = NutchConfiguration.create();
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());

    int index = parse.getText().indexOf(expectedText);
    assertTrue(index > 0);
  }
}
 
Example #14
Source File: TestMSWordParser.java    From anthelion with Apache License 2.0 5 votes vote down vote up
public String getTextContent(String fileName) throws ProtocolException, ParseException {
  String urlString = "file:" + sampleDir + fileSeparator + fileName;
  Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
  Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
  Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
  return parse.getText();
}
 
Example #15
Source File: TestMSWordParser.java    From anthelion with Apache License 2.0 5 votes vote down vote up
public void testOpeningDocs() throws ProtocolException, ParseException {
  String[] filenames = new File(sampleDir).list();
    for (int i = 0; i < filenames.length; i++) {
  	if (filenames[i].endsWith(".doc")==false) continue;
      assertTrue("cann't read content of " + filenames[i], getTextContent(filenames[i]).length() > 0);
    }      
}
 
Example #16
Source File: TestMSWordParser.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public String getTextContent(String fileName) throws ProtocolException, ParseException {
  String urlString = "file:" + sampleDir + fileSeparator + fileName;
  Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
  Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
  Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
  return parse.getText();
}
 
Example #17
Source File: RobotRulesParser.java    From anthelion with Apache License 2.0 5 votes vote down vote up
public boolean isAllowed(HttpBase http, URL url)
    throws ProtocolException, IOException {
  String path = url.getPath();                  // check rules
  if ((path == null) || "".equals(path)) {
    path= "/";
  }

  return getRobotRulesSet(http, url).isAllowed(path);
}
 
Example #18
Source File: TestMSWordParser.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public void testOpeningDocs() throws ProtocolException, ParseException {
  String[] filenames = new File(sampleDir).list();
    for (int i = 0; i < filenames.length; i++) {
  	if (filenames[i].endsWith(".doc")==false) continue;
      assertTrue("cann't read content of " + filenames[i], getTextContent(filenames[i]).length() > 0);
    }      
}
 
Example #19
Source File: TestFeedParser.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
/**
 * <p>
 * The test method: tests out the following 2 asserts:
 * </p>
 * 
 * <ul>
 * <li>There are 3 outlinks read from the sample rss file</li>
 * <li>The 3 outlinks read are in fact the correct outlinks from the sample
 * file</li>
 * </ul>
 */
public void testIt() throws ProtocolException, ParseException {
	String urlString;
	Protocol protocol;
	Content content;
	Parse parse;

	Configuration conf = NutchConfiguration.create();
	for (int i = 0; i < sampleFiles.length; i++) {
		urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

		protocol = new ProtocolFactory(conf).getProtocol(urlString);
		content = protocol.getProtocolOutput(new Text(urlString),
				new CrawlDatum()).getContent();
		parse = new ParseUtil(conf).parseByExtensionId("parse-tika",
				content).get(content.getUrl());

		// check that there are 2 outlinks:
		// unlike the original parse-rss
		// tika ignores the URL and description of the channel

		// http://test.channel.com
		// http://www-scf.usc.edu/~mattmann/
		// http://www.nutch.org

		ParseData theParseData = parse.getData();

		Outlink[] theOutlinks = theParseData.getOutlinks();

		assertTrue("There aren't 2 outlinks read!",
				theOutlinks.length == 2);

		// now check to make sure that those are the two outlinks
		boolean hasLink1 = false, hasLink2 = false;

		for (int j = 0; j < theOutlinks.length; j++) {
			if (theOutlinks[j].getToUrl().equals(
					"http://www-scf.usc.edu/~mattmann/")) {
				hasLink1 = true;
			}

			if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/")) {
				hasLink2 = true;
			}
		}

		if (!hasLink1 || !hasLink2) {
			fail("Outlinks read from sample rss file are not correct!");
		}
	}
}
 
Example #20
Source File: Http.java    From nutch-selenium with Apache License 2.0 4 votes vote down vote up
@Override
protected Response getResponse(URL url, WebPage page, boolean redirect)
        throws ProtocolException, IOException {
    return new HttpResponse(this, url, page, getConf());
}
 
Example #21
Source File: TestMSWordParser.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
public void testIt() throws ProtocolException, ParseException {
  for (int i=0; i<sampleFiles.length; i++) {
    String found = getTextContent(sampleFiles[i]);
    assertTrue("text found : '"+found+"'",found.startsWith(expectedText));
  }
}
 
Example #22
Source File: Http.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
protected Response getResponse(URL url, CrawlDatum datum, boolean redirect) throws ProtocolException, IOException {
    return new HttpResponse(this, url, datum);
}
 
Example #23
Source File: HttpBase.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
protected abstract Response getResponse(URL url,
                                      CrawlDatum datum,
                                      boolean followRedirects)
throws ProtocolException, IOException;
 
Example #24
Source File: Http.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
  throws ProtocolException, IOException {
  return new HttpResponse(this, url, datum);
}
 
Example #25
Source File: TestProtocolFile.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
public void testSetContentType() throws ProtocolException {
  for (String testTextFile : testTextFiles) {
    setContentType(testTextFile);
  }
}
 
Example #26
Source File: TestMSWordParser.java    From anthelion with Apache License 2.0 4 votes vote down vote up
public void testIt() throws ProtocolException, ParseException {
  for (int i=0; i<sampleFiles.length; i++) {
    String found = getTextContent(sampleFiles[i]);
    assertTrue("text found : '"+found+"'",found.startsWith(expectedText));
  }
}
 
Example #27
Source File: TestFeedParser.java    From anthelion with Apache License 2.0 4 votes vote down vote up
/**
 * <p>
 * The test method: tests out the following 2 asserts:
 * </p>
 * 
 * <ul>
 * <li>There are 3 outlinks read from the sample rss file</li>
 * <li>The 3 outlinks read are in fact the correct outlinks from the sample
 * file</li>
 * </ul>
 */
public void testIt() throws ProtocolException, ParseException {
	String urlString;
	Protocol protocol;
	Content content;
	Parse parse;

	Configuration conf = NutchConfiguration.create();
	for (int i = 0; i < sampleFiles.length; i++) {
		urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

		protocol = new ProtocolFactory(conf).getProtocol(urlString);
		content = protocol.getProtocolOutput(new Text(urlString),
				new CrawlDatum()).getContent();
		parse = new ParseUtil(conf).parseByExtensionId("parse-tika",
				content).get(content.getUrl());

		// check that there are 2 outlinks:
		// unlike the original parse-rss
		// tika ignores the URL and description of the channel

		// http://test.channel.com
		// http://www-scf.usc.edu/~mattmann/
		// http://www.nutch.org

		ParseData theParseData = parse.getData();

		Outlink[] theOutlinks = theParseData.getOutlinks();

		assertTrue("There aren't 2 outlinks read!",
				theOutlinks.length == 2);

		// now check to make sure that those are the two outlinks
		boolean hasLink1 = false, hasLink2 = false;

		for (int j = 0; j < theOutlinks.length; j++) {
			if (theOutlinks[j].getToUrl().equals(
					"http://www-scf.usc.edu/~mattmann/")) {
				hasLink1 = true;
			}

			if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/")) {
				hasLink2 = true;
			}
		}

		if (!hasLink1 || !hasLink2) {
			fail("Outlinks read from sample rss file are not correct!");
		}
	}
}
 
Example #28
Source File: RobotRulesParser.java    From anthelion with Apache License 2.0 4 votes vote down vote up
public long getCrawlDelay(HttpBase http, URL url)
    throws ProtocolException, IOException {
  return getRobotRulesSet(http, url).getCrawlDelay();
}
 
Example #29
Source File: HttpBase.java    From anthelion with Apache License 2.0 4 votes vote down vote up
protected abstract Response getResponse(URL url,
                                      CrawlDatum datum,
                                      boolean followRedirects)
throws ProtocolException, IOException;
 
Example #30
Source File: Http.java    From anthelion with Apache License 2.0 4 votes vote down vote up
protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
  throws ProtocolException, IOException {
  return new HttpResponse(this, url, datum);
}