org.apache.nutch.protocol.ProtocolException Java Exaples

Source File: TestZipParser.java From nutch-htmlunit with Apache License 2.0

6 votes

public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  Configuration conf = NutchConfiguration.create();
  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content).get(content.getUrl());
    assertTrue(parse.getText().equals(expectedText));
  }
}

Source File: TestPdfParser.java From nutch-htmlunit with Apache License 2.0

6 votes

public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    Configuration conf = NutchConfiguration.create();
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());

    int index = parse.getText().indexOf(expectedText);
    assertTrue(index > 0);
  }
}

Source File: TestProtocolFile.java From anthelion with Apache License 2.0

6 votes

/**
 * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata field.
 * 
 * @since NUTCH-384
 * 
 */
public void setContentType(String testTextFile) throws ProtocolException {
  String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
  assertNotNull(urlString);
  Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
  ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString),
      datum);
  assertNotNull(output);
  assertEquals("Status code: [" + output.getStatus().getCode()
      + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: ["
      + output.getStatus().getArgs() + "]", ProtocolStatus.SUCCESS, output
      .getStatus().getCode());
  assertNotNull(output.getContent());
  assertNotNull(output.getContent().getContentType());
  assertEquals(expectedMimeType, output.getContent().getContentType());
  assertNotNull(output.getContent().getMetadata());
  assertEquals(expectedMimeType,
      output.getContent().getMetadata().get(Response.CONTENT_TYPE));

}

Source File: TestProtocolFile.java From nutch-htmlunit with Apache License 2.0

6 votes

/**
 * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata field.
 * 
 * @since NUTCH-384
 * 
 */
public void setContentType(String testTextFile) throws ProtocolException {
  String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
  assertNotNull(urlString);
  Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
  ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString),
      datum);
  assertNotNull(output);
  assertEquals("Status code: [" + output.getStatus().getCode()
      + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: ["
      + output.getStatus().getArgs() + "]", ProtocolStatus.SUCCESS, output
      .getStatus().getCode());
  assertNotNull(output.getContent());
  assertNotNull(output.getContent().getContentType());
  assertEquals(expectedMimeType, output.getContent().getContentType());
  assertNotNull(output.getContent().getMetadata());
  assertEquals(expectedMimeType,
      output.getContent().getMetadata().get(Response.CONTENT_TYPE));

}

Source File: TestSWFParser.java From nutch-htmlunit with Apache License 2.0

6 votes

public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;
  Configuration conf = NutchConfiguration.create();

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

    parse = new ParseUtil(conf).parse(content).get(content.getUrl());

    String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
    assertTrue(sampleTexts[i].equals(text));
  }
}

Source File: TestRTFParser.java From nutch-htmlunit with Apache License 2.0

6 votes

public void testIt() throws ProtocolException, ParseException {
  /* Temporarily disabled - see Tika-748

	String urlString;
	Protocol protocol;
	Content content;
	Parse parse;

	Configuration conf = NutchConfiguration.create();
	urlString = "file:" + sampleDir + fileSeparator + rtfFile;
	protocol = new ProtocolFactory(conf).getProtocol(urlString);
	content = protocol.getProtocolOutput(new Text(urlString),
			new CrawlDatum()).getContent();
	parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
			.get(content.getUrl());
	String text = parse.getText();
	assertEquals("The quick brown fox jumps over the lazy dog", text.trim());

	String title = parse.getData().getTitle();
	Metadata meta = parse.getData().getParseMeta();

	// METADATA extraction is not yet supported in Tika
	// assertEquals("test rft document", title);
	// assertEquals("tests", meta.get(DublinCore.SUBJECT));
 */
}

Source File: TestExtParser.java From nutch-htmlunit with Apache License 2.0

6 votes

protected void setUp() throws ProtocolException, IOException {
  // prepare a temp file with expectedText as its content
  // This system property is defined in ./src/plugin/build-plugin.xml
  String path = System.getProperty("test.data");
  if (path != null) {
    File tempDir = new File(path);
    if (!tempDir.exists())
      tempDir.mkdir();
    tempFile = File.createTempFile("nutch.test.plugin.ExtParser.",".txt",tempDir);
  } else {
    // otherwise in java.io.tmpdir
    tempFile = File.createTempFile("nutch.test.plugin.ExtParser.",".txt");
  }
  urlString = tempFile.toURL().toString();

  FileOutputStream fos = new FileOutputStream(tempFile);
  fos.write(expectedText.getBytes());
  fos.close();

  // get nutch content
  Protocol protocol = new ProtocolFactory(NutchConfiguration.create()).getProtocol(urlString);
  content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
  protocol = null;
}

Source File: TestSWFParser.java From anthelion with Apache License 2.0

6 votes

public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;
  Configuration conf = NutchConfiguration.create();

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

    parse = new ParseUtil(conf).parse(content).get(content.getUrl());

    String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
    assertTrue(sampleTexts[i].equals(text));
  }
}

Source File: TestRTFParser.java From anthelion with Apache License 2.0

6 votes

public void testIt() throws ProtocolException, ParseException {
  /* Temporarily disabled - see Tika-748

	String urlString;
	Protocol protocol;
	Content content;
	Parse parse;

	Configuration conf = NutchConfiguration.create();
	urlString = "file:" + sampleDir + fileSeparator + rtfFile;
	protocol = new ProtocolFactory(conf).getProtocol(urlString);
	content = protocol.getProtocolOutput(new Text(urlString),
			new CrawlDatum()).getContent();
	parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
			.get(content.getUrl());
	String text = parse.getText();
	assertEquals("The quick brown fox jumps over the lazy dog", text.trim());

	String title = parse.getData().getTitle();
	Metadata meta = parse.getData().getParseMeta();

	// METADATA extraction is not yet supported in Tika
	// assertEquals("test rft document", title);
	// assertEquals("tests", meta.get(DublinCore.SUBJECT));
 */
}

Source File: TestImageMetadata.java From nutch-htmlunit with Apache License 2.0

6 votes

public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    Configuration conf = NutchConfiguration.create();
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());

    assertEquals("121", parse.getData().getMeta("width"));
    assertEquals("48", parse.getData().getMeta("height"));
  }
}

Source File: TestExtParser.java From anthelion with Apache License 2.0

6 votes

protected void setUp() throws ProtocolException, IOException {
  // prepare a temp file with expectedText as its content
  // This system property is defined in ./src/plugin/build-plugin.xml
  String path = System.getProperty("test.data");
  if (path != null) {
    File tempDir = new File(path);
    if (!tempDir.exists())
      tempDir.mkdir();
    tempFile = File.createTempFile("nutch.test.plugin.ExtParser.",".txt",tempDir);
  } else {
    // otherwise in java.io.tmpdir
    tempFile = File.createTempFile("nutch.test.plugin.ExtParser.",".txt");
  }
  urlString = tempFile.toURL().toString();

  FileOutputStream fos = new FileOutputStream(tempFile);
  fos.write(expectedText.getBytes());
  fos.close();

  // get nutch content
  Protocol protocol = new ProtocolFactory(NutchConfiguration.create()).getProtocol(urlString);
  content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
  protocol = null;
}

Source File: TestZipParser.java From anthelion with Apache License 2.0

6 votes

public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  Configuration conf = NutchConfiguration.create();
  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content).get(content.getUrl());
    assertTrue(parse.getText().equals(expectedText));
  }
}

Source File: TestPdfParser.java From anthelion with Apache License 2.0

6 votes

public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    Configuration conf = NutchConfiguration.create();
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());

    int index = parse.getText().indexOf(expectedText);
    assertTrue(index > 0);
  }
}

Source File: TestMSWordParser.java From anthelion with Apache License 2.0

5 votes

public String getTextContent(String fileName) throws ProtocolException, ParseException {
  String urlString = "file:" + sampleDir + fileSeparator + fileName;
  Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
  Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
  Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
  return parse.getText();
}

Source File: TestMSWordParser.java From anthelion with Apache License 2.0

5 votes

public void testOpeningDocs() throws ProtocolException, ParseException {
  String[] filenames = new File(sampleDir).list();
    for (int i = 0; i < filenames.length; i++) {
  	if (filenames[i].endsWith(".doc")==false) continue;
      assertTrue("cann't read content of " + filenames[i], getTextContent(filenames[i]).length() > 0);
    }      
}

Source File: TestMSWordParser.java From nutch-htmlunit with Apache License 2.0

5 votes

public String getTextContent(String fileName) throws ProtocolException, ParseException {
  String urlString = "file:" + sampleDir + fileSeparator + fileName;
  Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
  Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
  Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
  return parse.getText();
}

Source File: RobotRulesParser.java From anthelion with Apache License 2.0

5 votes

public boolean isAllowed(HttpBase http, URL url)
    throws ProtocolException, IOException {
  String path = url.getPath();                  // check rules
  if ((path == null) || "".equals(path)) {
    path= "/";
  }

  return getRobotRulesSet(http, url).isAllowed(path);
}

Source File: TestMSWordParser.java From nutch-htmlunit with Apache License 2.0

5 votes

public void testOpeningDocs() throws ProtocolException, ParseException {
  String[] filenames = new File(sampleDir).list();
    for (int i = 0; i < filenames.length; i++) {
  	if (filenames[i].endsWith(".doc")==false) continue;
      assertTrue("cann't read content of " + filenames[i], getTextContent(filenames[i]).length() > 0);
    }      
}

Source File: TestFeedParser.java From nutch-htmlunit with Apache License 2.0

4 votes

/**
 * <p>
 * The test method: tests out the following 2 asserts:
 * </p>
 * 
 * <ul>
 * <li>There are 3 outlinks read from the sample rss file</li>
 * <li>The 3 outlinks read are in fact the correct outlinks from the sample
 * file</li>
 * </ul>
 */
public void testIt() throws ProtocolException, ParseException {
	String urlString;
	Protocol protocol;
	Content content;
	Parse parse;

	Configuration conf = NutchConfiguration.create();
	for (int i = 0; i < sampleFiles.length; i++) {
		urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

		protocol = new ProtocolFactory(conf).getProtocol(urlString);
		content = protocol.getProtocolOutput(new Text(urlString),
				new CrawlDatum()).getContent();
		parse = new ParseUtil(conf).parseByExtensionId("parse-tika",
				content).get(content.getUrl());

		// check that there are 2 outlinks:
		// unlike the original parse-rss
		// tika ignores the URL and description of the channel

		// http://test.channel.com
		// http://www-scf.usc.edu/~mattmann/
		// http://www.nutch.org

		ParseData theParseData = parse.getData();

		Outlink[] theOutlinks = theParseData.getOutlinks();

		assertTrue("There aren't 2 outlinks read!",
				theOutlinks.length == 2);

		// now check to make sure that those are the two outlinks
		boolean hasLink1 = false, hasLink2 = false;

		for (int j = 0; j < theOutlinks.length; j++) {
			if (theOutlinks[j].getToUrl().equals(
					"http://www-scf.usc.edu/~mattmann/")) {
				hasLink1 = true;
			}

			if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/")) {
				hasLink2 = true;
			}
		}

		if (!hasLink1 || !hasLink2) {
			fail("Outlinks read from sample rss file are not correct!");
		}
	}
}

Source File: Http.java From nutch-selenium with Apache License 2.0

4 votes

@Override
protected Response getResponse(URL url, WebPage page, boolean redirect)
        throws ProtocolException, IOException {
    return new HttpResponse(this, url, page, getConf());
}

Source File: TestMSWordParser.java From nutch-htmlunit with Apache License 2.0

4 votes

public void testIt() throws ProtocolException, ParseException {
  for (int i=0; i<sampleFiles.length; i++) {
    String found = getTextContent(sampleFiles[i]);
    assertTrue("text found : '"+found+"'",found.startsWith(expectedText));
  }
}

Source File: Http.java From nutch-htmlunit with Apache License 2.0

4 votes

protected Response getResponse(URL url, CrawlDatum datum, boolean redirect) throws ProtocolException, IOException {
    return new HttpResponse(this, url, datum);
}

Source File: HttpBase.java From nutch-htmlunit with Apache License 2.0

4 votes

protected abstract Response getResponse(URL url,
                                      CrawlDatum datum,
                                      boolean followRedirects)
throws ProtocolException, IOException;

Source File: Http.java From nutch-htmlunit with Apache License 2.0

4 votes

protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
  throws ProtocolException, IOException {
  return new HttpResponse(this, url, datum);
}

Source File: TestProtocolFile.java From nutch-htmlunit with Apache License 2.0

4 votes

public void testSetContentType() throws ProtocolException {
  for (String testTextFile : testTextFiles) {
    setContentType(testTextFile);
  }
}

Source File: TestMSWordParser.java From anthelion with Apache License 2.0

4 votes

public void testIt() throws ProtocolException, ParseException {
  for (int i=0; i<sampleFiles.length; i++) {
    String found = getTextContent(sampleFiles[i]);
    assertTrue("text found : '"+found+"'",found.startsWith(expectedText));
  }
}

Source File: TestFeedParser.java From anthelion with Apache License 2.0

4 votes

/**
 * <p>
 * The test method: tests out the following 2 asserts:
 * </p>
 * 
 * <ul>
 * <li>There are 3 outlinks read from the sample rss file</li>
 * <li>The 3 outlinks read are in fact the correct outlinks from the sample
 * file</li>
 * </ul>
 */
public void testIt() throws ProtocolException, ParseException {
	String urlString;
	Protocol protocol;
	Content content;
	Parse parse;

	Configuration conf = NutchConfiguration.create();
	for (int i = 0; i < sampleFiles.length; i++) {
		urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

		protocol = new ProtocolFactory(conf).getProtocol(urlString);
		content = protocol.getProtocolOutput(new Text(urlString),
				new CrawlDatum()).getContent();
		parse = new ParseUtil(conf).parseByExtensionId("parse-tika",
				content).get(content.getUrl());

		// check that there are 2 outlinks:
		// unlike the original parse-rss
		// tika ignores the URL and description of the channel

		// http://test.channel.com
		// http://www-scf.usc.edu/~mattmann/
		// http://www.nutch.org

		ParseData theParseData = parse.getData();

		Outlink[] theOutlinks = theParseData.getOutlinks();

		assertTrue("There aren't 2 outlinks read!",
				theOutlinks.length == 2);

		// now check to make sure that those are the two outlinks
		boolean hasLink1 = false, hasLink2 = false;

		for (int j = 0; j < theOutlinks.length; j++) {
			if (theOutlinks[j].getToUrl().equals(
					"http://www-scf.usc.edu/~mattmann/")) {
				hasLink1 = true;
			}

			if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/")) {
				hasLink2 = true;
			}
		}

		if (!hasLink1 || !hasLink2) {
			fail("Outlinks read from sample rss file are not correct!");
		}
	}
}

Source File: RobotRulesParser.java From anthelion with Apache License 2.0

4 votes

public long getCrawlDelay(HttpBase http, URL url)
    throws ProtocolException, IOException {
  return getRobotRulesSet(http, url).getCrawlDelay();
}

Source File: HttpBase.java From anthelion with Apache License 2.0

4 votes

protected abstract Response getResponse(URL url,
                                      CrawlDatum datum,
                                      boolean followRedirects)
throws ProtocolException, IOException;

Source File: Http.java From anthelion with Apache License 2.0

4 votes

protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
  throws ProtocolException, IOException {
  return new HttpResponse(this, url, datum);
}

org.apache.nutch.protocol.ProtocolException Java Examples