org.apache.nutch.util.NutchConfiguration Java Examples
The following examples show how to use
org.apache.nutch.util.NutchConfiguration.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestExtParser.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
public void testIt() throws ParseException { String contentType; // now test only on linux platform if (!System.getProperty("os.name").equalsIgnoreCase("linux")) { System.err.println("Current OS is "+System.getProperty("os.name")+"."); System.err.println("No test is run on OS other than linux."); return; } Configuration conf = NutchConfiguration.create(); // loop alternately, total 10*2 times of invoking external command for (int i=0; i<10; i++) { // check external parser that does 'cat' contentType = "application/vnd.nutch.example.cat"; content.setContentType(contentType); parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl()); assertEquals(expectedText,parse.getText()); // check external parser that does 'md5sum' contentType = "application/vnd.nutch.example.md5sum"; content.setContentType(contentType); parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl()); assertTrue(parse.getText().startsWith(expectedMD5sum)); } }
Example #2
Source File: TestDomainBlacklistURLFilter.java From anthelion with Apache License 2.0 | 6 votes |
public void testFilter() throws Exception { String domainBlacklistFile = SAMPLES + SEPARATOR + "hosts.txt"; Configuration conf = NutchConfiguration.create(); DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter(domainBlacklistFile); domainBlacklistFilter.setConf(conf); assertNull(domainBlacklistFilter.filter("http://lucene.apache.org")); assertNull(domainBlacklistFilter.filter("http://hadoop.apache.org")); assertNull(domainBlacklistFilter.filter("http://www.apache.org")); assertNotNull(domainBlacklistFilter.filter("http://www.google.com")); assertNotNull(domainBlacklistFilter.filter("http://mail.yahoo.com")); assertNull(domainBlacklistFilter.filter("http://www.foobar.net")); assertNull(domainBlacklistFilter.filter("http://www.foobas.net")); assertNull(domainBlacklistFilter.filter("http://www.yahoo.com")); assertNull(domainBlacklistFilter.filter("http://www.foobar.be")); assertNotNull(domainBlacklistFilter.filter("http://www.adobe.com")); }
Example #3
Source File: LinkReader.java From nutchpy with Apache License 2.0 | 6 votes |
public static long count(String path) throws IOException { //read rows between start and stop Configuration conf = NutchConfiguration.create(); FileSystem fs = FileSystem.get(conf); Path file = new Path(path); System.out.println(file); SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf); Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf); Writable value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf); //skip rows long i = 0; while(reader.next(key, value)) { i += 1; } return i; }
Example #4
Source File: TestMetatagParser.java From anthelion with Apache License 2.0 | 6 votes |
public void testIt() { Configuration conf = NutchConfiguration.create(); String urlString = "file:" + sampleDir + fileSeparator + sampleFile; try { Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); // check that we get the same values Metadata parseMeta = parse.getData().getParseMeta(); assertEquals(description, parseMeta.get("metatag.description")); assertEquals(keywords, parseMeta.get("metatag.keywords")); } catch (Exception e) { e.printStackTrace(); fail(e.toString()); } }
Example #5
Source File: WdcParser.java From anthelion with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws Exception { // LOG.setLevel(Level.FINE); String name = args[0]; String url = "file:" + name; File file = new File(name); byte[] bytes = new byte[(int) file.length()]; DataInputStream in = new DataInputStream(new FileInputStream(file)); in.readFully(bytes); Configuration conf = NutchConfiguration.create(); WdcParser parser = new WdcParser(); parser.setConf(conf); Parse parse = parser.getParse(new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(url); System.out.println("data: " + parse.getData()); System.out.println("text: " + parse.getText()); String contains = parse.getData().getMeta(META_CONTAINS_SEM); System.out.println("contains: " + contains); }
Example #6
Source File: TestSubcollection.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
/**Test filtering logic * * @throws Exception */ public void testFilter() throws Exception { Subcollection sc=new Subcollection(NutchConfiguration.create()); sc.setWhiteList("www.nutch.org\nwww.apache.org"); sc.setBlackList("jpg\nwww.apache.org/zecret/"); //matches whitelist assertEquals("http://www.apache.org/index.html", sc.filter("http://www.apache.org/index.html")); //matches blacklist assertEquals(null, sc.filter("http://www.apache.org/zecret/index.html")); assertEquals(null, sc.filter("http://www.apache.org/img/image.jpg")); //no match assertEquals(null, sc.filter("http://www.google.com/")); }
Example #7
Source File: TestExtParser.java From anthelion with Apache License 2.0 | 6 votes |
protected void setUp() throws ProtocolException, IOException { // prepare a temp file with expectedText as its content // This system property is defined in ./src/plugin/build-plugin.xml String path = System.getProperty("test.data"); if (path != null) { File tempDir = new File(path); if (!tempDir.exists()) tempDir.mkdir(); tempFile = File.createTempFile("nutch.test.plugin.ExtParser.",".txt",tempDir); } else { // otherwise in java.io.tmpdir tempFile = File.createTempFile("nutch.test.plugin.ExtParser.",".txt"); } urlString = tempFile.toURL().toString(); FileOutputStream fos = new FileOutputStream(tempFile); fos.write(expectedText.getBytes()); fos.close(); // get nutch content Protocol protocol = new ProtocolFactory(NutchConfiguration.create()).getProtocol(urlString); content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); protocol = null; }
Example #8
Source File: TestRegexURLNormalizer.java From anthelion with Apache License 2.0 | 6 votes |
public TestRegexURLNormalizer(String name) throws IOException { super(name); normalizer = new RegexURLNormalizer(); conf = NutchConfiguration.create(); normalizer.setConf(conf); File[] configs = new File(sampleDir).listFiles(new FileFilter() { public boolean accept(File f) { if (f.getName().endsWith(".xml") && f.getName().startsWith("regex-normalize-")) return true; return false; } }); for (int i = 0; i < configs.length; i++) { try { FileReader reader = new FileReader(configs[i]); String cname = configs[i].getName(); cname = cname.substring(16, cname.indexOf(".xml")); normalizer.setConfiguration(reader, cname); NormalizedURL[] urls = readTestFile(cname); testData.put(cname, urls); } catch (Exception e) { LOG.warn("Could load config from '" + configs[i] + "': " + e.toString()); } } }
Example #9
Source File: TestHTMLLanguageParser.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
/** * Test parsing of language identifiers from html **/ public void testMetaHTMLParsing() { try { ParseUtil parser = new ParseUtil(NutchConfiguration.create()); /* loop through the test documents and validate result */ for (int t = 0; t < docs.length; t++) { Content content = getContent(docs[t]); Parse parse = parser.parse(content).get(content.getUrl()); assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE)); } } catch (Exception e) { e.printStackTrace(System.out); fail(e.toString()); } }
Example #10
Source File: TestPdfParser.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
public void testIt() throws ProtocolException, ParseException { String urlString; Protocol protocol; Content content; Parse parse; for (int i = 0; i < sampleFiles.length; i++) { urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; Configuration conf = NutchConfiguration.create(); protocol = new ProtocolFactory(conf).getProtocol(urlString); content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl()); int index = parse.getText().indexOf(expectedText); assertTrue(index > 0); } }
Example #11
Source File: TestHTMLLanguageParser.java From anthelion with Apache License 2.0 | 6 votes |
/** * Test parsing of language identifiers from html **/ public void testMetaHTMLParsing() { try { ParseUtil parser = new ParseUtil(NutchConfiguration.create()); /* loop through the test documents and validate result */ for (int t = 0; t < docs.length; t++) { Content content = getContent(docs[t]); Parse parse = parser.parse(content).get(content.getUrl()); assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE)); } } catch (Exception e) { e.printStackTrace(System.out); fail(e.toString()); } }
Example #12
Source File: TestCCParseFilter.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
public void pageTest(File file, String url, String license, String location, String type) throws Exception { String contentType = "text/html"; InputStream in = new FileInputStream(file); ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length()); byte[] buffer = new byte[1024]; int i; while ((i = in.read(buffer)) != -1) { out.write(buffer, 0, i); } in.close(); byte[] bytes = out.toByteArray(); Configuration conf = NutchConfiguration.create(); Content content = new Content(url, url, bytes, contentType, new Metadata(), conf); Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); Metadata metadata = parse.getData().getParseMeta(); assertEquals(license, metadata.get("License-Url")); assertEquals(location, metadata.get("License-Location")); assertEquals(type, metadata.get("Work-Type")); }
Example #13
Source File: SuffixURLFilter.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
public static void main(String args[]) throws IOException { SuffixURLFilter filter; if (args.length >= 1) filter = new SuffixURLFilter(new FileReader(args[0])); else { filter = new SuffixURLFilter(); filter.setConf(NutchConfiguration.create()); } BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); String line; while ((line = in.readLine()) != null) { String out = filter.filter(line); if (out != null) { System.out.println("ACCEPTED " + out); } else { System.out.println("REJECTED " + out); } } }
Example #14
Source File: IndexManager.java From spacewalk with GNU General Public License v2.0 | 5 votes |
private boolean initDocSummary() { /** * NOTE: NutchConfiguration is expecting "nutch-default.xml" and "nutch-site.xml" * to be available in the CLASSPATH */ try { nutchConf = NutchConfiguration.create(); nutchAnalyzerFactory = new AnalyzerFactory(nutchConf); FileSystem fs = FileSystem.get(nutchConf); docSegments = new TreeMap<String, FetchedSegments> (String.CASE_INSENSITIVE_ORDER); for (String key : docLocaleLookUp.keySet()) { String segmentsDir = indexWorkDir + File.separator + getDocIndexPath(key) + File.separator + "segments"; FetchedSegments segments = new FetchedSegments(fs, segmentsDir, nutchConf); if (segments == null) { log.info("Unable to create docSegments for language: " + key); docSegments.put(key, null); } String[] segNames = segments.getSegmentNames(); if (segNames == null || segNames.length == 0) { log.info("Unable to find any segments for language: " + key); docSegments.put(key, null); } log.info("Adding Documentation segments for language: " + key); docSegments.put(key, segments); } } catch (Exception e) { log.error("ignoring exception - most likely Nutch isn't present, so" + " doc summaries will be empty"); e.printStackTrace(); } return true; }
Example #15
Source File: NodeReader.java From anthelion with Apache License 2.0 | 5 votes |
/** * Runs the NodeReader tool. The command line arguments must contain a * webgraphdb path and a url. The url must match the normalized url that is * contained in the NodeDb of the WebGraph. */ public static void main(String[] args) throws Exception { Options options = new Options(); Option helpOpts = OptionBuilder.withArgName("help").withDescription( "show this help message").create("help"); Option webGraphOpts = OptionBuilder.withArgName("webgraphdb").hasArg() .withDescription("the webgraphdb to use").create("webgraphdb"); Option urlOpts = OptionBuilder.withArgName("url").hasOptionalArg() .withDescription("the url to dump").create("url"); options.addOption(helpOpts); options.addOption(webGraphOpts); options.addOption(urlOpts); CommandLineParser parser = new GnuParser(); try { // command line must take a webgraphdb and a url CommandLine line = parser.parse(options, args); if (line.hasOption("help") || !line.hasOption("webgraphdb") || !line.hasOption("url")) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("WebGraphReader", options); return; } // dump the values to system out and return String webGraphDb = line.getOptionValue("webgraphdb"); String url = line.getOptionValue("url"); NodeReader reader = new NodeReader(NutchConfiguration.create()); reader.dumpUrl(new Path(webGraphDb), url); return; } catch (Exception e) { e.printStackTrace(); return; } }
Example #16
Source File: TestIndexingFilters.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
/** * Test behaviour when NutchDOcument is null */ public void testNutchDocumentNullIndexingFilter() throws IndexingException{ Configuration conf = NutchConfiguration.create(); conf.addResource("nutch-default.xml"); conf.addResource("crawl-tests.xml"); IndexingFilters filters = new IndexingFilters(conf); NutchDocument doc = filters.filter(null, new ParseImpl("text", new ParseData( new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text( "http://www.example.com/"), new CrawlDatum(), new Inlinks()); assertNull(doc); }
Example #17
Source File: LoopReader.java From anthelion with Apache License 2.0 | 5 votes |
/** * Runs the LoopReader tool. For this tool to work the loops job must have * already been run on the corresponding WebGraph. */ public static void main(String[] args) throws Exception { Options options = new Options(); Option helpOpts = OptionBuilder.withArgName("help").withDescription( "show this help message").create("help"); Option webGraphOpts = OptionBuilder.withArgName("webgraphdb").hasArg() .withDescription("the webgraphdb to use").create("webgraphdb"); Option urlOpts = OptionBuilder.withArgName("url").hasOptionalArg() .withDescription("the url to dump").create("url"); options.addOption(helpOpts); options.addOption(webGraphOpts); options.addOption(urlOpts); CommandLineParser parser = new GnuParser(); try { CommandLine line = parser.parse(options, args); if (line.hasOption("help") || !line.hasOption("webgraphdb") || !line.hasOption("url")) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("WebGraphReader", options); return; } String webGraphDb = line.getOptionValue("webgraphdb"); String url = line.getOptionValue("url"); LoopReader reader = new LoopReader(NutchConfiguration.create()); reader.dumpUrl(new Path(webGraphDb), url); return; } catch (Exception e) { e.printStackTrace(); return; } }
Example #18
Source File: ParseData.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public static void main(String argv[]) throws Exception { String usage = "ParseData (-local | -dfs <namenode:port>) recno segment"; if (argv.length < 3) { System.out.println("usage:" + usage); return; } Options opts = new Options(); Configuration conf = NutchConfiguration.create(); GenericOptionsParser parser = new GenericOptionsParser(conf, opts, argv); String[] remainingArgs = parser.getRemainingArgs(); FileSystem fs = FileSystem.get(conf); try { int recno = Integer.parseInt(remainingArgs[0]); String segment = remainingArgs[1]; Path file = new Path(segment, DIR_NAME); System.out.println("Reading from file: " + file); ArrayFile.Reader parses = new ArrayFile.Reader(fs, file.toString(), conf); ParseData parseDatum = new ParseData(); parses.get(recno, parseDatum); System.out.println("Retrieved " + recno + " from file " + file); System.out.println(parseDatum); parses.close(); } finally { fs.close(); } }
Example #19
Source File: LinkReader.java From nutchpy with Apache License 2.0 | 5 votes |
public static List head(int nrows, String path) throws IOException { // reads the entire contents of the file List<HashMap> rows=new ArrayList<HashMap>(); Configuration conf = NutchConfiguration.create(); FileSystem fs = FileSystem.get(conf); Path file = new Path(path); SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf); Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf); LinkDatum value = new LinkDatum(); int i = 0; while(reader.next(key, value)) { if (i == nrows) { break; } i += 1; try { HashMap<String, String> t_row = getLinksRow(key,value); rows.add(t_row); } catch (Exception e) { } } return rows; }
Example #20
Source File: TestOOParser.java From anthelion with Apache License 2.0 | 5 votes |
public void testIt() throws ProtocolException, ParseException { String urlString; Content content; Parse parse; Configuration conf = NutchConfiguration.create(); Protocol protocol; ProtocolFactory factory = new ProtocolFactory(conf); System.out.println("Expected : "+expectedText); for (int i=0; i<sampleFiles.length; i++) { urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; if (sampleFiles[i].startsWith("ootest")==false) continue; protocol = factory.getProtocol(urlString); content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl()); String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim(); // simply test for the presence of a text - the ordering of the elements may differ from what was expected // in the previous tests assertTrue(text!=null && text.length() > 0); System.out.println("Found "+sampleFiles[i]+": "+text); } }
Example #21
Source File: TestParserFactory.java From anthelion with Apache License 2.0 | 5 votes |
/** Inits the Test Case with the test parse-plugin file */ protected void setUp() throws Exception { conf = NutchConfiguration.create(); conf.set("plugin.includes", ".*"); conf.set("parse.plugin.file", "org/apache/nutch/parse/parse-plugin-test.xml"); parserFactory = new ParserFactory(conf); }
Example #22
Source File: LinkDumper.java From anthelion with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { if (args == null || args.length < 2) { System.out.println("LinkDumper$Reader usage: <webgraphdb> <url>"); return; } // open the readers for the linkdump directory Configuration conf = NutchConfiguration.create(); FileSystem fs = FileSystem.get(conf); Path webGraphDb = new Path(args[0]); String url = args[1]; MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path( webGraphDb, DUMP_DIR), conf); // get the link nodes for the url Text key = new Text(url); LinkNodes nodes = new LinkNodes(); MapFileOutputFormat.getEntry(readers, new HashPartitioner<Text, LinkNodes>(), key, nodes); // print out the link nodes LinkNode[] linkNodesAr = nodes.getLinks(); System.out.println(url + ":"); for (LinkNode node : linkNodesAr) { System.out.println(" " + node.getUrl() + " - " + node.getNode().toString()); } // close the readers FSUtils.closeReaders(readers); }
Example #23
Source File: TestMoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public void testContentDispositionTitle() throws IndexingException { Configuration conf = NutchConfiguration.create(); Metadata metadata = new Metadata(); metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext"); MoreIndexingFilter filter = new MoreIndexingFilter(); filter.setConf(conf); NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData( new ParseStatus(), "title", new Outlink[0], metadata)), new Text( "http://www.example.com/"), new CrawlDatum(), new Inlinks()); assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title")); }
Example #24
Source File: IndexManager.java From uyuni with GNU General Public License v2.0 | 5 votes |
private boolean initDocSummary() { /** * NOTE: NutchConfiguration is expecting "nutch-default.xml" and "nutch-site.xml" * to be available in the CLASSPATH */ try { nutchConf = NutchConfiguration.create(); nutchAnalyzerFactory = new AnalyzerFactory(nutchConf); FileSystem fs = FileSystem.get(nutchConf); docSegments = new TreeMap<String, FetchedSegments> (String.CASE_INSENSITIVE_ORDER); for (String key : docLocaleLookUp.keySet()) { String segmentsDir = indexWorkDir + File.separator + getDocIndexPath(key) + File.separator + "segments"; FetchedSegments segments = new FetchedSegments(fs, segmentsDir, nutchConf); if (segments == null) { log.info("Unable to create docSegments for language: " + key); docSegments.put(key, null); } String[] segNames = segments.getSegmentNames(); if (segNames == null || segNames.length == 0) { log.info("Unable to find any segments for language: " + key); docSegments.put(key, null); } log.info("Adding Documentation segments for language: " + key); docSegments.put(key, segments); } } catch (Exception e) { log.error("ignoring exception - most likely Nutch isn't present, so" + " doc summaries will be empty"); e.printStackTrace(); } return true; }
Example #25
Source File: URLNormalizerChecker.java From anthelion with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { String usage = "Usage: URLNormalizerChecker [-normalizer <normalizerName>] [-scope <scope>]" + "\n\tscope can be one of: default,partition,generate_host_count,fetcher,crawldb,linkdb,inject,outlink"; String normalizerName = null; String scope = URLNormalizers.SCOPE_DEFAULT; for (int i = 0; i < args.length; i++) { if (args[i].equals("-normalizer")) { normalizerName = args[++i]; } else if (args[i].equals("-scope")) { scope = args[++i]; } else { System.err.println(usage); System.exit(-1); } } URLNormalizerChecker checker = new URLNormalizerChecker(NutchConfiguration.create()); if (normalizerName != null) { checker.checkOne(normalizerName, scope); } else { checker.checkAll(scope); } System.exit(0); }
Example #26
Source File: LinkReader.java From nutchpy with Apache License 2.0 | 5 votes |
public static List read(String path) throws IOException { // reads the entire contents of the file List<HashMap> rows=new ArrayList<HashMap>(); Configuration conf = NutchConfiguration.create(); FileSystem fs = FileSystem.get(conf); Path file = new Path(path); SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf); Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf); LinkDatum value = new LinkDatum(); while(reader.next(key, value)) { try { HashMap<String, String> t_row = getLinksRow(key,value); rows.add(t_row); } catch (Exception e) { } } return rows; }
Example #27
Source File: TestStaticFieldIndexerTest.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
protected void setUp() throws Exception { conf = NutchConfiguration.create(); parse = new ParseImpl(); url = new Text("http://nutch.apache.org/index.html"); crawlDatum = new CrawlDatum(); inlinks = new Inlinks(); filter = new StaticFieldIndexer(); }
Example #28
Source File: TestLinkDbMerger.java From anthelion with Apache License 2.0 | 5 votes |
public void setUp() throws Exception { init1.put(url10, urls10); init1.put(url11, urls11); init2.put(url20, urls20); init2.put(url21, urls21); expected.put(url10, urls10_expected); expected.put(url11, urls11_expected); expected.put(url20, urls20_expected); expected.put(url21, urls21_expected); conf = NutchConfiguration.create(); fs = FileSystem.get(conf); testDir = new Path("build/test/test-linkdb-" + new java.util.Random().nextInt()); fs.mkdirs(testDir); }
Example #29
Source File: TestPassURLNormalizer.java From anthelion with Apache License 2.0 | 5 votes |
public void testPassURLNormalizer() { Configuration conf = NutchConfiguration.create(); PassURLNormalizer normalizer = new PassURLNormalizer(); normalizer.setConf(conf); String url = "http://www.example.com/test/..//"; String result = null; try { result = normalizer.normalize(url, URLNormalizers.SCOPE_DEFAULT); } catch (MalformedURLException mue) { fail(mue.toString()); } assertEquals(url, result); }
Example #30
Source File: TestOOParser.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public void testIt() throws ProtocolException, ParseException { String urlString; Content content; Parse parse; Configuration conf = NutchConfiguration.create(); Protocol protocol; ProtocolFactory factory = new ProtocolFactory(conf); System.out.println("Expected : "+expectedText); for (int i=0; i<sampleFiles.length; i++) { urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; if (sampleFiles[i].startsWith("ootest")==false) continue; protocol = factory.getProtocol(urlString); content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl()); String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim(); // simply test for the presence of a text - the ordering of the elements may differ from what was expected // in the previous tests assertTrue(text!=null && text.length() > 0); System.out.println("Found "+sampleFiles[i]+": "+text); } }