org.apache.nutch.util.NutchConfiguration Java Exaples

Source File: TestExtParser.java From nutch-htmlunit with Apache License 2.0

6 votes

public void testIt() throws ParseException {
  String contentType;

  // now test only on linux platform
  if (!System.getProperty("os.name").equalsIgnoreCase("linux")) {
    System.err.println("Current OS is "+System.getProperty("os.name")+".");
    System.err.println("No test is run on OS other than linux.");
    return;
  }

  Configuration conf = NutchConfiguration.create();
  // loop alternately, total 10*2 times of invoking external command
  for (int i=0; i<10; i++) {
    // check external parser that does 'cat'
    contentType = "application/vnd.nutch.example.cat";
    content.setContentType(contentType);
    parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl());
    assertEquals(expectedText,parse.getText());

    // check external parser that does 'md5sum'
    contentType = "application/vnd.nutch.example.md5sum";
    content.setContentType(contentType);
    parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl());
    assertTrue(parse.getText().startsWith(expectedMD5sum));
  }
}

Source File: TestDomainBlacklistURLFilter.java From anthelion with Apache License 2.0

6 votes

public void testFilter()
  throws Exception {

  String domainBlacklistFile = SAMPLES + SEPARATOR + "hosts.txt";
  Configuration conf = NutchConfiguration.create();
  DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter(domainBlacklistFile);
  domainBlacklistFilter.setConf(conf);
  assertNull(domainBlacklistFilter.filter("http://lucene.apache.org"));
  assertNull(domainBlacklistFilter.filter("http://hadoop.apache.org"));
  assertNull(domainBlacklistFilter.filter("http://www.apache.org"));
  assertNotNull(domainBlacklistFilter.filter("http://www.google.com"));
  assertNotNull(domainBlacklistFilter.filter("http://mail.yahoo.com"));
  assertNull(domainBlacklistFilter.filter("http://www.foobar.net"));
  assertNull(domainBlacklistFilter.filter("http://www.foobas.net"));
  assertNull(domainBlacklistFilter.filter("http://www.yahoo.com"));
  assertNull(domainBlacklistFilter.filter("http://www.foobar.be"));
  assertNotNull(domainBlacklistFilter.filter("http://www.adobe.com"));
}

Source File: LinkReader.java From nutchpy with Apache License 2.0

6 votes

public static long count(String path) throws IOException  {
    //read rows between start and stop

    Configuration conf = NutchConfiguration.create();
    FileSystem fs = FileSystem.get(conf);

    Path file = new Path(path);
    System.out.println(file);

    SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf);

    Writable key = (Writable)
            ReflectionUtils.newInstance(reader.getKeyClass(), conf);
    Writable value = (Writable)
            ReflectionUtils.newInstance(reader.getValueClass(), conf);


    //skip rows
    long i = 0;


    while(reader.next(key, value)) {
        i += 1;
    }
    return i;
}

Source File: TestMetatagParser.java From anthelion with Apache License 2.0

6 votes

public void testIt() {
  Configuration conf = NutchConfiguration.create();
  
  String urlString = "file:" + sampleDir + fileSeparator + sampleFile;
  
  try {
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    Content content = protocol.getProtocolOutput(new Text(urlString),
        new CrawlDatum()).getContent();
    
    Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
    
    // check that we get the same values
    Metadata parseMeta = parse.getData().getParseMeta();
    
    assertEquals(description, parseMeta.get("metatag.description"));
    assertEquals(keywords, parseMeta.get("metatag.keywords"));
  } catch (Exception e) {
    e.printStackTrace();
    fail(e.toString());
  }
}

Source File: WdcParser.java From anthelion with Apache License 2.0

6 votes

public static void main(String[] args) throws Exception {
	// LOG.setLevel(Level.FINE);
	String name = args[0];
	String url = "file:" + name;
	File file = new File(name);
	byte[] bytes = new byte[(int) file.length()];
	DataInputStream in = new DataInputStream(new FileInputStream(file));
	in.readFully(bytes);
	Configuration conf = NutchConfiguration.create();
	WdcParser parser = new WdcParser();
	parser.setConf(conf);
	Parse parse = parser.getParse(new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(url);
	System.out.println("data: " + parse.getData());

	System.out.println("text: " + parse.getText());

	String contains = parse.getData().getMeta(META_CONTAINS_SEM);
	System.out.println("contains: " + contains);

}

Source File: TestSubcollection.java From nutch-htmlunit with Apache License 2.0

6 votes

/**Test filtering logic
 * 
 * @throws Exception
 */
public void testFilter() throws Exception {
  Subcollection sc=new Subcollection(NutchConfiguration.create());
  sc.setWhiteList("www.nutch.org\nwww.apache.org");
  sc.setBlackList("jpg\nwww.apache.org/zecret/");
  
  //matches whitelist
  assertEquals("http://www.apache.org/index.html", sc.filter("http://www.apache.org/index.html"));
  
  //matches blacklist
  assertEquals(null, sc.filter("http://www.apache.org/zecret/index.html"));
  assertEquals(null, sc.filter("http://www.apache.org/img/image.jpg"));
  
  //no match
  assertEquals(null, sc.filter("http://www.google.com/"));
}

Source File: TestExtParser.java From anthelion with Apache License 2.0

6 votes

protected void setUp() throws ProtocolException, IOException {
  // prepare a temp file with expectedText as its content
  // This system property is defined in ./src/plugin/build-plugin.xml
  String path = System.getProperty("test.data");
  if (path != null) {
    File tempDir = new File(path);
    if (!tempDir.exists())
      tempDir.mkdir();
    tempFile = File.createTempFile("nutch.test.plugin.ExtParser.",".txt",tempDir);
  } else {
    // otherwise in java.io.tmpdir
    tempFile = File.createTempFile("nutch.test.plugin.ExtParser.",".txt");
  }
  urlString = tempFile.toURL().toString();

  FileOutputStream fos = new FileOutputStream(tempFile);
  fos.write(expectedText.getBytes());
  fos.close();

  // get nutch content
  Protocol protocol = new ProtocolFactory(NutchConfiguration.create()).getProtocol(urlString);
  content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
  protocol = null;
}

Source File: TestRegexURLNormalizer.java From anthelion with Apache License 2.0

6 votes

public TestRegexURLNormalizer(String name) throws IOException {
  super(name);
  normalizer = new RegexURLNormalizer();
  conf = NutchConfiguration.create();
  normalizer.setConf(conf);
  File[] configs = new File(sampleDir).listFiles(new FileFilter() {
    public boolean accept(File f) {
      if (f.getName().endsWith(".xml") && f.getName().startsWith("regex-normalize-"))
        return true;
      return false;
    }
  });
  for (int i = 0; i < configs.length; i++) {
    try {
      FileReader reader = new FileReader(configs[i]);
      String cname = configs[i].getName();
      cname = cname.substring(16, cname.indexOf(".xml"));
      normalizer.setConfiguration(reader, cname);
      NormalizedURL[] urls = readTestFile(cname);
      testData.put(cname, urls);
    } catch (Exception e) {
      LOG.warn("Could load config from '" + configs[i] + "': " + e.toString());
    }
  }
}

Source File: TestHTMLLanguageParser.java From nutch-htmlunit with Apache License 2.0

6 votes

/**
 * Test parsing of language identifiers from html 
 **/
public void testMetaHTMLParsing() {

  try {
    ParseUtil parser = new ParseUtil(NutchConfiguration.create());
    /* loop through the test documents and validate result */
    for (int t = 0; t < docs.length; t++) {
      Content content = getContent(docs[t]);
      Parse parse = parser.parse(content).get(content.getUrl());
      assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE));
    }
  } catch (Exception e) {
    e.printStackTrace(System.out);
    fail(e.toString());
  }

}

Source File: TestPdfParser.java From nutch-htmlunit with Apache License 2.0

6 votes

public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    Configuration conf = NutchConfiguration.create();
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());

    int index = parse.getText().indexOf(expectedText);
    assertTrue(index > 0);
  }
}

Source File: TestHTMLLanguageParser.java From anthelion with Apache License 2.0

6 votes

/**
 * Test parsing of language identifiers from html 
 **/
public void testMetaHTMLParsing() {

  try {
    ParseUtil parser = new ParseUtil(NutchConfiguration.create());
    /* loop through the test documents and validate result */
    for (int t = 0; t < docs.length; t++) {
      Content content = getContent(docs[t]);
      Parse parse = parser.parse(content).get(content.getUrl());
      assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE));
    }
  } catch (Exception e) {
    e.printStackTrace(System.out);
    fail(e.toString());
  }

}

Source File: TestCCParseFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

public void pageTest(File file, String url,
                     String license, String location, String type)
  throws Exception {

  String contentType = "text/html";
  InputStream in = new FileInputStream(file);
  ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());
  byte[] buffer = new byte[1024];
  int i;
  while ((i = in.read(buffer)) != -1) {
    out.write(buffer, 0, i);
  }
  in.close();
  byte[] bytes = out.toByteArray();
  Configuration conf = NutchConfiguration.create();

  Content content =
    new Content(url, url, bytes, contentType, new Metadata(), conf);
  Parse parse =  new ParseUtil(conf).parse(content).get(content.getUrl());
  
  Metadata metadata = parse.getData().getParseMeta();
  assertEquals(license, metadata.get("License-Url"));
  assertEquals(location, metadata.get("License-Location"));
  assertEquals(type, metadata.get("Work-Type"));
}

Source File: SuffixURLFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

public static void main(String args[]) throws IOException {

    SuffixURLFilter filter;
    if (args.length >= 1)
      filter = new SuffixURLFilter(new FileReader(args[0]));
    else {
      filter = new SuffixURLFilter();
      filter.setConf(NutchConfiguration.create());
    }

    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
    String line;
    while ((line = in.readLine()) != null) {
      String out = filter.filter(line);
      if (out != null) {
        System.out.println("ACCEPTED " + out);
      } else {
        System.out.println("REJECTED " + out);
      }
    }
  }

Source File: IndexManager.java From spacewalk with GNU General Public License v2.0

5 votes

private boolean initDocSummary() {
    /**
     * NOTE:  NutchConfiguration is expecting "nutch-default.xml" and "nutch-site.xml"
     * to be available in the CLASSPATH
     */
    try {
        nutchConf = NutchConfiguration.create();
        nutchAnalyzerFactory = new AnalyzerFactory(nutchConf);
        FileSystem fs = FileSystem.get(nutchConf);
        docSegments = new TreeMap<String, FetchedSegments>
                                                            (String.CASE_INSENSITIVE_ORDER);
        for (String key : docLocaleLookUp.keySet()) {
            String segmentsDir = indexWorkDir + File.separator +
                getDocIndexPath(key) + File.separator + "segments";
            FetchedSegments segments = new FetchedSegments(fs, segmentsDir, nutchConf);
            if (segments == null) {
                log.info("Unable to create docSegments for language: " + key);
                docSegments.put(key, null);
            }
            String[] segNames = segments.getSegmentNames();
            if (segNames == null || segNames.length == 0) {
                log.info("Unable to find any segments for language: " + key);
                docSegments.put(key, null);
            }
            log.info("Adding Documentation segments for language: " + key);
            docSegments.put(key, segments);
        }
    }
    catch (Exception e) {
        log.error("ignoring exception - most likely Nutch isn't present, so" +
        " doc summaries will be empty");
        e.printStackTrace();
    }
    return true;
}

Source File: NodeReader.java From anthelion with Apache License 2.0

5 votes

/**
 * Runs the NodeReader tool.  The command line arguments must contain a 
 * webgraphdb path and a url.  The url must match the normalized url that is
 * contained in the NodeDb of the WebGraph.
 */
public static void main(String[] args)
  throws Exception {

  Options options = new Options();
  Option helpOpts = OptionBuilder.withArgName("help").withDescription(
    "show this help message").create("help");
  Option webGraphOpts = OptionBuilder.withArgName("webgraphdb").hasArg()
    .withDescription("the webgraphdb to use").create("webgraphdb");
  Option urlOpts = OptionBuilder.withArgName("url").hasOptionalArg()
    .withDescription("the url to dump").create("url");
  options.addOption(helpOpts);
  options.addOption(webGraphOpts);
  options.addOption(urlOpts);

  CommandLineParser parser = new GnuParser();
  try {

    // command line must take a webgraphdb and a url
    CommandLine line = parser.parse(options, args);
    if (line.hasOption("help") || !line.hasOption("webgraphdb")
      || !line.hasOption("url")) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp("WebGraphReader", options);
      return;
    }

    // dump the values to system out and return
    String webGraphDb = line.getOptionValue("webgraphdb");
    String url = line.getOptionValue("url");
    NodeReader reader = new NodeReader(NutchConfiguration.create());
    reader.dumpUrl(new Path(webGraphDb), url);
    
    return;
  }
  catch (Exception e) {
    e.printStackTrace();
    return;
  }
}

Source File: TestIndexingFilters.java From nutch-htmlunit with Apache License 2.0

5 votes

/**
 * Test behaviour when NutchDOcument is null
 */

public void testNutchDocumentNullIndexingFilter() throws IndexingException{
  Configuration conf = NutchConfiguration.create();
  conf.addResource("nutch-default.xml");
  conf.addResource("crawl-tests.xml");

  IndexingFilters filters = new IndexingFilters(conf);
  NutchDocument doc = filters.filter(null, new ParseImpl("text", new ParseData(
    new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
    "http://www.example.com/"), new CrawlDatum(), new Inlinks());
   
  assertNull(doc);
}

Source File: LoopReader.java From anthelion with Apache License 2.0

5 votes

/**
 * Runs the LoopReader tool.  For this tool to work the loops job must have
 * already been run on the corresponding WebGraph.
 */
public static void main(String[] args)
  throws Exception {

  Options options = new Options();
  Option helpOpts = OptionBuilder.withArgName("help").withDescription(
    "show this help message").create("help");
  Option webGraphOpts = OptionBuilder.withArgName("webgraphdb").hasArg()
    .withDescription("the webgraphdb to use").create("webgraphdb");
  Option urlOpts = OptionBuilder.withArgName("url").hasOptionalArg()
    .withDescription("the url to dump").create("url");
  options.addOption(helpOpts);
  options.addOption(webGraphOpts);
  options.addOption(urlOpts);

  CommandLineParser parser = new GnuParser();
  try {

    CommandLine line = parser.parse(options, args);
    if (line.hasOption("help") || !line.hasOption("webgraphdb")
      || !line.hasOption("url")) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp("WebGraphReader", options);
      return;
    }

    String webGraphDb = line.getOptionValue("webgraphdb");
    String url = line.getOptionValue("url");
    LoopReader reader = new LoopReader(NutchConfiguration.create());
    reader.dumpUrl(new Path(webGraphDb), url);
    return;
  }
  catch (Exception e) {
    e.printStackTrace();
    return;
  }
}

Source File: ParseData.java From nutch-htmlunit with Apache License 2.0

5 votes

public static void main(String argv[]) throws Exception {
  String usage = "ParseData (-local | -dfs <namenode:port>) recno segment";
  
  if (argv.length < 3) {
    System.out.println("usage:" + usage);
    return;
  }

  Options opts = new Options();
  Configuration conf = NutchConfiguration.create();
  
  GenericOptionsParser parser =
    new GenericOptionsParser(conf, opts, argv);
  
  String[] remainingArgs = parser.getRemainingArgs();
  FileSystem fs = FileSystem.get(conf);
  
  try {
    int recno = Integer.parseInt(remainingArgs[0]);
    String segment = remainingArgs[1];

    Path file = new Path(segment, DIR_NAME);
    System.out.println("Reading from file: " + file);

    ArrayFile.Reader parses = new ArrayFile.Reader(fs, file.toString(), conf);

    ParseData parseDatum = new ParseData();
    parses.get(recno, parseDatum);

    System.out.println("Retrieved " + recno + " from file " + file);
    System.out.println(parseDatum);

    parses.close();
  } finally {
    fs.close();
  }
}

Source File: LinkReader.java From nutchpy with Apache License 2.0

5 votes

public static List head(int nrows, String path) throws IOException {
    // reads the entire contents of the file

    List<HashMap> rows=new ArrayList<HashMap>();

    Configuration conf = NutchConfiguration.create();
    FileSystem fs = FileSystem.get(conf);

    Path file = new Path(path);

    SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf);

    Writable key = (Writable)
            ReflectionUtils.newInstance(reader.getKeyClass(), conf);
    LinkDatum value = new LinkDatum();

    int i = 0;
    while(reader.next(key, value)) {

        if (i == nrows) {
            break;
        }
        i += 1;
        try {
            HashMap<String, String> t_row = getLinksRow(key,value);
            rows.add(t_row);
        }
        catch (Exception e) {
        }
    }

    return rows;
}

Source File: TestOOParser.java From anthelion with Apache License 2.0

5 votes

public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Content content;
  Parse parse;
  Configuration conf = NutchConfiguration.create();
  Protocol protocol;
  ProtocolFactory factory = new ProtocolFactory(conf);

  System.out.println("Expected : "+expectedText);
  
  for (int i=0; i<sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    if (sampleFiles[i].startsWith("ootest")==false) continue;
    
    protocol = factory.getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
    
    String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();

    // simply test for the presence of a text - the ordering of the elements may differ from what was expected
    // in the previous tests
    assertTrue(text!=null && text.length() > 0);
    
    System.out.println("Found "+sampleFiles[i]+": "+text);
  }
}

Source File: TestParserFactory.java From anthelion with Apache License 2.0

5 votes

/** Inits the Test Case with the test parse-plugin file */
protected void setUp() throws Exception {
    conf = NutchConfiguration.create();
    conf.set("plugin.includes", ".*");
    conf.set("parse.plugin.file",
             "org/apache/nutch/parse/parse-plugin-test.xml");
    parserFactory = new ParserFactory(conf);
}

Source File: LinkDumper.java From anthelion with Apache License 2.0

5 votes

public static void main(String[] args)
  throws Exception {
  
  if (args == null || args.length < 2) {
    System.out.println("LinkDumper$Reader usage: <webgraphdb> <url>");
    return;
  }

  // open the readers for the linkdump directory
  Configuration conf = NutchConfiguration.create();
  FileSystem fs = FileSystem.get(conf);
  Path webGraphDb = new Path(args[0]);
  String url = args[1];
  MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(
    webGraphDb, DUMP_DIR), conf);

  // get the link nodes for the url
  Text key = new Text(url);
  LinkNodes nodes = new LinkNodes();
  MapFileOutputFormat.getEntry(readers,
    new HashPartitioner<Text, LinkNodes>(), key, nodes);

  // print out the link nodes
  LinkNode[] linkNodesAr = nodes.getLinks();
  System.out.println(url + ":");
  for (LinkNode node : linkNodesAr) {
    System.out.println("  " + node.getUrl() + " - "
      + node.getNode().toString());
  }

  // close the readers
  FSUtils.closeReaders(readers);
}

Source File: TestMoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

public void testContentDispositionTitle() throws IndexingException {
  Configuration conf = NutchConfiguration.create();

  Metadata metadata = new Metadata();
  metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);

  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
    new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());

  assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
}

Source File: IndexManager.java From uyuni with GNU General Public License v2.0

5 votes

private boolean initDocSummary() {
    /**
     * NOTE:  NutchConfiguration is expecting "nutch-default.xml" and "nutch-site.xml"
     * to be available in the CLASSPATH
     */
    try {
        nutchConf = NutchConfiguration.create();
        nutchAnalyzerFactory = new AnalyzerFactory(nutchConf);
        FileSystem fs = FileSystem.get(nutchConf);
        docSegments = new TreeMap<String, FetchedSegments>
                                                            (String.CASE_INSENSITIVE_ORDER);
        for (String key : docLocaleLookUp.keySet()) {
            String segmentsDir = indexWorkDir + File.separator +
                getDocIndexPath(key) + File.separator + "segments";
            FetchedSegments segments = new FetchedSegments(fs, segmentsDir, nutchConf);
            if (segments == null) {
                log.info("Unable to create docSegments for language: " + key);
                docSegments.put(key, null);
            }
            String[] segNames = segments.getSegmentNames();
            if (segNames == null || segNames.length == 0) {
                log.info("Unable to find any segments for language: " + key);
                docSegments.put(key, null);
            }
            log.info("Adding Documentation segments for language: " + key);
            docSegments.put(key, segments);
        }
    }
    catch (Exception e) {
        log.error("ignoring exception - most likely Nutch isn't present, so" +
        " doc summaries will be empty");
        e.printStackTrace();
    }
    return true;
}

Source File: URLNormalizerChecker.java From anthelion with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {

    String usage = "Usage: URLNormalizerChecker [-normalizer <normalizerName>] [-scope <scope>]"
      + "\n\tscope can be one of: default,partition,generate_host_count,fetcher,crawldb,linkdb,inject,outlink";

    String normalizerName = null;
    String scope = URLNormalizers.SCOPE_DEFAULT;
    for (int i = 0; i < args.length; i++) {
      if (args[i].equals("-normalizer")) {
        normalizerName = args[++i];
      } else if (args[i].equals("-scope")) {
        scope = args[++i];
      } else {
        System.err.println(usage);
        System.exit(-1);
      }
    }

    URLNormalizerChecker checker = new URLNormalizerChecker(NutchConfiguration.create());
    if (normalizerName != null) {
      checker.checkOne(normalizerName, scope);
    } else {
      checker.checkAll(scope);
    }

    System.exit(0);
  }

Source File: LinkReader.java From nutchpy with Apache License 2.0

5 votes

public static List read(String path) throws IOException {
    // reads the entire contents of the file

    List<HashMap> rows=new ArrayList<HashMap>();

    Configuration conf = NutchConfiguration.create();
    FileSystem fs = FileSystem.get(conf);

    Path file = new Path(path);

    SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf);

    Writable key = (Writable)
            ReflectionUtils.newInstance(reader.getKeyClass(), conf);
    LinkDatum value = new LinkDatum();

    while(reader.next(key, value)) {
        try {
            HashMap<String, String> t_row = getLinksRow(key,value);
            rows.add(t_row);
        }
        catch (Exception e) {
        }
    }

    return rows;
}

Source File: TestStaticFieldIndexerTest.java From nutch-htmlunit with Apache License 2.0

5 votes

protected void setUp() throws Exception {
  conf = NutchConfiguration.create();
  parse = new ParseImpl();
  url = new Text("http://nutch.apache.org/index.html");
  crawlDatum = new CrawlDatum();
  inlinks = new Inlinks();
  filter = new StaticFieldIndexer();
}

Source File: TestLinkDbMerger.java From anthelion with Apache License 2.0

5 votes

public void setUp() throws Exception {
  init1.put(url10, urls10);
  init1.put(url11, urls11);
  init2.put(url20, urls20);
  init2.put(url21, urls21);
  expected.put(url10, urls10_expected);
  expected.put(url11, urls11_expected);
  expected.put(url20, urls20_expected);
  expected.put(url21, urls21_expected);
  conf = NutchConfiguration.create();
  fs = FileSystem.get(conf);
  testDir = new Path("build/test/test-linkdb-" +
          new java.util.Random().nextInt());
  fs.mkdirs(testDir);
}

Source File: TestPassURLNormalizer.java From anthelion with Apache License 2.0

5 votes

public void testPassURLNormalizer() {
  Configuration conf = NutchConfiguration.create();
  
  PassURLNormalizer normalizer = new PassURLNormalizer();
  normalizer.setConf(conf);
  String url = "http://www.example.com/test/..//";
  String result = null;
  try {
    result = normalizer.normalize(url, URLNormalizers.SCOPE_DEFAULT);
  } catch (MalformedURLException mue) {
    fail(mue.toString());
  }
  
  assertEquals(url, result);
}

Source File: TestOOParser.java From nutch-htmlunit with Apache License 2.0

5 votes

public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Content content;
  Parse parse;
  Configuration conf = NutchConfiguration.create();
  Protocol protocol;
  ProtocolFactory factory = new ProtocolFactory(conf);

  System.out.println("Expected : "+expectedText);
  
  for (int i=0; i<sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    if (sampleFiles[i].startsWith("ootest")==false) continue;
    
    protocol = factory.getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
    
    String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();

    // simply test for the presence of a text - the ordering of the elements may differ from what was expected
    // in the previous tests
    assertTrue(text!=null && text.length() > 0);
    
    System.out.println("Found "+sampleFiles[i]+": "+text);
  }
}

org.apache.nutch.util.NutchConfiguration Java Examples