org.apache.hadoop.mapred.lib.HashPartitioner Java Exaples

Source File: SortValidator.java From hadoop-gpu with Apache License 2.0

6 votes

public void configure(JobConf job) {
  // 'key' == sortInput for sort-input; key == sortOutput for sort-output
  key = deduceInputFile(job);
  
  if (key == sortOutput) {
    partitioner = new HashPartitioner<WritableComparable, Writable>();
    
    // Figure the 'current' partition and no. of reduces of the 'sort'
    try {
      URI inputURI = new URI(job.get("map.input.file"));
      String inputFile = inputURI.getPath();
      partition = Integer.valueOf(
                                  inputFile.substring(inputFile.lastIndexOf("part")+5)
                                  ).intValue();
      noSortReducers = job.getInt("sortvalidate.sort.reduce.tasks", -1);
    } catch (Exception e) {
      System.err.println("Caught: " + e);
      System.exit(-1);
    }
  }
}

Source File: SortValidator.java From big-c with Apache License 2.0

6 votes

public void configure(JobConf job) {
  // 'key' == sortInput for sort-input; key == sortOutput for sort-output
  key = deduceInputFile(job);
  
  if (key == sortOutput) {
    partitioner = new HashPartitioner<WritableComparable, Writable>();
    
    // Figure the 'current' partition and no. of reduces of the 'sort'
    try {
      URI inputURI = new URI(job.get(JobContext.MAP_INPUT_FILE));
      String inputFile = inputURI.getPath();
      // part file is of the form part-r-xxxxx
      partition = Integer.valueOf(inputFile.substring(
        inputFile.lastIndexOf("part") + 7)).intValue();
      noSortReducers = job.getInt(SORT_REDUCES, -1);
    } catch (Exception e) {
      System.err.println("Caught: " + e);
      System.exit(-1);
    }
  }
}

Source File: NodeReader.java From nutch-htmlunit with Apache License 2.0

6 votes

/**
 * Prints the content of the Node represented by the url to system out.
 * 
 * @param webGraphDb The webgraph from which to get the node.
 * @param url The url of the node.
 * 
 * @throws IOException If an error occurs while getting the node.
 */
public void dumpUrl(Path webGraphDb, String url)
  throws IOException {

  fs = FileSystem.get(getConf());
  nodeReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb,
    WebGraph.NODE_DIR), getConf());

  // open the readers, get the node, print out the info, and close the readers
  Text key = new Text(url);
  Node node = new Node();
  MapFileOutputFormat.getEntry(nodeReaders,
    new HashPartitioner<Text, Node>(), key, node);
  System.out.println(url + ":");
  System.out.println("  inlink score: " + node.getInlinkScore());
  System.out.println("  outlink score: " + node.getOutlinkScore());
  System.out.println("  num inlinks: " + node.getNumInlinks());
  System.out.println("  num outlinks: " + node.getNumOutlinks());
  FSUtils.closeReaders(nodeReaders);
}

Source File: LoopReader.java From anthelion with Apache License 2.0

6 votes

/**
 * Prints loopset for a single url.  The loopset information will show any
 * outlink url the eventually forms a link cycle.
 * 
 * @param webGraphDb The WebGraph to check for loops
 * @param url The url to check.
 * 
 * @throws IOException If an error occurs while printing loopset information.
 */
public void dumpUrl(Path webGraphDb, String url)
  throws IOException {

  // open the readers
  fs = FileSystem.get(getConf());
  loopReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb,
    Loops.LOOPS_DIR), getConf());

  // get the loopset for a given url, if any
  Text key = new Text(url);
  LoopSet loop = new LoopSet();
  MapFileOutputFormat.getEntry(loopReaders,
    new HashPartitioner<Text, LoopSet>(), key, loop);

  // print out each loop url in the set
  System.out.println(url + ":");
  for (String loopUrl : loop.getLoopSet()) {
    System.out.println("  " + loopUrl);
  }

  // close the readers
  FSUtils.closeReaders(loopReaders);
}

Source File: NodeReader.java From anthelion with Apache License 2.0

6 votes

/**
 * Prints the content of the Node represented by the url to system out.
 * 
 * @param webGraphDb The webgraph from which to get the node.
 * @param url The url of the node.
 * 
 * @throws IOException If an error occurs while getting the node.
 */
public void dumpUrl(Path webGraphDb, String url)
  throws IOException {

  fs = FileSystem.get(getConf());
  nodeReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb,
    WebGraph.NODE_DIR), getConf());

  // open the readers, get the node, print out the info, and close the readers
  Text key = new Text(url);
  Node node = new Node();
  MapFileOutputFormat.getEntry(nodeReaders,
    new HashPartitioner<Text, Node>(), key, node);
  System.out.println(url + ":");
  System.out.println("  inlink score: " + node.getInlinkScore());
  System.out.println("  outlink score: " + node.getOutlinkScore());
  System.out.println("  num inlinks: " + node.getNumInlinks());
  System.out.println("  num outlinks: " + node.getNumOutlinks());
  FSUtils.closeReaders(nodeReaders);
}

Source File: SortValidator.java From hadoop with Apache License 2.0

6 votes

public void configure(JobConf job) {
  // 'key' == sortInput for sort-input; key == sortOutput for sort-output
  key = deduceInputFile(job);
  
  if (key == sortOutput) {
    partitioner = new HashPartitioner<WritableComparable, Writable>();
    
    // Figure the 'current' partition and no. of reduces of the 'sort'
    try {
      URI inputURI = new URI(job.get(JobContext.MAP_INPUT_FILE));
      String inputFile = inputURI.getPath();
      // part file is of the form part-r-xxxxx
      partition = Integer.valueOf(inputFile.substring(
        inputFile.lastIndexOf("part") + 7)).intValue();
      noSortReducers = job.getInt(SORT_REDUCES, -1);
    } catch (Exception e) {
      System.err.println("Caught: " + e);
      System.exit(-1);
    }
  }
}

Source File: LoopReader.java From nutch-htmlunit with Apache License 2.0

6 votes

/**
 * Prints loopset for a single url.  The loopset information will show any
 * outlink url the eventually forms a link cycle.
 * 
 * @param webGraphDb The WebGraph to check for loops
 * @param url The url to check.
 * 
 * @throws IOException If an error occurs while printing loopset information.
 */
public void dumpUrl(Path webGraphDb, String url)
  throws IOException {

  // open the readers
  fs = FileSystem.get(getConf());
  loopReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb,
    Loops.LOOPS_DIR), getConf());

  // get the loopset for a given url, if any
  Text key = new Text(url);
  LoopSet loop = new LoopSet();
  MapFileOutputFormat.getEntry(loopReaders,
    new HashPartitioner<Text, LoopSet>(), key, loop);

  // print out each loop url in the set
  System.out.println(url + ":");
  for (String loopUrl : loop.getLoopSet()) {
    System.out.println("  " + loopUrl);
  }

  // close the readers
  FSUtils.closeReaders(loopReaders);
}

Source File: SortValidator.java From RDFS with Apache License 2.0

6 votes

public void configure(JobConf job) {
  // 'key' == sortInput for sort-input; key == sortOutput for sort-output
  key = deduceInputFile(job);
  
  if (key == sortOutput) {
    partitioner = new HashPartitioner<WritableComparable, Writable>();
    
    // Figure the 'current' partition and no. of reduces of the 'sort'
    try {
      URI inputURI = new URI(job.get("map.input.file"));
      String inputFile = inputURI.getPath();
      partition = Integer.valueOf(
                                  inputFile.substring(inputFile.lastIndexOf("part")+5)
                                  ).intValue();
      noSortReducers = job.getInt("sortvalidate.sort.reduce.tasks", -1);
    } catch (Exception e) {
      System.err.println("Caught: " + e);
      System.exit(-1);
    }
  }
}

Source File: LinkDumper.java From nutch-htmlunit with Apache License 2.0

5 votes

public static void main(String[] args)
  throws Exception {
  
  if (args == null || args.length < 2) {
    System.out.println("LinkDumper$Reader usage: <webgraphdb> <url>");
    return;
  }

  // open the readers for the linkdump directory
  Configuration conf = NutchConfiguration.create();
  FileSystem fs = FileSystem.get(conf);
  Path webGraphDb = new Path(args[0]);
  String url = args[1];
  MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(
    webGraphDb, DUMP_DIR), conf);

  // get the link nodes for the url
  Text key = new Text(url);
  LinkNodes nodes = new LinkNodes();
  MapFileOutputFormat.getEntry(readers,
    new HashPartitioner<Text, LinkNodes>(), key, nodes);

  // print out the link nodes
  LinkNode[] linkNodesAr = nodes.getLinks();
  System.out.println(url + ":");
  for (LinkNode node : linkNodesAr) {
    System.out.println("  " + node.getUrl() + " - "
      + node.getNode().toString());
  }

  // close the readers
  FSUtils.closeReaders(readers);
}

Source File: CrawlDbReader.java From anthelion with Apache License 2.0

5 votes

public CrawlDatum get(String crawlDb, String url, Configuration config) throws IOException {
  Text key = new Text(url);
  CrawlDatum val = new CrawlDatum();
  openReaders(crawlDb, config);
  CrawlDatum res = (CrawlDatum)MapFileOutputFormat.getEntry(readers,
      new HashPartitioner<Text, CrawlDatum>(), key, val);
  return res;
}

Source File: LinkDumper.java From anthelion with Apache License 2.0

5 votes

public static void main(String[] args)
  throws Exception {
  
  if (args == null || args.length < 2) {
    System.out.println("LinkDumper$Reader usage: <webgraphdb> <url>");
    return;
  }

  // open the readers for the linkdump directory
  Configuration conf = NutchConfiguration.create();
  FileSystem fs = FileSystem.get(conf);
  Path webGraphDb = new Path(args[0]);
  String url = args[1];
  MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(
    webGraphDb, DUMP_DIR), conf);

  // get the link nodes for the url
  Text key = new Text(url);
  LinkNodes nodes = new LinkNodes();
  MapFileOutputFormat.getEntry(readers,
    new HashPartitioner<Text, LinkNodes>(), key, nodes);

  // print out the link nodes
  LinkNode[] linkNodesAr = nodes.getLinks();
  System.out.println(url + ":");
  for (LinkNode node : linkNodesAr) {
    System.out.println("  " + node.getUrl() + " - "
      + node.getNode().toString());
  }

  // close the readers
  FSUtils.closeReaders(readers);
}

Source File: CrawlDbReader.java From nutch-htmlunit with Apache License 2.0

5 votes

public CrawlDatum get(String crawlDb, String url, Configuration config) throws IOException {
  Text key = new Text(url);
  CrawlDatum val = new CrawlDatum();
  openReaders(crawlDb, config);
  CrawlDatum res = (CrawlDatum)MapFileOutputFormat.getEntry(readers,
      new HashPartitioner<Text, CrawlDatum>(), key, val);
  return res;
}

Source File: Submitter.java From RDFS with Apache License 2.0

4 votes

/**
 * Get the user's original partitioner.
 * @param conf the configuration to look in
 * @return the class that the user submitted
 */
static Class<? extends Partitioner> getJavaPartitioner(JobConf conf) {
  return conf.getClass("hadoop.pipes.partitioner", 
                       HashPartitioner.class,
                       Partitioner.class);
}

Source File: Submitter.java From hadoop-gpu with Apache License 2.0

4 votes

/**
 * Get the user's original partitioner.
 * @param conf the configuration to look in
 * @return the class that the user submitted
 */
static Class<? extends Partitioner> getJavaPartitioner(JobConf conf) {
  return conf.getClass("hadoop.pipes.partitioner", 
                       HashPartitioner.class,
                       Partitioner.class);
}

Source File: Submitter.java From big-c with Apache License 2.0

4 votes

/**
 * Get the user's original partitioner.
 * @param conf the configuration to look in
 * @return the class that the user submitted
 */
static Class<? extends Partitioner> getJavaPartitioner(JobConf conf) {
  return conf.getClass(Submitter.PARTITIONER, 
                       HashPartitioner.class,
                       Partitioner.class);
}

Source File: Submitter.java From hadoop with Apache License 2.0

4 votes

/**
 * Get the user's original partitioner.
 * @param conf the configuration to look in
 * @return the class that the user submitted
 */
static Class<? extends Partitioner> getJavaPartitioner(JobConf conf) {
  return conf.getClass(Submitter.PARTITIONER, 
                       HashPartitioner.class,
                       Partitioner.class);
}

Source File: JobConf.java From RDFS with Apache License 2.0

2 votes

/**
 * Get the {@link Partitioner} used to partition {@link Mapper}-outputs
 * to be sent to the {@link Reducer}s.
 *
 * @return the {@link Partitioner} used to partition map-outputs.
 */
public Class<? extends Partitioner> getPartitionerClass() {
  return getClass("mapred.partitioner.class",
                  HashPartitioner.class, Partitioner.class);
}

Source File: JobConf.java From big-c with Apache License 2.0

2 votes

/**
 * Get the {@link Partitioner} used to partition {@link Mapper}-outputs 
 * to be sent to the {@link Reducer}s.
 * 
 * @return the {@link Partitioner} used to partition map-outputs.
 */
public Class<? extends Partitioner> getPartitionerClass() {
  return getClass("mapred.partitioner.class",
                  HashPartitioner.class, Partitioner.class);
}

Source File: JobConf.java From hadoop with Apache License 2.0

2 votes

/**
 * Get the {@link Partitioner} used to partition {@link Mapper}-outputs 
 * to be sent to the {@link Reducer}s.
 * 
 * @return the {@link Partitioner} used to partition map-outputs.
 */
public Class<? extends Partitioner> getPartitionerClass() {
  return getClass("mapred.partitioner.class",
                  HashPartitioner.class, Partitioner.class);
}

Source File: JobConf.java From hadoop-gpu with Apache License 2.0

2 votes

/**
 * Get the {@link Partitioner} used to partition {@link Mapper}-outputs 
 * to be sent to the {@link Reducer}s.
 * 
 * @return the {@link Partitioner} used to partition map-outputs.
 */
public Class<? extends Partitioner> getPartitionerClass() {
  return getClass("mapred.partitioner.class",
                  HashPartitioner.class, Partitioner.class);
}

org.apache.hadoop.mapred.lib.HashPartitioner Java Examples