org.apache.hadoop.mapred.JobConf#setOutputFormat

Source File: SolrClean.java From anthelion with Apache License 2.0

7 votes

public void delete(String crawldb, String solrUrl, boolean noCommit) throws IOException {
  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
  long start = System.currentTimeMillis();
  LOG.info("SolrClean: starting at " + sdf.format(start));

  JobConf job = new NutchJob(getConf());

  FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME));
  job.setBoolean("noCommit", noCommit);
  job.set(SolrConstants.SERVER_URL, solrUrl);
  job.setInputFormat(SequenceFileInputFormat.class);
  job.setOutputFormat(NullOutputFormat.class);
  job.setMapOutputKeyClass(ByteWritable.class);
  job.setMapOutputValueClass(Text.class);
  job.setMapperClass(DBFilter.class);
  job.setReducerClass(SolrDeleter.class);

  JobClient.runJob(job);

  long end = System.currentTimeMillis();
  LOG.info("SolrClean: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

Source File: TeraSort.java From hadoop-book with Apache License 2.0

6 votes

public int run(String[] args) throws Exception {
  LOG.info("starting");
  JobConf job = (JobConf) getConf();
  Path inputDir = new Path(args[0]);
  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
  Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
  URI partitionUri = new URI(partitionFile.toString() +
                             "#" + TeraInputFormat.PARTITION_FILENAME);
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraSort");
  job.setJarByClass(TeraSort.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(TeraInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TeraInputFormat.writePartitionFile(job, partitionFile);
  DistributedCache.addCacheFile(partitionUri, job);
  DistributedCache.createSymlink(job);
  job.setInt("dfs.replication", 1);
  TeraOutputFormat.setFinalSync(job, true);
  JobClient.runJob(job);
  LOG.info("done");
  return 0;
}

Source File: ReadDataJob.java From tracing-framework with BSD 3-Clause "New" or "Revised" License

6 votes

public void configure(JobConf job) {
    // Set the mapper and reducers
    job.setMapperClass(TestMapper.class);
    // job.setReducerClass(TestReducer.class);

    // Set the output types of the mapper and reducer
    // job.setMapOutputKeyClass(IntWritable.class);
    // job.setMapOutputValueClass(NullWritable.class);
    // job.setOutputKeyClass(NullWritable.class);
    // job.setOutputValueClass(NullWritable.class);

    // Make sure this jar is included
    job.setJarByClass(TestMapper.class);

    // Specify the input and output data formats
    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(NullOutputFormat.class);

    // Turn off speculative execution
    job.setMapSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);

    // Add the job input path
    FileInputFormat.addInputPath(job, new Path(this.input_filename));
}

Source File: DataFsck.java From RDFS with Apache License 2.0

6 votes

private JobConf createJobConf() {
  JobConf jobConf = new JobConf(getConf());
  String jobName = NAME + " " + dateForm.format(new Date(System.currentTimeMillis()));
  jobConf.setJobName(jobName);
  jobConf.setMapSpeculativeExecution(false);

  jobConf.setJarByClass(DataFsck.class);
  jobConf.setInputFormat(DataFsckInputFormat.class);
  jobConf.setOutputFormat(SequenceFileOutputFormat.class);
  jobConf.setOutputKeyClass(Text.class);
  jobConf.setOutputValueClass(Text.class);

  jobConf.setMapperClass(DataFsckMapper.class);
  jobConf.setNumReduceTasks(0);
  return jobConf;
}

Source File: TeraGen.java From hadoop-gpu with Apache License 2.0

6 votes

/**
 * @param args the cli arguments
 */
public int run(String[] args) throws IOException {
  JobConf job = (JobConf) getConf();
  setNumberOfRows(job, Long.parseLong(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraGen");
  job.setJarByClass(TeraGen.class);
  job.setMapperClass(SortGenMapper.class);
  job.setNumReduceTasks(0);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(RangeInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  JobClient.runJob(job);
  return 0;
}

Source File: SleepJob.java From RDFS with Apache License 2.0

5 votes

public JobConf setupJobConf(int numMapper, int numReducer, 
                              long mapSleepTime, int mapSleepCount, 
                              long reduceSleepTime, int reduceSleepCount,
                              boolean doSpeculation, List<String> slowMaps,
                              List<String> slowReduces, int slowRatio,
                              int countersPerTask, List<String> hosts,
                              int hostsPerSplit, boolean setup) {
  
  JobConf job = new JobConf(getConf(), SleepJob.class);
  job.setNumMapTasks(numMapper);
  job.setNumReduceTasks(numReducer);
  job.setMapperClass(SleepJob.class);
  job.setMapOutputKeyClass(IntWritable.class);
  job.setMapOutputValueClass(NullWritable.class);
  job.setReducerClass(SleepJob.class);
  job.setOutputFormat(NullOutputFormat.class);
  job.setJobSetupCleanupNeeded(setup);
  job.setInputFormat(SleepInputFormat.class);
  job.setPartitionerClass(SleepJob.class);
  job.setJobName("Sleep job");
  FileInputFormat.addInputPath(job, new Path("ignored"));
  job.setLong("sleep.job.map.sleep.time", mapSleepTime);
  job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime);
  job.setInt("sleep.job.map.sleep.count", mapSleepCount);
  job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount);
  job.setSpeculativeExecution(doSpeculation);
  job.setInt(SLOW_RATIO, slowRatio);
  job.setStrings(SLOW_MAPS, slowMaps.toArray(new String[slowMaps.size()]));
  job.setStrings(SLOW_REDUCES, slowMaps.toArray(new String[slowReduces.size()]));
  job.setInt("sleep.job.counters.per.task", countersPerTask);
  job.setStrings(HOSTS_FOR_LOCALITY, hosts.toArray(new String[hosts.size()]));
  job.setInt(HOSTS_PER_SPLIT, hostsPerSplit);
  return job;
}

Source File: LinkRank.java From anthelion with Apache License 2.0

5 votes

/**
 * Runs the initializer job. The initializer job sets up the nodes with a
 * default starting score for link analysis.
 * 
 * @param nodeDb The node database to use.
 * @param output The job output directory.
 * 
 * @throws IOException If an error occurs while running the initializer job.
 */
private void runInitializer(Path nodeDb, Path output)
  throws IOException {

  // configure the initializer
  JobConf initializer = new NutchJob(getConf());
  initializer.setJobName("LinkAnalysis Initializer");
  FileInputFormat.addInputPath(initializer, nodeDb);
  FileOutputFormat.setOutputPath(initializer, output);
  initializer.setInputFormat(SequenceFileInputFormat.class);
  initializer.setMapperClass(Initializer.class);
  initializer.setMapOutputKeyClass(Text.class);
  initializer.setMapOutputValueClass(Node.class);
  initializer.setOutputKeyClass(Text.class);
  initializer.setOutputValueClass(Node.class);
  initializer.setOutputFormat(MapFileOutputFormat.class);
  initializer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

  // run the initializer
  LOG.info("Starting initialization job");
  try {
    JobClient.runJob(initializer);
  }
  catch (IOException e) {
    LOG.error(StringUtils.stringifyException(e));
    throw e;
  }
  LOG.info("Finished initialization job.");
}

Source File: TeraSort.java From RDFS with Apache License 2.0

5 votes

public int run(String[] args) throws Exception {
  LOG.info("starting");
  JobConf job = (JobConf) getConf();
  Path inputDir = new Path(args[0]);
  inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
  Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
  URI partitionUri = new URI(partitionFile.toString() +
                             "#" + TeraInputFormat.PARTITION_FILENAME);
  TeraInputFormat.setInputPaths(job, new Path(args[0]));
  FileOutputFormat.setOutputPath(job, new Path(args[1]));
  job.setJobName("TeraSort");
  job.setJarByClass(TeraSort.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  job.setInputFormat(TeraInputFormat.class);
  job.setOutputFormat(TeraOutputFormat.class);
  job.setPartitionerClass(TotalOrderPartitioner.class);
  TeraInputFormat.writePartitionFile(job, partitionFile);
  DistributedCache.addCacheFile(partitionUri, job);
  DistributedCache.createSymlink(job);
  job.setInt("dfs.replication", 1);
  TeraOutputFormat.setFinalSync(job, true);
  long startTime = System.currentTimeMillis();
  JobClient.runJob(job);
  long endTime = System.currentTimeMillis();
  System.out.println((float)(endTime-startTime)/1000);
  LOG.info("done");
  return 0;
}

Source File: IndexerMapReduce.java From anthelion with Apache License 2.0

5 votes

public static void initMRJob(Path crawlDb, Path linkDb,
                         Collection<Path> segments,
                         JobConf job) {

  LOG.info("IndexerMapReduce: crawldb: " + crawlDb);
  
  if (linkDb!=null)
    LOG.info("IndexerMapReduce: linkdb: " + linkDb);

  for (final Path segment : segments) {
    LOG.info("IndexerMapReduces: adding segment: " + segment);
    FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.FETCH_DIR_NAME));
    FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.PARSE_DIR_NAME));
    FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
    FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));
  }

  FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
  
  if (linkDb!=null)
 FileInputFormat.addInputPath(job, new Path(linkDb, LinkDb.CURRENT_NAME));
  
  job.setInputFormat(SequenceFileInputFormat.class);

  job.setMapperClass(IndexerMapReduce.class);
  job.setReducerClass(IndexerMapReduce.class);

  job.setOutputFormat(IndexerOutputFormat.class);
  job.setOutputKeyClass(Text.class);
  job.setMapOutputValueClass(NutchWritable.class);
  job.setOutputValueClass(NutchWritable.class);
}

Source File: LazyOutputFormat.java From big-c with Apache License 2.0

5 votes

/**
 * Set the underlying output format for LazyOutputFormat.
 * @param job the {@link JobConf} to modify
 * @param theClass the underlying class
 */
@SuppressWarnings("unchecked")
public static void  setOutputFormatClass(JobConf job, 
    Class<? extends OutputFormat> theClass) {
    job.setOutputFormat(LazyOutputFormat.class);
    job.setClass("mapreduce.output.lazyoutputformat.outputformat", theClass, OutputFormat.class);
}

Source File: Submitter.java From big-c with Apache License 2.0

4 votes

private static void setupPipesJob(JobConf conf) throws IOException {
  // default map output types to Text
  if (!getIsJavaMapper(conf)) {
    conf.setMapRunnerClass(PipesMapRunner.class);
    // Save the user's partitioner and hook in our's.
    setJavaPartitioner(conf, conf.getPartitionerClass());
    conf.setPartitionerClass(PipesPartitioner.class);
  }
  if (!getIsJavaReducer(conf)) {
    conf.setReducerClass(PipesReducer.class);
    if (!getIsJavaRecordWriter(conf)) {
      conf.setOutputFormat(NullOutputFormat.class);
    }
  }
  String textClassname = Text.class.getName();
  setIfUnset(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS, textClassname);
  setIfUnset(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS, textClassname);
  setIfUnset(conf, MRJobConfig.OUTPUT_KEY_CLASS, textClassname);
  setIfUnset(conf, MRJobConfig.OUTPUT_VALUE_CLASS, textClassname);
  
  // Use PipesNonJavaInputFormat if necessary to handle progress reporting
  // from C++ RecordReaders ...
  if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) {
    conf.setClass(Submitter.INPUT_FORMAT, 
                  conf.getInputFormat().getClass(), InputFormat.class);
    conf.setInputFormat(PipesNonJavaInputFormat.class);
  }
  
  String exec = getExecutable(conf);
  if (exec == null) {
    throw new IllegalArgumentException("No application program defined.");
  }
  // add default debug script only when executable is expressed as
  // <path>#<executable>
  if (exec.contains("#")) {
    // set default gdb commands for map and reduce task 
    String defScript = "$HADOOP_PREFIX/src/c++/pipes/debug/pipes-default-script";
    setIfUnset(conf, MRJobConfig.MAP_DEBUG_SCRIPT,defScript);
    setIfUnset(conf, MRJobConfig.REDUCE_DEBUG_SCRIPT,defScript);
  }
  URI[] fileCache = DistributedCache.getCacheFiles(conf);
  if (fileCache == null) {
    fileCache = new URI[1];
  } else {
    URI[] tmp = new URI[fileCache.length+1];
    System.arraycopy(fileCache, 0, tmp, 1, fileCache.length);
    fileCache = tmp;
  }
  try {
    fileCache[0] = new URI(exec);
  } catch (URISyntaxException e) {
    IOException ie = new IOException("Problem parsing execable URI " + exec);
    ie.initCause(e);
    throw ie;
  }
  DistributedCache.setCacheFiles(fileCache, conf);
}

Source File: TradeNetworthHdfsDataVerifier.java From gemfirexd-oss with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {

    GfxdDataSerializable.initTypes();

    JobConf conf = new JobConf(getConf());
    conf.setJobName("TradeSecurityHdfsDataVerifier");

    String hdfsHomeDir = args[0];
    String url         = args[1];
    String tableName   = args[2];

    System.out.println("TradeSecurityHdfsDataVerifier.run() invoked with " 
                       + " hdfsHomeDir = " + hdfsHomeDir 
                       + " url = " + url
                       + " tableName = " + tableName);

    // Job-specific params
    conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir);
    conf.set(RowInputFormat.INPUT_TABLE, tableName);
    conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false);
    
    conf.setInputFormat(RowInputFormat.class);
    conf.setMapperClass(HdfsDataMapper.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(TradeNetworthRow.class);
    
    conf.setReducerClass(HdfsDataReducer.class);
    conf.set(RowOutputFormat.OUTPUT_TABLE, tableName +"_HDFS");
    //conf.set(GfxdOutputFormat.OUTPUT_SCHEMA, "APP");
    conf.set(RowOutputFormat.OUTPUT_URL, url);
    conf.setOutputFormat(RowOutputFormat.class);
    conf.setOutputKeyClass(Key.class);
    conf.setOutputValueClass(TradeNetworthOutputObject.class);

    StringBuffer aStr = new StringBuffer();
    aStr.append("HOME_DIR = " + conf.get(RowInputFormat.HOME_DIR) + " ");
    aStr.append("INPUT_TABLE = " + conf.get(RowInputFormat.INPUT_TABLE) + " ");
    aStr.append("OUTPUT_TABLE = " + conf.get(RowOutputFormat.OUTPUT_TABLE) + " ");
    aStr.append("OUTPUT_URL = " + conf.get(RowOutputFormat.OUTPUT_URL) + " ");
    System.out.println("VerifyHdfsData running with the following conf: " + aStr.toString());

    
    FileOutputFormat.setOutputPath(conf, new Path("" + System.currentTimeMillis()));
    
    JobClient.runJob(conf);
    return 0;
  }

Source File: TradeSecurityHdfsDataVerifier.java From gemfirexd-oss with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {

    GfxdDataSerializable.initTypes();

    JobConf conf = new JobConf(getConf());
    conf.setJobName("TradeSecurityHdfsDataVerifier");

    String hdfsHomeDir = args[0];
    String url         = args[1];
    String tableName   = args[2];

    System.out.println("TradeSecurityHdfsDataVerifier.run() invoked with " 
                       + " hdfsHomeDir = " + hdfsHomeDir 
                       + " url = " + url
                       + " tableName = " + tableName);

    // Job-specific params
    conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir);
    conf.set(RowInputFormat.INPUT_TABLE, tableName);
    conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false);
    
    conf.setInputFormat(RowInputFormat.class);
    conf.setMapperClass(HdfsDataMapper.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(TradeSecurityRow.class);
    
    conf.setReducerClass(HdfsDataReducer.class);
    conf.set(RowOutputFormat.OUTPUT_TABLE,tableName + "_HDFS");
    conf.set(RowOutputFormat.OUTPUT_URL, url);
    conf.setOutputFormat(RowOutputFormat.class);
    conf.setOutputKeyClass(Key.class);
    conf.setOutputValueClass(TradeSecurityOutputObject.class);

    StringBuffer aStr = new StringBuffer();
    aStr.append("HOME_DIR = " + conf.get(RowInputFormat.HOME_DIR) + " ");
    aStr.append("INPUT_TABLE = " + conf.get(RowInputFormat.INPUT_TABLE) + " ");
    aStr.append("OUTPUT_TABLE = " + conf.get(RowOutputFormat.OUTPUT_TABLE) + " ");
    aStr.append("OUTPUT_URL = " + conf.get(RowOutputFormat.OUTPUT_URL) + " ");
    System.out.println("VerifyHdfsData running with the following conf: " + aStr.toString());

    
    FileOutputFormat.setOutputPath(conf, new Path("" + System.currentTimeMillis()));
    
    JobClient.runJob(conf);
    return 0;
  }

Source File: TradeBuyOrdersHdfsDataVerifier.java From gemfirexd-oss with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {

    GfxdDataSerializable.initTypes();

    JobConf conf = new JobConf(getConf());
    conf.setJobName("TradeBuyOrdersHdfsDataVerifier");

    String hdfsHomeDir = args[0];
    String url         = args[1];
    String tableName   = args[2];

    System.out.println("TradeBuyOrdersHdfsDataVerifier.run() invoked with " 
                       + " hdfsHomeDir = " + hdfsHomeDir 
                       + " url = " + url
                       + " tableName = " + tableName);

    // Job-specific params
    conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir);
    conf.set(RowInputFormat.INPUT_TABLE, tableName);
    conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false);
    
    conf.setInputFormat(RowInputFormat.class);
    conf.setMapperClass(HdfsDataMapper.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(TradeBuyOrdersRow.class);
    
    conf.setReducerClass(HdfsDataReducer.class);
    conf.set(RowOutputFormat.OUTPUT_TABLE, tableName + "_HDFS");
    //conf.set(GfxdOutputFormat.OUTPUT_SCHEMA, "APP");
    conf.set(RowOutputFormat.OUTPUT_URL, url);
    conf.setOutputFormat(RowOutputFormat.class);
    conf.setOutputKeyClass(Key.class);
    conf.setOutputValueClass(TradeBuyOrdersOutputObject.class);

    StringBuffer aStr = new StringBuffer();
    aStr.append("HOME_DIR = " + conf.get(RowInputFormat.HOME_DIR) + " ");
    aStr.append("INPUT_TABLE = " + conf.get(RowInputFormat.INPUT_TABLE) + " ");
    aStr.append("OUTPUT_TABLE = " + conf.get(RowOutputFormat.OUTPUT_TABLE) + " ");
    aStr.append("OUTPUT_URL = " + conf.get(RowOutputFormat.OUTPUT_URL) + " ");
    System.out.println("VerifyHdfsData running with the following conf: " + aStr.toString());

    
    FileOutputFormat.setOutputPath(conf, new Path("" + System.currentTimeMillis()));
    
    JobClient.runJob(conf);
    return 0;
  }

Source File: TradeTxHistoryHdfsDataVerifier.java From gemfirexd-oss with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {

    GfxdDataSerializable.initTypes();

    JobConf conf = new JobConf(getConf());
    conf.setJobName("TradeTxHistoryHdfsDataVerifier");

    String hdfsHomeDir = args[0];
    String url         = args[1];
    String tableName   = args[2];

    System.out.println("TradeTxHistoryHdfsDataVerifier.run() invoked with " 
                       + " hdfsHomeDir = " + hdfsHomeDir 
                       + " url = " + url
                       + " tableName = " + tableName);

    // Job-specific params
    conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir);
    conf.set(RowInputFormat.INPUT_TABLE, tableName);
    conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false);
    
    conf.setInputFormat(RowInputFormat.class);
    conf.setMapperClass(HdfsDataMapper.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(TradeTxHistoryRow.class);
    
    conf.setReducerClass(HdfsDataReducer.class);
    conf.set(RowOutputFormat.OUTPUT_TABLE, tableName + "_HDFS");
    //conf.set(GfxdOutputFormat.OUTPUT_SCHEMA, "APP");
    conf.set(RowOutputFormat.OUTPUT_URL, url);
    conf.setOutputFormat(RowOutputFormat.class);
    conf.setOutputKeyClass(Key.class);
    conf.setOutputValueClass(TradeTxHistoryOutputObject.class);

    StringBuffer aStr = new StringBuffer();
    aStr.append("HOME_DIR = " + conf.get(RowInputFormat.HOME_DIR) + " ");
    aStr.append("INPUT_TABLE = " + conf.get(RowInputFormat.INPUT_TABLE) + " ");
    aStr.append("OUTPUT_TABLE = " + conf.get(RowOutputFormat.OUTPUT_TABLE) + " ");
    aStr.append("OUTPUT_URL = " + conf.get(RowOutputFormat.OUTPUT_URL) + " ");
    System.out.println("VerifyHdfsData running with the following conf: " + aStr.toString());

    
    FileOutputFormat.setOutputPath(conf, new Path("" + System.currentTimeMillis()));
    
    JobClient.runJob(conf);
    return 0;
  }

Source File: TopBusyAirportGemfirexd.java From gemfirexd-oss with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {

    GfxdDataSerializable.initTypes();

    JobConf conf = new JobConf(getConf());
    conf.setJobName("Busy Airport Count");

    Path outputPath = new Path(args[0]);
    Path intermediateOutputPath = new Path(args[0] + "_int");
    String hdfsHomeDir = args[1];
    String tableName = args[2];

    outputPath.getFileSystem(conf).delete(outputPath, true);
    intermediateOutputPath.getFileSystem(conf).delete(intermediateOutputPath, true);

    conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir);
    conf.set(RowInputFormat.INPUT_TABLE, tableName);
    conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false);

    conf.setInputFormat(RowInputFormat.class);
    conf.setMapperClass(SampleMapper.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);

    conf.setReducerClass(SampleReducer.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    FileOutputFormat.setOutputPath(conf, intermediateOutputPath);

    int rc = JobClient.runJob(conf).isSuccessful() ? 0 : 1;
    if (rc == 0) {
      JobConf topConf = new JobConf(getConf());
      topConf.setJobName("Top Busy Airport");

      String hdfsFS = topConf.get("fs.defaultFS");
      URI hdfsUri = URI.create(hdfsFS);
      hdfsUri.getHost();

      // Assume that SqlFire locator is running alongside the namenode
      topConf.set(RowOutputFormat.OUTPUT_URL, "jdbc:gemfirexd://" + hdfsUri.getHost() + ":1527");
      //topConf.set(ddGfxdOutputFormat.OUTPUT_SCHEMA, "APP");
      //topConf.set(GfxdOutputFormat.OUTPUT_TABLE, "BUSY_AIRPORT");
      topConf.set(RowOutputFormat.OUTPUT_TABLE, "APP.BUSY_AIRPORT");

      // Only run a single reducer
      topConf.setNumReduceTasks(1);

      FileInputFormat.setInputPaths(topConf, intermediateOutputPath);

      topConf.setInputFormat(TextInputFormat.class);
      topConf.setMapperClass(TopBusyAirportMapper.class);
      topConf.setMapOutputKeyClass(Text.class);
      topConf.setMapOutputValueClass(StringIntPair.class);

      topConf.setReducerClass(TopBusyAirportReducer.class);
      topConf.setOutputKeyClass(Key.class);
      topConf.setOutputValueClass(BusyAirportModel.class);
      topConf.setOutputFormat(RowOutputFormat.class);

      rc = JobClient.runJob(topConf).isSuccessful() ? 0 : 1;
    }
    return rc;
  }

Source File: TradeSellOrdersHdfsDataVerifier.java From gemfirexd-oss with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {

    GfxdDataSerializable.initTypes();

    JobConf conf = new JobConf(getConf());
    conf.setJobName("TradeSellOrdersHdfsDataVerifier");

    String hdfsHomeDir = args[0];
    String url         = args[1];
    String tableName   = args[2];

    System.out.println("TradeSellOrdersHdfsDataVerifier.run() invoked with " 
                       + " hdfsHomeDir = " + hdfsHomeDir 
                       + " url = " + url
                       + " tableName = " + tableName);

    // Job-specific params
    conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir);
    conf.set(RowInputFormat.INPUT_TABLE, tableName);
    conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false);
    
    conf.setInputFormat(RowInputFormat.class);
    conf.setMapperClass(HdfsDataMapper.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(TradeSellOrdersRow.class);
    
    conf.setReducerClass(HdfsDataReducer.class);
    conf.set(RowOutputFormat.OUTPUT_TABLE, tableName + "_HDFS");
    //conf.set(GfxdOutputFormat.OUTPUT_SCHEMA, "APP");
    conf.set(RowOutputFormat.OUTPUT_URL, url);
    conf.setOutputFormat(RowOutputFormat.class);
    conf.setOutputKeyClass(Key.class);
    conf.setOutputValueClass(TradeSellOrdersOutputObject.class);

    StringBuffer aStr = new StringBuffer();
    aStr.append("HOME_DIR = " + conf.get(RowInputFormat.HOME_DIR) + " ");
    aStr.append("INPUT_TABLE = " + conf.get(RowInputFormat.INPUT_TABLE) + " ");
    aStr.append("OUTPUT_TABLE = " + conf.get(RowOutputFormat.OUTPUT_TABLE) + " ");
    aStr.append("OUTPUT_URL = " + conf.get(RowOutputFormat.OUTPUT_URL) + " ");
    System.out.println("VerifyHdfsData running with the following conf: " + aStr.toString());

    
    FileOutputFormat.setOutputPath(conf, new Path("" + System.currentTimeMillis()));
    
    JobClient.runJob(conf);
    return 0;
  }

Source File: NodeDumper.java From nutch-htmlunit with Apache License 2.0

4 votes

/**
 * Runs the process to dump the top urls out to a text file.
 *
 * @param webGraphDb The WebGraph from which to pull values.
 *
 * @param topN
 * @param output
 *
 * @throws IOException If an error occurs while dumping the top values.
 */
public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, boolean asEff, NameType nameType, AggrType aggrType, boolean asSequenceFile)
  throws Exception {

  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
  long start = System.currentTimeMillis();
  LOG.info("NodeDumper: starting at " + sdf.format(start));
  Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
  Configuration conf = getConf();

  JobConf dumper = new NutchJob(conf);
  dumper.setJobName("NodeDumper: " + webGraphDb);
  FileInputFormat.addInputPath(dumper, nodeDb);
  dumper.setInputFormat(SequenceFileInputFormat.class);

  if (nameType == null) {
    dumper.setMapperClass(Sorter.class);
    dumper.setReducerClass(Sorter.class);
    dumper.setMapOutputKeyClass(FloatWritable.class);
    dumper.setMapOutputValueClass(Text.class);
  } else {
    dumper.setMapperClass(Dumper.class);
    dumper.setReducerClass(Dumper.class);
    dumper.setMapOutputKeyClass(Text.class);
    dumper.setMapOutputValueClass(FloatWritable.class);
  }

  dumper.setOutputKeyClass(Text.class);
  dumper.setOutputValueClass(FloatWritable.class);
  FileOutputFormat.setOutputPath(dumper, output);

  if (asSequenceFile) {
    dumper.setOutputFormat(SequenceFileOutputFormat.class);
  } else {
    dumper.setOutputFormat(TextOutputFormat.class);
  }

  dumper.setNumReduceTasks(1);
  dumper.setBoolean("inlinks", type == DumpType.INLINKS);
  dumper.setBoolean("outlinks", type == DumpType.OUTLINKS);
  dumper.setBoolean("scores", type == DumpType.SCORES);

  dumper.setBoolean("host", nameType == NameType.HOST);
  dumper.setBoolean("domain", nameType == NameType.DOMAIN);
  dumper.setBoolean("sum", aggrType == AggrType.SUM);
  dumper.setBoolean("max", aggrType == AggrType.MAX);

  dumper.setLong("topn", topN);

  // Set equals-sign as separator for Solr's ExternalFileField
  if (asEff) {
    dumper.set("mapred.textoutputformat.separator", "=");
  }

  try {
    LOG.info("NodeDumper: running");
    JobClient.runJob(dumper);
  }
  catch (IOException e) {
    LOG.error(StringUtils.stringifyException(e));
    throw e;
  }
  long end = System.currentTimeMillis();
  LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

Source File: FreeGenerator.java From anthelion with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {
  if (args.length < 2) {
    System.err.println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]");
    System.err.println("\tinputDir\tinput directory containing one or more input files.");
    System.err.println("\t\tEach text file contains a list of URLs, one URL per line");
    System.err.println("\tsegmentsDir\toutput directory, where new segment will be created");
    System.err.println("\t-filter\trun current URLFilters on input URLs");
    System.err.println("\t-normalize\trun current URLNormalizers on input URLs");
    return -1;
  }
  boolean filter = false;
  boolean normalize = false;
  if (args.length > 2) {
    for (int i = 2; i < args.length; i++) {
      if (args[i].equals("-filter")) {
        filter = true;
      } else if (args[i].equals("-normalize")) {
        normalize = true;
      } else {
        LOG.error("Unknown argument: " + args[i] + ", exiting ...");
        return -1;
      }
    }
  }
  
  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
  long start = System.currentTimeMillis();
  LOG.info("FreeGenerator: starting at " + sdf.format(start));

  JobConf job = new NutchJob(getConf());
  job.setBoolean(FILTER_KEY, filter);
  job.setBoolean(NORMALIZE_KEY, normalize);
  FileInputFormat.addInputPath(job, new Path(args[0]));
  job.setInputFormat(TextInputFormat.class);
  job.setMapperClass(FG.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(Generator.SelectorEntry.class);
  job.setPartitionerClass(URLPartitioner.class);
  job.setReducerClass(FG.class);
  String segName = Generator.generateSegmentName();
  job.setNumReduceTasks(job.getNumMapTasks());
  job.setOutputFormat(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(CrawlDatum.class);
  job.setOutputKeyComparatorClass(Generator.HashComparator.class);
  FileOutputFormat.setOutputPath(job, new Path(args[1],
      new Path(segName, CrawlDatum.GENERATE_DIR_NAME)));
  try {
    JobClient.runJob(job);
  } catch (Exception e) {
    LOG.error("FAILED: " + StringUtils.stringifyException(e));
    return -1;
  }
  long end = System.currentTimeMillis();
  LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
  return 0;
}

Source File: DBOutputFormat.java From RDFS with Apache License 2.0

3 votes

/**
 * Initializes the reduce-part of the job with the appropriate output settings
 * 
 * @param job
 *          The job
 * @param tableName
 *          The table to insert data into
 * @param fieldNames
 *          The field names in the table. If unknown, supply the appropriate
 *          number of nulls.
 */
public static void setOutput(JobConf job, String tableName, String... fieldNames) {
  job.setOutputFormat(DBOutputFormat.class);
  job.setReduceSpeculativeExecution(false);

  DBConfiguration dbConf = new DBConfiguration(job);
  
  dbConf.setOutputTableName(tableName);
  dbConf.setOutputFieldNames(fieldNames);
}

Java Code Examples for org.apache.hadoop.mapred.JobConf#setOutputFormat()