Java Code Examples for org.apache.hadoop.mapred.JobConf#setMapOutputKeyClass()
The following examples show how to use
org.apache.hadoop.mapred.JobConf#setMapOutputKeyClass() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SolrClean.java From anthelion with Apache License 2.0 | 7 votes |
public void delete(String crawldb, String solrUrl, boolean noCommit) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("SolrClean: starting at " + sdf.format(start)); JobConf job = new NutchJob(getConf()); FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME)); job.setBoolean("noCommit", noCommit); job.set(SolrConstants.SERVER_URL, solrUrl); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(NullOutputFormat.class); job.setMapOutputKeyClass(ByteWritable.class); job.setMapOutputValueClass(Text.class); job.setMapperClass(DBFilter.class); job.setReducerClass(SolrDeleter.class); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("SolrClean: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
Example 2
Source File: DistCp.java From RDFS with Apache License 2.0 | 6 votes |
private static JobConf createJobConf(Configuration conf, boolean useFastCopy) { Class<? extends InputFormat> inputFormat = (useFastCopy) ? FastCopyInputFormat.class : CopyInputFormat.class; JobConf jobconf = new JobConf(conf, DistCp.class); jobconf.setJobName(NAME); // turn off speculative execution, because DFS doesn't handle // multiple writers to the same file. jobconf.setReduceSpeculativeExecution(false); jobconf.setMapOutputKeyClass(FilePairComparable.class); jobconf.setMapOutputValueClass(Text.class); jobconf.setOutputKeyClass(FilePairComparable.class); jobconf.setOutputValueClass(Text.class); jobconf.setInputFormat(inputFormat); jobconf.setMapperClass(CopyFilesTask.class); jobconf.setReducerClass(CopyFilesTask.class); jobconf.setNumReduceTasks(conf.getInt(MAX_REDUCE_LABEL, 1)); // Prevent the reducer from starting until all maps are done. jobconf.setInt("mapred.job.rushreduce.reduce.threshold", 0); jobconf.setFloat("mapred.reduce.slowstart.completed.maps", 1.0f); return jobconf; }
Example 3
Source File: ExternalJoin.java From aerospike-hadoop with Apache License 2.0 | 5 votes |
public int run(final String[] args) throws Exception { log.info("run starting"); final Configuration conf = getConf(); JobConf job = new JobConf(conf, ExternalJoin.class); job.setJobName("AerospikeExternalJoin"); job.setMapperClass(Map.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(LongWritable.class); // job.setCombinerClass(Reduce.class); // Reduce changes format. job.setReducerClass(Reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Session.class); job.setOutputFormat(SessionOutputFormat.class); for (int ii = 0; ii < args.length; ++ii) FileInputFormat.addInputPath(job, new Path(args[ii])); JobClient.runJob(job); log.info("finished"); return 0; }
Example 4
Source File: GenerateProfiles.java From aerospike-hadoop with Apache License 2.0 | 5 votes |
public int run(final String[] args) throws Exception { log.info("run starting"); final Configuration conf = getConf(); JobConf job = new JobConf(conf, GenerateProfiles.class); job.setJobName("AerospikeGenerateProfiles"); job.setMapperClass(Map.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(IntWritable.class); // job.setCombinerClass(Reduce.class); // Reduce changes format. job.setReducerClass(Reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Profile.class); job.setOutputFormat(ProfileOutputFormat.class); for (int ii = 0; ii < args.length; ++ii) FileInputFormat.addInputPath(job, new Path(args[ii])); JobClient.runJob(job); log.info("finished"); return 0; }
Example 5
Source File: BusyAirports.java From gemfirexd-oss with Apache License 2.0 | 5 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); JobConf conf = new JobConf(getConf()); conf.setJobName("Busy Airport Count"); Path outputPath = new Path(args[0]); String hdfsHomeDir = args[1]; String tableName = args[2]; outputPath.getFileSystem(conf).delete(outputPath, true); conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); conf.setInputFormat(RowInputFormat.class); conf.setMapperClass(SampleMapper.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(IntWritable.class); conf.setReducerClass(SampleReducer.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); FileOutputFormat.setOutputPath(conf, outputPath); JobClient.runJob(conf); return 0; }
Example 6
Source File: SleepJob.java From RDFS with Apache License 2.0 | 5 votes |
public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount, long reduceSleepTime, int reduceSleepCount, boolean doSpeculation, List<String> slowMaps, List<String> slowReduces, int slowRatio, int countersPerTask, List<String> hosts, int hostsPerSplit, boolean setup) { JobConf job = new JobConf(getConf(), SleepJob.class); job.setNumMapTasks(numMapper); job.setNumReduceTasks(numReducer); job.setMapperClass(SleepJob.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setReducerClass(SleepJob.class); job.setOutputFormat(NullOutputFormat.class); job.setJobSetupCleanupNeeded(setup); job.setInputFormat(SleepInputFormat.class); job.setPartitionerClass(SleepJob.class); job.setJobName("Sleep job"); FileInputFormat.addInputPath(job, new Path("ignored")); job.setLong("sleep.job.map.sleep.time", mapSleepTime); job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime); job.setInt("sleep.job.map.sleep.count", mapSleepCount); job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount); job.setSpeculativeExecution(doSpeculation); job.setInt(SLOW_RATIO, slowRatio); job.setStrings(SLOW_MAPS, slowMaps.toArray(new String[slowMaps.size()])); job.setStrings(SLOW_REDUCES, slowMaps.toArray(new String[slowReduces.size()])); job.setInt("sleep.job.counters.per.task", countersPerTask); job.setStrings(HOSTS_FOR_LOCALITY, hosts.toArray(new String[hosts.size()])); job.setInt(HOSTS_PER_SPLIT, hostsPerSplit); return job; }
Example 7
Source File: TopBusyAirport.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); JobConf conf = new JobConf(getConf()); conf.setJobName("Busy Airport Count"); Path outputPath = new Path(args[0]); Path intermediateOutputPath = new Path(args[0] + "_int"); String hdfsHomeDir = args[1]; String tableName = args[2]; outputPath.getFileSystem(conf).delete(outputPath, true); intermediateOutputPath.getFileSystem(conf).delete(intermediateOutputPath, true); conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); conf.setInputFormat(RowInputFormat.class); conf.setMapperClass(SampleMapper.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(IntWritable.class); conf.setReducerClass(SampleReducer.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); FileOutputFormat.setOutputPath(conf, intermediateOutputPath); int rc = JobClient.runJob(conf).isSuccessful() ? 0 : 1; if (rc == 0) { JobConf topConf = new JobConf(getConf()); topConf.setJobName("Top Busy Airport"); // Only run a single reducer topConf.setNumReduceTasks(1); FileInputFormat.setInputPaths(topConf, intermediateOutputPath); topConf.setInputFormat(TextInputFormat.class); topConf.setMapperClass(TopBusyAirportMapper.class); topConf.setMapOutputKeyClass(Text.class); topConf.setMapOutputValueClass(StringIntPair.class); topConf.setReducerClass(TopBusyAirportReducer.class); topConf.setOutputKeyClass(Text.class); topConf.setOutputValueClass(IntWritable.class); FileOutputFormat.setOutputPath(topConf, outputPath); rc = JobClient.runJob(topConf).isSuccessful() ? 0 : 1; } return rc; }
Example 8
Source File: TradesHdfsDataVerifier.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); JobConf conf = new JobConf(getConf()); conf.setJobName("TradesHdfsDataVerifier"); String hdfsHomeDir = args[0]; String url = args[1]; String tableName = args[2]; System.out.println("TradesHdfsDataVerifier.run() invoked with " + " hdfsHomeDir = " + hdfsHomeDir + " url = " + url + " tableName = " + tableName); // Job-specific params conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); conf.setInputFormat(RowInputFormat.class); conf.setMapperClass(HdfsDataMapper.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(TradesRow.class); conf.setReducerClass(HdfsDataReducer.class); conf.set(RowOutputFormat.OUTPUT_TABLE, tableName + "_HDFS"); //conf.set(GfxdOutputFormat.OUTPUT_SCHEMA, "APP"); conf.set(RowOutputFormat.OUTPUT_URL, url); conf.setOutputFormat(RowOutputFormat.class); conf.setOutputKeyClass(Key.class); conf.setOutputValueClass(TradeOutputObject.class); StringBuffer aStr = new StringBuffer(); aStr.append("HOME_DIR = " + conf.get(RowInputFormat.HOME_DIR) + " "); aStr.append("INPUT_TABLE = " + conf.get(RowInputFormat.INPUT_TABLE) + " "); aStr.append("OUTPUT_TABLE = " + conf.get(RowOutputFormat.OUTPUT_TABLE) + " "); aStr.append("OUTPUT_URL = " + conf.get(RowOutputFormat.OUTPUT_URL) + " "); System.out.println("VerifyHdfsData running with the following conf: " + aStr.toString()); FileOutputFormat.setOutputPath(conf, new Path("" + System.currentTimeMillis())); JobClient.runJob(conf); return 0; }
Example 9
Source File: DataJoinJob.java From big-c with Apache License 2.0 | 4 votes |
public static JobConf createDataJoinJob(String args[]) throws IOException { String inputDir = args[0]; String outputDir = args[1]; Class inputFormat = SequenceFileInputFormat.class; if (args[2].compareToIgnoreCase("text") != 0) { System.out.println("Using SequenceFileInputFormat: " + args[2]); } else { System.out.println("Using TextInputFormat: " + args[2]); inputFormat = TextInputFormat.class; } int numOfReducers = Integer.parseInt(args[3]); Class mapper = getClassByName(args[4]); Class reducer = getClassByName(args[5]); Class mapoutputValueClass = getClassByName(args[6]); Class outputFormat = TextOutputFormat.class; Class outputValueClass = Text.class; if (args[7].compareToIgnoreCase("text") != 0) { System.out.println("Using SequenceFileOutputFormat: " + args[7]); outputFormat = SequenceFileOutputFormat.class; outputValueClass = getClassByName(args[7]); } else { System.out.println("Using TextOutputFormat: " + args[7]); } long maxNumOfValuesPerGroup = 100; String jobName = ""; if (args.length > 8) { maxNumOfValuesPerGroup = Long.parseLong(args[8]); } if (args.length > 9) { jobName = args[9]; } Configuration defaults = new Configuration(); JobConf job = new JobConf(defaults, DataJoinJob.class); job.setJobName("DataJoinJob: " + jobName); FileSystem fs = FileSystem.get(defaults); fs.delete(new Path(outputDir), true); FileInputFormat.setInputPaths(job, inputDir); job.setInputFormat(inputFormat); job.setMapperClass(mapper); FileOutputFormat.setOutputPath(job, new Path(outputDir)); job.setOutputFormat(outputFormat); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(mapoutputValueClass); job.setOutputKeyClass(Text.class); job.setOutputValueClass(outputValueClass); job.setReducerClass(reducer); job.setNumMapTasks(1); job.setNumReduceTasks(numOfReducers); job.setLong("datajoin.maxNumOfValuesPerGroup", maxNumOfValuesPerGroup); return job; }
Example 10
Source File: TradeCustomersHdfsDataVerifier.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); JobConf conf = new JobConf(getConf()); conf.setJobName("TradeCustomersHdfsDataVerifier"); String hdfsHomeDir = args[0]; String url = args[1]; String tableName = args[2]; System.out.println("TradeCustomersHdfsDataVerifier.run() invoked with " + " hdfsHomeDir = " + hdfsHomeDir + " url = " + url + " tableName = " + tableName); // Job-specific params conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); conf.setInputFormat(RowInputFormat.class); conf.setMapperClass(HdfsDataMapper.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(TradeCustomersRow.class); conf.setReducerClass(HdfsDataReducer.class); conf.set(RowOutputFormat.OUTPUT_TABLE, tableName + "_HDFS"); //conf.set(GfxdOutputFormat.OUTPUT_SCHEMA, "APP"); conf.set(RowOutputFormat.OUTPUT_URL, url); conf.setOutputFormat(RowOutputFormat.class); conf.setOutputKeyClass(Key.class); conf.setOutputValueClass(TradeCustomerOutputObject.class); StringBuffer aStr = new StringBuffer(); aStr.append("HOME_DIR = " + conf.get(RowInputFormat.HOME_DIR) + " "); aStr.append("INPUT_TABLE = " + conf.get(RowInputFormat.INPUT_TABLE) + " "); aStr.append("OUTPUT_TABLE = " + conf.get(RowOutputFormat.OUTPUT_TABLE) + " "); aStr.append("OUTPUT_URL = " + conf.get(RowOutputFormat.OUTPUT_URL) + " "); System.out.println("VerifyHdfsData running with the following conf: " + aStr.toString()); FileOutputFormat.setOutputPath(conf, new Path("" + System.currentTimeMillis())); JobClient.runJob(conf); return 0; }
Example 11
Source File: TradeNetworthHdfsDataVerifier.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); JobConf conf = new JobConf(getConf()); conf.setJobName("TradeSecurityHdfsDataVerifier"); String hdfsHomeDir = args[0]; String url = args[1]; String tableName = args[2]; System.out.println("TradeSecurityHdfsDataVerifier.run() invoked with " + " hdfsHomeDir = " + hdfsHomeDir + " url = " + url + " tableName = " + tableName); // Job-specific params conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); conf.setInputFormat(RowInputFormat.class); conf.setMapperClass(HdfsDataMapper.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(TradeNetworthRow.class); conf.setReducerClass(HdfsDataReducer.class); conf.set(RowOutputFormat.OUTPUT_TABLE, tableName +"_HDFS"); //conf.set(GfxdOutputFormat.OUTPUT_SCHEMA, "APP"); conf.set(RowOutputFormat.OUTPUT_URL, url); conf.setOutputFormat(RowOutputFormat.class); conf.setOutputKeyClass(Key.class); conf.setOutputValueClass(TradeNetworthOutputObject.class); StringBuffer aStr = new StringBuffer(); aStr.append("HOME_DIR = " + conf.get(RowInputFormat.HOME_DIR) + " "); aStr.append("INPUT_TABLE = " + conf.get(RowInputFormat.INPUT_TABLE) + " "); aStr.append("OUTPUT_TABLE = " + conf.get(RowOutputFormat.OUTPUT_TABLE) + " "); aStr.append("OUTPUT_URL = " + conf.get(RowOutputFormat.OUTPUT_URL) + " "); System.out.println("VerifyHdfsData running with the following conf: " + aStr.toString()); FileOutputFormat.setOutputPath(conf, new Path("" + System.currentTimeMillis())); JobClient.runJob(conf); return 0; }
Example 12
Source File: TradeSecurityHdfsDataVerifier.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); JobConf conf = new JobConf(getConf()); conf.setJobName("TradeSecurityHdfsDataVerifier"); String hdfsHomeDir = args[0]; String url = args[1]; String tableName = args[2]; System.out.println("TradeSecurityHdfsDataVerifier.run() invoked with " + " hdfsHomeDir = " + hdfsHomeDir + " url = " + url + " tableName = " + tableName); // Job-specific params conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); conf.setInputFormat(RowInputFormat.class); conf.setMapperClass(HdfsDataMapper.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(TradeSecurityRow.class); conf.setReducerClass(HdfsDataReducer.class); conf.set(RowOutputFormat.OUTPUT_TABLE,tableName + "_HDFS"); conf.set(RowOutputFormat.OUTPUT_URL, url); conf.setOutputFormat(RowOutputFormat.class); conf.setOutputKeyClass(Key.class); conf.setOutputValueClass(TradeSecurityOutputObject.class); StringBuffer aStr = new StringBuffer(); aStr.append("HOME_DIR = " + conf.get(RowInputFormat.HOME_DIR) + " "); aStr.append("INPUT_TABLE = " + conf.get(RowInputFormat.INPUT_TABLE) + " "); aStr.append("OUTPUT_TABLE = " + conf.get(RowOutputFormat.OUTPUT_TABLE) + " "); aStr.append("OUTPUT_URL = " + conf.get(RowOutputFormat.OUTPUT_URL) + " "); System.out.println("VerifyHdfsData running with the following conf: " + aStr.toString()); FileOutputFormat.setOutputPath(conf, new Path("" + System.currentTimeMillis())); JobClient.runJob(conf); return 0; }
Example 13
Source File: IndexUpdater.java From hadoop-gpu with Apache License 2.0 | 4 votes |
JobConf createJob(Configuration conf, Path[] inputPaths, Path outputPath, int numMapTasks, Shard[] shards) throws IOException { // set the starting generation for each shard // when a reduce task fails, a new reduce task // has to know where to re-start setShardGeneration(conf, shards); // iconf.set sets properties in conf IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(conf); Shard.setIndexShards(iconf, shards); // MapTask.MapOutputBuffer uses "io.sort.mb" to decide its max buffer size // (max buffer size = 1/2 * "io.sort.mb"). // Here we half-en "io.sort.mb" because we use the other half memory to // build an intermediate form/index in Combiner. iconf.setIOSortMB(iconf.getIOSortMB() / 2); // create the job configuration JobConf jobConf = new JobConf(conf, IndexUpdater.class); jobConf.setJobName(this.getClass().getName() + "_" + System.currentTimeMillis()); // provided by application FileInputFormat.setInputPaths(jobConf, inputPaths); FileOutputFormat.setOutputPath(jobConf, outputPath); jobConf.setNumMapTasks(numMapTasks); // already set shards jobConf.setNumReduceTasks(shards.length); jobConf.setInputFormat(iconf.getIndexInputFormatClass()); Path[] inputs = FileInputFormat.getInputPaths(jobConf); StringBuilder buffer = new StringBuilder(inputs[0].toString()); for (int i = 1; i < inputs.length; i++) { buffer.append(","); buffer.append(inputs[i].toString()); } LOG.info("mapred.input.dir = " + buffer.toString()); LOG.info("mapred.output.dir = " + FileOutputFormat.getOutputPath(jobConf).toString()); LOG.info("mapred.map.tasks = " + jobConf.getNumMapTasks()); LOG.info("mapred.reduce.tasks = " + jobConf.getNumReduceTasks()); LOG.info(shards.length + " shards = " + iconf.getIndexShards()); // better if we don't create the input format instance LOG.info("mapred.input.format.class = " + jobConf.getInputFormat().getClass().getName()); // set by the system jobConf.setMapOutputKeyClass(IndexUpdateMapper.getMapOutputKeyClass()); jobConf.setMapOutputValueClass(IndexUpdateMapper.getMapOutputValueClass()); jobConf.setOutputKeyClass(IndexUpdateReducer.getOutputKeyClass()); jobConf.setOutputValueClass(IndexUpdateReducer.getOutputValueClass()); jobConf.setMapperClass(IndexUpdateMapper.class); jobConf.setPartitionerClass(IndexUpdatePartitioner.class); jobConf.setCombinerClass(IndexUpdateCombiner.class); jobConf.setReducerClass(IndexUpdateReducer.class); jobConf.setOutputFormat(IndexUpdateOutputFormat.class); return jobConf; }
Example 14
Source File: VerifyHdfsDataUsingMR.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { // todo@lhughes -- why do we need this? GfxdDataSerializable.initTypes(); JobConf conf = new JobConf(getConf()); conf.setJobName("hdfsMapReduce"); String hdfsHomeDir = args[0]; String url = args[1]; String tableName = args[2]; System.out.println("VerifyHdfsData.run() invoked with " + " hdfsHomeDir = " + hdfsHomeDir + " url = " + url + " tableName = " + tableName); // Job-specific params conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); conf.setInputFormat(RowInputFormat.class); conf.setMapperClass(HdfsDataMapper.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(MyRow.class); conf.setReducerClass(HdfsDataReducer.class); conf.set(RowOutputFormat.OUTPUT_TABLE, "TRADE.HDFS_CUSTOMERS"); //conf.set(GfxdOutputFormat.OUTPUT_SCHEMA, "APP"); conf.set(RowOutputFormat.OUTPUT_URL, url); conf.setOutputFormat(RowOutputFormat.class); conf.setOutputKeyClass(Key.class); conf.setOutputValueClass(DataObject.class); StringBuffer aStr = new StringBuffer(); aStr.append("HOME_DIR = " + conf.get(RowInputFormat.HOME_DIR) + " "); aStr.append("INPUT_TABLE = " + conf.get(RowInputFormat.INPUT_TABLE) + " "); aStr.append("OUTPUT_TABLE = " + conf.get(RowOutputFormat.OUTPUT_TABLE) + " "); aStr.append("OUTPUT_URL = " + conf.get(RowOutputFormat.OUTPUT_URL) + " "); System.out.println("VerifyHdfsData running with the following conf: " + aStr.toString()); // not planning to use this, but I get an NPE without it FileOutputFormat.setOutputPath(conf, new Path("" + System.currentTimeMillis())); JobClient.runJob(conf); return 0; }
Example 15
Source File: TradesHdfsDataVerifier.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); JobConf conf = new JobConf(getConf()); conf.setJobName("TradesHdfsDataVerifier"); String hdfsHomeDir = args[0]; String url = args[1]; String tableName = args[2]; System.out.println("TradesHdfsDataVerifier.run() invoked with " + " hdfsHomeDir = " + hdfsHomeDir + " url = " + url + " tableName = " + tableName); // Job-specific params conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); conf.setInputFormat(RowInputFormat.class); conf.setMapperClass(HdfsDataMapper.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(TradesRow.class); conf.setReducerClass(HdfsDataReducer.class); conf.set(RowOutputFormat.OUTPUT_TABLE, tableName + "_HDFS"); //conf.set(GfxdOutputFormat.OUTPUT_SCHEMA, "APP"); conf.set(RowOutputFormat.OUTPUT_URL, url); conf.setOutputFormat(RowOutputFormat.class); conf.setOutputKeyClass(Key.class); conf.setOutputValueClass(TradeOutputObject.class); StringBuffer aStr = new StringBuffer(); aStr.append("HOME_DIR = " + conf.get(RowInputFormat.HOME_DIR) + " "); aStr.append("INPUT_TABLE = " + conf.get(RowInputFormat.INPUT_TABLE) + " "); aStr.append("OUTPUT_TABLE = " + conf.get(RowOutputFormat.OUTPUT_TABLE) + " "); aStr.append("OUTPUT_URL = " + conf.get(RowOutputFormat.OUTPUT_URL) + " "); System.out.println("VerifyHdfsData running with the following conf: " + aStr.toString()); FileOutputFormat.setOutputPath(conf, new Path("" + System.currentTimeMillis())); JobClient.runJob(conf); return 0; }
Example 16
Source File: TradeCustomersHdfsDataVerifier.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); JobConf conf = new JobConf(getConf()); conf.setJobName("TradeCustomersHdfsDataVerifier"); String hdfsHomeDir = args[0]; String url = args[1]; String tableName = args[2]; System.out.println("TradeCustomersHdfsDataVerifier.run() invoked with " + " hdfsHomeDir = " + hdfsHomeDir + " url = " + url + " tableName = " + tableName); // Job-specific params conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); conf.setInputFormat(RowInputFormat.class); conf.setMapperClass(HdfsDataMapper.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(TradeCustomersRow.class); conf.setReducerClass(HdfsDataReducer.class); conf.set(RowOutputFormat.OUTPUT_TABLE, tableName + "_HDFS"); //conf.set(GfxdOutputFormat.OUTPUT_SCHEMA, "APP"); conf.set(RowOutputFormat.OUTPUT_URL, url); conf.setOutputFormat(RowOutputFormat.class); conf.setOutputKeyClass(Key.class); conf.setOutputValueClass(TradeCustomerOutputObject.class); StringBuffer aStr = new StringBuffer(); aStr.append("HOME_DIR = " + conf.get(RowInputFormat.HOME_DIR) + " "); aStr.append("INPUT_TABLE = " + conf.get(RowInputFormat.INPUT_TABLE) + " "); aStr.append("OUTPUT_TABLE = " + conf.get(RowOutputFormat.OUTPUT_TABLE) + " "); aStr.append("OUTPUT_URL = " + conf.get(RowOutputFormat.OUTPUT_URL) + " "); System.out.println("VerifyHdfsData running with the following conf: " + aStr.toString()); FileOutputFormat.setOutputPath(conf, new Path("" + System.currentTimeMillis())); JobClient.runJob(conf); return 0; }
Example 17
Source File: VerifyHdfsDataUsingMR.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { // todo@lhughes -- why do we need this? GfxdDataSerializable.initTypes(); JobConf conf = new JobConf(getConf()); conf.setJobName("hdfsMapReduce"); String hdfsHomeDir = args[0]; String url = args[1]; String tableName = args[2]; System.out.println("VerifyHdfsData.run() invoked with " + " hdfsHomeDir = " + hdfsHomeDir + " url = " + url + " tableName = " + tableName); // Job-specific params conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); conf.setInputFormat(RowInputFormat.class); conf.setMapperClass(HdfsDataMapper.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(MyRow.class); conf.setReducerClass(HdfsDataReducer.class); conf.set(RowOutputFormat.OUTPUT_TABLE, "TRADE.HDFS_CUSTOMERS"); //conf.set(GfxdOutputFormat.OUTPUT_SCHEMA, "APP"); conf.set(RowOutputFormat.OUTPUT_URL, url); conf.setOutputFormat(RowOutputFormat.class); conf.setOutputKeyClass(Key.class); conf.setOutputValueClass(DataObject.class); StringBuffer aStr = new StringBuffer(); aStr.append("HOME_DIR = " + conf.get(RowInputFormat.HOME_DIR) + " "); aStr.append("INPUT_TABLE = " + conf.get(RowInputFormat.INPUT_TABLE) + " "); aStr.append("OUTPUT_TABLE = " + conf.get(RowOutputFormat.OUTPUT_TABLE) + " "); aStr.append("OUTPUT_URL = " + conf.get(RowOutputFormat.OUTPUT_URL) + " "); System.out.println("VerifyHdfsData running with the following conf: " + aStr.toString()); // not planning to use this, but I get an NPE without it FileOutputFormat.setOutputPath(conf, new Path("" + System.currentTimeMillis())); JobClient.runJob(conf); return 0; }
Example 18
Source File: CrawlDBScanner.java From anthelion with Apache License 2.0 | 4 votes |
private void scan(Path crawlDb, Path outputPath, String regex, String status, boolean text) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("CrawlDB scanner: starting at " + sdf.format(start)); JobConf job = new NutchJob(getConf()); job.setJobName("Scan : " + crawlDb + " for URLS matching : " + regex); job.set("CrawlDBScanner.regex", regex); if (status != null) job.set("CrawlDBScanner.status", status); FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDBScanner.class); job.setReducerClass(CrawlDBScanner.class); FileOutputFormat.setOutputPath(job, outputPath); // if we want a text dump of the entries // in order to check something - better to use the text format and avoid // compression if (text) { job.set("mapred.output.compress", "false"); job.setOutputFormat(TextOutputFormat.class); } // otherwise what we will actually create is a mini-crawlDB which can be // then used // for debugging else { job.setOutputFormat(MapFileOutputFormat.class); } job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(CrawlDatum.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); try { JobClient.runJob(job); } catch (IOException e) { throw e; } long end = System.currentTimeMillis(); LOG.info("CrawlDb scanner: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
Example 19
Source File: LinkRank.java From anthelion with Apache License 2.0 | 4 votes |
/** * Runs the counter job. The counter job determines the number of links in the * webgraph. This is used during analysis. * * @param fs The job file system. * @param webGraphDb The web graph database to use. * * @return The number of nodes in the web graph. * @throws IOException If an error occurs while running the counter job. */ private int runCounter(FileSystem fs, Path webGraphDb) throws IOException { // configure the counter job Path numLinksPath = new Path(webGraphDb, NUM_NODES); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); JobConf counter = new NutchJob(getConf()); counter.setJobName("LinkRank Counter"); FileInputFormat.addInputPath(counter, nodeDb); FileOutputFormat.setOutputPath(counter, numLinksPath); counter.setInputFormat(SequenceFileInputFormat.class); counter.setMapperClass(Counter.class); counter.setCombinerClass(Counter.class); counter.setReducerClass(Counter.class); counter.setMapOutputKeyClass(Text.class); counter.setMapOutputValueClass(LongWritable.class); counter.setOutputKeyClass(Text.class); counter.setOutputValueClass(LongWritable.class); counter.setNumReduceTasks(1); counter.setOutputFormat(TextOutputFormat.class); counter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); // run the counter job, outputs to a single reduce task and file LOG.info("Starting link counter job"); try { JobClient.runJob(counter); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished link counter job"); // read the first (and only) line from the file which should be the // number of links in the web graph LOG.info("Reading numlinks temp file"); FSDataInputStream readLinks = fs.open(new Path(numLinksPath, "part-00000")); BufferedReader buffer = new BufferedReader(new InputStreamReader(readLinks)); String numLinksLine = buffer.readLine(); readLinks.close(); // check if there are links to process, if none, webgraph might be empty if (numLinksLine == null || numLinksLine.length() == 0) { fs.delete(numLinksPath, true); throw new IOException("No links to process, is the webgraph empty?"); } // delete temp file and convert and return the number of links as an int LOG.info("Deleting numlinks temp file"); fs.delete(numLinksPath, true); String numLinks = numLinksLine.split("\\s+")[1]; return Integer.parseInt(numLinks); }
Example 20
Source File: ChainMapper.java From hadoop-gpu with Apache License 2.0 | 3 votes |
/** * Adds a Mapper class to the chain job's JobConf. * <p/> * It has to be specified how key and values are passed from one element of * the chain to the next, by value or by reference. If a Mapper leverages the * assumed semantics that the key and values are not modified by the collector * 'by value' must be used. If the Mapper does not expect this semantics, as * an optimization to avoid serialization and deserialization 'by reference' * can be used. * <p/> * For the added Mapper the configuration given for it, * <code>mapperConf</code>, have precedence over the job's JobConf. This * precedence is in effect when the task is running. * <p/> * IMPORTANT: There is no need to specify the output key/value classes for the * ChainMapper, this is done by the addMapper for the last mapper in the chain * <p/> * * @param job job's JobConf to add the Mapper class. * @param klass the Mapper class to add. * @param inputKeyClass mapper input key class. * @param inputValueClass mapper input value class. * @param outputKeyClass mapper output key class. * @param outputValueClass mapper output value class. * @param byValue indicates if key/values should be passed by value * to the next Mapper in the chain, if any. * @param mapperConf a JobConf with the configuration for the Mapper * class. It is recommended to use a JobConf without default values using the * <code>JobConf(boolean loadDefaults)</code> constructor with FALSE. */ public static <K1, V1, K2, V2> void addMapper(JobConf job, Class<? extends Mapper<K1, V1, K2, V2>> klass, Class<? extends K1> inputKeyClass, Class<? extends V1> inputValueClass, Class<? extends K2> outputKeyClass, Class<? extends V2> outputValueClass, boolean byValue, JobConf mapperConf) { job.setMapperClass(ChainMapper.class); job.setMapOutputKeyClass(outputKeyClass); job.setMapOutputValueClass(outputValueClass); Chain.addMapper(true, job, klass, inputKeyClass, inputValueClass, outputKeyClass, outputValueClass, byValue, mapperConf); }