Java Code Examples for org.apache.hadoop.mapred.JobConf#setOutputFormat()
The following examples show how to use
org.apache.hadoop.mapred.JobConf#setOutputFormat() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SolrClean.java From anthelion with Apache License 2.0 | 7 votes |
public void delete(String crawldb, String solrUrl, boolean noCommit) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("SolrClean: starting at " + sdf.format(start)); JobConf job = new NutchJob(getConf()); FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME)); job.setBoolean("noCommit", noCommit); job.set(SolrConstants.SERVER_URL, solrUrl); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(NullOutputFormat.class); job.setMapOutputKeyClass(ByteWritable.class); job.setMapOutputValueClass(Text.class); job.setMapperClass(DBFilter.class); job.setReducerClass(SolrDeleter.class); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("SolrClean: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
Example 2
Source File: TeraSort.java From hadoop-book with Apache License 2.0 | 6 votes |
public int run(String[] args) throws Exception { LOG.info("starting"); JobConf job = (JobConf) getConf(); Path inputDir = new Path(args[0]); inputDir = inputDir.makeQualified(inputDir.getFileSystem(job)); Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME); URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME); TeraInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraSort"); job.setJarByClass(TeraSort.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormat(TeraInputFormat.class); job.setOutputFormat(TeraOutputFormat.class); job.setPartitionerClass(TotalOrderPartitioner.class); TeraInputFormat.writePartitionFile(job, partitionFile); DistributedCache.addCacheFile(partitionUri, job); DistributedCache.createSymlink(job); job.setInt("dfs.replication", 1); TeraOutputFormat.setFinalSync(job, true); JobClient.runJob(job); LOG.info("done"); return 0; }
Example 3
Source File: ReadDataJob.java From tracing-framework with BSD 3-Clause "New" or "Revised" License | 6 votes |
public void configure(JobConf job) { // Set the mapper and reducers job.setMapperClass(TestMapper.class); // job.setReducerClass(TestReducer.class); // Set the output types of the mapper and reducer // job.setMapOutputKeyClass(IntWritable.class); // job.setMapOutputValueClass(NullWritable.class); // job.setOutputKeyClass(NullWritable.class); // job.setOutputValueClass(NullWritable.class); // Make sure this jar is included job.setJarByClass(TestMapper.class); // Specify the input and output data formats job.setInputFormat(TextInputFormat.class); job.setOutputFormat(NullOutputFormat.class); // Turn off speculative execution job.setMapSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); // Add the job input path FileInputFormat.addInputPath(job, new Path(this.input_filename)); }
Example 4
Source File: DataFsck.java From RDFS with Apache License 2.0 | 6 votes |
private JobConf createJobConf() { JobConf jobConf = new JobConf(getConf()); String jobName = NAME + " " + dateForm.format(new Date(System.currentTimeMillis())); jobConf.setJobName(jobName); jobConf.setMapSpeculativeExecution(false); jobConf.setJarByClass(DataFsck.class); jobConf.setInputFormat(DataFsckInputFormat.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); jobConf.setMapperClass(DataFsckMapper.class); jobConf.setNumReduceTasks(0); return jobConf; }
Example 5
Source File: TeraGen.java From hadoop-gpu with Apache License 2.0 | 6 votes |
/** * @param args the cli arguments */ public int run(String[] args) throws IOException { JobConf job = (JobConf) getConf(); setNumberOfRows(job, Long.parseLong(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraGen"); job.setJarByClass(TeraGen.class); job.setMapperClass(SortGenMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormat(RangeInputFormat.class); job.setOutputFormat(TeraOutputFormat.class); JobClient.runJob(job); return 0; }
Example 6
Source File: SleepJob.java From RDFS with Apache License 2.0 | 5 votes |
public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount, long reduceSleepTime, int reduceSleepCount, boolean doSpeculation, List<String> slowMaps, List<String> slowReduces, int slowRatio, int countersPerTask, List<String> hosts, int hostsPerSplit, boolean setup) { JobConf job = new JobConf(getConf(), SleepJob.class); job.setNumMapTasks(numMapper); job.setNumReduceTasks(numReducer); job.setMapperClass(SleepJob.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setReducerClass(SleepJob.class); job.setOutputFormat(NullOutputFormat.class); job.setJobSetupCleanupNeeded(setup); job.setInputFormat(SleepInputFormat.class); job.setPartitionerClass(SleepJob.class); job.setJobName("Sleep job"); FileInputFormat.addInputPath(job, new Path("ignored")); job.setLong("sleep.job.map.sleep.time", mapSleepTime); job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime); job.setInt("sleep.job.map.sleep.count", mapSleepCount); job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount); job.setSpeculativeExecution(doSpeculation); job.setInt(SLOW_RATIO, slowRatio); job.setStrings(SLOW_MAPS, slowMaps.toArray(new String[slowMaps.size()])); job.setStrings(SLOW_REDUCES, slowMaps.toArray(new String[slowReduces.size()])); job.setInt("sleep.job.counters.per.task", countersPerTask); job.setStrings(HOSTS_FOR_LOCALITY, hosts.toArray(new String[hosts.size()])); job.setInt(HOSTS_PER_SPLIT, hostsPerSplit); return job; }
Example 7
Source File: LinkRank.java From anthelion with Apache License 2.0 | 5 votes |
/** * Runs the initializer job. The initializer job sets up the nodes with a * default starting score for link analysis. * * @param nodeDb The node database to use. * @param output The job output directory. * * @throws IOException If an error occurs while running the initializer job. */ private void runInitializer(Path nodeDb, Path output) throws IOException { // configure the initializer JobConf initializer = new NutchJob(getConf()); initializer.setJobName("LinkAnalysis Initializer"); FileInputFormat.addInputPath(initializer, nodeDb); FileOutputFormat.setOutputPath(initializer, output); initializer.setInputFormat(SequenceFileInputFormat.class); initializer.setMapperClass(Initializer.class); initializer.setMapOutputKeyClass(Text.class); initializer.setMapOutputValueClass(Node.class); initializer.setOutputKeyClass(Text.class); initializer.setOutputValueClass(Node.class); initializer.setOutputFormat(MapFileOutputFormat.class); initializer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); // run the initializer LOG.info("Starting initialization job"); try { JobClient.runJob(initializer); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished initialization job."); }
Example 8
Source File: TeraSort.java From RDFS with Apache License 2.0 | 5 votes |
public int run(String[] args) throws Exception { LOG.info("starting"); JobConf job = (JobConf) getConf(); Path inputDir = new Path(args[0]); inputDir = inputDir.makeQualified(inputDir.getFileSystem(job)); Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME); URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME); TeraInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraSort"); job.setJarByClass(TeraSort.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormat(TeraInputFormat.class); job.setOutputFormat(TeraOutputFormat.class); job.setPartitionerClass(TotalOrderPartitioner.class); TeraInputFormat.writePartitionFile(job, partitionFile); DistributedCache.addCacheFile(partitionUri, job); DistributedCache.createSymlink(job); job.setInt("dfs.replication", 1); TeraOutputFormat.setFinalSync(job, true); long startTime = System.currentTimeMillis(); JobClient.runJob(job); long endTime = System.currentTimeMillis(); System.out.println((float)(endTime-startTime)/1000); LOG.info("done"); return 0; }
Example 9
Source File: IndexerMapReduce.java From anthelion with Apache License 2.0 | 5 votes |
public static void initMRJob(Path crawlDb, Path linkDb, Collection<Path> segments, JobConf job) { LOG.info("IndexerMapReduce: crawldb: " + crawlDb); if (linkDb!=null) LOG.info("IndexerMapReduce: linkdb: " + linkDb); for (final Path segment : segments) { LOG.info("IndexerMapReduces: adding segment: " + segment); FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.FETCH_DIR_NAME)); FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.PARSE_DIR_NAME)); FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME)); FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME)); } FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); if (linkDb!=null) FileInputFormat.addInputPath(job, new Path(linkDb, LinkDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(IndexerMapReduce.class); job.setReducerClass(IndexerMapReduce.class); job.setOutputFormat(IndexerOutputFormat.class); job.setOutputKeyClass(Text.class); job.setMapOutputValueClass(NutchWritable.class); job.setOutputValueClass(NutchWritable.class); }
Example 10
Source File: LazyOutputFormat.java From big-c with Apache License 2.0 | 5 votes |
/** * Set the underlying output format for LazyOutputFormat. * @param job the {@link JobConf} to modify * @param theClass the underlying class */ @SuppressWarnings("unchecked") public static void setOutputFormatClass(JobConf job, Class<? extends OutputFormat> theClass) { job.setOutputFormat(LazyOutputFormat.class); job.setClass("mapreduce.output.lazyoutputformat.outputformat", theClass, OutputFormat.class); }
Example 11
Source File: Submitter.java From big-c with Apache License 2.0 | 4 votes |
private static void setupPipesJob(JobConf conf) throws IOException { // default map output types to Text if (!getIsJavaMapper(conf)) { conf.setMapRunnerClass(PipesMapRunner.class); // Save the user's partitioner and hook in our's. setJavaPartitioner(conf, conf.getPartitionerClass()); conf.setPartitionerClass(PipesPartitioner.class); } if (!getIsJavaReducer(conf)) { conf.setReducerClass(PipesReducer.class); if (!getIsJavaRecordWriter(conf)) { conf.setOutputFormat(NullOutputFormat.class); } } String textClassname = Text.class.getName(); setIfUnset(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS, textClassname); setIfUnset(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS, textClassname); setIfUnset(conf, MRJobConfig.OUTPUT_KEY_CLASS, textClassname); setIfUnset(conf, MRJobConfig.OUTPUT_VALUE_CLASS, textClassname); // Use PipesNonJavaInputFormat if necessary to handle progress reporting // from C++ RecordReaders ... if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) { conf.setClass(Submitter.INPUT_FORMAT, conf.getInputFormat().getClass(), InputFormat.class); conf.setInputFormat(PipesNonJavaInputFormat.class); } String exec = getExecutable(conf); if (exec == null) { throw new IllegalArgumentException("No application program defined."); } // add default debug script only when executable is expressed as // <path>#<executable> if (exec.contains("#")) { // set default gdb commands for map and reduce task String defScript = "$HADOOP_PREFIX/src/c++/pipes/debug/pipes-default-script"; setIfUnset(conf, MRJobConfig.MAP_DEBUG_SCRIPT,defScript); setIfUnset(conf, MRJobConfig.REDUCE_DEBUG_SCRIPT,defScript); } URI[] fileCache = DistributedCache.getCacheFiles(conf); if (fileCache == null) { fileCache = new URI[1]; } else { URI[] tmp = new URI[fileCache.length+1]; System.arraycopy(fileCache, 0, tmp, 1, fileCache.length); fileCache = tmp; } try { fileCache[0] = new URI(exec); } catch (URISyntaxException e) { IOException ie = new IOException("Problem parsing execable URI " + exec); ie.initCause(e); throw ie; } DistributedCache.setCacheFiles(fileCache, conf); }
Example 12
Source File: TradeNetworthHdfsDataVerifier.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); JobConf conf = new JobConf(getConf()); conf.setJobName("TradeSecurityHdfsDataVerifier"); String hdfsHomeDir = args[0]; String url = args[1]; String tableName = args[2]; System.out.println("TradeSecurityHdfsDataVerifier.run() invoked with " + " hdfsHomeDir = " + hdfsHomeDir + " url = " + url + " tableName = " + tableName); // Job-specific params conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); conf.setInputFormat(RowInputFormat.class); conf.setMapperClass(HdfsDataMapper.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(TradeNetworthRow.class); conf.setReducerClass(HdfsDataReducer.class); conf.set(RowOutputFormat.OUTPUT_TABLE, tableName +"_HDFS"); //conf.set(GfxdOutputFormat.OUTPUT_SCHEMA, "APP"); conf.set(RowOutputFormat.OUTPUT_URL, url); conf.setOutputFormat(RowOutputFormat.class); conf.setOutputKeyClass(Key.class); conf.setOutputValueClass(TradeNetworthOutputObject.class); StringBuffer aStr = new StringBuffer(); aStr.append("HOME_DIR = " + conf.get(RowInputFormat.HOME_DIR) + " "); aStr.append("INPUT_TABLE = " + conf.get(RowInputFormat.INPUT_TABLE) + " "); aStr.append("OUTPUT_TABLE = " + conf.get(RowOutputFormat.OUTPUT_TABLE) + " "); aStr.append("OUTPUT_URL = " + conf.get(RowOutputFormat.OUTPUT_URL) + " "); System.out.println("VerifyHdfsData running with the following conf: " + aStr.toString()); FileOutputFormat.setOutputPath(conf, new Path("" + System.currentTimeMillis())); JobClient.runJob(conf); return 0; }
Example 13
Source File: TradeSecurityHdfsDataVerifier.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); JobConf conf = new JobConf(getConf()); conf.setJobName("TradeSecurityHdfsDataVerifier"); String hdfsHomeDir = args[0]; String url = args[1]; String tableName = args[2]; System.out.println("TradeSecurityHdfsDataVerifier.run() invoked with " + " hdfsHomeDir = " + hdfsHomeDir + " url = " + url + " tableName = " + tableName); // Job-specific params conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); conf.setInputFormat(RowInputFormat.class); conf.setMapperClass(HdfsDataMapper.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(TradeSecurityRow.class); conf.setReducerClass(HdfsDataReducer.class); conf.set(RowOutputFormat.OUTPUT_TABLE,tableName + "_HDFS"); conf.set(RowOutputFormat.OUTPUT_URL, url); conf.setOutputFormat(RowOutputFormat.class); conf.setOutputKeyClass(Key.class); conf.setOutputValueClass(TradeSecurityOutputObject.class); StringBuffer aStr = new StringBuffer(); aStr.append("HOME_DIR = " + conf.get(RowInputFormat.HOME_DIR) + " "); aStr.append("INPUT_TABLE = " + conf.get(RowInputFormat.INPUT_TABLE) + " "); aStr.append("OUTPUT_TABLE = " + conf.get(RowOutputFormat.OUTPUT_TABLE) + " "); aStr.append("OUTPUT_URL = " + conf.get(RowOutputFormat.OUTPUT_URL) + " "); System.out.println("VerifyHdfsData running with the following conf: " + aStr.toString()); FileOutputFormat.setOutputPath(conf, new Path("" + System.currentTimeMillis())); JobClient.runJob(conf); return 0; }
Example 14
Source File: TradeBuyOrdersHdfsDataVerifier.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); JobConf conf = new JobConf(getConf()); conf.setJobName("TradeBuyOrdersHdfsDataVerifier"); String hdfsHomeDir = args[0]; String url = args[1]; String tableName = args[2]; System.out.println("TradeBuyOrdersHdfsDataVerifier.run() invoked with " + " hdfsHomeDir = " + hdfsHomeDir + " url = " + url + " tableName = " + tableName); // Job-specific params conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); conf.setInputFormat(RowInputFormat.class); conf.setMapperClass(HdfsDataMapper.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(TradeBuyOrdersRow.class); conf.setReducerClass(HdfsDataReducer.class); conf.set(RowOutputFormat.OUTPUT_TABLE, tableName + "_HDFS"); //conf.set(GfxdOutputFormat.OUTPUT_SCHEMA, "APP"); conf.set(RowOutputFormat.OUTPUT_URL, url); conf.setOutputFormat(RowOutputFormat.class); conf.setOutputKeyClass(Key.class); conf.setOutputValueClass(TradeBuyOrdersOutputObject.class); StringBuffer aStr = new StringBuffer(); aStr.append("HOME_DIR = " + conf.get(RowInputFormat.HOME_DIR) + " "); aStr.append("INPUT_TABLE = " + conf.get(RowInputFormat.INPUT_TABLE) + " "); aStr.append("OUTPUT_TABLE = " + conf.get(RowOutputFormat.OUTPUT_TABLE) + " "); aStr.append("OUTPUT_URL = " + conf.get(RowOutputFormat.OUTPUT_URL) + " "); System.out.println("VerifyHdfsData running with the following conf: " + aStr.toString()); FileOutputFormat.setOutputPath(conf, new Path("" + System.currentTimeMillis())); JobClient.runJob(conf); return 0; }
Example 15
Source File: TradeTxHistoryHdfsDataVerifier.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); JobConf conf = new JobConf(getConf()); conf.setJobName("TradeTxHistoryHdfsDataVerifier"); String hdfsHomeDir = args[0]; String url = args[1]; String tableName = args[2]; System.out.println("TradeTxHistoryHdfsDataVerifier.run() invoked with " + " hdfsHomeDir = " + hdfsHomeDir + " url = " + url + " tableName = " + tableName); // Job-specific params conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); conf.setInputFormat(RowInputFormat.class); conf.setMapperClass(HdfsDataMapper.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(TradeTxHistoryRow.class); conf.setReducerClass(HdfsDataReducer.class); conf.set(RowOutputFormat.OUTPUT_TABLE, tableName + "_HDFS"); //conf.set(GfxdOutputFormat.OUTPUT_SCHEMA, "APP"); conf.set(RowOutputFormat.OUTPUT_URL, url); conf.setOutputFormat(RowOutputFormat.class); conf.setOutputKeyClass(Key.class); conf.setOutputValueClass(TradeTxHistoryOutputObject.class); StringBuffer aStr = new StringBuffer(); aStr.append("HOME_DIR = " + conf.get(RowInputFormat.HOME_DIR) + " "); aStr.append("INPUT_TABLE = " + conf.get(RowInputFormat.INPUT_TABLE) + " "); aStr.append("OUTPUT_TABLE = " + conf.get(RowOutputFormat.OUTPUT_TABLE) + " "); aStr.append("OUTPUT_URL = " + conf.get(RowOutputFormat.OUTPUT_URL) + " "); System.out.println("VerifyHdfsData running with the following conf: " + aStr.toString()); FileOutputFormat.setOutputPath(conf, new Path("" + System.currentTimeMillis())); JobClient.runJob(conf); return 0; }
Example 16
Source File: TopBusyAirportGemfirexd.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); JobConf conf = new JobConf(getConf()); conf.setJobName("Busy Airport Count"); Path outputPath = new Path(args[0]); Path intermediateOutputPath = new Path(args[0] + "_int"); String hdfsHomeDir = args[1]; String tableName = args[2]; outputPath.getFileSystem(conf).delete(outputPath, true); intermediateOutputPath.getFileSystem(conf).delete(intermediateOutputPath, true); conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); conf.setInputFormat(RowInputFormat.class); conf.setMapperClass(SampleMapper.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(IntWritable.class); conf.setReducerClass(SampleReducer.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); FileOutputFormat.setOutputPath(conf, intermediateOutputPath); int rc = JobClient.runJob(conf).isSuccessful() ? 0 : 1; if (rc == 0) { JobConf topConf = new JobConf(getConf()); topConf.setJobName("Top Busy Airport"); String hdfsFS = topConf.get("fs.defaultFS"); URI hdfsUri = URI.create(hdfsFS); hdfsUri.getHost(); // Assume that SqlFire locator is running alongside the namenode topConf.set(RowOutputFormat.OUTPUT_URL, "jdbc:gemfirexd://" + hdfsUri.getHost() + ":1527"); //topConf.set(ddGfxdOutputFormat.OUTPUT_SCHEMA, "APP"); //topConf.set(GfxdOutputFormat.OUTPUT_TABLE, "BUSY_AIRPORT"); topConf.set(RowOutputFormat.OUTPUT_TABLE, "APP.BUSY_AIRPORT"); // Only run a single reducer topConf.setNumReduceTasks(1); FileInputFormat.setInputPaths(topConf, intermediateOutputPath); topConf.setInputFormat(TextInputFormat.class); topConf.setMapperClass(TopBusyAirportMapper.class); topConf.setMapOutputKeyClass(Text.class); topConf.setMapOutputValueClass(StringIntPair.class); topConf.setReducerClass(TopBusyAirportReducer.class); topConf.setOutputKeyClass(Key.class); topConf.setOutputValueClass(BusyAirportModel.class); topConf.setOutputFormat(RowOutputFormat.class); rc = JobClient.runJob(topConf).isSuccessful() ? 0 : 1; } return rc; }
Example 17
Source File: TradeSellOrdersHdfsDataVerifier.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); JobConf conf = new JobConf(getConf()); conf.setJobName("TradeSellOrdersHdfsDataVerifier"); String hdfsHomeDir = args[0]; String url = args[1]; String tableName = args[2]; System.out.println("TradeSellOrdersHdfsDataVerifier.run() invoked with " + " hdfsHomeDir = " + hdfsHomeDir + " url = " + url + " tableName = " + tableName); // Job-specific params conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); conf.setInputFormat(RowInputFormat.class); conf.setMapperClass(HdfsDataMapper.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(TradeSellOrdersRow.class); conf.setReducerClass(HdfsDataReducer.class); conf.set(RowOutputFormat.OUTPUT_TABLE, tableName + "_HDFS"); //conf.set(GfxdOutputFormat.OUTPUT_SCHEMA, "APP"); conf.set(RowOutputFormat.OUTPUT_URL, url); conf.setOutputFormat(RowOutputFormat.class); conf.setOutputKeyClass(Key.class); conf.setOutputValueClass(TradeSellOrdersOutputObject.class); StringBuffer aStr = new StringBuffer(); aStr.append("HOME_DIR = " + conf.get(RowInputFormat.HOME_DIR) + " "); aStr.append("INPUT_TABLE = " + conf.get(RowInputFormat.INPUT_TABLE) + " "); aStr.append("OUTPUT_TABLE = " + conf.get(RowOutputFormat.OUTPUT_TABLE) + " "); aStr.append("OUTPUT_URL = " + conf.get(RowOutputFormat.OUTPUT_URL) + " "); System.out.println("VerifyHdfsData running with the following conf: " + aStr.toString()); FileOutputFormat.setOutputPath(conf, new Path("" + System.currentTimeMillis())); JobClient.runJob(conf); return 0; }
Example 18
Source File: NodeDumper.java From nutch-htmlunit with Apache License 2.0 | 4 votes |
/** * Runs the process to dump the top urls out to a text file. * * @param webGraphDb The WebGraph from which to pull values. * * @param topN * @param output * * @throws IOException If an error occurs while dumping the top values. */ public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, boolean asEff, NameType nameType, AggrType aggrType, boolean asSequenceFile) throws Exception { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("NodeDumper: starting at " + sdf.format(start)); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); Configuration conf = getConf(); JobConf dumper = new NutchJob(conf); dumper.setJobName("NodeDumper: " + webGraphDb); FileInputFormat.addInputPath(dumper, nodeDb); dumper.setInputFormat(SequenceFileInputFormat.class); if (nameType == null) { dumper.setMapperClass(Sorter.class); dumper.setReducerClass(Sorter.class); dumper.setMapOutputKeyClass(FloatWritable.class); dumper.setMapOutputValueClass(Text.class); } else { dumper.setMapperClass(Dumper.class); dumper.setReducerClass(Dumper.class); dumper.setMapOutputKeyClass(Text.class); dumper.setMapOutputValueClass(FloatWritable.class); } dumper.setOutputKeyClass(Text.class); dumper.setOutputValueClass(FloatWritable.class); FileOutputFormat.setOutputPath(dumper, output); if (asSequenceFile) { dumper.setOutputFormat(SequenceFileOutputFormat.class); } else { dumper.setOutputFormat(TextOutputFormat.class); } dumper.setNumReduceTasks(1); dumper.setBoolean("inlinks", type == DumpType.INLINKS); dumper.setBoolean("outlinks", type == DumpType.OUTLINKS); dumper.setBoolean("scores", type == DumpType.SCORES); dumper.setBoolean("host", nameType == NameType.HOST); dumper.setBoolean("domain", nameType == NameType.DOMAIN); dumper.setBoolean("sum", aggrType == AggrType.SUM); dumper.setBoolean("max", aggrType == AggrType.MAX); dumper.setLong("topn", topN); // Set equals-sign as separator for Solr's ExternalFileField if (asEff) { dumper.set("mapred.textoutputformat.separator", "="); } try { LOG.info("NodeDumper: running"); JobClient.runJob(dumper); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } long end = System.currentTimeMillis(); LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
Example 19
Source File: FreeGenerator.java From anthelion with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]"); System.err.println("\tinputDir\tinput directory containing one or more input files."); System.err.println("\t\tEach text file contains a list of URLs, one URL per line"); System.err.println("\tsegmentsDir\toutput directory, where new segment will be created"); System.err.println("\t-filter\trun current URLFilters on input URLs"); System.err.println("\t-normalize\trun current URLNormalizers on input URLs"); return -1; } boolean filter = false; boolean normalize = false; if (args.length > 2) { for (int i = 2; i < args.length; i++) { if (args[i].equals("-filter")) { filter = true; } else if (args[i].equals("-normalize")) { normalize = true; } else { LOG.error("Unknown argument: " + args[i] + ", exiting ..."); return -1; } } } SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("FreeGenerator: starting at " + sdf.format(start)); JobConf job = new NutchJob(getConf()); job.setBoolean(FILTER_KEY, filter); job.setBoolean(NORMALIZE_KEY, normalize); FileInputFormat.addInputPath(job, new Path(args[0])); job.setInputFormat(TextInputFormat.class); job.setMapperClass(FG.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Generator.SelectorEntry.class); job.setPartitionerClass(URLPartitioner.class); job.setReducerClass(FG.class); String segName = Generator.generateSegmentName(); job.setNumReduceTasks(job.getNumMapTasks()); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setOutputKeyComparatorClass(Generator.HashComparator.class); FileOutputFormat.setOutputPath(job, new Path(args[1], new Path(segName, CrawlDatum.GENERATE_DIR_NAME))); try { JobClient.runJob(job); } catch (Exception e) { LOG.error("FAILED: " + StringUtils.stringifyException(e)); return -1; } long end = System.currentTimeMillis(); LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); return 0; }
Example 20
Source File: DBOutputFormat.java From RDFS with Apache License 2.0 | 3 votes |
/** * Initializes the reduce-part of the job with the appropriate output settings * * @param job * The job * @param tableName * The table to insert data into * @param fieldNames * The field names in the table. If unknown, supply the appropriate * number of nulls. */ public static void setOutput(JobConf job, String tableName, String... fieldNames) { job.setOutputFormat(DBOutputFormat.class); job.setReduceSpeculativeExecution(false); DBConfiguration dbConf = new DBConfiguration(job); dbConf.setOutputTableName(tableName); dbConf.setOutputFieldNames(fieldNames); }