Java Code Examples for org.apache.hadoop.mapred.JobConf#setJobName()
The following examples show how to use
org.apache.hadoop.mapred.JobConf#setJobName() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TeraGen.java From hadoop-gpu with Apache License 2.0 | 6 votes |
/** * @param args the cli arguments */ public int run(String[] args) throws IOException { JobConf job = (JobConf) getConf(); setNumberOfRows(job, Long.parseLong(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraGen"); job.setJarByClass(TeraGen.class); job.setMapperClass(SortGenMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormat(RangeInputFormat.class); job.setOutputFormat(TeraOutputFormat.class); JobClient.runJob(job); return 0; }
Example 2
Source File: RowCounter.java From hbase with Apache License 2.0 | 6 votes |
/** * @param args * @return the JobConf * @throws IOException */ public JobConf createSubmittableJob(String[] args) throws IOException { JobConf c = new JobConf(getConf(), getClass()); c.setJobName(NAME); // Columns are space delimited StringBuilder sb = new StringBuilder(); final int columnoffset = 2; for (int i = columnoffset; i < args.length; i++) { if (i > columnoffset) { sb.append(" "); } sb.append(args[i]); } // Second argument is the table name. TableMapReduceUtil.initTableMapJob(args[1], sb.toString(), RowCounterMapper.class, ImmutableBytesWritable.class, Result.class, c); c.setNumReduceTasks(0); // First arg is the output directory. FileOutputFormat.setOutputPath(c, new Path(args[0])); return c; }
Example 3
Source File: TeraSort.java From hadoop-book with Apache License 2.0 | 6 votes |
public int run(String[] args) throws Exception { LOG.info("starting"); JobConf job = (JobConf) getConf(); Path inputDir = new Path(args[0]); inputDir = inputDir.makeQualified(inputDir.getFileSystem(job)); Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME); URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME); TeraInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraSort"); job.setJarByClass(TeraSort.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormat(TeraInputFormat.class); job.setOutputFormat(TeraOutputFormat.class); job.setPartitionerClass(TotalOrderPartitioner.class); TeraInputFormat.writePartitionFile(job, partitionFile); DistributedCache.addCacheFile(partitionUri, job); DistributedCache.createSymlink(job); job.setInt("dfs.replication", 1); TeraOutputFormat.setFinalSync(job, true); JobClient.runJob(job); LOG.info("done"); return 0; }
Example 4
Source File: GenerateProfiles.java From aerospike-hadoop with Apache License 2.0 | 5 votes |
public int run(final String[] args) throws Exception { log.info("run starting"); final Configuration conf = getConf(); JobConf job = new JobConf(conf, GenerateProfiles.class); job.setJobName("AerospikeGenerateProfiles"); job.setMapperClass(Map.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(IntWritable.class); // job.setCombinerClass(Reduce.class); // Reduce changes format. job.setReducerClass(Reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Profile.class); job.setOutputFormat(ProfileOutputFormat.class); for (int ii = 0; ii < args.length; ++ii) FileInputFormat.addInputPath(job, new Path(args[ii])); JobClient.runJob(job); log.info("finished"); return 0; }
Example 5
Source File: DistCh.java From hadoop-gpu with Apache License 2.0 | 5 votes |
private static JobConf createJobConf(Configuration conf) { JobConf jobconf = new JobConf(conf, DistCh.class); jobconf.setJobName(NAME); jobconf.setMapSpeculativeExecution(false); jobconf.setInputFormat(ChangeInputFormat.class); jobconf.setOutputKeyClass(Text.class); jobconf.setOutputValueClass(Text.class); jobconf.setMapperClass(ChangeFilesMapper.class); jobconf.setNumReduceTasks(0); return jobconf; }
Example 6
Source File: ExternalJoin.java From aerospike-hadoop with Apache License 2.0 | 5 votes |
public int run(final String[] args) throws Exception { log.info("run starting"); final Configuration conf = getConf(); JobConf job = new JobConf(conf, ExternalJoin.class); job.setJobName("AerospikeExternalJoin"); job.setMapperClass(Map.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(LongWritable.class); // job.setCombinerClass(Reduce.class); // Reduce changes format. job.setReducerClass(Reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Session.class); job.setOutputFormat(SessionOutputFormat.class); for (int ii = 0; ii < args.length; ++ii) FileInputFormat.addInputPath(job, new Path(args[ii])); JobClient.runJob(job); log.info("finished"); return 0; }
Example 7
Source File: CopyFromS3.java From emr-sample-apps with Apache License 2.0 | 5 votes |
/** * This method constructs the JobConf to be used to run the map reduce job to * download the files from S3. This is a potentially expensive method since it * makes multiple calls to S3 to get a listing of all the input data. Clients * are encouraged to cache the returned JobConf reference and not call this * method multiple times unless necessary. * * @return the JobConf to be used to run the map reduce job to download the * files from S3. */ public JobConf getJobConf() throws IOException, ParseException { JobConf conf = new JobConf(CopyFromS3.class); conf.setJobName("CopyFromS3"); conf.setOutputKeyClass(NullWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(S3CopyMapper.class); // We configure a reducer, even though we don't use it right now. // The idea is that, in the future we may. conf.setReducerClass(HDFSWriterReducer.class); conf.setNumReduceTasks(0); FileInputFormat.setInputPaths(conf, new Path(tempFile)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setOutputFormat(TextOutputFormat.class); conf.setCompressMapOutput(true); JobClient jobClient = new JobClient(conf); FileSystem inputFS = FileSystem.get(URI.create(inputPathPrefix), conf); DatePathFilter datePathFilter = new DatePathFilter(startDate, endDate); List<Path> filePaths = getFilePaths(inputFS, new Path(inputPathPrefix), datePathFilter, jobClient.getDefaultMaps()); // Write the file names to a temporary index file to be used // as input to the map tasks. FileSystem outputFS = FileSystem.get(URI.create(tempFile), conf); FSDataOutputStream outputStream = outputFS.create(new Path(tempFile), true); try { for (Path path : filePaths) { outputStream.writeBytes(path.toString() + "\n"); } } finally { outputStream.close(); } conf.setNumMapTasks(Math.min(filePaths.size(), jobClient.getDefaultMaps())); return conf; }
Example 8
Source File: NodeDumper.java From anthelion with Apache License 2.0 | 4 votes |
/** * Runs the process to dump the top urls out to a text file. * * @param webGraphDb The WebGraph from which to pull values. * * @param topN * @param output * * @throws IOException If an error occurs while dumping the top values. */ public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, boolean asEff, NameType nameType, AggrType aggrType, boolean asSequenceFile) throws Exception { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("NodeDumper: starting at " + sdf.format(start)); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); Configuration conf = getConf(); JobConf dumper = new NutchJob(conf); dumper.setJobName("NodeDumper: " + webGraphDb); FileInputFormat.addInputPath(dumper, nodeDb); dumper.setInputFormat(SequenceFileInputFormat.class); if (nameType == null) { dumper.setMapperClass(Sorter.class); dumper.setReducerClass(Sorter.class); dumper.setMapOutputKeyClass(FloatWritable.class); dumper.setMapOutputValueClass(Text.class); } else { dumper.setMapperClass(Dumper.class); dumper.setReducerClass(Dumper.class); dumper.setMapOutputKeyClass(Text.class); dumper.setMapOutputValueClass(FloatWritable.class); } dumper.setOutputKeyClass(Text.class); dumper.setOutputValueClass(FloatWritable.class); FileOutputFormat.setOutputPath(dumper, output); if (asSequenceFile) { dumper.setOutputFormat(SequenceFileOutputFormat.class); } else { dumper.setOutputFormat(TextOutputFormat.class); } dumper.setNumReduceTasks(1); dumper.setBoolean("inlinks", type == DumpType.INLINKS); dumper.setBoolean("outlinks", type == DumpType.OUTLINKS); dumper.setBoolean("scores", type == DumpType.SCORES); dumper.setBoolean("host", nameType == NameType.HOST); dumper.setBoolean("domain", nameType == NameType.DOMAIN); dumper.setBoolean("sum", aggrType == AggrType.SUM); dumper.setBoolean("max", aggrType == AggrType.MAX); dumper.setLong("topn", topN); // Set equals-sign as separator for Solr's ExternalFileField if (asEff) { dumper.set("mapred.textoutputformat.separator", "="); } try { LOG.info("NodeDumper: running"); JobClient.runJob(dumper); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } long end = System.currentTimeMillis(); LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
Example 9
Source File: TradeSellOrdersHdfsDataVerifier.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); JobConf conf = new JobConf(getConf()); conf.setJobName("TradeSellOrdersHdfsDataVerifier"); String hdfsHomeDir = args[0]; String url = args[1]; String tableName = args[2]; System.out.println("TradeSellOrdersHdfsDataVerifier.run() invoked with " + " hdfsHomeDir = " + hdfsHomeDir + " url = " + url + " tableName = " + tableName); // Job-specific params conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); conf.setInputFormat(RowInputFormat.class); conf.setMapperClass(HdfsDataMapper.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(TradeSellOrdersRow.class); conf.setReducerClass(HdfsDataReducer.class); conf.set(RowOutputFormat.OUTPUT_TABLE, tableName + "_HDFS"); //conf.set(GfxdOutputFormat.OUTPUT_SCHEMA, "APP"); conf.set(RowOutputFormat.OUTPUT_URL, url); conf.setOutputFormat(RowOutputFormat.class); conf.setOutputKeyClass(Key.class); conf.setOutputValueClass(TradeSellOrdersOutputObject.class); StringBuffer aStr = new StringBuffer(); aStr.append("HOME_DIR = " + conf.get(RowInputFormat.HOME_DIR) + " "); aStr.append("INPUT_TABLE = " + conf.get(RowInputFormat.INPUT_TABLE) + " "); aStr.append("OUTPUT_TABLE = " + conf.get(RowOutputFormat.OUTPUT_TABLE) + " "); aStr.append("OUTPUT_URL = " + conf.get(RowOutputFormat.OUTPUT_URL) + " "); System.out.println("VerifyHdfsData running with the following conf: " + aStr.toString()); FileOutputFormat.setOutputPath(conf, new Path("" + System.currentTimeMillis())); JobClient.runJob(conf); return 0; }
Example 10
Source File: ResultMergeRemoteSpark.java From systemds with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") protected RDDObject executeMerge(MatrixObject compare, MatrixObject[] inputs, long rlen, long clen, int blen) { String jobname = "ParFor-RMSP"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; SparkExecutionContext sec = (SparkExecutionContext)_ec; boolean withCompare = (compare!=null); RDDObject ret = null; //determine degree of parallelism int numRed = determineNumReducers(rlen, clen, blen, _numReducers); //sanity check for empty src files if( inputs == null || inputs.length==0 ) throw new DMLRuntimeException("Execute merge should never be called with no inputs."); try { //note: initial implementation via union over all result rdds discarded due to //stack overflow errors with many parfor tasks, and thus many rdds //Step 1: construct input rdd from all result files of parfor workers //a) construct job conf with all files InputOutputInfo ii = InputOutputInfo.get(DataType.MATRIX, FileFormat.BINARY); JobConf job = new JobConf( "test" ); job.setJobName(jobname); job.setInputFormat(ii.inputFormatClass); Path[] paths = new Path[ inputs.length ]; for(int i=0; i<paths.length; i++) { //ensure input exists on hdfs (e.g., if in-memory or RDD) inputs[i].exportData(); paths[i] = new Path( inputs[i].getFileName() ); //update rdd handle to allow lazy evaluation by guarding //against cleanup of temporary result files setRDDHandleForMerge(inputs[i], sec); } FileInputFormat.setInputPaths(job, paths); //b) create rdd from input files w/ deep copy of keys and blocks JavaPairRDD<MatrixIndexes, MatrixBlock> rdd = sec.getSparkContext() .hadoopRDD(job, ii.inputFormatClass, ii.keyClass, ii.valueClass) .mapPartitionsToPair(new CopyMatrixBlockPairFunction(true), true); //Step 2a: merge with compare JavaPairRDD<MatrixIndexes, MatrixBlock> out = null; if( withCompare ) { JavaPairRDD<MatrixIndexes, MatrixBlock> compareRdd = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sec.getRDDHandleForMatrixObject(compare, FileFormat.BINARY); //merge values which differ from compare values ResultMergeRemoteSparkWCompare cfun = new ResultMergeRemoteSparkWCompare(_isAccum); out = rdd.groupByKey(numRed) //group all result blocks per key .join(compareRdd) //join compare block and result blocks .mapToPair(cfun); //merge result blocks w/ compare } //Step 2b: merge without compare else { //direct merge in any order (disjointness guaranteed) out = _isAccum ? RDDAggregateUtils.sumByKeyStable(rdd, false) : RDDAggregateUtils.mergeByKey(rdd, false); } //Step 3: create output rdd handle w/ lineage ret = new RDDObject(out); for(int i=0; i<paths.length; i++) ret.addLineageChild(inputs[i].getRDDHandle()); if( withCompare ) ret.addLineageChild(compare.getRDDHandle()); } catch( Exception ex ) { throw new DMLRuntimeException(ex); } //maintain statistics Statistics.incrementNoOfCompiledSPInst(); Statistics.incrementNoOfExecutedSPInst(); if( DMLScript.STATISTICS ){ Statistics.maintainCPHeavyHitters(jobname, System.nanoTime()-t0); } return ret; }
Example 11
Source File: AbstractAvroJob.java From ml-ease with Apache License 2.0 | 4 votes |
/** * Sets up various standard settings in the JobConf. You probably don't want to mess with this. * * @return A configured JobConf. * @throws IOException * @throws URISyntaxException */ protected JobConf createJobConf() throws IOException, URISyntaxException { JobConf conf = new JobConf(); conf.setJobName(getJobId()); conf.setInputFormat(AvroInputFormat.class); conf.setOutputFormat(AvroOutputFormat.class); AvroOutputFormat.setDeflateLevel(conf, 9); String hadoop_ugi = _config.getString("hadoop.job.ugi", null); if (hadoop_ugi != null) { conf.set("hadoop.job.ugi", hadoop_ugi); } if (_config.getBoolean("is.local", false)) { conf.set("mapred.job.tracker", "local"); conf.set("fs.default.name", "file:///"); conf.set("mapred.local.dir", "/tmp/map-red"); _log.info("Running locally, no hadoop jar set."); } // set JVM options if present if (_config.containsKey("mapred.child.java.opts")) { conf.set("mapred.child.java.opts", _config.getString("mapred.child.java.opts")); _log.info("mapred.child.java.opts set to " + _config.getString("mapred.child.java.opts")); } if (_config.containsKey(INPUT_PATHS)) { List<String> inputPathnames = _config.getStringList(INPUT_PATHS); for (String pathname : inputPathnames) { AvroUtils.addAllSubPaths(conf, new Path(pathname)); } AvroJob.setInputSchema(conf, AvroUtils.getAvroInputSchema(conf)); } if (_config.containsKey(OUTPUT_PATH)) { Path path = new Path(_config.get(OUTPUT_PATH)); AvroOutputFormat.setOutputPath(conf, path); if (_config.getBoolean("force.output.overwrite", false)) { FileSystem fs = FileOutputFormat.getOutputPath(conf).getFileSystem(conf); fs.delete(FileOutputFormat.getOutputPath(conf), true); } } // set all hadoop configs for (String key : _config.keySet()) { String lowerCase = key.toLowerCase(); if ( lowerCase.startsWith(HADOOP_PREFIX)) { String newKey = key.substring(HADOOP_PREFIX.length()); conf.set(newKey, _config.get(key)); } } return conf; }
Example 12
Source File: TradeTxHistoryHdfsDataVerifier.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); JobConf conf = new JobConf(getConf()); conf.setJobName("TradeTxHistoryHdfsDataVerifier"); String hdfsHomeDir = args[0]; String url = args[1]; String tableName = args[2]; System.out.println("TradeTxHistoryHdfsDataVerifier.run() invoked with " + " hdfsHomeDir = " + hdfsHomeDir + " url = " + url + " tableName = " + tableName); // Job-specific params conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); conf.setInputFormat(RowInputFormat.class); conf.setMapperClass(HdfsDataMapper.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(TradeTxHistoryRow.class); conf.setReducerClass(HdfsDataReducer.class); conf.set(RowOutputFormat.OUTPUT_TABLE, tableName + "_HDFS"); //conf.set(GfxdOutputFormat.OUTPUT_SCHEMA, "APP"); conf.set(RowOutputFormat.OUTPUT_URL, url); conf.setOutputFormat(RowOutputFormat.class); conf.setOutputKeyClass(Key.class); conf.setOutputValueClass(TradeTxHistoryOutputObject.class); StringBuffer aStr = new StringBuffer(); aStr.append("HOME_DIR = " + conf.get(RowInputFormat.HOME_DIR) + " "); aStr.append("INPUT_TABLE = " + conf.get(RowInputFormat.INPUT_TABLE) + " "); aStr.append("OUTPUT_TABLE = " + conf.get(RowOutputFormat.OUTPUT_TABLE) + " "); aStr.append("OUTPUT_URL = " + conf.get(RowOutputFormat.OUTPUT_URL) + " "); System.out.println("VerifyHdfsData running with the following conf: " + aStr.toString()); FileOutputFormat.setOutputPath(conf, new Path("" + System.currentTimeMillis())); JobClient.runJob(conf); return 0; }
Example 13
Source File: TradeCustomersHdfsDataVerifier.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); JobConf conf = new JobConf(getConf()); conf.setJobName("TradeCustomersHdfsDataVerifier"); String hdfsHomeDir = args[0]; String url = args[1]; String tableName = args[2]; System.out.println("TradeCustomersHdfsDataVerifier.run() invoked with " + " hdfsHomeDir = " + hdfsHomeDir + " url = " + url + " tableName = " + tableName); // Job-specific params conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); conf.setInputFormat(RowInputFormat.class); conf.setMapperClass(HdfsDataMapper.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(TradeCustomersRow.class); conf.setReducerClass(HdfsDataReducer.class); conf.set(RowOutputFormat.OUTPUT_TABLE, tableName + "_HDFS"); //conf.set(GfxdOutputFormat.OUTPUT_SCHEMA, "APP"); conf.set(RowOutputFormat.OUTPUT_URL, url); conf.setOutputFormat(RowOutputFormat.class); conf.setOutputKeyClass(Key.class); conf.setOutputValueClass(TradeCustomerOutputObject.class); StringBuffer aStr = new StringBuffer(); aStr.append("HOME_DIR = " + conf.get(RowInputFormat.HOME_DIR) + " "); aStr.append("INPUT_TABLE = " + conf.get(RowInputFormat.INPUT_TABLE) + " "); aStr.append("OUTPUT_TABLE = " + conf.get(RowOutputFormat.OUTPUT_TABLE) + " "); aStr.append("OUTPUT_URL = " + conf.get(RowOutputFormat.OUTPUT_URL) + " "); System.out.println("VerifyHdfsData running with the following conf: " + aStr.toString()); FileOutputFormat.setOutputPath(conf, new Path("" + System.currentTimeMillis())); JobClient.runJob(conf); return 0; }
Example 14
Source File: PiEstimator.java From hadoop-gpu with Apache License 2.0 | 4 votes |
/** * Run a map/reduce job for estimating Pi. * * @return the estimated value of Pi */ public static BigDecimal estimate(int numMaps, long numPoints, JobConf jobConf ) throws IOException { //setup job conf jobConf.setJobName(PiEstimator.class.getSimpleName()); jobConf.setInputFormat(SequenceFileInputFormat.class); jobConf.setOutputKeyClass(BooleanWritable.class); jobConf.setOutputValueClass(LongWritable.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setMapperClass(PiMapper.class); jobConf.setNumMapTasks(numMaps); jobConf.setReducerClass(PiReducer.class); jobConf.setNumReduceTasks(1); // turn off speculative execution, because DFS doesn't handle // multiple writers to the same file. jobConf.setSpeculativeExecution(false); //setup input/output directories final Path inDir = new Path(TMP_DIR, "in"); final Path outDir = new Path(TMP_DIR, "out"); FileInputFormat.setInputPaths(jobConf, inDir); FileOutputFormat.setOutputPath(jobConf, outDir); final FileSystem fs = FileSystem.get(jobConf); if (fs.exists(TMP_DIR)) { throw new IOException("Tmp directory " + fs.makeQualified(TMP_DIR) + " already exists. Please remove it first."); } if (!fs.mkdirs(inDir)) { throw new IOException("Cannot create input directory " + inDir); } try { //generate an input file for each map task for(int i=0; i < numMaps; ++i) { final Path file = new Path(inDir, "part"+i); final LongWritable offset = new LongWritable(i * numPoints); final LongWritable size = new LongWritable(numPoints); final SequenceFile.Writer writer = SequenceFile.createWriter( fs, jobConf, file, LongWritable.class, LongWritable.class, CompressionType.NONE); try { writer.append(offset, size); } finally { writer.close(); } System.out.println("Wrote input for Map #"+i); } //start a map/reduce job System.out.println("Starting Job"); final long startTime = System.currentTimeMillis(); JobClient.runJob(jobConf); final double duration = (System.currentTimeMillis() - startTime)/1000.0; System.out.println("Job Finished in " + duration + " seconds"); //read outputs Path inFile = new Path(outDir, "reduce-out"); LongWritable numInside = new LongWritable(); LongWritable numOutside = new LongWritable(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, jobConf); try { reader.next(numInside, numOutside); } finally { reader.close(); } //compute estimated value return BigDecimal.valueOf(4).setScale(20) .multiply(BigDecimal.valueOf(numInside.get())) .divide(BigDecimal.valueOf(numMaps)) .divide(BigDecimal.valueOf(numPoints)); } finally { fs.delete(TMP_DIR, true); } }
Example 15
Source File: RandomWriter.java From hadoop-book with Apache License 2.0 | 4 votes |
/** * This is the main routine for launching a distributed random write job. It * runs 10 maps/node and each node writes 1 gig of data to a DFS file. The * reduce doesn't do anything. * * @throws IOException */ public int run(String[] args) throws Exception { if (args.length == 0) { System.out.println("Usage: writer <out-dir>"); ToolRunner.printGenericCommandUsage(System.out); return -1; } Path outDir = new Path(args[0]); JobConf job = new JobConf(getConf()); job.setJarByClass(RandomWriter.class); job.setJobName("random-writer"); FileOutputFormat.setOutputPath(job, outDir); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(BytesWritable.class); job.setInputFormat(RandomInputFormat.class); job.setMapperClass(Map.class); job.setReducerClass(IdentityReducer.class); job.setOutputFormat(SequenceFileOutputFormat.class); JobClient client = new JobClient(job); ClusterStatus cluster = client.getClusterStatus(); int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10); long numBytesToWritePerMap = job.getLong("test.randomwrite.bytes_per_map", 1 * 1024 * 1024 * 1024); if (numBytesToWritePerMap == 0) { System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0"); return -2; } long totalBytesToWrite = job.getLong("test.randomwrite.total_bytes", numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers()); int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; job.setLong("test.randomwrite.bytes_per_map", totalBytesToWrite); } job.setNumMapTasks(numMaps); System.out.println("Running " + numMaps + " maps."); // reducer NONE job.setNumReduceTasks(0); Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(job); Date endTime = new Date(); System.out.println("Job ended: " + endTime); System.out.println("The job took " + (endTime.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
Example 16
Source File: TradeBuyOrdersHdfsDataVerifier.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); JobConf conf = new JobConf(getConf()); conf.setJobName("TradeBuyOrdersHdfsDataVerifier"); String hdfsHomeDir = args[0]; String url = args[1]; String tableName = args[2]; System.out.println("TradeBuyOrdersHdfsDataVerifier.run() invoked with " + " hdfsHomeDir = " + hdfsHomeDir + " url = " + url + " tableName = " + tableName); // Job-specific params conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); conf.setInputFormat(RowInputFormat.class); conf.setMapperClass(HdfsDataMapper.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(TradeBuyOrdersRow.class); conf.setReducerClass(HdfsDataReducer.class); conf.set(RowOutputFormat.OUTPUT_TABLE, tableName + "_HDFS"); //conf.set(GfxdOutputFormat.OUTPUT_SCHEMA, "APP"); conf.set(RowOutputFormat.OUTPUT_URL, url); conf.setOutputFormat(RowOutputFormat.class); conf.setOutputKeyClass(Key.class); conf.setOutputValueClass(TradeBuyOrdersOutputObject.class); StringBuffer aStr = new StringBuffer(); aStr.append("HOME_DIR = " + conf.get(RowInputFormat.HOME_DIR) + " "); aStr.append("INPUT_TABLE = " + conf.get(RowInputFormat.INPUT_TABLE) + " "); aStr.append("OUTPUT_TABLE = " + conf.get(RowOutputFormat.OUTPUT_TABLE) + " "); aStr.append("OUTPUT_URL = " + conf.get(RowOutputFormat.OUTPUT_URL) + " "); System.out.println("VerifyHdfsData running with the following conf: " + aStr.toString()); FileOutputFormat.setOutputPath(conf, new Path("" + System.currentTimeMillis())); JobClient.runJob(conf); return 0; }
Example 17
Source File: NodeDumper.java From nutch-htmlunit with Apache License 2.0 | 4 votes |
/** * Runs the process to dump the top urls out to a text file. * * @param webGraphDb The WebGraph from which to pull values. * * @param topN * @param output * * @throws IOException If an error occurs while dumping the top values. */ public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, boolean asEff, NameType nameType, AggrType aggrType, boolean asSequenceFile) throws Exception { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("NodeDumper: starting at " + sdf.format(start)); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); Configuration conf = getConf(); JobConf dumper = new NutchJob(conf); dumper.setJobName("NodeDumper: " + webGraphDb); FileInputFormat.addInputPath(dumper, nodeDb); dumper.setInputFormat(SequenceFileInputFormat.class); if (nameType == null) { dumper.setMapperClass(Sorter.class); dumper.setReducerClass(Sorter.class); dumper.setMapOutputKeyClass(FloatWritable.class); dumper.setMapOutputValueClass(Text.class); } else { dumper.setMapperClass(Dumper.class); dumper.setReducerClass(Dumper.class); dumper.setMapOutputKeyClass(Text.class); dumper.setMapOutputValueClass(FloatWritable.class); } dumper.setOutputKeyClass(Text.class); dumper.setOutputValueClass(FloatWritable.class); FileOutputFormat.setOutputPath(dumper, output); if (asSequenceFile) { dumper.setOutputFormat(SequenceFileOutputFormat.class); } else { dumper.setOutputFormat(TextOutputFormat.class); } dumper.setNumReduceTasks(1); dumper.setBoolean("inlinks", type == DumpType.INLINKS); dumper.setBoolean("outlinks", type == DumpType.OUTLINKS); dumper.setBoolean("scores", type == DumpType.SCORES); dumper.setBoolean("host", nameType == NameType.HOST); dumper.setBoolean("domain", nameType == NameType.DOMAIN); dumper.setBoolean("sum", aggrType == AggrType.SUM); dumper.setBoolean("max", aggrType == AggrType.MAX); dumper.setLong("topn", topN); // Set equals-sign as separator for Solr's ExternalFileField if (asEff) { dumper.set("mapred.textoutputformat.separator", "="); } try { LOG.info("NodeDumper: running"); JobClient.runJob(dumper); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } long end = System.currentTimeMillis(); LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
Example 18
Source File: DFSGeneralTest.java From RDFS with Apache License 2.0 | 4 votes |
@Override public int run(String[] args) throws IOException { if (args.length < 1) { printUsage(); } testtype = args[0]; if (!Arrays.asList(testtypes).contains(testtype)) { System.err.println(testtype + " is not a supported test type"); printUsage(); } // running the Writting fsConfig = new Configuration(getConf()); dfs_output = getUniqueName(DFS_OUTPUT + testtype); dfs_input = getUniqueName(DFS_INPUT + testtype); input = getUniqueName(INPUT + testtype); output = getUniqueName(OUTPUT + testtype); workdir = input; cleanUpDirs(fsConfig); FileSystem fs = FileSystem.get(fsConfig); JobConf conf = new JobConf(fsConfig, DFSGeneralTest.class); conf.setJobName(getUniqueName("gentest-" + testtype)); conf.set(TEST_TYPE_KEY, testtype); String[] newArgs = initializeGeneralConf(args, conf); if (testtype.equals(GenWriterThread.TEST_TYPE)) { initializeGenWriterJob(newArgs, conf); } else if (testtype.equals(DatanodeBenThread.TEST_TYPE)) { initializeDatanodeBenJob(newArgs, conf); } else { printUsage(); } updateJobConf(conf, new Path(dfs_input), new Path(dfs_output, "results")); long startTime = System.currentTimeMillis(); JobClient.runJob(conf); long endTime = System.currentTimeMillis(); printResult(fs, new Path(output, "results"), startTime, endTime); verifyFiles(fs); //Delete all related files if (cleanup) cleanUpDirs(fsConfig); return 0; }
Example 19
Source File: TradeSecurityHdfsDataVerifier.java From gemfirexd-oss with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { GfxdDataSerializable.initTypes(); JobConf conf = new JobConf(getConf()); conf.setJobName("TradeSecurityHdfsDataVerifier"); String hdfsHomeDir = args[0]; String url = args[1]; String tableName = args[2]; System.out.println("TradeSecurityHdfsDataVerifier.run() invoked with " + " hdfsHomeDir = " + hdfsHomeDir + " url = " + url + " tableName = " + tableName); // Job-specific params conf.set(RowInputFormat.HOME_DIR, hdfsHomeDir); conf.set(RowInputFormat.INPUT_TABLE, tableName); conf.setBoolean(RowInputFormat.CHECKPOINT_MODE, false); conf.setInputFormat(RowInputFormat.class); conf.setMapperClass(HdfsDataMapper.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(TradeSecurityRow.class); conf.setReducerClass(HdfsDataReducer.class); conf.set(RowOutputFormat.OUTPUT_TABLE,tableName + "_HDFS"); conf.set(RowOutputFormat.OUTPUT_URL, url); conf.setOutputFormat(RowOutputFormat.class); conf.setOutputKeyClass(Key.class); conf.setOutputValueClass(TradeSecurityOutputObject.class); StringBuffer aStr = new StringBuffer(); aStr.append("HOME_DIR = " + conf.get(RowInputFormat.HOME_DIR) + " "); aStr.append("INPUT_TABLE = " + conf.get(RowInputFormat.INPUT_TABLE) + " "); aStr.append("OUTPUT_TABLE = " + conf.get(RowOutputFormat.OUTPUT_TABLE) + " "); aStr.append("OUTPUT_URL = " + conf.get(RowOutputFormat.OUTPUT_URL) + " "); System.out.println("VerifyHdfsData running with the following conf: " + aStr.toString()); FileOutputFormat.setOutputPath(conf, new Path("" + System.currentTimeMillis())); JobClient.runJob(conf); return 0; }
Example 20
Source File: ZephyrDriver.java From zephyr with Apache License 2.0 | 3 votes |
@Override public int run(String[] args) throws Exception { JobConf job = new JobConf(super.getConf(), ZephyrDriver.class); job.setJobName(config.getJobName()); Path in = new Path(config.getInputPath()); Path out = new Path(config.getOutputPath()); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); job.set("zephyr.job.uuid", UUIDHelper.generateUUID()); job.setInputFormat(config.getInputFormat().getClass()); job.setMapperClass(config.getMapper().getClass()); job.setOutputFormat(ZephyrOutputFormat.class); for (Map.Entry<String, String> entry : config.getConfigMap().entrySet()) { job.set(entry.getKey(), entry.getValue()); } job.set("zephyr.feed.xml", this.jobConfigFile); job.setNumReduceTasks(0); JobClient.runJob(job); return 0; }