Java Code Examples for org.apache.hadoop.mapreduce.Job#setInputFormatClass()
The following examples show how to use
org.apache.hadoop.mapreduce.Job#setInputFormatClass() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Step5.java From recsys-offline with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { // TODO Auto-generated method stub Configuration conf1 = new Configuration(); Job job1 = new Job(conf1, "wiki job five"); job1.setNumReduceTasks(1); job1.setJarByClass(Step5.class); job1.setInputFormatClass(SequenceFileInputFormat.class); job1.setMapperClass(WikiMapper5.class); job1.setMapOutputKeyClass(VarLongWritable.class); job1.setMapOutputValueClass(VectorWritable.class); job1.setCombinerClass(WiKiCombiner5.class); job1.setReducerClass(WiKiReducer5.class); job1.setOutputKeyClass(VarLongWritable.class); job1.setOutputValueClass(RecommendedItemsWritable.class); // job1.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileInputFormat.addInputPath(job1, new Path(INPUT_PATH)); FileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH)); if(!job1.waitForCompletion(true)){ System.exit(1); // run error then exit } }
Example 2
Source File: BinaryLoader.java From marklogic-contentpump with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: ContentLoader configFile inputDir"); System.exit(2); } Job job = Job.getInstance(conf); job.setJarByClass(BinaryLoader.class); job.setInputFormatClass(BinaryInputFormat.class); job.setMapperClass(ContentMapper.class); job.setMapOutputKeyClass(DocumentURI.class); job.setMapOutputValueClass(Text.class); job.setOutputFormatClass(ContentOutputFormat.class); BinaryInputFormat.setInputPaths(job, new Path(otherArgs[1])); conf = job.getConfiguration(); conf.addResource(otherArgs[0]); System.exit(job.waitForCompletion(true) ? 0 : 1); }
Example 3
Source File: WordDistributionStatisticsCollector.java From dkpro-c4corpus with Apache License 2.0 | 5 votes |
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(WordDistributionStatisticsCollector.class); job.setJobName(WordDistributionStatisticsCollector.class.getName()); // mapper job.setMapperClass(getMapperClass()); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // reducer job.setReducerClass(SumReducer.class); job.setInputFormatClass(getInputFormatClass()); job.setOutputFormatClass(TextOutputFormat.class); // paths String commaSeparatedInputFiles = args[0]; String outputPath = args[1]; FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job.waitForCompletion(true) ? 0 : 1; }
Example 4
Source File: TestReflectInputOutputFormat.java From parquet-mr with Apache License 2.0 | 5 votes |
@Before public void createParquetFile() throws Exception { // set up readers and writers not in MR conf.setBoolean(AvroReadSupport.AVRO_COMPATIBILITY, false); AvroReadSupport.setAvroDataSupplier(conf, ReflectDataSupplier.class); AvroWriteSupport.setAvroDataSupplier(conf, ReflectDataSupplier.class); final FileSystem fileSystem = parquetPath.getFileSystem(conf); fileSystem.delete(parquetPath, true); fileSystem.delete(outputPath, true); { final Job job = new Job(conf, "write"); // input not really used TextInputFormat.addInputPath(job, inputPath); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(TestReflectInputOutputFormat.MyMapper.class); job.setNumReduceTasks(0); job.setOutputFormatClass(AvroParquetOutputFormat.class); AvroParquetOutputFormat.setOutputPath(job, parquetPath); AvroParquetOutputFormat.setSchema(job, CAR_SCHEMA); AvroParquetOutputFormat.setAvroDataSupplier(job, ReflectDataSupplier.class); waitForJob(job); } }
Example 5
Source File: WARCRecordCounter.java From dkpro-c4corpus with Apache License 2.0 | 5 votes |
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); System.out.println("Other args: " + Arrays.toString(otherArgs)); Job job = Job.getInstance(conf); job.setJarByClass(WARCRecordCounter.class); job.setJobName(WARCRecordCounter.class.getName()); // mapper job.setMapperClass(ResponseMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // combiner + reducer job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); job.setInputFormatClass(WARCInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); // paths String commaSeparatedInputFiles = otherArgs[0]; String outputPath = otherArgs[1]; FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job.waitForCompletion(true) ? 0 : 1; }
Example 6
Source File: CompactionOrcJobConfigurator.java From incubator-gobblin with Apache License 2.0 | 5 votes |
protected void configureMapper(Job job) { job.setInputFormatClass(OrcValueCombineFileInputFormat.class); job.setMapperClass(OrcValueMapper.class); job.setMapOutputKeyClass(OrcKey.class); job.setMapOutputValueClass(OrcValue.class); job.setGroupingComparatorClass(OrcKeyComparator.class); job.setSortComparatorClass(OrcKeyComparator.class); }
Example 7
Source File: Main.java From hiped2 with Apache License 2.0 | 5 votes |
public static double calcPageRank(Path inputPath, Path outputPath, int numNodes) throws Exception { Configuration conf = new Configuration(); conf.setInt(Reduce.CONF_NUM_NODES_GRAPH, numNodes); Job job = new Job(conf); job.setJarByClass(Main.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(KeyValueTextInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); if (!job.waitForCompletion(true)) { throw new Exception("Job failed"); } long summedConvergence = job.getCounters().findCounter( Reduce.Counter.CONV_DELTAS).getValue(); double convergence = ((double) summedConvergence / Reduce.CONVERGENCE_SCALING_FACTOR) / (double) numNodes; System.out.println("======================================"); System.out.println("= Num nodes: " + numNodes); System.out.println("= Summed convergence: " + summedConvergence); System.out.println("= Convergence: " + convergence); System.out.println("======================================"); return convergence; }
Example 8
Source File: JMatrixMultiplicationStep2.java From RecommendationEngine with MIT License | 5 votes |
public static void run() throws IOException, ClassNotFoundException, InterruptedException { String inputPath = ItemBasedCFDriver.path.get("step8InputPath"); String outputPath = ItemBasedCFDriver.path.get("step8OutputPath"); Configuration conf = new Configuration(); Job job = Job.getInstance(conf); HDFS hdfs = new HDFS(conf); hdfs.rmr(outputPath); job.setMapperClass(Step2_Mapper.class); job.setReducerClass(Step2_Reducer.class); job.setJarByClass(JMatrixMultiplicationStep2.class); job.setNumReduceTasks(ItemBasedCFDriver.ReducerNumber); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.waitForCompletion(true); }
Example 9
Source File: DataDrivenDBInputFormat.java From hadoop with Apache License 2.0 | 5 votes |
/** setInput() takes a custom query and a separate "bounding query" to use instead of the custom "count query" used by DBInputFormat. */ public static void setInput(Job job, Class<? extends DBWritable> inputClass, String inputQuery, String inputBoundingQuery) { DBInputFormat.setInput(job, inputClass, inputQuery, ""); job.getConfiguration().set(DBConfiguration.INPUT_BOUNDING_QUERY, inputBoundingQuery); job.setInputFormatClass(DataDrivenDBInputFormat.class); }
Example 10
Source File: AvroConversionBaseCreator.java From datacollector with Apache License 2.0 | 5 votes |
@Override public Job call() throws Exception { // We're explicitly disabling speculative execution conf.set("mapreduce.map.speculative", "false"); conf.set("mapreduce.map.maxattempts", "1"); conf.set("mapreduce.job.user.classpath.first", "true"); conf.set("mapreduce.task.classpath.user.precedence", "true"); conf.set("mapreduce.task.classpath.first", "true"); addNecessaryJarsToJob(conf); Job job = Job.getInstance(conf); // IO formats job.setInputFormatClass(getInputFormatClass()); job.setOutputFormatClass(NullOutputFormat.class); // Mapper & job output job.setMapperClass(getMapperClass()); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); // It's map only job job.setNumReduceTasks(0); // General configuration job.setJarByClass(getClass()); return job; }
Example 11
Source File: PhoenixMapReduceUtil.java From phoenix with Apache License 2.0 | 5 votes |
/** * * @param job * @param inputClass DBWritable class * @param tableName Input table name * @param conditions Condition clause to be added to the WHERE clause. * @param fieldNames fields being projected for the SELECT query. */ public static void setInput(final Job job, final Class<? extends DBWritable> inputClass, final String tableName , final String conditions, final String... fieldNames) { job.setInputFormatClass(PhoenixInputFormat.class); final Configuration configuration = job.getConfiguration(); PhoenixConfigurationUtil.setInputTableName(configuration, tableName); PhoenixConfigurationUtil.setSelectColumnNames(configuration,fieldNames); PhoenixConfigurationUtil.setInputClass(configuration,inputClass); PhoenixConfigurationUtil.setSchemaType(configuration, SchemaType.TABLE); }
Example 12
Source File: KafkaMRInput.java From kylin with Apache License 2.0 | 5 votes |
@Override public void configureJob(Job job) { job.setInputFormatClass(SequenceFileInputFormat.class); String jobId = job.getConfiguration().get(BatchConstants.ARG_CUBING_JOB_ID); IJoinedFlatTableDesc flatHiveTableDesc = new CubeJoinedFlatTableDesc(cubeSegment); String inputPath = JoinedFlatTable.getTableDir(flatHiveTableDesc, JobBuilderSupport.getJobWorkingDir(conf, jobId)); try { FileInputFormat.addInputPath(job, new Path(inputPath)); } catch (IOException e) { throw new IllegalStateException(e); } }
Example 13
Source File: JsonDataValidationExecutor.java From jumbune with GNU Lesser General Public License v3.0 | 4 votes |
public static void main( String[] args ) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); StringBuilder sb = new StringBuilder(); for (int j = 2; j < otherArgs.length; j++) { sb.append(otherArgs[j]); } LOGGER.debug("Arguments[ " + otherArgs.length+"]"+"and values respectively ["+otherArgs[0]+"], "+ otherArgs[1]+", ["+otherArgs[2]+"]"+", ["+otherArgs[3]+"],"+ otherArgs[4]); String inputpath = otherArgs[0]; String outputpath = "/tmp/jumbune/dvjsonreport"+ new Date().getTime(); String json = otherArgs[1]; String nullCondition = otherArgs[2]; String regex = otherArgs[3]; String dvDir = otherArgs[4]; if(regex.isEmpty()){ conf.set(JsonDataVaildationConstants.REGEX_ARGUMENT, ""); }else{ conf.set(JsonDataVaildationConstants.REGEX_ARGUMENT, regex); } if(nullCondition.isEmpty()){ conf.set(JsonDataVaildationConstants.NULL_ARGUMENT, ""); }else{ conf.set(JsonDataVaildationConstants.NULL_ARGUMENT, nullCondition); } conf.set(JsonDataVaildationConstants.SLAVE_DIR, dvDir); conf.set(JsonDataVaildationConstants.JSON_ARGUMENT, json); FileSystem fs = FileSystem.get(conf); @SuppressWarnings("deprecation") Job job = new Job(conf, "JSONDataValidation"); job.setJarByClass(JsonDataValidationExecutor.class); job.setInputFormatClass(JsonFileInputFormat.class); job.setMapperClass(JsonDataValidationMapper.class); job.setPartitionerClass(JsonDataValidationPartitioner.class); job.setReducerClass(JsonDataValidationReducer.class); job.setNumReduceTasks(5); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(FileKeyViolationBean.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(TotalReducerViolationBean.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); Path[] inputPaths = FileUtil.getAllJsonNestedFilePath(job, inputpath); FileInputFormat.setInputPaths(job, inputPaths); FileOutputFormat.setOutputPath(job, new Path(outputpath)); if(fs.exists(new Path(outputpath))) { fs.delete(new Path(outputpath), true); } job.waitForCompletion(true); Map<String, JsonViolationReport> jsonMap = readDataFromHdfs(conf,outputpath); final Gson gson= new Gson(); final String jsonReport = gson.toJson(jsonMap); LOGGER.info("Completed DataValidation"); LOGGER.info(JsonDataVaildationConstants.JSON_DV_REPORT + jsonReport); }
Example 14
Source File: TopKPhaseJob.java From incubator-pinot with Apache License 2.0 | 4 votes |
public Job run() throws Exception { Job job = Job.getInstance(getConf()); job.setJobName(name); job.setJarByClass(TopKPhaseJob.class); Configuration configuration = job.getConfiguration(); FileSystem fs = FileSystem.get(configuration); // Properties LOGGER.info("Properties {}", props); // Input Path String inputPathDir = getAndSetConfiguration(configuration, TOPK_PHASE_INPUT_PATH); LOGGER.info("Input path dir: " + inputPathDir); for (String inputPath : inputPathDir.split(ThirdEyeConstants.FIELD_SEPARATOR)) { LOGGER.info("Adding input:" + inputPath); Path input = new Path(inputPath); FileInputFormat.addInputPath(job, input); } // Output path Path outputPath = new Path(getAndSetConfiguration(configuration, TOPK_PHASE_OUTPUT_PATH)); LOGGER.info("Output path dir: " + outputPath.toString()); if (fs.exists(outputPath)) { fs.delete(outputPath, true); } FileOutputFormat.setOutputPath(job, outputPath); // Schema Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir); LOGGER.info("Schema : {}", avroSchema.toString(true)); // ThirdEyeConfig String dimensionTypesProperty = ThirdeyeAvroUtils.getDimensionTypesProperty( props.getProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_NAMES.toString()), avroSchema); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_TYPES.toString(), dimensionTypesProperty); String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty( props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()), props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty); ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props); LOGGER.info("Thirdeye Config {}", thirdeyeConfig.encode()); job.getConfiguration().set(TOPK_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig)); // Map config job.setMapperClass(TopKPhaseMapper.class); job.setInputFormatClass(AvroKeyInputFormat.class); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(BytesWritable.class); // Combiner job.setCombinerClass(TopKPhaseCombiner.class); // Reduce config job.setReducerClass(TopKPhaseReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); job.setNumReduceTasks(1); job.waitForCompletion(true); return job; }
Example 15
Source File: DomainStatistics.java From anthelion with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { if (args.length < 3) { System.out.println("usage: DomainStatistics inputDirs outDir host|domain|suffix|tld [numOfReducer]"); return 1; } String inputDir = args[0]; String outputDir = args[1]; int numOfReducers = 1; if (args.length > 3) { numOfReducers = Integer.parseInt(args[3]); } SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("DomainStatistics: starting at " + sdf.format(start)); int mode = 0; String jobName = "DomainStatistics"; if(args[2].equals("host")) { jobName = "Host statistics"; mode = MODE_HOST; } else if(args[2].equals("domain")) { jobName = "Domain statistics"; mode = MODE_DOMAIN; } else if(args[2].equals("suffix")) { jobName = "Suffix statistics"; mode = MODE_SUFFIX; } else if(args[2].equals("tld")) { jobName = "TLD statistics"; mode = MODE_TLD; } Configuration conf = getConf(); conf.setInt("domain.statistics.mode", mode); conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); Job job = new Job(conf, jobName); job.setJarByClass(DomainStatistics.class); String[] inputDirsSpecs = inputDir.split(","); for (int i = 0; i < inputDirsSpecs.length; i++) { FileInputFormat.addInputPath(job, new Path(inputDirsSpecs[i])); } job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, new Path(outputDir)); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setMapperClass(DomainStatisticsMapper.class); job.setReducerClass(DomainStatisticsReducer.class); job.setCombinerClass(DomainStatisticsCombiner.class); job.setNumReduceTasks(numOfReducers); try { job.waitForCompletion(true); } catch (Exception e) { throw e; } long end = System.currentTimeMillis(); LOG.info("DomainStatistics: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); return 0; }
Example 16
Source File: MapReduceExercise.java From mongodb-hadoop-workshop with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { if(args.length < 3) { System.err.println("Usage: MapReduceExercise " + "[mongodb input uri] " + "[mongodb output uri] " + "update=[true or false]"); System.err.println("Example: MapReduceExercise " + "mongodb://127.0.0.1:27017/movielens.ratings " + "mongodb://127.0.0.1:27017/movielens.ratings.stats update=false"); System.err.println("Example: MapReduceExercise " + "mongodb://127.0.0.1:27017/movielens.ratings " + "mongodb://127.0.0.1:27017/movielens.movies update=true"); System.exit(-1); } Class outputValueClass = BSONWritable.class; Class reducerClass = Reduce.class; if(args[2].equals("update=true")) { outputValueClass = MongoUpdateWritable.class; reducerClass = ReduceUpdater.class; } Configuration conf = new Configuration(); // Set MongoDB-specific configuration items conf.setClass("mongo.job.mapper", Map.class, Mapper.class); conf.setClass("mongo.job.reducer", reducerClass, Reducer.class); conf.setClass("mongo.job.mapper.output.key", IntWritable.class, Object.class); conf.setClass("mongo.job.mapper.output.value", DoubleWritable.class, Object.class); conf.setClass("mongo.job.output.key", NullWritable.class, Object.class); conf.setClass("mongo.job.output.value", outputValueClass, Object.class); conf.set("mongo.input.uri", args[0]); conf.set("mongo.output.uri", args[1]); Job job = Job.getInstance(conf); // Set Hadoop-specific job parameters job.setInputFormatClass(MongoInputFormat.class); job.setOutputFormatClass(MongoOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(outputValueClass); job.setMapperClass(Map.class); job.setReducerClass(reducerClass); job.setJarByClass(MapReduceExercise.class); job.submit(); }
Example 17
Source File: HalyardBulkExport.java From Halyard with Apache License 2.0 | 4 votes |
@Override protected int run(CommandLine cmd) throws Exception { if (!cmd.getArgList().isEmpty()) throw new HalyardExport.ExportException("Unknown arguments: " + cmd.getArgList().toString()); String source = cmd.getOptionValue('s'); String queryFiles = cmd.getOptionValue('q'); String target = cmd.getOptionValue('t'); if (!target.contains("{0}")) { throw new HalyardExport.ExportException("Bulk export target must contain '{0}' to be replaced by stripped filename of the actual SPARQL query."); } getConf().set(SOURCE, source); getConf().set(TARGET, target); String driver = cmd.getOptionValue('c'); if (driver != null) { getConf().set(JDBC_DRIVER, driver); } String props[] = cmd.getOptionValues('p'); if (props != null) { for (int i=0; i<props.length; i++) { props[i] = Base64.encodeBase64String(props[i].getBytes(StandardCharsets.UTF_8)); } getConf().setStrings(JDBC_PROPERTIES, props); } if (cmd.hasOption('i')) getConf().set(HalyardBulkUpdate.ELASTIC_INDEX_URL, cmd.getOptionValue('i')); TableMapReduceUtil.addDependencyJars(getConf(), HalyardExport.class, NTriplesUtil.class, Rio.class, AbstractRDFHandler.class, RDFFormat.class, RDFParser.class, HTable.class, HBaseConfiguration.class, AuthenticationProtos.class, Trace.class, Gauge.class); HBaseConfiguration.addHbaseResources(getConf()); String cp = cmd.getOptionValue('l'); if (cp != null) { String jars[] = cp.split(":"); StringBuilder newCp = new StringBuilder(); for (int i=0; i<jars.length; i++) { if (i > 0) newCp.append(':'); newCp.append(addTmpFile(jars[i])); //append clappspath entris to tmpfiles and trim paths from the classpath } getConf().set(JDBC_CLASSPATH, newCp.toString()); } Job job = Job.getInstance(getConf(), "HalyardBulkExport " + source + " -> " + target); job.setJarByClass(HalyardBulkExport.class); job.setMaxMapAttempts(1); job.setMapperClass(BulkExportMapper.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Void.class); job.setNumReduceTasks(0); job.setInputFormatClass(QueryInputFormat.class); QueryInputFormat.setQueriesFromDirRecursive(job.getConfiguration(), queryFiles, false, 0); job.setOutputFormatClass(NullOutputFormat.class); TableMapReduceUtil.initCredentials(job); if (job.waitForCompletion(true)) { LOG.info("Bulk Export Completed.."); return 0; } return -1; }
Example 18
Source File: BlurOutputFormatMiniClusterTest.java From incubator-retired-blur with Apache License 2.0 | 4 votes |
@Test public void testBlurOutputFormat() throws IOException, InterruptedException, ClassNotFoundException, BlurException, TException { fileSystem.delete(inDir, true); String tableName = "testBlurOutputFormat"; writeRecordsFile("in/part1", 1, 1, 1, 1, "cf1"); writeRecordsFile("in/part2", 1, 1, 2, 1, "cf1"); Job job = Job.getInstance(conf, "blur index"); job.setJarByClass(BlurOutputFormatMiniClusterTest.class); job.setMapperClass(CsvBlurMapper.class); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.addInputPath(job, new Path(TEST_ROOT_DIR + "/in")); String tableUri = new Path(TEST_ROOT_DIR + "/blur/" + tableName).makeQualified(fileSystem.getUri(), fileSystem.getWorkingDirectory()).toString(); CsvBlurMapper.addColumns(job, "cf1", "col"); TableDescriptor tableDescriptor = new TableDescriptor(); tableDescriptor.setShardCount(1); tableDescriptor.setTableUri(tableUri); tableDescriptor.setName(tableName); Iface client = getClient(); client.createTable(tableDescriptor); BlurOutputFormat.setupJob(job, tableDescriptor); Path output = new Path(TEST_ROOT_DIR + "/out"); BlurOutputFormat.setOutputPath(job, output); Path tablePath = new Path(tableUri); Path shardPath = new Path(tablePath, ShardUtil.getShardName(0)); FileStatus[] listStatus = fileSystem.listStatus(shardPath); System.out.println("======" + listStatus.length); for (FileStatus fileStatus : listStatus) { System.out.println(fileStatus.getPath()); } assertEquals(3, listStatus.length); assertTrue(job.waitForCompletion(true)); Counters ctrs = job.getCounters(); System.out.println("Counters: " + ctrs); client.loadData(tableName, output.toString()); while (true) { TableStats tableStats = client.tableStats(tableName); System.out.println(tableStats); if (tableStats.getRowCount() > 0) { break; } Thread.sleep(100); } assertTrue(fileSystem.exists(tablePath)); assertFalse(fileSystem.isFile(tablePath)); FileStatus[] listStatusAfter = fileSystem.listStatus(shardPath); assertEquals(12, listStatusAfter.length); }
Example 19
Source File: HadoopSegmentCreationJob.java From incubator-pinot with Apache License 2.0 | 4 votes |
public void run() throws Exception { _logger.info("Starting {}", getClass().getSimpleName()); // Initialize all directories _outputDirFileSystem = FileSystem.get(new Path(_outputDir).toUri(), getConf()); JobPreparationHelper.mkdirs(_outputDirFileSystem, new Path(_outputDir), _defaultPermissionsMask); JobPreparationHelper.mkdirs(_outputDirFileSystem, new Path(_stagingDir), _defaultPermissionsMask); Path stagingInputDir = new Path(_stagingDir, "input"); JobPreparationHelper.mkdirs(_outputDirFileSystem, stagingInputDir, _defaultPermissionsMask); // Gather all data files List<Path> dataFilePaths = getDataFilePaths(_inputPattern); int numDataFiles = dataFilePaths.size(); if (numDataFiles == 0) { String errorMessage = "No data file founded with pattern: " + _inputPattern; _logger.error(errorMessage); throw new RuntimeException(errorMessage); } else { _logger.info("Creating segments with data files: {}", dataFilePaths); for (int i = 0; i < numDataFiles; i++) { Path dataFilePath = dataFilePaths.get(i); try (DataOutputStream dataOutputStream = _outputDirFileSystem .create(new Path(stagingInputDir, Integer.toString(i)))) { dataOutputStream.write(StringUtil.encodeUtf8(dataFilePath.toString() + " " + i)); dataOutputStream.flush(); } } } // Set up the job Job job = Job.getInstance(getConf()); job.setJarByClass(getClass()); job.setJobName(getClass().getName()); Configuration jobConf = job.getConfiguration(); String hadoopTokenFileLocation = System.getenv("HADOOP_TOKEN_FILE_LOCATION"); if (hadoopTokenFileLocation != null) { jobConf.set("mapreduce.job.credentials.binary", hadoopTokenFileLocation); } jobConf.setInt(JobContext.NUM_MAPS, numDataFiles); // Set table config and schema TableConfig tableConfig = getTableConfig(); if (tableConfig != null) { validateTableConfig(tableConfig); jobConf.set(JobConfigConstants.TABLE_CONFIG, tableConfig.toJsonString()); } jobConf.set(JobConfigConstants.SCHEMA, getSchema().toSingleLineJsonString()); // Set additional configurations for (Map.Entry<Object, Object> entry : _properties.entrySet()) { jobConf.set(entry.getKey().toString(), entry.getValue().toString()); } job.setMapperClass(getMapperClass()); job.setNumReduceTasks(0); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); FileInputFormat.addInputPath(job, stagingInputDir); FileOutputFormat.setOutputPath(job, new Path(_stagingDir, "output")); addDepsJarToDistributedCache(job); addAdditionalJobProperties(job); // Submit the job job.waitForCompletion(true); if (!job.isSuccessful()) { throw new RuntimeException("Job failed: " + job); } moveSegmentsToOutputDir(); // Delete the staging directory _logger.info("Deleting the staging directory: {}", _stagingDir); _outputDirFileSystem.delete(new Path(_stagingDir), true); }
Example 20
Source File: DBInputFormat.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 3 votes |
/** * Initializes the map-part of the job with the appropriate input settings. * * @param job The map-reduce job * @param inputClass the class object implementing DBWritable, which is the * Java object holding tuple fields. * @param inputQuery the input query to select fields. Example : * "SELECT f1, f2, f3 FROM Mytable ORDER BY f1" * @param inputCountQuery the input query that returns * the number of records in the table. * Example : "SELECT COUNT(f1) FROM Mytable" * @see #setInput(Job, Class, String, String, String, String...) */ public static void setInput(Job job, Class<? extends DBWritable> inputClass, String inputQuery, String inputCountQuery) { job.setInputFormatClass(DBInputFormat.class); DBConfiguration dbConf = new DBConfiguration(job.getConfiguration()); dbConf.setInputClass(inputClass); dbConf.setInputQuery(inputQuery); dbConf.setInputCountQuery(inputCountQuery); }