org.apache.hadoop.mapreduce.lib.output.MultipleOutputs Java Examples
The following examples show how to use
org.apache.hadoop.mapreduce.lib.output.MultipleOutputs.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ConvergeCuboidDataReducer.java From kylin-on-parquet-v2 with Apache License 2.0 | 6 votes |
@Override protected void doSetup(Context context) throws IOException { super.bindCurrentConfiguration(context.getConfiguration()); mos = new MultipleOutputs(context); String cubeName = context.getConfiguration().get(BatchConstants.CFG_CUBE_NAME); String segmentID = context.getConfiguration().get(BatchConstants.CFG_CUBE_SEGMENT_ID); KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata(); CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName); CubeSegment cubeSegment = cube.getSegmentById(segmentID); CubeSegment oldSegment = cube.getOriginalSegmentToOptimize(cubeSegment); this.enableSharding = oldSegment.isEnableSharding(); this.baseCuboid = cube.getCuboidScheduler().getBaseCuboidId(); }
Example #2
Source File: ConvergeCuboidDataReducer.java From kylin with Apache License 2.0 | 6 votes |
@Override protected void doSetup(Context context) throws IOException { super.bindCurrentConfiguration(context.getConfiguration()); mos = new MultipleOutputs(context); String cubeName = context.getConfiguration().get(BatchConstants.CFG_CUBE_NAME); String segmentID = context.getConfiguration().get(BatchConstants.CFG_CUBE_SEGMENT_ID); KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata(); CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName); CubeSegment cubeSegment = cube.getSegmentById(segmentID); CubeSegment oldSegment = cube.getOriginalSegmentToOptimize(cubeSegment); this.enableSharding = oldSegment.isEnableSharding(); this.baseCuboid = cube.getCuboidScheduler().getBaseCuboidId(); }
Example #3
Source File: WARCWriterReducerClass.java From dkpro-c4corpus with Apache License 2.0 | 6 votes |
/** * Writes single WARCWritable to the output with specific output file prefix * * @param warcWritable warc record * @param multipleOutputs output * @throws IOException exception * @throws InterruptedException exception */ // TODO move somewhere else? public static void writeSingleWARCWritableToOutput(WARCWritable warcWritable, MultipleOutputs<NullWritable, WARCWritable> multipleOutputs) throws IOException, InterruptedException { WARCRecord.Header header = warcWritable.getRecord().getHeader(); String license = header.getField(WARCRecord.WARCRecordFieldConstants.LICENSE); String language = header.getField(WARCRecord.WARCRecordFieldConstants.LANGUAGE); String noBoilerplate = header .getField(WARCRecord.WARCRecordFieldConstants.NO_BOILERPLATE); String minimalHtml = header.getField(WARCRecord.WARCRecordFieldConstants.MINIMAL_HTML); // set the file name prefix String fileName = createOutputFilePrefix(license, language, noBoilerplate, minimalHtml); // bottleneck of single reducer for all "Lic_none_Lang_en" pages (majority of Web) // if ("en".equals(language) && LicenseDetector.NO_LICENCE.equals(license)) { // long simHash = Long // .valueOf(header.getField(WARCRecord.WARCRecordFieldConstants.SIMHASH)); // int binNumber = getBinNumberFromSimHash(simHash); // fileName = createOutputFilePrefix(license, language, noBoilerplate); // } multipleOutputs.write(NullWritable.get(), warcWritable, fileName); }
Example #4
Source File: AbstractReasoningTool.java From rya with Apache License 2.0 | 6 votes |
/** * Set up a MapReduce job to output human-readable text. */ protected void configureTextOutput(String destination) { Path outPath; outPath = MRReasoningUtils.getOutputPath(job.getConfiguration(), destination); TextOutputFormat.setOutputPath(job, outPath); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INTERMEDIATE_OUT, TextOutputFormat.class, NullWritable.class, Text.class); MultipleOutputs.addNamedOutput(job, MRReasoningUtils.TERMINAL_OUT, TextOutputFormat.class, NullWritable.class, Text.class); MultipleOutputs.addNamedOutput(job, MRReasoningUtils.SCHEMA_OUT, TextOutputFormat.class, NullWritable.class, Text.class); MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INCONSISTENT_OUT, TextOutputFormat.class, NullWritable.class, Text.class); MultipleOutputs.addNamedOutput(job, MRReasoningUtils.DEBUG_OUT, TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.setCountersEnabled(job, true); }
Example #5
Source File: HadoopMultipleOutputFormat.java From kylin with Apache License 2.0 | 6 votes |
@Override public void open(int taskNumber, int numTasks) throws IOException { super.open(taskNumber, numTasks); synchronized (OPEN_MULTIPLE_MUTEX) { try { TaskInputOutputContext taskInputOutputContext = new ReduceContextImpl(configuration, context.getTaskAttemptID(), new InputIterator(), new GenericCounter(), new GenericCounter(), recordWriter, outputCommitter, new DummyReporter(), null, BytesWritable.class, BytesWritable.class); this.writer = new MultipleOutputs(taskInputOutputContext); } catch (InterruptedException e) { throw new IOException("Could not create MultipleOutputs.", e); } } }
Example #6
Source File: FinalResponseReducer.java From incubator-retired-pirk with Apache License 2.0 | 6 votes |
@Override public void setup(Context ctx) throws IOException, InterruptedException { super.setup(ctx); mos = new MultipleOutputs<>(ctx); FileSystem fs = FileSystem.newInstance(ctx.getConfiguration()); storage = new HadoopFileSystemStore(fs); String queryDir = ctx.getConfiguration().get("pirMR.queryInputDir"); Query query = storage.recall(queryDir, Query.class); QueryInfo queryInfo = query.getQueryInfo(); outputFile = ctx.getConfiguration().get("pirMR.outputFile"); response = new Response(queryInfo); }
Example #7
Source File: FeatureDataMapper.java From data-polygamy with BSD 3-Clause "New" or "Revised" License | 5 votes |
@Override public void setup(Context context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) context.getInputSplit(); String[] fileSplitTokens = fileSplit.getPath().getParent().toString().split("/"); dataset = fileSplitTokens[fileSplitTokens.length-1]; out = new MultipleOutputs<Text,Text>(context); }
Example #8
Source File: FactDistinctColumnsJob.java From kylin with Apache License 2.0 | 5 votes |
private void setupReducer(Path output, CubeSegment cubeSeg) throws IOException { FactDistinctColumnsReducerMapping reducerMapping = new FactDistinctColumnsReducerMapping(cubeSeg.getCubeInstance()); int numberOfReducers = reducerMapping.getTotalReducerNum(); logger.info("{} has reducers {}.", this.getClass().getName(), numberOfReducers); if (numberOfReducers > 250) { throw new IllegalArgumentException( "The max reducer number for FactDistinctColumnsJob is 250, but now it is " + numberOfReducers + ", decrease 'kylin.engine.mr.uhc-reducer-count'"); } job.setReducerClass(FactDistinctColumnsReducer.class); job.setPartitionerClass(FactDistinctColumnPartitioner.class); job.setNumReduceTasks(numberOfReducers); // make each reducer output to respective dir MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_COLUMN, SequenceFileOutputFormat.class, NullWritable.class, Text.class); MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class, NullWritable.class, ArrayPrimitiveWritable.class); MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_STATISTICS, SequenceFileOutputFormat.class, LongWritable.class, BytesWritable.class); MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_PARTITION, TextOutputFormat.class, NullWritable.class, LongWritable.class); FileOutputFormat.setOutputPath(job, output); job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, output.toString()); // prevent to create zero-sized default output LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); deletePath(job.getConfiguration(), output); }
Example #9
Source File: BuildGlobalHiveDictPartBuildJob.java From kylin with Apache License 2.0 | 5 votes |
private void setOutput(Job job, String[] dicColsArr, String outputBase) { // make each reducer output to respective dir // eg: /user/kylin/tmp/kylin/globaldic_test/kylin-188c9f9d_dabb_944e_9f20_99dc95be66e6/kylin_sales_cube_mr/dict_column=KYLIN_SALES_SELLER_ID/part_sort for (int i = 0; i < dicColsArr.length; i++) { MultipleOutputs.addNamedOutput(job, i + "", TextOutputFormat.class, LongWritable.class, Text.class); } Path outputPath = new Path(outputBase); FileOutputFormat.setOutputPath(job, outputPath); }
Example #10
Source File: BuildGlobalHiveDictTotalBuildJob.java From kylin with Apache License 2.0 | 5 votes |
private void setOutput(Job job, String[] dicColsArr, String outputBase) { // make each reducer output to respective dir ///user/prod_kylin/tmp/kylin2/globaldic_test/kylin-188c9f9d_dabb_944e_9f20_99dc95be66e6/bs_order_scene_day_new_cube_clone/dict_column=DM_ES_REPORT_ORDER_VIEW0420_DRIVER_ID/part_sort for (int i = 0; i < dicColsArr.length; i++) { MultipleOutputs.addNamedOutput(job, i + "", TextOutputFormat.class, Text.class, LongWritable.class); } Path outputPath = new Path(outputBase); FileOutputFormat.setOutputPath(job, outputPath); }
Example #11
Source File: BuildGlobalHiveDictPartBuildReducer.java From kylin with Apache License 2.0 | 5 votes |
@Override protected void doSetup(Context context) throws IOException, InterruptedException { mos = new MultipleOutputs(context); KylinConfig config; try { config = AbstractHadoopJob.loadKylinPropsAndMetadata(); } catch (IOException e) { throw new RuntimeException(e); } dicCols = config.getMrHiveDictColumnsExcludeRefColumns(); }
Example #12
Source File: UHCDictionaryReducer.java From kylin with Apache License 2.0 | 5 votes |
@Override protected void doSetup(Context context) throws IOException { super.bindCurrentConfiguration(context.getConfiguration()); Configuration conf = context.getConfiguration(); mos = new MultipleOutputs(context); KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata(); String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME); CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName); CubeDesc cubeDesc = cube.getDescriptor(); List<TblColRef> uhcColumns = cubeDesc.getAllUHCColumns(); int taskId = context.getTaskAttemptID().getTaskID().getId(); col = uhcColumns.get(taskId); logger.info("column name: " + col.getIdentity()); if (cube.getDescriptor().getShardByColumns().contains(col)) { //for ShardByColumns builder = DictionaryGenerator.newDictionaryBuilder(col.getType()); builder.init(null, 0, null); } else { //for GlobalDictionaryColumns String hdfsDir = conf.get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR); DictionaryInfo dictionaryInfo = new DictionaryInfo(col.getColumnDesc(), col.getDatatype()); String builderClass = cubeDesc.getDictionaryBuilderClass(col); builder = (IDictionaryBuilder) ClassUtil.newInstance(builderClass); builder.init(dictionaryInfo, 0, hdfsDir); } }
Example #13
Source File: ScalarFunctionDataMapper.java From data-polygamy with BSD 3-Clause "New" or "Revised" License | 5 votes |
@Override public void setup(Context context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) context.getInputSplit(); String[] fileSplitTokens = fileSplit.getPath().getParent().toString().split("/"); dataset = fileSplitTokens[fileSplitTokens.length-1]; out = new MultipleOutputs<Text,Text>(context); }
Example #14
Source File: AggregationReducer.java From data-polygamy with BSD 3-Clause "New" or "Revised" License | 5 votes |
@Override public void setup(Context context) throws IOException, InterruptedException { String[] datasetNames = context.getConfiguration().get("dataset-name","").split(","); String[] datasetIds = context.getConfiguration().get("dataset-id","").split(","); for (int i = 0; i < datasetNames.length; i++) idToDataset.put(Integer.parseInt(datasetIds[i]), datasetNames[i]); out = new MultipleOutputs<SpatioTemporalWritable,FloatArrayWritable>(context); //out = new MultipleOutputs<Text,Text>(context); }
Example #15
Source File: Task.java From WIFIProbe with Apache License 2.0 | 5 votes |
private boolean analyze(final String inputFilePath, final String outputFilePath, final Long startTime) throws Exception { Configuration conf = new Configuration(); conf.setLong(Holistic.START_TIME, startTime); conf.setLong(Holistic.EXECUTE_TIME, executeHourTime); Job jobAnalyze = Job.getInstance(conf, "analyze"); jobAnalyze.setJarByClass(Holistic.class); MultipleOutputs.addNamedOutput(jobAnalyze, MapKeyConfig.NEW_OLD_CUSTOMER, TextOutputFormat.class, KeyWrapper.class, Text.class); MultipleOutputs.addNamedOutput(jobAnalyze, MapKeyConfig.CUSTOMER_FLOW_KEY, TextOutputFormat.class, KeyWrapper.class, Text.class); MultipleOutputs.addNamedOutput(jobAnalyze, MapKeyConfig.CYCLE, TextOutputFormat.class, KeyWrapper.class, Text.class); MultipleOutputs.addNamedOutput(jobAnalyze, MapKeyConfig.IN_STORE_HOUR, TextOutputFormat.class, KeyWrapper.class, Text.class); jobAnalyze.setMapperClass(AnalysisMapper.class); jobAnalyze.setReducerClass(AnalysisReducer.class); jobAnalyze.setCombinerClass(AnalysisCombiner.class); jobAnalyze.setOutputKeyClass(LongWritable.class); jobAnalyze.setOutputValueClass(Text.class); jobAnalyze.setMapOutputKeyClass(KeyWrapper.class); jobAnalyze.setMapOutputValueClass(ValueWrapper.class); FileInputFormat.addInputPath(jobAnalyze, new Path(inputFilePath)); FileOutputFormat.setOutputPath(jobAnalyze, new Path(outputFilePath)); return jobAnalyze.waitForCompletion(true) ; }
Example #16
Source File: ColumnMultReducer.java From incubator-retired-pirk with Apache License 2.0 | 5 votes |
@Override public void setup(Context ctx) throws IOException, InterruptedException { super.setup(ctx); outputValue = new Text(); mos = new MultipleOutputs<>(ctx); FileSystem fs = FileSystem.newInstance(ctx.getConfiguration()); String queryDir = ctx.getConfiguration().get("pirMR.queryInputDir"); query = new HadoopFileSystemStore(fs).recall(queryDir, Query.class); }
Example #17
Source File: MultipleOutputs.java From hadoop with Apache License 2.0 | 5 votes |
/** * Creates and initializes multiple outputs support, * it should be instantiated in the Mapper/Reducer setup method. * * @param context the TaskInputOutputContext object */ public MultipleOutputs( TaskInputOutputContext<?, ?, KEYOUT, VALUEOUT> context) { this.context = context; namedOutputs = Collections.unmodifiableSet( new HashSet<String>(MultipleOutputs.getNamedOutputsList(context))); recordWriters = new HashMap<String, RecordWriter<?, ?>>(); countersEnabled = getCountersEnabled(context); }
Example #18
Source File: UHCDictionaryJob.java From kylin with Apache License 2.0 | 5 votes |
private void setupReducer(Path output, int numberOfReducers) throws IOException { job.setReducerClass(UHCDictionaryReducer.class); job.setPartitionerClass(UHCDictionaryPartitioner.class); job.setNumReduceTasks(numberOfReducers); MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class, NullWritable.class, ArrayPrimitiveWritable.class); FileOutputFormat.setOutputPath(job, output); job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, output.toString()); //prevent to create zero-sized default output LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); deletePath(job.getConfiguration(), output); }
Example #19
Source File: UHCDictionaryJob.java From kylin-on-parquet-v2 with Apache License 2.0 | 5 votes |
private void setupReducer(Path output, int numberOfReducers) throws IOException { job.setReducerClass(UHCDictionaryReducer.class); job.setPartitionerClass(UHCDictionaryPartitioner.class); job.setNumReduceTasks(numberOfReducers); MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class, NullWritable.class, ArrayPrimitiveWritable.class); FileOutputFormat.setOutputPath(job, output); job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, output.toString()); //prevent to create zero-sized default output LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); deletePath(job.getConfiguration(), output); }
Example #20
Source File: AbstractReasoningTool.java From rya with Apache License 2.0 | 5 votes |
/** * Set up the MapReduce job to output a schema (TBox). */ protected void configureSchemaOutput() { Path outPath = MRReasoningUtils.getSchemaPath(job.getConfiguration()); SequenceFileOutputFormat.setOutputPath(job, outPath); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(SchemaWritable.class); LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); MultipleOutputs.addNamedOutput(job, "schemaobj", SequenceFileOutputFormat.class, NullWritable.class, SchemaWritable.class); MultipleOutputs.addNamedOutput(job, MRReasoningUtils.DEBUG_OUT, TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.setCountersEnabled(job, true); }
Example #21
Source File: AbstractReasoningTool.java From rya with Apache License 2.0 | 5 votes |
/** * Set up a MapReduce job to output newly derived triples. * @param intermediate True if this is intermediate data. Outputs * to [base]-[iteration]-[temp]. */ protected void configureDerivationOutput(boolean intermediate) { Path outPath; Configuration conf = job.getConfiguration(); int iteration = MRReasoningUtils.getCurrentIteration(conf); if (intermediate) { outPath = MRReasoningUtils.getOutputPath(conf, MRReasoningUtils.OUTPUT_BASE + iteration + MRReasoningUtils.TEMP_SUFFIX); } else { outPath = MRReasoningUtils.getOutputPath(conf, MRReasoningUtils.OUTPUT_BASE + iteration); } SequenceFileOutputFormat.setOutputPath(job, outPath); LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INTERMEDIATE_OUT, SequenceFileOutputFormat.class, Fact.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, MRReasoningUtils.TERMINAL_OUT, SequenceFileOutputFormat.class, Fact.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, MRReasoningUtils.SCHEMA_OUT, SequenceFileOutputFormat.class, Fact.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INCONSISTENT_OUT, SequenceFileOutputFormat.class, Derivation.class, NullWritable.class); MultipleOutputs.setCountersEnabled(job, true); // Set up an output for diagnostic info, if needed MultipleOutputs.addNamedOutput(job, MRReasoningUtils.DEBUG_OUT, TextOutputFormat.class, Text.class, Text.class); }
Example #22
Source File: ForwardChain.java From rya with Apache License 2.0 | 5 votes |
@Override protected void setup(Context context) { debugOut = new MultipleOutputs<>(context); Configuration conf = context.getConfiguration(); if (schema == null) { schema = MRReasoningUtils.loadSchema(context.getConfiguration()); } debug = MRReasoningUtils.debug(conf); }
Example #23
Source File: ForwardChain.java From rya with Apache License 2.0 | 5 votes |
@Override public void setup(Context context) { mout = new MultipleOutputs<>(context); Configuration conf = context.getConfiguration(); if (schema == null) { schema = MRReasoningUtils.loadSchema(conf); } debug = MRReasoningUtils.debug(conf); }
Example #24
Source File: DuplicateElimination.java From rya with Apache License 2.0 | 5 votes |
@Override public void setup(Context context) { Configuration conf = context.getConfiguration(); debug = MRReasoningUtils.debug(conf); if (debug) { debugOut = new MultipleOutputs<>(context); } }
Example #25
Source File: DuplicateElimination.java From rya with Apache License 2.0 | 5 votes |
@Override public void setup(Context context) { Configuration conf = context.getConfiguration(); mout = new MultipleOutputs<>(context); current = MRReasoningUtils.getCurrentIteration(conf); debug = MRReasoningUtils.debug(conf); }
Example #26
Source File: DataSourceCompReducer.java From jumbune with GNU Lesser General Public License v3.0 | 5 votes |
@SuppressWarnings({ "rawtypes", "unchecked" }) protected void setup(Reducer.Context context) { Configuration conf = context.getConfiguration(); Type type = new TypeToken<Map<String, String>>() { }.getType(); filesMap = gson.fromJson(conf.get("filesMap"), type); validationInfo = gson.fromJson(conf.get("validationInfoJson"), DataSourceCompValidationInfo.class); multipleOutputs = new MultipleOutputs<NullWritable, Text>(context); }
Example #27
Source File: DataSourceCompMapper.java From jumbune with GNU Lesser General Public License v3.0 | 5 votes |
@Override @SuppressWarnings({ "unchecked", "rawtypes" }) protected void setup(Mapper.Context context) { Gson gson = new Gson(); Configuration conf = context.getConfiguration(); Type type = new TypeToken<Map<String, String>>() { }.getType(); mapperInfo = gson.fromJson(conf.get("mapperInfoJson"), DataSourceCompMapperInfo.class); filesMap = gson.fromJson(conf.get("filesMap"), type); multipleOutputs = new MultipleOutputs<NullWritable, Text>(context); }
Example #28
Source File: MultipleOutputsJob.java From hiped2 with Apache License 2.0 | 5 votes |
/** * The MapReduce driver - setup and launch the job. * * @param args the command-line arguments * @return the process exit code * @throws Exception if something goes wrong */ public int run(final String[] args) throws Exception { Cli cli = Cli.builder().setArgs(args).addOptions(IOOptions.values()).build(); int result = cli.runCmd(); if (result != 0) { return result; } Path input = new Path(cli.getArgValueAsString(IOOptions.INPUT)); Path output = new Path(cli.getArgValueAsString(IOOptions.OUTPUT)); Configuration conf = super.getConf(); Job job = new Job(conf); job.setJarByClass(MultipleOutputsJob.class); job.setMapperClass(Map.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setNumReduceTasks(0); MultipleOutputs.addNamedOutput(job, "partition", TextOutputFormat.class, Text.class, Text.class); return job.waitForCompletion(true) ? 0 : 1; }
Example #29
Source File: BasicJobChaining.java From hadoop-map-reduce-patterns with Apache License 2.0 | 5 votes |
protected void setup(Context context) throws IOException, InterruptedException { average = getAveragePostsPerUser(context.getConfiguration()); mos = new MultipleOutputs<Text, Text>(context); try { Path[] files = DistributedCache.getLocalCacheFiles(context.getConfiguration()); if (files == null || files.length == 0) { throw new RuntimeException("User information is not set in DistributedCache"); } // Read all files in the DistributedCache for (Path p : files) { BufferedReader rdr = new BufferedReader(new InputStreamReader( new GZIPInputStream(new FileInputStream(new File(p.toString()))))); String line; // For each record in the user file while ((line = rdr.readLine()) != null) { // Get the user ID and reputation Map<String, String> parsed = MRDPUtils.transformXmlToMap(line); String userId = parsed.get("Id"); String reputation = parsed.get("Reputation"); if (userId != null && reputation != null) { // Map the user ID to the reputation userIdToReputation.put(userId, reputation); } } } } catch (IOException e) { throw new RuntimeException(e); } }
Example #30
Source File: BinningTags.java From hadoop-map-reduce-patterns with Apache License 2.0 | 5 votes |
@Override public int run(String[] args) throws Exception { Configuration conf = new Configuration(); GenericOptionsParser parser = new GenericOptionsParser(conf, args); String[] otherArgs = parser.getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: BinningTags <in> <out>"); ToolRunner.printGenericCommandUsage(System.err); System.exit(2); } Job job = new Job(conf, "Binning Tags"); job.setJarByClass(BinningTags.class); // Configure the MultipleOutputs by adding an output called "bins" // With the proper output format and mapper key/value pairs MultipleOutputs.addNamedOutput(job, "bins", TextOutputFormat.class, Text.class, NullWritable.class); // Enable the counters for the job // If there are a significant number of different named outputs, this // should be disabled MultipleOutputs.setCountersEnabled(job, true); // Map-only job job.setNumReduceTasks(0); job.setMapperClass(BinningMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }