org.apache.hadoop.mapreduce.lib.output.MultipleOutputs Java Exaples

Source File: ConvergeCuboidDataReducer.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

@Override
protected void doSetup(Context context) throws IOException {
    super.bindCurrentConfiguration(context.getConfiguration());
    mos = new MultipleOutputs(context);

    String cubeName = context.getConfiguration().get(BatchConstants.CFG_CUBE_NAME);
    String segmentID = context.getConfiguration().get(BatchConstants.CFG_CUBE_SEGMENT_ID);

    KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();

    CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName);
    CubeSegment cubeSegment = cube.getSegmentById(segmentID);
    CubeSegment oldSegment = cube.getOriginalSegmentToOptimize(cubeSegment);

    this.enableSharding = oldSegment.isEnableSharding();
    this.baseCuboid = cube.getCuboidScheduler().getBaseCuboidId();
}

Source File: ConvergeCuboidDataReducer.java From kylin with Apache License 2.0

6 votes

@Override
protected void doSetup(Context context) throws IOException {
    super.bindCurrentConfiguration(context.getConfiguration());
    mos = new MultipleOutputs(context);

    String cubeName = context.getConfiguration().get(BatchConstants.CFG_CUBE_NAME);
    String segmentID = context.getConfiguration().get(BatchConstants.CFG_CUBE_SEGMENT_ID);

    KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();

    CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName);
    CubeSegment cubeSegment = cube.getSegmentById(segmentID);
    CubeSegment oldSegment = cube.getOriginalSegmentToOptimize(cubeSegment);

    this.enableSharding = oldSegment.isEnableSharding();
    this.baseCuboid = cube.getCuboidScheduler().getBaseCuboidId();
}

Source File: WARCWriterReducerClass.java From dkpro-c4corpus with Apache License 2.0

6 votes

/**
 * Writes single WARCWritable to the output with specific output file prefix
 *
 * @param warcWritable    warc record
 * @param multipleOutputs output
 * @throws IOException          exception
 * @throws InterruptedException exception
 */
// TODO move somewhere else?
public static void writeSingleWARCWritableToOutput(WARCWritable warcWritable,
        MultipleOutputs<NullWritable, WARCWritable> multipleOutputs)
        throws IOException, InterruptedException
{
    WARCRecord.Header header = warcWritable.getRecord().getHeader();
    String license = header.getField(WARCRecord.WARCRecordFieldConstants.LICENSE);
    String language = header.getField(WARCRecord.WARCRecordFieldConstants.LANGUAGE);
    String noBoilerplate = header
            .getField(WARCRecord.WARCRecordFieldConstants.NO_BOILERPLATE);
    String minimalHtml = header.getField(WARCRecord.WARCRecordFieldConstants.MINIMAL_HTML);

    // set the file name prefix
    String fileName = createOutputFilePrefix(license, language, noBoilerplate, minimalHtml);

    // bottleneck of single reducer for all "Lic_none_Lang_en" pages (majority of Web)
    //        if ("en".equals(language) && LicenseDetector.NO_LICENCE.equals(license)) {
    //            long simHash = Long
    //                    .valueOf(header.getField(WARCRecord.WARCRecordFieldConstants.SIMHASH));
    //            int binNumber = getBinNumberFromSimHash(simHash);
    //            fileName = createOutputFilePrefix(license, language, noBoilerplate);
    //        }

    multipleOutputs.write(NullWritable.get(), warcWritable, fileName);
}

Source File: AbstractReasoningTool.java From rya with Apache License 2.0

6 votes

/**
 * Set up a MapReduce job to output human-readable text.
 */
protected void configureTextOutput(String destination) {
    Path outPath;
    outPath = MRReasoningUtils.getOutputPath(job.getConfiguration(), destination);
    TextOutputFormat.setOutputPath(job, outPath);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INTERMEDIATE_OUT,
        TextOutputFormat.class, NullWritable.class, Text.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.TERMINAL_OUT,
        TextOutputFormat.class, NullWritable.class, Text.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.SCHEMA_OUT,
        TextOutputFormat.class, NullWritable.class, Text.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INCONSISTENT_OUT,
        TextOutputFormat.class, NullWritable.class, Text.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.DEBUG_OUT,
        TextOutputFormat.class, Text.class, Text.class);
    MultipleOutputs.setCountersEnabled(job, true);
}

Source File: HadoopMultipleOutputFormat.java From kylin with Apache License 2.0

6 votes

@Override
public void open(int taskNumber, int numTasks) throws IOException {
    super.open(taskNumber, numTasks);

    synchronized (OPEN_MULTIPLE_MUTEX) {
        try {
            TaskInputOutputContext taskInputOutputContext = new ReduceContextImpl(configuration,
                    context.getTaskAttemptID(), new InputIterator(), new GenericCounter(), new GenericCounter(),
                    recordWriter, outputCommitter, new DummyReporter(), null,
                    BytesWritable.class, BytesWritable.class);
            this.writer = new MultipleOutputs(taskInputOutputContext);
        } catch (InterruptedException e) {
            throw new IOException("Could not create MultipleOutputs.", e);
        }
    }
}

Source File: FinalResponseReducer.java From incubator-retired-pirk with Apache License 2.0

6 votes

@Override
public void setup(Context ctx) throws IOException, InterruptedException
{
  super.setup(ctx);

  mos = new MultipleOutputs<>(ctx);

  FileSystem fs = FileSystem.newInstance(ctx.getConfiguration());
  storage = new HadoopFileSystemStore(fs);
  String queryDir = ctx.getConfiguration().get("pirMR.queryInputDir");
  Query query = storage.recall(queryDir, Query.class);
  QueryInfo queryInfo = query.getQueryInfo();

  outputFile = ctx.getConfiguration().get("pirMR.outputFile");

  response = new Response(queryInfo);
}

Source File: FeatureDataMapper.java From data-polygamy with BSD 3-Clause "New" or "Revised" License

5 votes

@Override
public void setup(Context context)
        throws IOException, InterruptedException {
    
    FileSplit fileSplit = (FileSplit) context.getInputSplit();
    String[] fileSplitTokens = fileSplit.getPath().getParent().toString().split("/");
    dataset = fileSplitTokens[fileSplitTokens.length-1];
    
    out = new MultipleOutputs<Text,Text>(context);
}

Source File: FactDistinctColumnsJob.java From kylin with Apache License 2.0

5 votes

private void setupReducer(Path output, CubeSegment cubeSeg)
        throws IOException {
    FactDistinctColumnsReducerMapping reducerMapping = new FactDistinctColumnsReducerMapping(cubeSeg.getCubeInstance());
    int numberOfReducers = reducerMapping.getTotalReducerNum();
    logger.info("{} has reducers {}.", this.getClass().getName(), numberOfReducers);
    if (numberOfReducers > 250) {
        throw new IllegalArgumentException(
                "The max reducer number for FactDistinctColumnsJob is 250, but now it is "
                        + numberOfReducers
                        + ", decrease 'kylin.engine.mr.uhc-reducer-count'");
    }

    job.setReducerClass(FactDistinctColumnsReducer.class);
    job.setPartitionerClass(FactDistinctColumnPartitioner.class);
    job.setNumReduceTasks(numberOfReducers);

    // make each reducer output to respective dir
    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_COLUMN, SequenceFileOutputFormat.class, NullWritable.class, Text.class);
    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class, NullWritable.class, ArrayPrimitiveWritable.class);
    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_STATISTICS, SequenceFileOutputFormat.class, LongWritable.class, BytesWritable.class);
    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_PARTITION, TextOutputFormat.class, NullWritable.class, LongWritable.class);

    FileOutputFormat.setOutputPath(job, output);
    job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, output.toString());

    // prevent to create zero-sized default output
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    deletePath(job.getConfiguration(), output);
}

Source File: BuildGlobalHiveDictPartBuildJob.java From kylin with Apache License 2.0

5 votes

private void setOutput(Job job, String[] dicColsArr, String outputBase) {
    // make each reducer output to respective dir
    // eg: /user/kylin/tmp/kylin/globaldic_test/kylin-188c9f9d_dabb_944e_9f20_99dc95be66e6/kylin_sales_cube_mr/dict_column=KYLIN_SALES_SELLER_ID/part_sort
    for (int i = 0; i < dicColsArr.length; i++) {
        MultipleOutputs.addNamedOutput(job, i + "", TextOutputFormat.class, LongWritable.class, Text.class);
    }
    Path outputPath = new Path(outputBase);
    FileOutputFormat.setOutputPath(job, outputPath);
}

Source File: BuildGlobalHiveDictTotalBuildJob.java From kylin with Apache License 2.0

5 votes

private void setOutput(Job job, String[] dicColsArr, String outputBase) {
    // make each reducer output to respective dir
    ///user/prod_kylin/tmp/kylin2/globaldic_test/kylin-188c9f9d_dabb_944e_9f20_99dc95be66e6/bs_order_scene_day_new_cube_clone/dict_column=DM_ES_REPORT_ORDER_VIEW0420_DRIVER_ID/part_sort
    for (int i = 0; i < dicColsArr.length; i++) {
        MultipleOutputs.addNamedOutput(job, i + "", TextOutputFormat.class, Text.class, LongWritable.class);
    }
    Path outputPath = new Path(outputBase);
    FileOutputFormat.setOutputPath(job, outputPath);
}

Source File: BuildGlobalHiveDictPartBuildReducer.java From kylin with Apache License 2.0

5 votes

@Override
protected void doSetup(Context context) throws IOException, InterruptedException {
    mos = new MultipleOutputs(context);
    KylinConfig config;
    try {
        config = AbstractHadoopJob.loadKylinPropsAndMetadata();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    dicCols = config.getMrHiveDictColumnsExcludeRefColumns();
}

Source File: UHCDictionaryReducer.java From kylin with Apache License 2.0

5 votes

@Override
protected void doSetup(Context context) throws IOException {
    super.bindCurrentConfiguration(context.getConfiguration());
    Configuration conf = context.getConfiguration();
    mos = new MultipleOutputs(context);

    KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();
    String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME);
    CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName);
    CubeDesc cubeDesc = cube.getDescriptor();
    List<TblColRef> uhcColumns = cubeDesc.getAllUHCColumns();

    int taskId = context.getTaskAttemptID().getTaskID().getId();
    col = uhcColumns.get(taskId);
    logger.info("column name: " + col.getIdentity());

    if (cube.getDescriptor().getShardByColumns().contains(col)) {
        //for ShardByColumns
        builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
        builder.init(null, 0, null);
    } else {
        //for GlobalDictionaryColumns
        String hdfsDir = conf.get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR);
        DictionaryInfo dictionaryInfo = new DictionaryInfo(col.getColumnDesc(), col.getDatatype());
        String builderClass = cubeDesc.getDictionaryBuilderClass(col);
        builder = (IDictionaryBuilder) ClassUtil.newInstance(builderClass);
        builder.init(dictionaryInfo, 0, hdfsDir);
    }
}

Source File: ScalarFunctionDataMapper.java From data-polygamy with BSD 3-Clause "New" or "Revised" License

5 votes

@Override
public void setup(Context context)
        throws IOException, InterruptedException {
    
    FileSplit fileSplit = (FileSplit) context.getInputSplit();
    String[] fileSplitTokens = fileSplit.getPath().getParent().toString().split("/");
    dataset = fileSplitTokens[fileSplitTokens.length-1];
    
    out = new MultipleOutputs<Text,Text>(context);
}

Source File: AggregationReducer.java From data-polygamy with BSD 3-Clause "New" or "Revised" License

5 votes

@Override
public void setup(Context context)
        throws IOException, InterruptedException {
    String[] datasetNames = context.getConfiguration().get("dataset-name","").split(",");
    String[] datasetIds = context.getConfiguration().get("dataset-id","").split(",");
    for (int i = 0; i < datasetNames.length; i++)
        idToDataset.put(Integer.parseInt(datasetIds[i]), datasetNames[i]);
    out = new MultipleOutputs<SpatioTemporalWritable,FloatArrayWritable>(context);
    //out = new MultipleOutputs<Text,Text>(context);
}

Source File: Task.java From WIFIProbe with Apache License 2.0

5 votes

private boolean analyze(final String inputFilePath,
                           final String outputFilePath,
                           final Long startTime) throws Exception {
    Configuration conf = new Configuration();
    conf.setLong(Holistic.START_TIME, startTime);
    conf.setLong(Holistic.EXECUTE_TIME, executeHourTime);

    Job jobAnalyze = Job.getInstance(conf, "analyze");

    jobAnalyze.setJarByClass(Holistic.class);

    MultipleOutputs.addNamedOutput(jobAnalyze, MapKeyConfig.NEW_OLD_CUSTOMER,
            TextOutputFormat.class, KeyWrapper.class, Text.class);
    MultipleOutputs.addNamedOutput(jobAnalyze, MapKeyConfig.CUSTOMER_FLOW_KEY,
            TextOutputFormat.class, KeyWrapper.class, Text.class);
    MultipleOutputs.addNamedOutput(jobAnalyze, MapKeyConfig.CYCLE,
            TextOutputFormat.class, KeyWrapper.class, Text.class);
    MultipleOutputs.addNamedOutput(jobAnalyze, MapKeyConfig.IN_STORE_HOUR,
            TextOutputFormat.class, KeyWrapper.class, Text.class);

    jobAnalyze.setMapperClass(AnalysisMapper.class);
    jobAnalyze.setReducerClass(AnalysisReducer.class);
    jobAnalyze.setCombinerClass(AnalysisCombiner.class);

    jobAnalyze.setOutputKeyClass(LongWritable.class);
    jobAnalyze.setOutputValueClass(Text.class);

    jobAnalyze.setMapOutputKeyClass(KeyWrapper.class);
    jobAnalyze.setMapOutputValueClass(ValueWrapper.class);

    FileInputFormat.addInputPath(jobAnalyze, new Path(inputFilePath));
    FileOutputFormat.setOutputPath(jobAnalyze, new Path(outputFilePath));

    return jobAnalyze.waitForCompletion(true) ;
}

Source File: ColumnMultReducer.java From incubator-retired-pirk with Apache License 2.0

5 votes

@Override
public void setup(Context ctx) throws IOException, InterruptedException
{
  super.setup(ctx);

  outputValue = new Text();
  mos = new MultipleOutputs<>(ctx);

  FileSystem fs = FileSystem.newInstance(ctx.getConfiguration());
  String queryDir = ctx.getConfiguration().get("pirMR.queryInputDir");
  query = new HadoopFileSystemStore(fs).recall(queryDir, Query.class);
}

Source File: MultipleOutputs.java From hadoop with Apache License 2.0

5 votes

/**
 * Creates and initializes multiple outputs support,
 * it should be instantiated in the Mapper/Reducer setup method.
 *
 * @param context the TaskInputOutputContext object
 */
public MultipleOutputs(
    TaskInputOutputContext<?, ?, KEYOUT, VALUEOUT> context) {
  this.context = context;
  namedOutputs = Collections.unmodifiableSet(
    new HashSet<String>(MultipleOutputs.getNamedOutputsList(context)));
  recordWriters = new HashMap<String, RecordWriter<?, ?>>();
  countersEnabled = getCountersEnabled(context);
}

Source File: UHCDictionaryJob.java From kylin with Apache License 2.0

5 votes

private void setupReducer(Path output, int numberOfReducers) throws IOException {
    job.setReducerClass(UHCDictionaryReducer.class);
    job.setPartitionerClass(UHCDictionaryPartitioner.class);
    job.setNumReduceTasks(numberOfReducers);

    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class, NullWritable.class, ArrayPrimitiveWritable.class);
    FileOutputFormat.setOutputPath(job, output);
    job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, output.toString());

    //prevent to create zero-sized default output
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    deletePath(job.getConfiguration(), output);
}

Source File: UHCDictionaryJob.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

private void setupReducer(Path output, int numberOfReducers) throws IOException {
    job.setReducerClass(UHCDictionaryReducer.class);
    job.setPartitionerClass(UHCDictionaryPartitioner.class);
    job.setNumReduceTasks(numberOfReducers);

    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class, NullWritable.class, ArrayPrimitiveWritable.class);
    FileOutputFormat.setOutputPath(job, output);
    job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, output.toString());

    //prevent to create zero-sized default output
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    deletePath(job.getConfiguration(), output);
}

Source File: AbstractReasoningTool.java From rya with Apache License 2.0

5 votes

/**
 * Set up the MapReduce job to output a schema (TBox).
 */
protected void configureSchemaOutput() {
    Path outPath = MRReasoningUtils.getSchemaPath(job.getConfiguration());
    SequenceFileOutputFormat.setOutputPath(job, outPath);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(SchemaWritable.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    MultipleOutputs.addNamedOutput(job, "schemaobj",
        SequenceFileOutputFormat.class, NullWritable.class, SchemaWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.DEBUG_OUT,
        TextOutputFormat.class, Text.class, Text.class);
    MultipleOutputs.setCountersEnabled(job, true);
}

Source File: AbstractReasoningTool.java From rya with Apache License 2.0

5 votes

/**
 * Set up a MapReduce job to output newly derived triples.
 * @param   intermediate    True if this is intermediate data. Outputs
 *                          to [base]-[iteration]-[temp].
 */
protected void configureDerivationOutput(boolean intermediate) {
    Path outPath;
    Configuration conf = job.getConfiguration();
    int iteration = MRReasoningUtils.getCurrentIteration(conf);
    if (intermediate) {
        outPath = MRReasoningUtils.getOutputPath(conf,
            MRReasoningUtils.OUTPUT_BASE + iteration
            + MRReasoningUtils.TEMP_SUFFIX);
    }
    else {
        outPath = MRReasoningUtils.getOutputPath(conf,
            MRReasoningUtils.OUTPUT_BASE + iteration);
    }
    SequenceFileOutputFormat.setOutputPath(job, outPath);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INTERMEDIATE_OUT,
        SequenceFileOutputFormat.class, Fact.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.TERMINAL_OUT,
        SequenceFileOutputFormat.class, Fact.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.SCHEMA_OUT,
        SequenceFileOutputFormat.class, Fact.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INCONSISTENT_OUT,
        SequenceFileOutputFormat.class, Derivation.class, NullWritable.class);
    MultipleOutputs.setCountersEnabled(job, true);
    // Set up an output for diagnostic info, if needed
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.DEBUG_OUT,
        TextOutputFormat.class, Text.class, Text.class);
}

Source File: ForwardChain.java From rya with Apache License 2.0

5 votes

@Override
protected void setup(Context context) {
    debugOut = new MultipleOutputs<>(context);
    Configuration conf = context.getConfiguration();
    if (schema == null) {
        schema = MRReasoningUtils.loadSchema(context.getConfiguration());
    }
    debug = MRReasoningUtils.debug(conf);
}

Source File: ForwardChain.java From rya with Apache License 2.0

5 votes

@Override
public void setup(Context context) {
    mout = new MultipleOutputs<>(context);
    Configuration conf = context.getConfiguration();
    if (schema == null) {
        schema = MRReasoningUtils.loadSchema(conf);
    }
    debug = MRReasoningUtils.debug(conf);
}

Source File: DuplicateElimination.java From rya with Apache License 2.0

5 votes

@Override
public void setup(Context context) {
    Configuration conf = context.getConfiguration();
    debug = MRReasoningUtils.debug(conf);
    if (debug) {
        debugOut = new MultipleOutputs<>(context);
    }
}

Source File: DuplicateElimination.java From rya with Apache License 2.0

5 votes

@Override
public void setup(Context context) {
    Configuration conf = context.getConfiguration();
    mout = new MultipleOutputs<>(context);
    current = MRReasoningUtils.getCurrentIteration(conf);
    debug = MRReasoningUtils.debug(conf);
}

Source File: DataSourceCompReducer.java From jumbune with GNU Lesser General Public License v3.0

5 votes

@SuppressWarnings({ "rawtypes", "unchecked" })
protected void setup(Reducer.Context context) {
	
	Configuration conf = context.getConfiguration();
	Type type = new TypeToken<Map<String, String>>() {
	}.getType();

	filesMap = gson.fromJson(conf.get("filesMap"), type);
	validationInfo = gson.fromJson(conf.get("validationInfoJson"), DataSourceCompValidationInfo.class);
	multipleOutputs = new MultipleOutputs<NullWritable, Text>(context);
}

Source File: DataSourceCompMapper.java From jumbune with GNU Lesser General Public License v3.0

5 votes

@Override
@SuppressWarnings({ "unchecked", "rawtypes" })
protected void setup(Mapper.Context context) {
	Gson gson = new Gson();
	Configuration conf = context.getConfiguration();
	Type type = new TypeToken<Map<String, String>>() {
	}.getType();

	mapperInfo = gson.fromJson(conf.get("mapperInfoJson"), DataSourceCompMapperInfo.class);
	filesMap = gson.fromJson(conf.get("filesMap"), type);
	multipleOutputs = new MultipleOutputs<NullWritable, Text>(context);

}

Source File: MultipleOutputsJob.java From hiped2 with Apache License 2.0

5 votes

/**
 * The MapReduce driver - setup and launch the job.
 *
 * @param args the command-line arguments
 * @return the process exit code
 * @throws Exception if something goes wrong
 */
public int run(final String[] args) throws Exception {

  Cli cli = Cli.builder().setArgs(args).addOptions(IOOptions.values()).build();
  int result = cli.runCmd();

  if (result != 0) {
    return result;
  }

  Path input = new Path(cli.getArgValueAsString(IOOptions.INPUT));
  Path output = new Path(cli.getArgValueAsString(IOOptions.OUTPUT));

  Configuration conf = super.getConf();

  Job job = new Job(conf);
  job.setJarByClass(MultipleOutputsJob.class);
  job.setMapperClass(Map.class);

  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(Text.class);

  FileInputFormat.setInputPaths(job, input);
  FileOutputFormat.setOutputPath(job, output);

  job.setNumReduceTasks(0);

  MultipleOutputs.addNamedOutput(job, "partition",
      TextOutputFormat.class, Text.class, Text.class);

  return job.waitForCompletion(true) ? 0 : 1;
}

Source File: BasicJobChaining.java From hadoop-map-reduce-patterns with Apache License 2.0

5 votes

protected void setup(Context context) throws IOException, InterruptedException {
	average = getAveragePostsPerUser(context.getConfiguration());
	mos = new MultipleOutputs<Text, Text>(context);

	try {
		Path[] files = DistributedCache.getLocalCacheFiles(context.getConfiguration());

		if (files == null || files.length == 0) {
			throw new RuntimeException("User information is not set in DistributedCache");
		}

		// Read all files in the DistributedCache
		for (Path p : files) {
			BufferedReader rdr = new BufferedReader(new InputStreamReader(
					new GZIPInputStream(new FileInputStream(new File(p.toString())))));

			String line;
			// For each record in the user file
			while ((line = rdr.readLine()) != null) {

				// Get the user ID and reputation
				Map<String, String> parsed = MRDPUtils.transformXmlToMap(line);
				String userId = parsed.get("Id");
				String reputation = parsed.get("Reputation");

				if (userId != null && reputation != null) {
					// Map the user ID to the reputation
					userIdToReputation.put(userId, reputation);
				}
			}
		}

	} catch (IOException e) {
		throw new RuntimeException(e);
	}
}

Source File: BinningTags.java From hadoop-map-reduce-patterns with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {
	Configuration conf = new Configuration();
	GenericOptionsParser parser = new GenericOptionsParser(conf, args);
	String[] otherArgs = parser.getRemainingArgs();
	if (otherArgs.length != 2) {
		System.err.println("Usage: BinningTags <in> <out>");
		ToolRunner.printGenericCommandUsage(System.err);
		System.exit(2);
	}
	Job job = new Job(conf, "Binning Tags");
	job.setJarByClass(BinningTags.class);
	// Configure the MultipleOutputs by adding an output called "bins"
	// With the proper output format and mapper key/value pairs
	MultipleOutputs.addNamedOutput(job, "bins", TextOutputFormat.class,
			Text.class, NullWritable.class);
	// Enable the counters for the job
	// If there are a significant number of different named outputs, this
	// should be disabled
	MultipleOutputs.setCountersEnabled(job, true);
	// Map-only job
	job.setNumReduceTasks(0);
	job.setMapperClass(BinningMapper.class);
	job.setMapOutputKeyClass(Text.class);
	job.setMapOutputValueClass(NullWritable.class);
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(NullWritable.class);
	FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
	FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
	boolean success = job.waitForCompletion(true);
	return success ? 0 : 1;
}

org.apache.hadoop.mapreduce.lib.output.MultipleOutputs Java Examples