Java Code Examples for org.apache.hadoop.mapreduce.lib.input.NLineInputFormat#addInputPath()

The following examples show how to use org.apache.hadoop.mapreduce.lib.input.NLineInputFormat#addInputPath() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Phase3Step4LocalDeDuplication.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());

    job.setJarByClass(Phase3Step4LocalDeDuplication.class);
    job.setJobName(Phase3Step4LocalDeDuplication.class.getName());

    // paths
    String inputPath = args[0];
    // text files of ids to be deleted
    String outputPath = args[1];

    // input: reading max N lines for each mapper
    job.setInputFormatClass(NLineInputFormat.class);
    NLineInputFormat.addInputPath(job, new Path(inputPath));
    job.getConfiguration().setInt("mapreduce.input.lineinputformat.linespermap", LINES);

    // mapper
    job.setMapperClass(LocalGreedyDeDuplicationMapper.class);

    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    // reducer
    job.setReducerClass(IDCollectorReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 2
Source File: HDFSDistributor.java    From DataGenerator with Apache License 2.0 4 votes vote down vote up
private Job prepareJob() throws IOException {
        // Basic configuration

        configuration.setInt("mapreduce.input.lineinputformat.linespermap", 1);
        configuration.set("reportingHost", this.reportingHost);

        configuration.setBoolean("mapreduce.map.output.compress", true);
        configuration.setBoolean("mapred.compress.map.output", true);
        configuration.setBoolean("mapred.output.compress", true);
        configuration.setClass("mapred.map.output.compression.codec", GzipCodec.class, CompressionCodec.class);
        configuration.setClass("mapred.output.compression.codec", GzipCodec.class, CompressionCodec.class);

        /*        configuration.setBoolean("mapreduce.output.fileoutputformat.compress", true);
         configuration.setClass("mapreduce.output.fileoutputformat.compress.codec", GzipCodec.class, CompressionCodec.class);
         configuration.setCompressMapOutput(true);
         */
//        configuration.set("mapreduce.output.fileoutputformat.compress", "true");
//        configuration.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.GzipCodec");
//        configuration.set("mapreduce.output.fileoutputformat.compress.type", "BLOCK");
//        Job ret = new Job(configuration);
        Job ret = org.apache.hadoop.mapreduce.Job.getInstance(configuration);
        ret.setJarByClass(HDFSDistributor.class);
        ret.setJobName("PATH Test Data Generation");

        // Mapper
        ret.setMapperClass(DataGeneratorMapper.class);

        // Reducer (none)
        ret.setNumReduceTasks(0);

        // Input
        ret.setInputFormatClass(NLineInputFormat.class);
        NLineInputFormat.addInputPath(ret, mapperInputFilePath);

        // Output
        // [BTR] Saw this used in an example w/NLineInputFormatter
        // but not sure what it actually does ...
//        LazyOutputFormat.setOutputFormatClass(ret, TextOutputFormat.class);
        FileOutputFormat.setOutputPath(ret, mapperOutputFilePath);
        //ret.getConfiguration().setBoolean("mapred.output.compress", false);

        return ret;
    }