org.apache.hadoop.mapreduce.lib.input.FileInputFormat#addInputPaths

Source File: LeftJoin.java From BigData-In-Practice with Apache License 2.0

6 votes

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    GenericOptionsParser optionparser = new GenericOptionsParser(conf, args);
    conf = optionparser.getConfiguration();

    Job job = Job.getInstance(conf, "leftjoin");
    job.setJarByClass(LeftJoin.class);
    FileInputFormat.addInputPaths(job, conf.get("input_dir"));
    Path out = new Path(conf.get("output_dir"));
    FileOutputFormat.setOutputPath(job, out);
    job.setNumReduceTasks(conf.getInt("reduce_num", 1));

    job.setMapperClass(LeftJoinMapper.class);
    job.setReducerClass(LeftJoinReduce.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    conf.set("mapred.textoutputformat.separator", ",");

    return (job.waitForCompletion(true) ? 0 : 1);
}

Source File: Phase3Step3NearDupTuplesCreation.java From dkpro-c4corpus with Apache License 2.0

5 votes

@Override
public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());

    job.setJarByClass(Phase3Step3NearDupTuplesCreation.class);
    job.setJobName(Phase3Step3NearDupTuplesCreation.class.getName());

    // mapper
    job.setMapperClass(CreateTuplesMapper.class);
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(TreeSet.class);

    job.setInputFormatClass(TextInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setNumReduceTasks(0); //must be added or the mapper wont be called

    return job.waitForCompletion(true) ? 0 : 1;
}

Source File: TopDomainCounter.java From dkpro-c4corpus with Apache License 2.0

5 votes

@Override
public int run(String[] args)
        throws Exception
{
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance();
    job.setJarByClass(TopDomainCounter.class);

    job.setJobName(TopDomainCounter.class.getName());

    // mapper
    job.setMapperClass(DomainMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // combiner + reducer
    job.setCombinerClass(TextLongCountingReducer.class);
    job.setReducerClass(TextLongCountingReducer.class);

    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

Source File: WordDistributionStatisticsCollector.java From dkpro-c4corpus with Apache License 2.0

5 votes

@Override
public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());

    job.setJarByClass(WordDistributionStatisticsCollector.class);
    job.setJobName(WordDistributionStatisticsCollector.class.getName());

    // mapper
    job.setMapperClass(getMapperClass());
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    // reducer
    job.setReducerClass(SumReducer.class);
    job.setInputFormatClass(getInputFormatClass());
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

Source File: ContentTypeAndSizeDistribution.java From dkpro-c4corpus with Apache License 2.0

5 votes

@Override public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());

    job.setJarByClass(ContentTypeAndSizeDistribution.class);

    job.setJobName(ContentTypeAndSizeDistribution.class.getName());

    // mapper
    job.setMapperClass(ContentAndSizeMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // reducer
    //        job.setReducerClass(DistributionReducer.class);
    job.setReducerClass(TextLongCountingReducer.class);

    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

Source File: TextToSentencesSplitter.java From dkpro-c4corpus with Apache License 2.0

5 votes

@Override
public int run(String[] args)
        throws Exception
{
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance(conf);
    job.setJarByClass(TextToSentencesSplitter.class);

    job.setJobName(TextToSentencesSplitter.class.getName());

    // mapper
    job.setMapperClass(TextToSentencesSplitter.MapperClass.class);
    job.setInputFormatClass(WARCInputFormat.class);

    // reducer
    job.setReducerClass(ReducerClass.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

Source File: PagesByURLExtractor.java From dkpro-c4corpus with Apache License 2.0

5 votes

@Override
public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());

    for (Map.Entry<String, String> next : job.getConfiguration()) {
        System.out.println(next.getKey() + ": " + next.getValue());
    }

    job.setJarByClass(PagesByURLExtractor.class);
    job.setJobName(PagesByURLExtractor.class.getName());

    // mapper
    job.setMapperClass(MapperClass.class);

    // input
    job.setInputFormatClass(WARCInputFormat.class);

    // output
    job.setOutputFormatClass(WARCOutputFormat.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(WARCWritable.class);
    FileOutputFormat.setCompressOutput(job, true);

    // paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    // load IDs to be searched for
    job.getConfiguration().set(MAPREDUCE_MAPPER_URLS, loadURLs(args[2]));

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

Source File: URIExtractor.java From dkpro-c4corpus with Apache License 2.0

5 votes

/**
 * {@inheritDoc}
 */
@Override
public int run(String[] args)
        throws Exception
{

    Job job = Job.getInstance(getConf());
    // set from the command line
    job.setJarByClass(URIExtractor.class);
    job.setJobName(URIExtractor.class.getName());

    // mapper
    job.setMapperClass(URIExtractorMapper.class);
    job.setReducerClass(URIExtractorReducer.class);

    // input-output is warc
    job.setInputFormatClass(WARCInputFormat.class);
    // is necessary, so that Hadoop does not mix the map input format up.
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    // set output compression to GZip
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
}

Source File: Phase3Step2DistinctDataJob.java From dkpro-c4corpus with Apache License 2.0

5 votes

@Override
public int run(String[] args)
        throws Exception
{

    Job job = Job.getInstance(getConf());
    job.setJarByClass(Phase3Step2DistinctDataJob.class);
    job.setJobName(Phase3Step2DistinctDataJob.class.getName());

    //mapper
    job.setMapperClass(RemoveRedundantDataMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(NullWritable.class);

    //reducer
    job.setReducerClass(RemoveRedundantDataReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    //paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    job.setInputFormatClass(TextInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    //i/o paths
    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

Source File: Phase1FullJob.java From dkpro-c4corpus with Apache License 2.0

5 votes

@Override
public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());
    // set from the command line

    job.setJarByClass(Phase1FullJob.class);
    job.setJobName(Phase1FullJob.class.getName());

    // mapper
    job.setMapperClass(MapperClass.class);

    // we will compress the mapper's output (use fast Snappy compressor)
    job.getConfiguration().setBoolean(Job.MAP_OUTPUT_COMPRESS, true);
    job.getConfiguration()
            .setClass(Job.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);

    // reducer
    job.setReducerClass(SimpleWarcWriterReducer.class);

    // input-output is warc
    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(WARCOutputFormat.class);

    // mapper output data
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(WARCWritable.class);

    // set output compression to GZip
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
}

Source File: Phase2ExactMatchDeDuplication.java From dkpro-c4corpus with Apache License 2.0

5 votes

@Override
public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());
    //set from the command line

    job.setJarByClass(Phase2ExactMatchDeDuplication.class);
    job.setJobName(Phase2ExactMatchDeDuplication.class.getName());

    // mapper
    job.setMapperClass(ExactMatchDetectionMapper.class);

    // we will compress the mapper's output (use fast Snappy compressor)
    job.getConfiguration().setBoolean(Job.MAP_OUTPUT_COMPRESS, true);
    job.getConfiguration()
            .setClass(Job.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);

    // reducer
    job.setReducerClass(UniqueWarcWriterReducer.class);
    // no combiner, as the output classes in mapper and reducer are different!

    // input-output is warc
    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(WARCOutputFormat.class);

    // mapper output data
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(WARCWritable.class);

    // set output compression to GZip
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
}

Source File: Phase3Step1ExtractNearDupInfo.java From dkpro-c4corpus with Apache License 2.0

5 votes

@Override
public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());

    job.setJarByClass(Phase3Step1ExtractNearDupInfo.class);
    job.setJobName(Phase3Step1ExtractNearDupInfo.class.getName());

    // mapper
    job.setMapperClass(MapperClass.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(DocumentInfo.class);

    // reducer
    job.setReducerClass(DeDuplicationTextOutputReducer.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(List.class);

    job.setInputFormatClass(WARCInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, DocumentInfoOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;

}

Source File: ConfigurationHelper.java From dkpro-c4corpus with Apache License 2.0

5 votes

/**
 * Job configurator
 *
 * @param job                      job instance
 * @param jarByClass               class of the jar
 * @param mapperClass              mapper
 * @param reducerClass             reducer
 * @param commaSeparatedInputFiles input paths
 * @param outputPath               output
 * @throws IOException I/O exception
 */
public static void configureJob(Job job, Class<?> jarByClass,
        Class<? extends Mapper> mapperClass, Class<? extends Reducer> reducerClass,
        String commaSeparatedInputFiles, String outputPath)
        throws IOException
{
    job.setJarByClass(jarByClass);
    job.setJobName(jarByClass.getName());

    // mapper
    job.setMapperClass(mapperClass);

    // reducer
    job.setReducerClass(reducerClass);

    // input-output is warc
    job.setInputFormatClass(WARCInputFormat.class);
    // prevent producing empty files
    LazyOutputFormat.setOutputFormatClass(job, WARCOutputFormat.class);

    // intermediate data
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(WARCWritable.class);

    // output data
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(WARCWritable.class);

    // set output compression to GZip
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
}

Source File: WordCounterExample.java From dkpro-c4corpus with Apache License 2.0

5 votes

@Override
public int run(String[] args)
        throws Exception
{
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance();
    job.setJarByClass(WordCounterExample.class);

    job.setJobName(WordCounterExample.class.getName());

    // mapper
    job.setMapperClass(WordCounterMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // combiner + reducer
    job.setCombinerClass(TextLongCountingReducer.class);
    job.setReducerClass(TextLongCountingReducer.class);

    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

Source File: SimpleTextSearch.java From dkpro-c4corpus with Apache License 2.0

5 votes

@Override
public int run(String[] args)
        throws Exception
{
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance();
    job.setJarByClass(SimpleTextSearch.class);

    job.setJobName(SimpleTextSearch.class.getName());

    // mapper
    job.setMapperClass(TextSearchMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // combiner + reducer
    job.setCombinerClass(TextLongCountingReducer.class);
    job.setReducerClass(TextLongCountingReducer.class);

    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];

    // regex with a phrase to be searched for
    String regex = otherArgs[2];
    job.getConfiguration().set(MAPREDUCE_MAP_REGEX, regex);

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

Source File: MultiFileWordCount.java From big-c with Apache License 2.0

5 votes

public int run(String[] args) throws Exception {

    if(args.length < 2) {
      printUsage();
      return 2;
    }

    Job job = Job.getInstance(getConf());
    job.setJobName("MultiFileWordCount");
    job.setJarByClass(MultiFileWordCount.class);

    //set the InputFormat of the job to our InputFormat
    job.setInputFormatClass(MyInputFormat.class);
    
    // the keys are words (strings)
    job.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    job.setOutputValueClass(IntWritable.class);

    //use the defined mapper
    job.setMapperClass(MapClass.class);
    //use the WordCount Reducer
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
  }

Source File: MultiFileWordCount.java From hadoop with Apache License 2.0

5 votes

public int run(String[] args) throws Exception {

    if(args.length < 2) {
      printUsage();
      return 2;
    }

    Job job = Job.getInstance(getConf());
    job.setJobName("MultiFileWordCount");
    job.setJarByClass(MultiFileWordCount.class);

    //set the InputFormat of the job to our InputFormat
    job.setInputFormatClass(MyInputFormat.class);
    
    // the keys are words (strings)
    job.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    job.setOutputValueClass(IntWritable.class);

    //use the defined mapper
    job.setMapperClass(MapClass.class);
    //use the WordCount Reducer
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
  }

Source File: GenericMRLoadGenerator.java From big-c with Apache License 2.0

4 votes

/**
 * Configure a job given argv.
 */
public static boolean parseArgs(String[] argv, Job job) throws IOException {
  if (argv.length < 1) {
    return 0 == printUsage();
  }
  for(int i=0; i < argv.length; ++i) {
    if (argv.length == i + 1) {
      System.out.println("ERROR: Required parameter missing from " +
          argv[i]);
      return 0 == printUsage();
    }
    try {
      if ("-r".equals(argv[i])) {
        job.setNumReduceTasks(Integer.parseInt(argv[++i]));
      } else if ("-inFormat".equals(argv[i])) {
        job.setInputFormatClass(
            Class.forName(argv[++i]).asSubclass(InputFormat.class));
      } else if ("-outFormat".equals(argv[i])) {
        job.setOutputFormatClass(
            Class.forName(argv[++i]).asSubclass(OutputFormat.class));
      } else if ("-outKey".equals(argv[i])) {
        job.setOutputKeyClass(
          Class.forName(argv[++i]).asSubclass(WritableComparable.class));
      } else if ("-outValue".equals(argv[i])) {
        job.setOutputValueClass(
          Class.forName(argv[++i]).asSubclass(Writable.class));
      } else if ("-keepmap".equals(argv[i])) {
        job.getConfiguration().set(MAP_PRESERVE_PERCENT, argv[++i]);
      } else if ("-keepred".equals(argv[i])) {
        job.getConfiguration().set(REDUCE_PRESERVE_PERCENT, argv[++i]);
      } else if ("-outdir".equals(argv[i])) {
        FileOutputFormat.setOutputPath(job, new Path(argv[++i]));
      } else if ("-indir".equals(argv[i])) {
        FileInputFormat.addInputPaths(job, argv[++i]);
      } else if ("-inFormatIndirect".equals(argv[i])) {
        job.getConfiguration().setClass(INDIRECT_INPUT_FORMAT,
            Class.forName(argv[++i]).asSubclass(InputFormat.class),
            InputFormat.class);
        job.setInputFormatClass(IndirectInputFormat.class);
      } else {
        System.out.println("Unexpected argument: " + argv[i]);
        return 0 == printUsage();
      }
    } catch (NumberFormatException except) {
      System.out.println("ERROR: Integer expected instead of " + argv[i]);
      return 0 == printUsage();
    } catch (Exception e) {
      throw (IOException)new IOException().initCause(e);
    }
  }
  return true;
}

Source File: AbstractBulkLoadTool.java From phoenix with Apache License 2.0

4 votes

/**
 * Submits the jobs to the cluster.
 * Loads the HFiles onto the respective tables.
 * @throws Exception 
 */
public int submitJob(final Configuration conf, final String qualifiedTableName,
    final String inputPaths, final Path outputPath, List<TargetTableRef> tablesToBeLoaded, boolean hasLocalIndexes) throws Exception {
   
    Job job = Job.getInstance(conf, "Phoenix MapReduce import for " + qualifiedTableName);
    FileInputFormat.addInputPaths(job, inputPaths);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setInputFormatClass(PhoenixTextInputFormat.class);
    job.setMapOutputKeyClass(TableRowkeyPair.class);
    job.setMapOutputValueClass(ImmutableBytesWritable.class);
    job.setOutputKeyClass(TableRowkeyPair.class);
    job.setOutputValueClass(KeyValue.class);
    job.setReducerClass(FormatToKeyValueReducer.class);
    byte[][] splitKeysBeforeJob = null;
    try(org.apache.hadoop.hbase.client.Connection hbaseConn =
            ConnectionFactory.createConnection(job.getConfiguration())) {
        RegionLocator regionLocator = null;
        if(hasLocalIndexes) {
            try{
                regionLocator = hbaseConn.getRegionLocator(
                        TableName.valueOf(qualifiedTableName));
                splitKeysBeforeJob = regionLocator.getStartKeys();
            } finally {
                if (regionLocator != null) regionLocator.close();
            }
        }
        MultiHfileOutputFormat.configureIncrementalLoad(job, tablesToBeLoaded);

        final String tableNamesAsJson = TargetTableRefFunctions.NAMES_TO_JSON
                .apply(tablesToBeLoaded);
        final String logicalNamesAsJson = TargetTableRefFunctions.LOGICAL_NAMES_TO_JSON
                .apply(tablesToBeLoaded);

        job.getConfiguration().set(FormatToBytesWritableMapper.TABLE_NAMES_CONFKEY,
                tableNamesAsJson);
        job.getConfiguration().set(FormatToBytesWritableMapper.LOGICAL_NAMES_CONFKEY,
                logicalNamesAsJson);

        // give subclasses their hook
        setupJob(job);

        LOGGER.info("Running MapReduce import job from {} to {}", inputPaths, outputPath);
        boolean success = job.waitForCompletion(true);

        if (success) {
            if (hasLocalIndexes) {
                try {
                    regionLocator = hbaseConn.getRegionLocator(
                            TableName.valueOf(qualifiedTableName));
                    if(!IndexUtil.matchingSplitKeys(splitKeysBeforeJob,
                            regionLocator.getStartKeys())) {
                        LOGGER.error("The table " + qualifiedTableName + " has local indexes and"
                                + " there is split key mismatch before and after running"
                                + " bulkload job. Please rerun the job otherwise there may be"
                                + " inconsistencies between actual data and index data.");
                        return -1;
                    }
                } finally {
                    if (regionLocator != null) regionLocator.close();
                }
            }
            LOGGER.info("Loading HFiles from {}", outputPath);
            completebulkload(conf,outputPath,tablesToBeLoaded);
            LOGGER.info("Removing output directory {}", outputPath);
            if(!outputPath.getFileSystem(conf).delete(outputPath, true)) {
                LOGGER.error("Failed to delete the output directory {}", outputPath);
            }
            return 0;
        } else {
           return -1;
       }
   }
}

Source File: GenericMRLoadGenerator.java From hadoop with Apache License 2.0

4 votes

/**
 * Configure a job given argv.
 */
public static boolean parseArgs(String[] argv, Job job) throws IOException {
  if (argv.length < 1) {
    return 0 == printUsage();
  }
  for(int i=0; i < argv.length; ++i) {
    if (argv.length == i + 1) {
      System.out.println("ERROR: Required parameter missing from " +
          argv[i]);
      return 0 == printUsage();
    }
    try {
      if ("-r".equals(argv[i])) {
        job.setNumReduceTasks(Integer.parseInt(argv[++i]));
      } else if ("-inFormat".equals(argv[i])) {
        job.setInputFormatClass(
            Class.forName(argv[++i]).asSubclass(InputFormat.class));
      } else if ("-outFormat".equals(argv[i])) {
        job.setOutputFormatClass(
            Class.forName(argv[++i]).asSubclass(OutputFormat.class));
      } else if ("-outKey".equals(argv[i])) {
        job.setOutputKeyClass(
          Class.forName(argv[++i]).asSubclass(WritableComparable.class));
      } else if ("-outValue".equals(argv[i])) {
        job.setOutputValueClass(
          Class.forName(argv[++i]).asSubclass(Writable.class));
      } else if ("-keepmap".equals(argv[i])) {
        job.getConfiguration().set(MAP_PRESERVE_PERCENT, argv[++i]);
      } else if ("-keepred".equals(argv[i])) {
        job.getConfiguration().set(REDUCE_PRESERVE_PERCENT, argv[++i]);
      } else if ("-outdir".equals(argv[i])) {
        FileOutputFormat.setOutputPath(job, new Path(argv[++i]));
      } else if ("-indir".equals(argv[i])) {
        FileInputFormat.addInputPaths(job, argv[++i]);
      } else if ("-inFormatIndirect".equals(argv[i])) {
        job.getConfiguration().setClass(INDIRECT_INPUT_FORMAT,
            Class.forName(argv[++i]).asSubclass(InputFormat.class),
            InputFormat.class);
        job.setInputFormatClass(IndirectInputFormat.class);
      } else {
        System.out.println("Unexpected argument: " + argv[i]);
        return 0 == printUsage();
      }
    } catch (NumberFormatException except) {
      System.out.println("ERROR: Integer expected instead of " + argv[i]);
      return 0 == printUsage();
    } catch (Exception e) {
      throw (IOException)new IOException().initCause(e);
    }
  }
  return true;
}

Java Code Examples for org.apache.hadoop.mapreduce.lib.input.FileInputFormat#addInputPaths()