org.apache.hadoop.mapreduce.Job#setInputFormatClass

Source File: Step5.java From recsys-offline with Apache License 2.0

6 votes

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  
    // TODO Auto-generated method stub  
    Configuration conf1 = new Configuration();  

    Job job1 = new Job(conf1, "wiki  job five");  
    job1.setNumReduceTasks(1);  
    job1.setJarByClass(Step5.class);  
    job1.setInputFormatClass(SequenceFileInputFormat.class);  
    job1.setMapperClass(WikiMapper5.class);  
    job1.setMapOutputKeyClass(VarLongWritable.class);  
    job1.setMapOutputValueClass(VectorWritable.class);  
      
    job1.setCombinerClass(WiKiCombiner5.class);  
    job1.setReducerClass(WiKiReducer5.class);  
    job1.setOutputKeyClass(VarLongWritable.class);  
    job1.setOutputValueClass(RecommendedItemsWritable.class);  
//   job1.setOutputFormatClass(SequenceFileOutputFormat.class);  
    SequenceFileInputFormat.addInputPath(job1, new Path(INPUT_PATH));  
  
    FileOutputFormat.setOutputPath(job1, new Path(OUTPUT_PATH));     
    if(!job1.waitForCompletion(true)){  
        System.exit(1); // run error then exit  
    }  
}

Source File: BinaryLoader.java From marklogic-contentpump with Apache License 2.0

6 votes

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
        System.err.println("Usage: ContentLoader configFile inputDir");
        System.exit(2);
    }
    
    Job job = Job.getInstance(conf);
    job.setJarByClass(BinaryLoader.class);
    job.setInputFormatClass(BinaryInputFormat.class);
    job.setMapperClass(ContentMapper.class);
    job.setMapOutputKeyClass(DocumentURI.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputFormatClass(ContentOutputFormat.class);
    
    BinaryInputFormat.setInputPaths(job, new Path(otherArgs[1]));

    conf = job.getConfiguration();
    conf.addResource(otherArgs[0]);
     
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

Source File: WordDistributionStatisticsCollector.java From dkpro-c4corpus with Apache License 2.0

5 votes

@Override
public int run(String[] args)
        throws Exception
{
    Job job = Job.getInstance(getConf());

    job.setJarByClass(WordDistributionStatisticsCollector.class);
    job.setJobName(WordDistributionStatisticsCollector.class.getName());

    // mapper
    job.setMapperClass(getMapperClass());
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    // reducer
    job.setReducerClass(SumReducer.class);
    job.setInputFormatClass(getInputFormatClass());
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = args[0];
    String outputPath = args[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

Source File: TestReflectInputOutputFormat.java From parquet-mr with Apache License 2.0

5 votes

@Before
public void createParquetFile() throws Exception {
  // set up readers and writers not in MR
  conf.setBoolean(AvroReadSupport.AVRO_COMPATIBILITY, false);
  AvroReadSupport.setAvroDataSupplier(conf, ReflectDataSupplier.class);
  AvroWriteSupport.setAvroDataSupplier(conf, ReflectDataSupplier.class);

  final FileSystem fileSystem = parquetPath.getFileSystem(conf);
  fileSystem.delete(parquetPath, true);
  fileSystem.delete(outputPath, true);
  {
    final Job job = new Job(conf, "write");

    // input not really used
    TextInputFormat.addInputPath(job, inputPath);
    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(TestReflectInputOutputFormat.MyMapper.class);
    job.setNumReduceTasks(0);

    job.setOutputFormatClass(AvroParquetOutputFormat.class);
    AvroParquetOutputFormat.setOutputPath(job, parquetPath);
    AvroParquetOutputFormat.setSchema(job, CAR_SCHEMA);
    AvroParquetOutputFormat.setAvroDataSupplier(job, ReflectDataSupplier.class);

    waitForJob(job);
  }
}

Source File: WARCRecordCounter.java From dkpro-c4corpus with Apache License 2.0

5 votes

@Override public int run(String[] args)
        throws Exception
{
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    System.out.println("Other args: " + Arrays.toString(otherArgs));

    Job job = Job.getInstance(conf);
    job.setJarByClass(WARCRecordCounter.class);

    job.setJobName(WARCRecordCounter.class.getName());

    // mapper
    job.setMapperClass(ResponseMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    // combiner + reducer
    job.setCombinerClass(MyReducer.class);
    job.setReducerClass(MyReducer.class);

    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

Source File: CompactionOrcJobConfigurator.java From incubator-gobblin with Apache License 2.0

5 votes

protected void configureMapper(Job job) {
  job.setInputFormatClass(OrcValueCombineFileInputFormat.class);
  job.setMapperClass(OrcValueMapper.class);
  job.setMapOutputKeyClass(OrcKey.class);
  job.setMapOutputValueClass(OrcValue.class);
  job.setGroupingComparatorClass(OrcKeyComparator.class);
  job.setSortComparatorClass(OrcKeyComparator.class);
}

Source File: Main.java From hiped2 with Apache License 2.0

5 votes

public static double calcPageRank(Path inputPath, Path outputPath, int numNodes)
    throws Exception {
  Configuration conf = new Configuration();
  conf.setInt(Reduce.CONF_NUM_NODES_GRAPH, numNodes);

  Job job = new Job(conf);
  job.setJarByClass(Main.class);
  job.setMapperClass(Map.class);
  job.setReducerClass(Reduce.class);

  job.setInputFormatClass(KeyValueTextInputFormat.class);

  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(Text.class);

  FileInputFormat.setInputPaths(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  if (!job.waitForCompletion(true)) {
    throw new Exception("Job failed");
  }

  long summedConvergence = job.getCounters().findCounter(
      Reduce.Counter.CONV_DELTAS).getValue();
  double convergence =
      ((double) summedConvergence /
          Reduce.CONVERGENCE_SCALING_FACTOR) /
          (double) numNodes;

  System.out.println("======================================");
  System.out.println("=  Num nodes:           " + numNodes);
  System.out.println("=  Summed convergence:  " + summedConvergence);
  System.out.println("=  Convergence:         " + convergence);
  System.out.println("======================================");

  return convergence;
}

Source File: JMatrixMultiplicationStep2.java From RecommendationEngine with MIT License

5 votes

public static void run() throws IOException, ClassNotFoundException,
		InterruptedException {
	String inputPath = ItemBasedCFDriver.path.get("step8InputPath");
	String outputPath = ItemBasedCFDriver.path.get("step8OutputPath");

	Configuration conf = new Configuration();

	Job job = Job.getInstance(conf);

	HDFS hdfs = new HDFS(conf);
	hdfs.rmr(outputPath);

	job.setMapperClass(Step2_Mapper.class);
	job.setReducerClass(Step2_Reducer.class);
	job.setJarByClass(JMatrixMultiplicationStep2.class);
	job.setNumReduceTasks(ItemBasedCFDriver.ReducerNumber);

	job.setMapOutputKeyClass(Text.class);
	job.setMapOutputValueClass(Text.class);
	job.setOutputKeyClass(NullWritable.class);
	job.setOutputValueClass(Text.class);

	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	FileInputFormat.setInputPaths(job, new Path(inputPath));
	FileOutputFormat.setOutputPath(job, new Path(outputPath));

	job.waitForCompletion(true);
}

Source File: DataDrivenDBInputFormat.java From hadoop with Apache License 2.0

5 votes

/** setInput() takes a custom query and a separate "bounding query" to use
    instead of the custom "count query" used by DBInputFormat.
  */
public static void setInput(Job job,
    Class<? extends DBWritable> inputClass,
    String inputQuery, String inputBoundingQuery) {
  DBInputFormat.setInput(job, inputClass, inputQuery, "");
  job.getConfiguration().set(DBConfiguration.INPUT_BOUNDING_QUERY, inputBoundingQuery);
  job.setInputFormatClass(DataDrivenDBInputFormat.class);
}

Source File: AvroConversionBaseCreator.java From datacollector with Apache License 2.0

5 votes

@Override
public Job call() throws Exception {
  // We're explicitly disabling speculative execution
  conf.set("mapreduce.map.speculative", "false");
  conf.set("mapreduce.map.maxattempts", "1");

  conf.set("mapreduce.job.user.classpath.first", "true");
  conf.set("mapreduce.task.classpath.user.precedence", "true");
  conf.set("mapreduce.task.classpath.first", "true");

  addNecessaryJarsToJob(conf);

  Job job = Job.getInstance(conf);

  // IO formats
  job.setInputFormatClass(getInputFormatClass());
  job.setOutputFormatClass(NullOutputFormat.class);

  // Mapper & job output
  job.setMapperClass(getMapperClass());
  job.setOutputKeyClass(NullWritable.class);
  job.setOutputValueClass(NullWritable.class);

  // It's map only job
  job.setNumReduceTasks(0);

  // General configuration
  job.setJarByClass(getClass());


  return job;
}

Source File: PhoenixMapReduceUtil.java From phoenix with Apache License 2.0

5 votes

/**
 * 
 * @param job
 * @param inputClass DBWritable class
 * @param tableName  Input table name
 * @param conditions Condition clause to be added to the WHERE clause.
 * @param fieldNames fields being projected for the SELECT query.
 */
public static void setInput(final Job job, final Class<? extends DBWritable> inputClass, final String tableName , final String conditions, final String... fieldNames) {
      job.setInputFormatClass(PhoenixInputFormat.class);
      final Configuration configuration = job.getConfiguration();
      PhoenixConfigurationUtil.setInputTableName(configuration, tableName);
      PhoenixConfigurationUtil.setSelectColumnNames(configuration,fieldNames);
      PhoenixConfigurationUtil.setInputClass(configuration,inputClass);
      PhoenixConfigurationUtil.setSchemaType(configuration, SchemaType.TABLE);
}

Source File: KafkaMRInput.java From kylin with Apache License 2.0

5 votes

@Override
public void configureJob(Job job) {
    job.setInputFormatClass(SequenceFileInputFormat.class);
    String jobId = job.getConfiguration().get(BatchConstants.ARG_CUBING_JOB_ID);
    IJoinedFlatTableDesc flatHiveTableDesc = new CubeJoinedFlatTableDesc(cubeSegment);
    String inputPath = JoinedFlatTable.getTableDir(flatHiveTableDesc,
            JobBuilderSupport.getJobWorkingDir(conf, jobId));
    try {
        FileInputFormat.addInputPath(job, new Path(inputPath));
    } catch (IOException e) {
        throw new IllegalStateException(e);
    }
}

Source File: JsonDataValidationExecutor.java From jumbune with GNU Lesser General Public License v3.0

4 votes

public static void main( String[] args ) throws IOException, ClassNotFoundException, InterruptedException
  {
  	Configuration conf = new Configuration();	
  	String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
StringBuilder sb = new StringBuilder();
  	for (int j = 2; j < otherArgs.length; j++) {
	
  		sb.append(otherArgs[j]);
}
  	
  	LOGGER.debug("Arguments[ " + otherArgs.length+"]"+"and values respectively ["+otherArgs[0]+"], "+
		otherArgs[1]+", ["+otherArgs[2]+"]"+", ["+otherArgs[3]+"],"+
		otherArgs[4]);

String inputpath = otherArgs[0];
String outputpath = "/tmp/jumbune/dvjsonreport"+  new Date().getTime();

String json = otherArgs[1];
String nullCondition = otherArgs[2];
String regex = otherArgs[3];
String dvDir = otherArgs[4];



if(regex.isEmpty()){
	conf.set(JsonDataVaildationConstants.REGEX_ARGUMENT, "");
}else{
	conf.set(JsonDataVaildationConstants.REGEX_ARGUMENT, regex);
}

if(nullCondition.isEmpty()){
	conf.set(JsonDataVaildationConstants.NULL_ARGUMENT, "");
}else{
	conf.set(JsonDataVaildationConstants.NULL_ARGUMENT, nullCondition);
}


conf.set(JsonDataVaildationConstants.SLAVE_DIR, dvDir);
conf.set(JsonDataVaildationConstants.JSON_ARGUMENT, json);
FileSystem fs = FileSystem.get(conf);

@SuppressWarnings("deprecation")
Job job = new Job(conf, "JSONDataValidation");
job.setJarByClass(JsonDataValidationExecutor.class);

job.setInputFormatClass(JsonFileInputFormat.class);

job.setMapperClass(JsonDataValidationMapper.class);
job.setPartitionerClass(JsonDataValidationPartitioner.class);
job.setReducerClass(JsonDataValidationReducer.class);
job.setNumReduceTasks(5);

job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FileKeyViolationBean.class);

job.setOutputKeyClass(Text.class);
job.setOutputValueClass(TotalReducerViolationBean.class);
	
job.setOutputFormatClass(SequenceFileOutputFormat.class);

  	Path[] inputPaths = FileUtil.getAllJsonNestedFilePath(job, inputpath);

FileInputFormat.setInputPaths(job, inputPaths);
FileOutputFormat.setOutputPath(job, new Path(outputpath));
		
if(fs.exists(new Path(outputpath)))
{
	fs.delete(new Path(outputpath), true);
}

job.waitForCompletion(true);	

 Map<String, JsonViolationReport> jsonMap = readDataFromHdfs(conf,outputpath);
 final Gson gson= new Gson();
 final String jsonReport = gson.toJson(jsonMap);

 LOGGER.info("Completed DataValidation");
 LOGGER.info(JsonDataVaildationConstants.JSON_DV_REPORT + jsonReport);
  }

Source File: TopKPhaseJob.java From incubator-pinot with Apache License 2.0

4 votes

public Job run() throws Exception {
  Job job = Job.getInstance(getConf());
  job.setJobName(name);
  job.setJarByClass(TopKPhaseJob.class);

  Configuration configuration = job.getConfiguration();
  FileSystem fs = FileSystem.get(configuration);

  // Properties
  LOGGER.info("Properties {}", props);

   // Input Path
  String inputPathDir = getAndSetConfiguration(configuration, TOPK_PHASE_INPUT_PATH);
  LOGGER.info("Input path dir: " + inputPathDir);
  for (String inputPath : inputPathDir.split(ThirdEyeConstants.FIELD_SEPARATOR)) {
    LOGGER.info("Adding input:" + inputPath);
    Path input = new Path(inputPath);
    FileInputFormat.addInputPath(job, input);
  }

  // Output path
  Path outputPath = new Path(getAndSetConfiguration(configuration, TOPK_PHASE_OUTPUT_PATH));
  LOGGER.info("Output path dir: " + outputPath.toString());
  if (fs.exists(outputPath)) {
    fs.delete(outputPath, true);
  }
  FileOutputFormat.setOutputPath(job, outputPath);

  // Schema
  Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir);
  LOGGER.info("Schema : {}", avroSchema.toString(true));

  // ThirdEyeConfig
  String dimensionTypesProperty = ThirdeyeAvroUtils.getDimensionTypesProperty(
      props.getProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_NAMES.toString()), avroSchema);
  props.setProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_TYPES.toString(), dimensionTypesProperty);
  String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty(
      props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()),
      props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema);
  props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty);
  ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props);
  LOGGER.info("Thirdeye Config {}", thirdeyeConfig.encode());
  job.getConfiguration().set(TOPK_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig));

  // Map config
  job.setMapperClass(TopKPhaseMapper.class);
  job.setInputFormatClass(AvroKeyInputFormat.class);
  job.setMapOutputKeyClass(BytesWritable.class);
  job.setMapOutputValueClass(BytesWritable.class);

  // Combiner
  job.setCombinerClass(TopKPhaseCombiner.class);

   // Reduce config
  job.setReducerClass(TopKPhaseReducer.class);
  job.setOutputKeyClass(NullWritable.class);
  job.setOutputValueClass(NullWritable.class);
  job.setNumReduceTasks(1);

  job.waitForCompletion(true);

  return job;
}

Source File: DomainStatistics.java From anthelion with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {
  if (args.length < 3) {
    System.out.println("usage: DomainStatistics inputDirs outDir host|domain|suffix|tld [numOfReducer]");
    return 1;
  }
  String inputDir = args[0];
  String outputDir = args[1];
  int numOfReducers = 1;

  if (args.length > 3) {
    numOfReducers = Integer.parseInt(args[3]);
  }

  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
  long start = System.currentTimeMillis();
  LOG.info("DomainStatistics: starting at " + sdf.format(start));

  int mode = 0;
  String jobName = "DomainStatistics";
  if(args[2].equals("host")) {
    jobName = "Host statistics";
    mode = MODE_HOST;
  } else if(args[2].equals("domain")) {
    jobName  = "Domain statistics";
    mode = MODE_DOMAIN;
  } else if(args[2].equals("suffix")) {
    jobName = "Suffix statistics";
    mode = MODE_SUFFIX;
  } else if(args[2].equals("tld")) {
    jobName = "TLD statistics";
    mode = MODE_TLD;
  }

  Configuration conf = getConf();
  conf.setInt("domain.statistics.mode", mode);
  conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

  Job job = new Job(conf, jobName);
  job.setJarByClass(DomainStatistics.class);

  String[] inputDirsSpecs = inputDir.split(",");
  for (int i = 0; i < inputDirsSpecs.length; i++) {
    FileInputFormat.addInputPath(job, new Path(inputDirsSpecs[i]));
  }

  job.setInputFormatClass(SequenceFileInputFormat.class);
  FileOutputFormat.setOutputPath(job, new Path(outputDir));
  job.setOutputFormatClass(TextOutputFormat.class);

  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(LongWritable.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(LongWritable.class);

  job.setMapperClass(DomainStatisticsMapper.class);
  job.setReducerClass(DomainStatisticsReducer.class);
  job.setCombinerClass(DomainStatisticsCombiner.class);
  job.setNumReduceTasks(numOfReducers);

  try {
    job.waitForCompletion(true);
  } catch (Exception e) {
    throw e;
  }

  long end = System.currentTimeMillis();
  LOG.info("DomainStatistics: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
  return 0;
}

Source File: MapReduceExercise.java From mongodb-hadoop-workshop with Apache License 2.0

4 votes

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    if(args.length < 3) {
        System.err.println("Usage: MapReduceExercise " +
            "[mongodb input uri] " +
            "[mongodb output uri] " +
            "update=[true or false]");

        System.err.println("Example: MapReduceExercise " +
            "mongodb://127.0.0.1:27017/movielens.ratings " +
            "mongodb://127.0.0.1:27017/movielens.ratings.stats update=false");

        System.err.println("Example: MapReduceExercise " +
            "mongodb://127.0.0.1:27017/movielens.ratings " +
            "mongodb://127.0.0.1:27017/movielens.movies update=true");

        System.exit(-1);
    }

    Class outputValueClass = BSONWritable.class;
    Class reducerClass = Reduce.class;

    if(args[2].equals("update=true")) {
        outputValueClass = MongoUpdateWritable.class;
        reducerClass = ReduceUpdater.class;
    }

    Configuration conf = new Configuration();

    // Set MongoDB-specific configuration items
    conf.setClass("mongo.job.mapper", Map.class, Mapper.class);
    conf.setClass("mongo.job.reducer", reducerClass, Reducer.class);

    conf.setClass("mongo.job.mapper.output.key", IntWritable.class, Object.class);
    conf.setClass("mongo.job.mapper.output.value", DoubleWritable.class, Object.class);

    conf.setClass("mongo.job.output.key", NullWritable.class, Object.class);
    conf.setClass("mongo.job.output.value", outputValueClass, Object.class);

    conf.set("mongo.input.uri",  args[0]);
    conf.set("mongo.output.uri", args[1]);

    Job job = Job.getInstance(conf);

    // Set Hadoop-specific job parameters
    job.setInputFormatClass(MongoInputFormat.class);
    job.setOutputFormatClass(MongoOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(outputValueClass);

    job.setMapperClass(Map.class);
    job.setReducerClass(reducerClass);

    job.setJarByClass(MapReduceExercise.class);

    job.submit();
}

Source File: HalyardBulkExport.java From Halyard with Apache License 2.0

4 votes

@Override
protected int run(CommandLine cmd) throws Exception {
    if (!cmd.getArgList().isEmpty()) throw new HalyardExport.ExportException("Unknown arguments: " + cmd.getArgList().toString());
    String source = cmd.getOptionValue('s');
    String queryFiles = cmd.getOptionValue('q');
    String target = cmd.getOptionValue('t');
    if (!target.contains("{0}")) {
        throw new HalyardExport.ExportException("Bulk export target must contain '{0}' to be replaced by stripped filename of the actual SPARQL query.");
    }
    getConf().set(SOURCE, source);
    getConf().set(TARGET, target);
    String driver = cmd.getOptionValue('c');
    if (driver != null) {
        getConf().set(JDBC_DRIVER, driver);
    }
    String props[] = cmd.getOptionValues('p');
    if (props != null) {
        for (int i=0; i<props.length; i++) {
            props[i] = Base64.encodeBase64String(props[i].getBytes(StandardCharsets.UTF_8));
        }
        getConf().setStrings(JDBC_PROPERTIES, props);
    }
    if (cmd.hasOption('i')) getConf().set(HalyardBulkUpdate.ELASTIC_INDEX_URL, cmd.getOptionValue('i'));
    TableMapReduceUtil.addDependencyJars(getConf(),
           HalyardExport.class,
           NTriplesUtil.class,
           Rio.class,
           AbstractRDFHandler.class,
           RDFFormat.class,
           RDFParser.class,
           HTable.class,
           HBaseConfiguration.class,
           AuthenticationProtos.class,
           Trace.class,
           Gauge.class);
    HBaseConfiguration.addHbaseResources(getConf());
    String cp = cmd.getOptionValue('l');
    if (cp != null) {
        String jars[] = cp.split(":");
        StringBuilder newCp = new StringBuilder();
        for (int i=0; i<jars.length; i++) {
            if (i > 0) newCp.append(':');
            newCp.append(addTmpFile(jars[i])); //append clappspath entris to tmpfiles and trim paths from the classpath
        }
        getConf().set(JDBC_CLASSPATH, newCp.toString());
    }
    Job job = Job.getInstance(getConf(), "HalyardBulkExport " + source + " -> " + target);
    job.setJarByClass(HalyardBulkExport.class);
    job.setMaxMapAttempts(1);
    job.setMapperClass(BulkExportMapper.class);
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(Void.class);
    job.setNumReduceTasks(0);
    job.setInputFormatClass(QueryInputFormat.class);
    QueryInputFormat.setQueriesFromDirRecursive(job.getConfiguration(), queryFiles, false, 0);
    job.setOutputFormatClass(NullOutputFormat.class);
    TableMapReduceUtil.initCredentials(job);
    if (job.waitForCompletion(true)) {
        LOG.info("Bulk Export Completed..");
        return 0;
    }
    return -1;
}

Source File: BlurOutputFormatMiniClusterTest.java From incubator-retired-blur with Apache License 2.0

4 votes

@Test
public void testBlurOutputFormat() throws IOException, InterruptedException, ClassNotFoundException, BlurException,
    TException {
  fileSystem.delete(inDir, true);
  String tableName = "testBlurOutputFormat";
  writeRecordsFile("in/part1", 1, 1, 1, 1, "cf1");
  writeRecordsFile("in/part2", 1, 1, 2, 1, "cf1");

  Job job = Job.getInstance(conf, "blur index");
  job.setJarByClass(BlurOutputFormatMiniClusterTest.class);
  job.setMapperClass(CsvBlurMapper.class);
  job.setInputFormatClass(TextInputFormat.class);

  FileInputFormat.addInputPath(job, new Path(TEST_ROOT_DIR + "/in"));
  String tableUri = new Path(TEST_ROOT_DIR + "/blur/" + tableName).makeQualified(fileSystem.getUri(),
      fileSystem.getWorkingDirectory()).toString();
  CsvBlurMapper.addColumns(job, "cf1", "col");

  TableDescriptor tableDescriptor = new TableDescriptor();
  tableDescriptor.setShardCount(1);
  tableDescriptor.setTableUri(tableUri);
  tableDescriptor.setName(tableName);

  Iface client = getClient();
  client.createTable(tableDescriptor);

  BlurOutputFormat.setupJob(job, tableDescriptor);
  Path output = new Path(TEST_ROOT_DIR + "/out");
  BlurOutputFormat.setOutputPath(job, output);

  Path tablePath = new Path(tableUri);
  Path shardPath = new Path(tablePath, ShardUtil.getShardName(0));
  FileStatus[] listStatus = fileSystem.listStatus(shardPath);
  
  System.out.println("======" + listStatus.length);
  for (FileStatus fileStatus : listStatus) {
    System.out.println(fileStatus.getPath());
  }
  assertEquals(3, listStatus.length);

  assertTrue(job.waitForCompletion(true));
  Counters ctrs = job.getCounters();
  System.out.println("Counters: " + ctrs);

  client.loadData(tableName, output.toString());

  while (true) {
    TableStats tableStats = client.tableStats(tableName);
    System.out.println(tableStats);
    if (tableStats.getRowCount() > 0) {
      break;
    }
    Thread.sleep(100);
  }

  assertTrue(fileSystem.exists(tablePath));
  assertFalse(fileSystem.isFile(tablePath));

  FileStatus[] listStatusAfter = fileSystem.listStatus(shardPath);

  assertEquals(12, listStatusAfter.length);

}

Source File: HadoopSegmentCreationJob.java From incubator-pinot with Apache License 2.0

4 votes

public void run()
    throws Exception {
  _logger.info("Starting {}", getClass().getSimpleName());

  // Initialize all directories
  _outputDirFileSystem = FileSystem.get(new Path(_outputDir).toUri(), getConf());
  JobPreparationHelper.mkdirs(_outputDirFileSystem, new Path(_outputDir), _defaultPermissionsMask);
  JobPreparationHelper.mkdirs(_outputDirFileSystem, new Path(_stagingDir), _defaultPermissionsMask);
  Path stagingInputDir = new Path(_stagingDir, "input");
  JobPreparationHelper.mkdirs(_outputDirFileSystem, stagingInputDir, _defaultPermissionsMask);

  // Gather all data files
  List<Path> dataFilePaths = getDataFilePaths(_inputPattern);
  int numDataFiles = dataFilePaths.size();
  if (numDataFiles == 0) {
    String errorMessage = "No data file founded with pattern: " + _inputPattern;
    _logger.error(errorMessage);
    throw new RuntimeException(errorMessage);
  } else {
    _logger.info("Creating segments with data files: {}", dataFilePaths);
    for (int i = 0; i < numDataFiles; i++) {
      Path dataFilePath = dataFilePaths.get(i);
      try (DataOutputStream dataOutputStream = _outputDirFileSystem
          .create(new Path(stagingInputDir, Integer.toString(i)))) {
        dataOutputStream.write(StringUtil.encodeUtf8(dataFilePath.toString() + " " + i));
        dataOutputStream.flush();
      }
    }
  }

  // Set up the job
  Job job = Job.getInstance(getConf());
  job.setJarByClass(getClass());
  job.setJobName(getClass().getName());

  Configuration jobConf = job.getConfiguration();
  String hadoopTokenFileLocation = System.getenv("HADOOP_TOKEN_FILE_LOCATION");
  if (hadoopTokenFileLocation != null) {
    jobConf.set("mapreduce.job.credentials.binary", hadoopTokenFileLocation);
  }
  jobConf.setInt(JobContext.NUM_MAPS, numDataFiles);

  // Set table config and schema
  TableConfig tableConfig = getTableConfig();
  if (tableConfig != null) {
    validateTableConfig(tableConfig);
    jobConf.set(JobConfigConstants.TABLE_CONFIG, tableConfig.toJsonString());
  }
  jobConf.set(JobConfigConstants.SCHEMA, getSchema().toSingleLineJsonString());

  // Set additional configurations
  for (Map.Entry<Object, Object> entry : _properties.entrySet()) {
    jobConf.set(entry.getKey().toString(), entry.getValue().toString());
  }

  job.setMapperClass(getMapperClass());
  job.setNumReduceTasks(0);

  job.setInputFormatClass(TextInputFormat.class);
  job.setOutputFormatClass(TextOutputFormat.class);

  job.setMapOutputKeyClass(LongWritable.class);
  job.setMapOutputValueClass(Text.class);

  FileInputFormat.addInputPath(job, stagingInputDir);
  FileOutputFormat.setOutputPath(job, new Path(_stagingDir, "output"));

  addDepsJarToDistributedCache(job);
  addAdditionalJobProperties(job);

  // Submit the job
  job.waitForCompletion(true);
  if (!job.isSuccessful()) {
    throw new RuntimeException("Job failed: " + job);
  }

  moveSegmentsToOutputDir();

  // Delete the staging directory
  _logger.info("Deleting the staging directory: {}", _stagingDir);
  _outputDirFileSystem.delete(new Path(_stagingDir), true);
}

Source File: DBInputFormat.java From aliyun-maxcompute-data-collectors with Apache License 2.0

3 votes

/**
 * Initializes the map-part of the job with the appropriate input settings.
 *
 * @param job The map-reduce job
 * @param inputClass the class object implementing DBWritable, which is the
 * Java object holding tuple fields.
 * @param inputQuery the input query to select fields. Example :
 * "SELECT f1, f2, f3 FROM Mytable ORDER BY f1"
 * @param inputCountQuery the input query that returns
 * the number of records in the table.
 * Example : "SELECT COUNT(f1) FROM Mytable"
 * @see #setInput(Job, Class, String, String, String, String...)
 */
public static void setInput(Job job,
    Class<? extends DBWritable> inputClass,
    String inputQuery, String inputCountQuery) {
  job.setInputFormatClass(DBInputFormat.class);
  DBConfiguration dbConf = new DBConfiguration(job.getConfiguration());
  dbConf.setInputClass(inputClass);
  dbConf.setInputQuery(inputQuery);
  dbConf.setInputCountQuery(inputCountQuery);
}

Java Code Examples for org.apache.hadoop.mapreduce.Job#setInputFormatClass()