org.apache.avro.mapred.AvroJob Java Exaples

Source File: AvroAsJsonOutputFormat.java From iow-hadoop-streaming with Apache License 2.0

6 votes

static <K> void configureDataFileWriter(DataFileWriter<K> writer,
    JobConf job) throws UnsupportedEncodingException {

    if (FileOutputFormat.getCompressOutput(job)) {
        int level = job.getInt(org.apache.avro.mapred.AvroOutputFormat.DEFLATE_LEVEL_KEY,
                org.apache.avro.mapred.AvroOutputFormat.DEFAULT_DEFLATE_LEVEL);
        String codecName = job.get(AvroJob.OUTPUT_CODEC, DEFLATE_CODEC);
        CodecFactory factory = codecName.equals(DEFLATE_CODEC) ?
            CodecFactory.deflateCodec(level) : CodecFactory.fromString(codecName);
        writer.setCodec(factory);
    }

    writer.setSyncInterval(job.getInt(org.apache.avro.mapred.AvroOutputFormat.SYNC_INTERVAL_KEY,
            DEFAULT_SYNC_INTERVAL));

    // copy metadata from job
    for (Map.Entry<String,String> e : job) {
        if (e.getKey().startsWith(AvroJob.TEXT_PREFIX))
            writer.setMeta(e.getKey().substring(AvroJob.TEXT_PREFIX.length()),e.getValue());
        if (e.getKey().startsWith(AvroJob.BINARY_PREFIX))
            writer.setMeta(e.getKey().substring(AvroJob.BINARY_PREFIX.length()),
                   URLDecoder.decode(e.getValue(), "ISO-8859-1")
                   .getBytes("ISO-8859-1"));
    }
}

Source File: AbstractAvroJob.java From ml-ease with Apache License 2.0

6 votes

/**
 * Creates a JobConf for a map-reduce job. Loads the input schema from the input files.
 * 
 * @param mapperClass AvroMapper subclass for the mapper.
 * @param reducerClass AvroReducer subclass for the reducer.
 * @param mapperOutputSchema Mapper output schema. Must be an instance of org.apache.avro.mapred.Pair
 * @param outputSchema Reducer output schema
 * @return A configured JobConf.
 * @throws IOException
 * @throws URISyntaxException 
 */
public JobConf createJobConf(Class<? extends AvroMapper> mapperClass,
                             Class<? extends AvroReducer> reducerClass,
                             Schema mapperOutputSchema,
                             Schema outputSchema) throws IOException, URISyntaxException
{
  JobConf conf = createJobConf();

  AvroJob.setMapperClass(conf, mapperClass);
  AvroJob.setReducerClass(conf, reducerClass);

  AvroJob.setMapOutputSchema(conf, mapperOutputSchema);
  AvroJob.setOutputSchema(conf, outputSchema);

  return conf;
}

Source File: AbstractAvroJob.java From ml-ease with Apache License 2.0

6 votes

/**
 * Creates a JobConf for a map-reduce job that uses a combiner. Loads the input schema from the
 * input files.
 * 
 * @param mapperClass AvroMapper subclass for the mapper.
 * @param reducerClass AvroReducer subclass for the reducer.
 * @param combinerClass AvroReducer subclass for the combiner.
 * @param mapperOutputSchema Mapper output schema. Must be an instance of org.apache.avro.mapred.Pair
 * @param outputSchema Reducer output schema
 * @return A configured JobConf.
 * @throws IOException
 * @throws URISyntaxException 
 */
public JobConf createJobConf(Class<? extends AvroMapper> mapperClass,
                             Class<? extends AvroReducer> reducerClass,
                             Class<? extends AvroReducer> combinerClass,
                             Schema mapperOutputSchema,
                             Schema outputSchema) throws IOException, URISyntaxException
{
  JobConf conf = createJobConf();
  
  AvroJob.setMapperClass(conf, mapperClass);
  AvroJob.setReducerClass(conf, reducerClass);
  AvroJob.setCombinerClass(conf, combinerClass);
  
  AvroJob.setMapOutputSchema(conf, mapperOutputSchema);
  AvroJob.setOutputSchema(conf, outputSchema);
  
  return conf;
}

Source File: AbstractAvroJob.java From ml-ease with Apache License 2.0

6 votes

/**
 * Creates a JobConf for a map-only job with an explicitly set input Schema.
 * 
 * @param mapperClass AvroMapper subclass implementing the map phase
 * @param inputSchema Schema of the input data.
 * @param outputSchema Schema of the mapper output
 * @return A configured JobConf.
 * @throws IOException
 * @throws URISyntaxException 
 */
public JobConf createJobConf(Class<? extends AvroMapper> mapperClass, 
                             Schema inputSchema, 
                             Schema outputSchema) throws IOException, URISyntaxException
{
  JobConf conf = createJobConf();

  AvroJob.setMapperClass(conf, mapperClass);
  AvroJob.setReducerClass(conf, AvroReducer.class);
  
  AvroJob.setInputSchema(conf, inputSchema);
  AvroJob.setOutputSchema(conf, outputSchema);
  
  conf.setNumReduceTasks(0);

  return conf;
}

Source File: AbstractAvroJob.java From ml-ease with Apache License 2.0

6 votes

/**
 * Creates a JobConf for a map-reducer job with an explicitly set input schema.
 * 
 * @param mapperClass AvroMapper subclass for the mapper.
 * @param reducerClass AvroReducer subclass for the reducer.
 * @param inputSchema Schema of the input data.
 * @param mapperOutputSchema Mapper output schema. Must be an instance of org.apache.avro.mapred.Pair
 * @param outputSchema Reducer output schema
 * @return A configured JobConf.
 * @throws IOException
 * @throws URISyntaxException 
 */
public JobConf createJobConf(Class<? extends AvroMapper> mapperClass,
                             Class<? extends AvroReducer> reducerClass,
                             Schema inputSchema,
                             Schema mapperOutputSchema,
                             Schema outputSchema) throws IOException, URISyntaxException
{
  JobConf conf = createJobConf();
  
  AvroJob.setMapperClass(conf, mapperClass);
  AvroJob.setReducerClass(conf, reducerClass);
  
  AvroJob.setInputSchema(conf, inputSchema);
  AvroJob.setMapOutputSchema(conf, mapperOutputSchema);
  AvroJob.setOutputSchema(conf, outputSchema);

  return conf;
}

Source File: AbstractAvroJob.java From ml-ease with Apache License 2.0

6 votes

/**
 * Creates a JobConf for a map-reduce job that uses a combiner and has an explicitly set input schema.
 * 
 * @param mapperClass AvroMapper subclass for the mapper.
 * @param reducerClass AvroReducer subclass for the reducer.
 * @param combinerClass AvroReducer subclass for the combiner.
 * @param inputSchema Schema of the input data.
 * @param mapperOutputSchema Mapper output schema. Must be an instance of org.apache.avro.mapred.Pair
 * @param outputSchema Reducer output schema
 * @return A configured JobConf.
 * @throws IOException
 * @throws URISyntaxException 
 */
public JobConf createJobConf(Class<? extends AvroMapper> mapperClass,
                             Class<? extends AvroReducer> reducerClass,
                             Class<? extends AvroReducer> combinerClass,
                             Schema inputSchema,
                             Schema mapperOutputSchema,
                             Schema outputSchema) throws IOException, URISyntaxException
{
  JobConf conf = createJobConf();
  
  AvroJob.setMapperClass(conf, mapperClass);
  AvroJob.setReducerClass(conf, reducerClass);
  AvroJob.setCombinerClass(conf, combinerClass);
  
  AvroJob.setInputSchema(conf, inputSchema);
  AvroJob.setMapOutputSchema(conf, mapperOutputSchema);
  AvroJob.setOutputSchema(conf, outputSchema);

  return conf;
}

Source File: RegressionTest.java From ml-ease with Apache License 2.0

6 votes

@Override
public void setConf(Configuration conf)
{
  super.setConf(conf);
  if (conf == null)
  {
    return;
  }
  _outputSchema = AvroJob.getOutputSchema(conf);
  AvroDistributedCacheFileReader modelReader =
      new AvroDistributedCacheFileReader(new JobConf(conf));
  try
  {
    modelReader.build(conf.get(MODEL_PATH), _modelConsumer);
    _modelConsumer.done();
  }
  catch (IOException e)
  {
    e.printStackTrace();
  }
  _lambda = conf.getFloat(LAMBDA, 0);
  _ignoreValue = conf.getBoolean(BINARY_FEATURE, false);
  _logger.info("Loaded the model for test, size:" + _modelConsumer.get().size());
}

Source File: ItemModelTest.java From ml-ease with Apache License 2.0

5 votes

private JobConf createJobConf(Class<? extends AvroMapper> mapperClass,
                              Class<? extends AvroReducer> reducerClass) throws IOException, URISyntaxException
{
  JobConf conf = createJobConf();
  Schema inputSchema = Util.removeUnion(AvroUtils.getAvroInputSchema(conf));
  if (inputSchema == null)
  {
    throw new IllegalStateException("Input does not have schema info and/or input is missing.");
  }
  _logger.info("Input Schema=" + inputSchema.toString());
  List<Schema.Field> inputFields = inputSchema.getFields();
  Schema.Field predField =
      new Schema.Field("pred", Schema.create(Type.FLOAT), "", null);
  List<Schema.Field> outputFields = new LinkedList<Schema.Field>();
  for (Schema.Field field : inputFields)
  {
    outputFields.add(new Schema.Field(field.name(),
                                      field.schema(),
                                      field.doc(),
                                      null));
  }
  outputFields.add(predField);
  Schema outputSchema =
      Schema.createRecord("PerItemTestOutput",
                          "Test output for PerItemTest",
                          "com.linkedin.lab.regression.avro",
                          false);
  outputSchema.setFields(outputFields);
  AvroJob.setOutputSchema(conf, outputSchema);
  AvroJob.setMapOutputSchema(conf,
                             Pair.getPairSchema(Schema.create(Type.STRING), inputSchema));
  AvroJob.setMapperClass(conf, mapperClass);
  AvroJob.setReducerClass(conf, reducerClass);
  return conf;
}

Source File: AvroRecordWriter.java From spork with Apache License 2.0

5 votes

static void configureDataFileWriter(DataFileWriter<GenericData.Record> writer,
    JobConf job) throws UnsupportedEncodingException {
  if (FileOutputFormat.getCompressOutput(job)) {
    int level = job.getInt(DEFLATE_LEVEL_KEY,
        DEFAULT_DEFLATE_LEVEL);
    String codecName = job.get(AvroJob.OUTPUT_CODEC, DEFLATE_CODEC);
    CodecFactory factory = codecName.equals(DEFLATE_CODEC)
      ? CodecFactory.deflateCodec(level)
      : CodecFactory.fromString(codecName);
    writer.setCodec(factory);
  }

  // Do max as core-default.xml has io.file.buffer.size as 4K
  writer.setSyncInterval(job.getInt(SYNC_INTERVAL_KEY, Math.max(
          job.getInt("io.file.buffer.size", DEFAULT_SYNC_INTERVAL), DEFAULT_SYNC_INTERVAL)));

  // copy metadata from job
  for (Map.Entry<String,String> e : job) {
    if (e.getKey().startsWith(AvroJob.TEXT_PREFIX))
      writer.setMeta(e.getKey().substring(AvroJob.TEXT_PREFIX.length()),
                     e.getValue());
    if (e.getKey().startsWith(AvroJob.BINARY_PREFIX))
      writer.setMeta(e.getKey().substring(AvroJob.BINARY_PREFIX.length()),
                     URLDecoder.decode(e.getValue(), "ISO-8859-1")
                     .getBytes("ISO-8859-1"));
  }
}

Source File: BloomFilterCreator.java From hiped2 with Apache License 2.0

5 votes

/**
 * The MapReduce driver - setup and launch the job.
 *
 * @param args the command-line arguments
 * @return the process exit code
 * @throws Exception if something goes wrong
 */
public int run(final String[] args) throws Exception {

  Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build();
  int result = cli.runCmd();

  if (result != 0) {
    return result;
  }

  Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT));
  Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT));

  Configuration conf = super.getConf();

  JobConf job = new JobConf(conf);
  job.setJarByClass(BloomFilterCreator.class);

  job.set(AvroJob.OUTPUT_SCHEMA, AvroBytesRecord.SCHEMA.toString());
  job.set(AvroJob.OUTPUT_CODEC, SnappyCodec.class.getName());

  job.setInputFormat(KeyValueTextInputFormat.class);
  job.setOutputFormat(AvroOutputFormat.class);

  job.setMapperClass(Map.class);
  job.setReducerClass(Reduce.class);

  job.setMapOutputKeyClass(NullWritable.class);
  job.setMapOutputValueClass(BloomFilter.class);

  job.setOutputKeyClass(NullWritable.class);
  job.setOutputValueClass(BloomFilter.class);

  FileInputFormat.setInputPaths(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  return JobClient.runJob(job).isSuccessful() ? 0 : 1;
}

Source File: RegressionTest.java From ml-ease with Apache License 2.0

5 votes

private JobConf createJobConf(Class<? extends AvroMapper> mapperClass,
                              Class<? extends AvroReducer> reducerClass) throws IOException, URISyntaxException
{
  JobConf conf = createJobConf();
  Schema inputSchema = Util.removeUnion(AvroUtils.getAvroInputSchema(conf));
  if (inputSchema == null)
  {
    throw new IllegalStateException("Input does not have schema info and/or input is missing.");
  }
  _logger.info("Input Schema=" + inputSchema.toString());
  List<Schema.Field> inputFields = inputSchema.getFields();
  Schema.Field predField =
      new Schema.Field("pred", Schema.create(Type.FLOAT), "", null);
  List<Schema.Field> outputFields = new LinkedList<Schema.Field>();
  for (Schema.Field field : inputFields)
  {
    outputFields.add(new Schema.Field(field.name(),
                                      field.schema(),
                                      field.doc(),
                                      null));
  }
  outputFields.add(predField);
  Schema outputSchema =
      Schema.createRecord("AdmmTestOutput",
                          "Test output for AdmmTest",
                          "com.linkedin.lab.regression.avro",
                          false);
  outputSchema.setFields(outputFields);
  AvroJob.setOutputSchema(conf, outputSchema);
  AvroJob.setMapOutputSchema(conf,
                             Pair.getPairSchema(Schema.create(Type.FLOAT), outputSchema));
  AvroJob.setMapperClass(conf, mapperClass);
  AvroJob.setReducerClass(conf, reducerClass);
  return conf;
}

Source File: AvroFileAccessor.java From pxf with Apache License 2.0

5 votes

@Override
public boolean openForRead() throws Exception {
    // Pass the schema to the AvroInputFormat
    AvroJob.setInputSchema(jobConf, schema);

    // The avroWrapper required for the iteration
    avroWrapper = new AvroWrapper<>();

    return super.openForRead();
}

Source File: ItemModelTest.java From ml-ease with Apache License 2.0

5 votes

@Override
public void setConf(Configuration conf)
{
  super.setConf(conf);
  if (conf == null)
  {
    return;
  }
  _outputSchema = AvroJob.getOutputSchema(conf);
  _lambda = conf.getFloat(LAMBDA, 0);
  _ignoreValue = conf.getBoolean(BINARY_FEATURE, false);
  String modelPath = conf.get(MODEL_PATH, "");
  _logger.info("Going to read model files from distributed cache at:" + modelPath);
  int reduceTaskId = conf.getInt("mapred.task.partition", -1);
  _logger.info("The reduce task id=" + reduceTaskId);
  if (reduceTaskId < 0)
  {
    throw new RuntimeException("Can't read reduce task id from mapred.task.partition!");
  }
  int nReducers = conf.getInt(NUM_REDUCERS, -1);
  String lambdaKey = String.valueOf(_lambda) + "#";
  _consumer = new ReadLinearModelConsumer(lambdaKey, reduceTaskId, nReducers);
  AvroDistributedCacheFileReader modelReader =
      new AvroDistributedCacheFileReader(new JobConf(conf));
  try
  {
    modelReader.build(modelPath, _consumer);
    _consumer.done();
  }
  catch (IOException e)
  {
    throw new RuntimeException("Can't load model, error=" + e);
  }
  _logger.info("Loaded linear models, number of models loaded="
      + _consumer.get().size());
}

Source File: AbstractAvroJob.java From ml-ease with Apache License 2.0

5 votes

/**
 * Creates a JobConf for a map-only job. Automatically loads the schema from each input file.
 * 
 * @param mapperClass AvroMapper subclass implementing the map phase
 * @param outputSchema Schema of the mapper output
 * @return A configured JobConf.
 * @throws IOException
 * @throws URISyntaxException 
 */
public JobConf createJobConf(Class<? extends AvroMapper> mapperClass, 
                             Schema outputSchema) throws IOException, URISyntaxException
{
  JobConf conf = createJobConf();

  AvroJob.setMapperClass(conf, mapperClass);
  AvroJob.setReducerClass(conf, AvroReducer.class);

  AvroJob.setOutputSchema(conf, outputSchema);
  
  conf.setNumReduceTasks(0);

  return conf;
}

Source File: DBImportMapReduce.java From hiped2 with Apache License 2.0

4 votes

/**
 * The MapReduce driver - setup and launch the job.
 *
 * @param args the command-line arguments
 * @return the process exit code
 * @throws Exception if something goes wrong
 */
public int run(final String[] args) throws Exception {

  Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.OutputFileOption.values()).build();
  int result = cli.runCmd();

  if (result != 0) {
    return result;
  }

  Path output = new Path(cli.getArgValueAsString(CliCommonOpts.OutputFileOption.OUTPUT));

  Configuration conf = super.getConf();

  DBConfiguration.configureDB(conf, "com.mysql.jdbc.Driver",
      "jdbc:mysql://localhost/sqoop_test" +
          "?user=hip_sqoop_user&password=password");

  JobConf job = new JobConf(conf);
  job.setJarByClass(DBImportMapReduce.class);

  job.setInputFormat(DBInputFormat.class);
  job.setOutputFormat(AvroOutputFormat.class);
  AvroJob.setOutputSchema(job, Stock.SCHEMA$);
  job.set(AvroJob.OUTPUT_CODEC, SnappyCodec.class.getName());

  job.setMapperClass(Map.class);

  job.setNumMapTasks(4);
  job.setNumReduceTasks(0);

  job.setMapOutputKeyClass(AvroWrapper.class);
  job.setMapOutputValueClass(NullWritable.class);

  job.setOutputKeyClass(AvroWrapper.class);
  job.setOutputValueClass(NullWritable.class);

  FileOutputFormat.setOutputPath(job, output);

  DBInputFormat.setInput(
      job,
      StockDbWritable.class,
      "select * from stocks",
      "SELECT COUNT(id) FROM stocks");

  RunningJob runningJob = JobClient.runJob(job);

  return runningJob.isSuccessful() ? 0 : 1;
}

Source File: AbstractAvroJob.java From ml-ease with Apache License 2.0

4 votes

/**
 * Sets up various standard settings in the JobConf. You probably don't want to mess with this.
 * 
 * @return A configured JobConf.
 * @throws IOException
 * @throws URISyntaxException 
 */
protected  JobConf createJobConf() throws IOException, URISyntaxException
{
  JobConf conf = new JobConf();
  
  conf.setJobName(getJobId());
  conf.setInputFormat(AvroInputFormat.class);
  conf.setOutputFormat(AvroOutputFormat.class);
  
  AvroOutputFormat.setDeflateLevel(conf, 9);
  
  String hadoop_ugi = _config.getString("hadoop.job.ugi", null);
  if (hadoop_ugi != null)
  {
      conf.set("hadoop.job.ugi", hadoop_ugi);
  }
  if (_config.getBoolean("is.local", false))
  {
    conf.set("mapred.job.tracker", "local");
    conf.set("fs.default.name", "file:///");
    conf.set("mapred.local.dir", "/tmp/map-red");

    _log.info("Running locally, no hadoop jar set.");
  }
  
  // set JVM options if present
  if (_config.containsKey("mapred.child.java.opts"))
  {
    conf.set("mapred.child.java.opts", _config.getString("mapred.child.java.opts"));
    _log.info("mapred.child.java.opts set to " + _config.getString("mapred.child.java.opts"));
  }

  if (_config.containsKey(INPUT_PATHS))
  {
    List<String> inputPathnames = _config.getStringList(INPUT_PATHS);
    for (String pathname : inputPathnames)
    {
      AvroUtils.addAllSubPaths(conf, new Path(pathname));
    }
    AvroJob.setInputSchema(conf, AvroUtils.getAvroInputSchema(conf));
  }

  if (_config.containsKey(OUTPUT_PATH))
  {
    Path path = new Path(_config.get(OUTPUT_PATH));
    AvroOutputFormat.setOutputPath(conf, path);

    if (_config.getBoolean("force.output.overwrite", false))
    {
      FileSystem fs = FileOutputFormat.getOutputPath(conf).getFileSystem(conf);
      fs.delete(FileOutputFormat.getOutputPath(conf), true);
    }
  }
  // set all hadoop configs
  for (String key : _config.keySet()) 
  {
    String lowerCase = key.toLowerCase();
    if ( lowerCase.startsWith(HADOOP_PREFIX)) 
    {
        String newKey = key.substring(HADOOP_PREFIX.length());
        conf.set(newKey, _config.get(key));
    }
  }
  return conf;
}

Source File: AvroMixedMapReduce.java From hiped2 with Apache License 2.0

3 votes

/**
 * The MapReduce driver - setup and launch the job.
 *
 * @param args the command-line arguments
 * @return the process exit code
 * @throws Exception if something goes wrong
 */
public int run(final String[] args) throws Exception {


  Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build();
  int result = cli.runCmd();

  if (result != 0) {
    return result;
  }

  Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT));
  Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT));

  Configuration conf = super.getConf();

  JobConf job = new JobConf(conf);
  job.setJarByClass(AvroMixedMapReduce.class);

  job.set(AvroJob.INPUT_SCHEMA, Stock.SCHEMA$.toString());
  job.set(AvroJob.OUTPUT_SCHEMA, StockAvg.SCHEMA$.toString());
  job.set(AvroJob.OUTPUT_CODEC, SnappyCodec.class.getName());

  job.setInputFormat(AvroInputFormat.class);
  job.setOutputFormat(AvroOutputFormat.class);

  job.setMapperClass(Map.class);
  job.setReducerClass(Reduce.class);

  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(DoubleWritable.class);

  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(DoubleWritable.class);

  FileInputFormat.setInputPaths(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  return JobClient.runJob(job).isSuccessful() ? 0 : 1;
}

Source File: SmallFilesMapReduce.java From hiped2 with Apache License 2.0

3 votes

/**
 * The MapReduce driver - setup and launch the job.
 *
 * @param args the command-line arguments
 * @return the process exit code
 * @throws Exception if something goes wrong
 */
public int run(final String[] args) throws Exception {

  Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build();
  int result = cli.runCmd();

  if (result != 0) {
    return result;
  }

  String inputPath = cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT);
  Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT));

  Configuration conf = super.getConf();

  JobConf job = new JobConf(conf);
  job.setJarByClass(SmallFilesMapReduce.class);

  job.set(AvroJob.INPUT_SCHEMA, SmallFilesWrite.SCHEMA.toString());

  job.setInputFormat(AvroInputFormat.class);

  job.setOutputFormat(TextOutputFormat.class);

  job.setMapperClass(Map.class);
  FileInputFormat.setInputPaths(job, inputPath);
  FileOutputFormat.setOutputPath(job, outputPath);

  job.setNumReduceTasks(0);

  return JobClient.runJob(job).isSuccessful() ? 0 : 1;
}

org.apache.avro.mapred.AvroJob Java Examples