org.apache.avro.mapred.AvroJob Java Examples
The following examples show how to use
org.apache.avro.mapred.AvroJob.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AvroAsJsonOutputFormat.java From iow-hadoop-streaming with Apache License 2.0 | 6 votes |
static <K> void configureDataFileWriter(DataFileWriter<K> writer, JobConf job) throws UnsupportedEncodingException { if (FileOutputFormat.getCompressOutput(job)) { int level = job.getInt(org.apache.avro.mapred.AvroOutputFormat.DEFLATE_LEVEL_KEY, org.apache.avro.mapred.AvroOutputFormat.DEFAULT_DEFLATE_LEVEL); String codecName = job.get(AvroJob.OUTPUT_CODEC, DEFLATE_CODEC); CodecFactory factory = codecName.equals(DEFLATE_CODEC) ? CodecFactory.deflateCodec(level) : CodecFactory.fromString(codecName); writer.setCodec(factory); } writer.setSyncInterval(job.getInt(org.apache.avro.mapred.AvroOutputFormat.SYNC_INTERVAL_KEY, DEFAULT_SYNC_INTERVAL)); // copy metadata from job for (Map.Entry<String,String> e : job) { if (e.getKey().startsWith(AvroJob.TEXT_PREFIX)) writer.setMeta(e.getKey().substring(AvroJob.TEXT_PREFIX.length()),e.getValue()); if (e.getKey().startsWith(AvroJob.BINARY_PREFIX)) writer.setMeta(e.getKey().substring(AvroJob.BINARY_PREFIX.length()), URLDecoder.decode(e.getValue(), "ISO-8859-1") .getBytes("ISO-8859-1")); } }
Example #2
Source File: AbstractAvroJob.java From ml-ease with Apache License 2.0 | 6 votes |
/** * Creates a JobConf for a map-reduce job. Loads the input schema from the input files. * * @param mapperClass AvroMapper subclass for the mapper. * @param reducerClass AvroReducer subclass for the reducer. * @param mapperOutputSchema Mapper output schema. Must be an instance of org.apache.avro.mapred.Pair * @param outputSchema Reducer output schema * @return A configured JobConf. * @throws IOException * @throws URISyntaxException */ public JobConf createJobConf(Class<? extends AvroMapper> mapperClass, Class<? extends AvroReducer> reducerClass, Schema mapperOutputSchema, Schema outputSchema) throws IOException, URISyntaxException { JobConf conf = createJobConf(); AvroJob.setMapperClass(conf, mapperClass); AvroJob.setReducerClass(conf, reducerClass); AvroJob.setMapOutputSchema(conf, mapperOutputSchema); AvroJob.setOutputSchema(conf, outputSchema); return conf; }
Example #3
Source File: AbstractAvroJob.java From ml-ease with Apache License 2.0 | 6 votes |
/** * Creates a JobConf for a map-reduce job that uses a combiner. Loads the input schema from the * input files. * * @param mapperClass AvroMapper subclass for the mapper. * @param reducerClass AvroReducer subclass for the reducer. * @param combinerClass AvroReducer subclass for the combiner. * @param mapperOutputSchema Mapper output schema. Must be an instance of org.apache.avro.mapred.Pair * @param outputSchema Reducer output schema * @return A configured JobConf. * @throws IOException * @throws URISyntaxException */ public JobConf createJobConf(Class<? extends AvroMapper> mapperClass, Class<? extends AvroReducer> reducerClass, Class<? extends AvroReducer> combinerClass, Schema mapperOutputSchema, Schema outputSchema) throws IOException, URISyntaxException { JobConf conf = createJobConf(); AvroJob.setMapperClass(conf, mapperClass); AvroJob.setReducerClass(conf, reducerClass); AvroJob.setCombinerClass(conf, combinerClass); AvroJob.setMapOutputSchema(conf, mapperOutputSchema); AvroJob.setOutputSchema(conf, outputSchema); return conf; }
Example #4
Source File: AbstractAvroJob.java From ml-ease with Apache License 2.0 | 6 votes |
/** * Creates a JobConf for a map-only job with an explicitly set input Schema. * * @param mapperClass AvroMapper subclass implementing the map phase * @param inputSchema Schema of the input data. * @param outputSchema Schema of the mapper output * @return A configured JobConf. * @throws IOException * @throws URISyntaxException */ public JobConf createJobConf(Class<? extends AvroMapper> mapperClass, Schema inputSchema, Schema outputSchema) throws IOException, URISyntaxException { JobConf conf = createJobConf(); AvroJob.setMapperClass(conf, mapperClass); AvroJob.setReducerClass(conf, AvroReducer.class); AvroJob.setInputSchema(conf, inputSchema); AvroJob.setOutputSchema(conf, outputSchema); conf.setNumReduceTasks(0); return conf; }
Example #5
Source File: AbstractAvroJob.java From ml-ease with Apache License 2.0 | 6 votes |
/** * Creates a JobConf for a map-reducer job with an explicitly set input schema. * * @param mapperClass AvroMapper subclass for the mapper. * @param reducerClass AvroReducer subclass for the reducer. * @param inputSchema Schema of the input data. * @param mapperOutputSchema Mapper output schema. Must be an instance of org.apache.avro.mapred.Pair * @param outputSchema Reducer output schema * @return A configured JobConf. * @throws IOException * @throws URISyntaxException */ public JobConf createJobConf(Class<? extends AvroMapper> mapperClass, Class<? extends AvroReducer> reducerClass, Schema inputSchema, Schema mapperOutputSchema, Schema outputSchema) throws IOException, URISyntaxException { JobConf conf = createJobConf(); AvroJob.setMapperClass(conf, mapperClass); AvroJob.setReducerClass(conf, reducerClass); AvroJob.setInputSchema(conf, inputSchema); AvroJob.setMapOutputSchema(conf, mapperOutputSchema); AvroJob.setOutputSchema(conf, outputSchema); return conf; }
Example #6
Source File: AbstractAvroJob.java From ml-ease with Apache License 2.0 | 6 votes |
/** * Creates a JobConf for a map-reduce job that uses a combiner and has an explicitly set input schema. * * @param mapperClass AvroMapper subclass for the mapper. * @param reducerClass AvroReducer subclass for the reducer. * @param combinerClass AvroReducer subclass for the combiner. * @param inputSchema Schema of the input data. * @param mapperOutputSchema Mapper output schema. Must be an instance of org.apache.avro.mapred.Pair * @param outputSchema Reducer output schema * @return A configured JobConf. * @throws IOException * @throws URISyntaxException */ public JobConf createJobConf(Class<? extends AvroMapper> mapperClass, Class<? extends AvroReducer> reducerClass, Class<? extends AvroReducer> combinerClass, Schema inputSchema, Schema mapperOutputSchema, Schema outputSchema) throws IOException, URISyntaxException { JobConf conf = createJobConf(); AvroJob.setMapperClass(conf, mapperClass); AvroJob.setReducerClass(conf, reducerClass); AvroJob.setCombinerClass(conf, combinerClass); AvroJob.setInputSchema(conf, inputSchema); AvroJob.setMapOutputSchema(conf, mapperOutputSchema); AvroJob.setOutputSchema(conf, outputSchema); return conf; }
Example #7
Source File: RegressionTest.java From ml-ease with Apache License 2.0 | 6 votes |
@Override public void setConf(Configuration conf) { super.setConf(conf); if (conf == null) { return; } _outputSchema = AvroJob.getOutputSchema(conf); AvroDistributedCacheFileReader modelReader = new AvroDistributedCacheFileReader(new JobConf(conf)); try { modelReader.build(conf.get(MODEL_PATH), _modelConsumer); _modelConsumer.done(); } catch (IOException e) { e.printStackTrace(); } _lambda = conf.getFloat(LAMBDA, 0); _ignoreValue = conf.getBoolean(BINARY_FEATURE, false); _logger.info("Loaded the model for test, size:" + _modelConsumer.get().size()); }
Example #8
Source File: ItemModelTest.java From ml-ease with Apache License 2.0 | 5 votes |
private JobConf createJobConf(Class<? extends AvroMapper> mapperClass, Class<? extends AvroReducer> reducerClass) throws IOException, URISyntaxException { JobConf conf = createJobConf(); Schema inputSchema = Util.removeUnion(AvroUtils.getAvroInputSchema(conf)); if (inputSchema == null) { throw new IllegalStateException("Input does not have schema info and/or input is missing."); } _logger.info("Input Schema=" + inputSchema.toString()); List<Schema.Field> inputFields = inputSchema.getFields(); Schema.Field predField = new Schema.Field("pred", Schema.create(Type.FLOAT), "", null); List<Schema.Field> outputFields = new LinkedList<Schema.Field>(); for (Schema.Field field : inputFields) { outputFields.add(new Schema.Field(field.name(), field.schema(), field.doc(), null)); } outputFields.add(predField); Schema outputSchema = Schema.createRecord("PerItemTestOutput", "Test output for PerItemTest", "com.linkedin.lab.regression.avro", false); outputSchema.setFields(outputFields); AvroJob.setOutputSchema(conf, outputSchema); AvroJob.setMapOutputSchema(conf, Pair.getPairSchema(Schema.create(Type.STRING), inputSchema)); AvroJob.setMapperClass(conf, mapperClass); AvroJob.setReducerClass(conf, reducerClass); return conf; }
Example #9
Source File: AvroRecordWriter.java From spork with Apache License 2.0 | 5 votes |
static void configureDataFileWriter(DataFileWriter<GenericData.Record> writer, JobConf job) throws UnsupportedEncodingException { if (FileOutputFormat.getCompressOutput(job)) { int level = job.getInt(DEFLATE_LEVEL_KEY, DEFAULT_DEFLATE_LEVEL); String codecName = job.get(AvroJob.OUTPUT_CODEC, DEFLATE_CODEC); CodecFactory factory = codecName.equals(DEFLATE_CODEC) ? CodecFactory.deflateCodec(level) : CodecFactory.fromString(codecName); writer.setCodec(factory); } // Do max as core-default.xml has io.file.buffer.size as 4K writer.setSyncInterval(job.getInt(SYNC_INTERVAL_KEY, Math.max( job.getInt("io.file.buffer.size", DEFAULT_SYNC_INTERVAL), DEFAULT_SYNC_INTERVAL))); // copy metadata from job for (Map.Entry<String,String> e : job) { if (e.getKey().startsWith(AvroJob.TEXT_PREFIX)) writer.setMeta(e.getKey().substring(AvroJob.TEXT_PREFIX.length()), e.getValue()); if (e.getKey().startsWith(AvroJob.BINARY_PREFIX)) writer.setMeta(e.getKey().substring(AvroJob.BINARY_PREFIX.length()), URLDecoder.decode(e.getValue(), "ISO-8859-1") .getBytes("ISO-8859-1")); } }
Example #10
Source File: BloomFilterCreator.java From hiped2 with Apache License 2.0 | 5 votes |
/** * The MapReduce driver - setup and launch the job. * * @param args the command-line arguments * @return the process exit code * @throws Exception if something goes wrong */ public int run(final String[] args) throws Exception { Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build(); int result = cli.runCmd(); if (result != 0) { return result; } Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT)); Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT)); Configuration conf = super.getConf(); JobConf job = new JobConf(conf); job.setJarByClass(BloomFilterCreator.class); job.set(AvroJob.OUTPUT_SCHEMA, AvroBytesRecord.SCHEMA.toString()); job.set(AvroJob.OUTPUT_CODEC, SnappyCodec.class.getName()); job.setInputFormat(KeyValueTextInputFormat.class); job.setOutputFormat(AvroOutputFormat.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(BloomFilter.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(BloomFilter.class); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); return JobClient.runJob(job).isSuccessful() ? 0 : 1; }
Example #11
Source File: RegressionTest.java From ml-ease with Apache License 2.0 | 5 votes |
private JobConf createJobConf(Class<? extends AvroMapper> mapperClass, Class<? extends AvroReducer> reducerClass) throws IOException, URISyntaxException { JobConf conf = createJobConf(); Schema inputSchema = Util.removeUnion(AvroUtils.getAvroInputSchema(conf)); if (inputSchema == null) { throw new IllegalStateException("Input does not have schema info and/or input is missing."); } _logger.info("Input Schema=" + inputSchema.toString()); List<Schema.Field> inputFields = inputSchema.getFields(); Schema.Field predField = new Schema.Field("pred", Schema.create(Type.FLOAT), "", null); List<Schema.Field> outputFields = new LinkedList<Schema.Field>(); for (Schema.Field field : inputFields) { outputFields.add(new Schema.Field(field.name(), field.schema(), field.doc(), null)); } outputFields.add(predField); Schema outputSchema = Schema.createRecord("AdmmTestOutput", "Test output for AdmmTest", "com.linkedin.lab.regression.avro", false); outputSchema.setFields(outputFields); AvroJob.setOutputSchema(conf, outputSchema); AvroJob.setMapOutputSchema(conf, Pair.getPairSchema(Schema.create(Type.FLOAT), outputSchema)); AvroJob.setMapperClass(conf, mapperClass); AvroJob.setReducerClass(conf, reducerClass); return conf; }
Example #12
Source File: AvroFileAccessor.java From pxf with Apache License 2.0 | 5 votes |
@Override public boolean openForRead() throws Exception { // Pass the schema to the AvroInputFormat AvroJob.setInputSchema(jobConf, schema); // The avroWrapper required for the iteration avroWrapper = new AvroWrapper<>(); return super.openForRead(); }
Example #13
Source File: ItemModelTest.java From ml-ease with Apache License 2.0 | 5 votes |
@Override public void setConf(Configuration conf) { super.setConf(conf); if (conf == null) { return; } _outputSchema = AvroJob.getOutputSchema(conf); _lambda = conf.getFloat(LAMBDA, 0); _ignoreValue = conf.getBoolean(BINARY_FEATURE, false); String modelPath = conf.get(MODEL_PATH, ""); _logger.info("Going to read model files from distributed cache at:" + modelPath); int reduceTaskId = conf.getInt("mapred.task.partition", -1); _logger.info("The reduce task id=" + reduceTaskId); if (reduceTaskId < 0) { throw new RuntimeException("Can't read reduce task id from mapred.task.partition!"); } int nReducers = conf.getInt(NUM_REDUCERS, -1); String lambdaKey = String.valueOf(_lambda) + "#"; _consumer = new ReadLinearModelConsumer(lambdaKey, reduceTaskId, nReducers); AvroDistributedCacheFileReader modelReader = new AvroDistributedCacheFileReader(new JobConf(conf)); try { modelReader.build(modelPath, _consumer); _consumer.done(); } catch (IOException e) { throw new RuntimeException("Can't load model, error=" + e); } _logger.info("Loaded linear models, number of models loaded=" + _consumer.get().size()); }
Example #14
Source File: AbstractAvroJob.java From ml-ease with Apache License 2.0 | 5 votes |
/** * Creates a JobConf for a map-only job. Automatically loads the schema from each input file. * * @param mapperClass AvroMapper subclass implementing the map phase * @param outputSchema Schema of the mapper output * @return A configured JobConf. * @throws IOException * @throws URISyntaxException */ public JobConf createJobConf(Class<? extends AvroMapper> mapperClass, Schema outputSchema) throws IOException, URISyntaxException { JobConf conf = createJobConf(); AvroJob.setMapperClass(conf, mapperClass); AvroJob.setReducerClass(conf, AvroReducer.class); AvroJob.setOutputSchema(conf, outputSchema); conf.setNumReduceTasks(0); return conf; }
Example #15
Source File: DBImportMapReduce.java From hiped2 with Apache License 2.0 | 4 votes |
/** * The MapReduce driver - setup and launch the job. * * @param args the command-line arguments * @return the process exit code * @throws Exception if something goes wrong */ public int run(final String[] args) throws Exception { Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.OutputFileOption.values()).build(); int result = cli.runCmd(); if (result != 0) { return result; } Path output = new Path(cli.getArgValueAsString(CliCommonOpts.OutputFileOption.OUTPUT)); Configuration conf = super.getConf(); DBConfiguration.configureDB(conf, "com.mysql.jdbc.Driver", "jdbc:mysql://localhost/sqoop_test" + "?user=hip_sqoop_user&password=password"); JobConf job = new JobConf(conf); job.setJarByClass(DBImportMapReduce.class); job.setInputFormat(DBInputFormat.class); job.setOutputFormat(AvroOutputFormat.class); AvroJob.setOutputSchema(job, Stock.SCHEMA$); job.set(AvroJob.OUTPUT_CODEC, SnappyCodec.class.getName()); job.setMapperClass(Map.class); job.setNumMapTasks(4); job.setNumReduceTasks(0); job.setMapOutputKeyClass(AvroWrapper.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(AvroWrapper.class); job.setOutputValueClass(NullWritable.class); FileOutputFormat.setOutputPath(job, output); DBInputFormat.setInput( job, StockDbWritable.class, "select * from stocks", "SELECT COUNT(id) FROM stocks"); RunningJob runningJob = JobClient.runJob(job); return runningJob.isSuccessful() ? 0 : 1; }
Example #16
Source File: AbstractAvroJob.java From ml-ease with Apache License 2.0 | 4 votes |
/** * Sets up various standard settings in the JobConf. You probably don't want to mess with this. * * @return A configured JobConf. * @throws IOException * @throws URISyntaxException */ protected JobConf createJobConf() throws IOException, URISyntaxException { JobConf conf = new JobConf(); conf.setJobName(getJobId()); conf.setInputFormat(AvroInputFormat.class); conf.setOutputFormat(AvroOutputFormat.class); AvroOutputFormat.setDeflateLevel(conf, 9); String hadoop_ugi = _config.getString("hadoop.job.ugi", null); if (hadoop_ugi != null) { conf.set("hadoop.job.ugi", hadoop_ugi); } if (_config.getBoolean("is.local", false)) { conf.set("mapred.job.tracker", "local"); conf.set("fs.default.name", "file:///"); conf.set("mapred.local.dir", "/tmp/map-red"); _log.info("Running locally, no hadoop jar set."); } // set JVM options if present if (_config.containsKey("mapred.child.java.opts")) { conf.set("mapred.child.java.opts", _config.getString("mapred.child.java.opts")); _log.info("mapred.child.java.opts set to " + _config.getString("mapred.child.java.opts")); } if (_config.containsKey(INPUT_PATHS)) { List<String> inputPathnames = _config.getStringList(INPUT_PATHS); for (String pathname : inputPathnames) { AvroUtils.addAllSubPaths(conf, new Path(pathname)); } AvroJob.setInputSchema(conf, AvroUtils.getAvroInputSchema(conf)); } if (_config.containsKey(OUTPUT_PATH)) { Path path = new Path(_config.get(OUTPUT_PATH)); AvroOutputFormat.setOutputPath(conf, path); if (_config.getBoolean("force.output.overwrite", false)) { FileSystem fs = FileOutputFormat.getOutputPath(conf).getFileSystem(conf); fs.delete(FileOutputFormat.getOutputPath(conf), true); } } // set all hadoop configs for (String key : _config.keySet()) { String lowerCase = key.toLowerCase(); if ( lowerCase.startsWith(HADOOP_PREFIX)) { String newKey = key.substring(HADOOP_PREFIX.length()); conf.set(newKey, _config.get(key)); } } return conf; }
Example #17
Source File: AvroMixedMapReduce.java From hiped2 with Apache License 2.0 | 3 votes |
/** * The MapReduce driver - setup and launch the job. * * @param args the command-line arguments * @return the process exit code * @throws Exception if something goes wrong */ public int run(final String[] args) throws Exception { Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build(); int result = cli.runCmd(); if (result != 0) { return result; } Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT)); Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT)); Configuration conf = super.getConf(); JobConf job = new JobConf(conf); job.setJarByClass(AvroMixedMapReduce.class); job.set(AvroJob.INPUT_SCHEMA, Stock.SCHEMA$.toString()); job.set(AvroJob.OUTPUT_SCHEMA, StockAvg.SCHEMA$.toString()); job.set(AvroJob.OUTPUT_CODEC, SnappyCodec.class.getName()); job.setInputFormat(AvroInputFormat.class); job.setOutputFormat(AvroOutputFormat.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); return JobClient.runJob(job).isSuccessful() ? 0 : 1; }
Example #18
Source File: SmallFilesMapReduce.java From hiped2 with Apache License 2.0 | 3 votes |
/** * The MapReduce driver - setup and launch the job. * * @param args the command-line arguments * @return the process exit code * @throws Exception if something goes wrong */ public int run(final String[] args) throws Exception { Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build(); int result = cli.runCmd(); if (result != 0) { return result; } String inputPath = cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT); Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT)); Configuration conf = super.getConf(); JobConf job = new JobConf(conf); job.setJarByClass(SmallFilesMapReduce.class); job.set(AvroJob.INPUT_SCHEMA, SmallFilesWrite.SCHEMA.toString()); job.setInputFormat(AvroInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setMapperClass(Map.class); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); job.setNumReduceTasks(0); return JobClient.runJob(job).isSuccessful() ? 0 : 1; }