org.apache.avro.mapred.AvroInputFormat Java Examples
The following examples show how to use
org.apache.avro.mapred.AvroInputFormat.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AvroDataSupplier.java From tablasco with Apache License 2.0 | 5 votes |
@Override public DistributedTable get() { JavaPairRDD<AvroWrapper, NullWritable> avroRdd = this.sparkContext.hadoopFile(this.dataPath.toString(), AvroInputFormat.class, AvroWrapper.class, NullWritable.class); LOGGER.info("data location: {}", this.dataPath); List<String> headers = avroRdd.keys().map(new AvroHeadersFunction()).first(); LOGGER.info("data headers: {}", headers); JavaRDD<List<Object>> rows = avroRdd.map(new AvroRowsFunction(headers)); return new DistributedTable(headers, rows); }
Example #2
Source File: AvroUtils.java From ml-ease with Apache License 2.0 | 5 votes |
/** * Adds all subdirectories under a root path to the input format. * * @param conf The JobConf. * @param path The root path. * @throws IOException */ public static void addAllSubPaths(JobConf conf, Path path) throws IOException { if (shouldPathBeIgnored(path)) { throw new IllegalArgumentException(String.format("Path[%s] should be ignored.", path)); } final FileSystem fs = path.getFileSystem(conf); if(fs.exists(path)) { for (FileStatus status : fs.listStatus(path)) { if (! shouldPathBeIgnored(status.getPath())) { if (status.isDir()) { addAllSubPaths(conf, status.getPath()); } else { AvroInputFormat.addInputPath(conf, status.getPath()); } } } } }
Example #3
Source File: AvroUtils.java From ml-ease with Apache License 2.0 | 5 votes |
/** * Run an avro hadoop job with job conf * @param conf * @throws Exception */ public static void runAvroJob(JobConf conf) throws Exception { Path[] inputPaths = AvroInputFormat.getInputPaths(conf); _log.info("Running hadoop job with input paths:"); for (Path inputPath : inputPaths) { _log.info(inputPath); } _log.info("Output path="+AvroOutputFormat.getOutputPath(conf)); Job job = new Job(conf); job.setJarByClass(AvroUtils.class); job.waitForCompletion(true); }
Example #4
Source File: AvroFileAccessor.java From pxf with Apache License 2.0 | 4 votes |
/** * Constructs a new instance of the AvroFileAccessor */ public AvroFileAccessor() { super(new AvroInputFormat<GenericRecord>()); avroUtilities = AvroUtilities.getInstance(); }
Example #5
Source File: AbstractAvroJob.java From ml-ease with Apache License 2.0 | 4 votes |
/** * Sets up various standard settings in the JobConf. You probably don't want to mess with this. * * @return A configured JobConf. * @throws IOException * @throws URISyntaxException */ protected JobConf createJobConf() throws IOException, URISyntaxException { JobConf conf = new JobConf(); conf.setJobName(getJobId()); conf.setInputFormat(AvroInputFormat.class); conf.setOutputFormat(AvroOutputFormat.class); AvroOutputFormat.setDeflateLevel(conf, 9); String hadoop_ugi = _config.getString("hadoop.job.ugi", null); if (hadoop_ugi != null) { conf.set("hadoop.job.ugi", hadoop_ugi); } if (_config.getBoolean("is.local", false)) { conf.set("mapred.job.tracker", "local"); conf.set("fs.default.name", "file:///"); conf.set("mapred.local.dir", "/tmp/map-red"); _log.info("Running locally, no hadoop jar set."); } // set JVM options if present if (_config.containsKey("mapred.child.java.opts")) { conf.set("mapred.child.java.opts", _config.getString("mapred.child.java.opts")); _log.info("mapred.child.java.opts set to " + _config.getString("mapred.child.java.opts")); } if (_config.containsKey(INPUT_PATHS)) { List<String> inputPathnames = _config.getStringList(INPUT_PATHS); for (String pathname : inputPathnames) { AvroUtils.addAllSubPaths(conf, new Path(pathname)); } AvroJob.setInputSchema(conf, AvroUtils.getAvroInputSchema(conf)); } if (_config.containsKey(OUTPUT_PATH)) { Path path = new Path(_config.get(OUTPUT_PATH)); AvroOutputFormat.setOutputPath(conf, path); if (_config.getBoolean("force.output.overwrite", false)) { FileSystem fs = FileOutputFormat.getOutputPath(conf).getFileSystem(conf); fs.delete(FileOutputFormat.getOutputPath(conf), true); } } // set all hadoop configs for (String key : _config.keySet()) { String lowerCase = key.toLowerCase(); if ( lowerCase.startsWith(HADOOP_PREFIX)) { String newKey = key.substring(HADOOP_PREFIX.length()); conf.set(newKey, _config.get(key)); } } return conf; }
Example #6
Source File: AvroStorage.java From spork with Apache License 2.0 | 4 votes |
@Override public List<String> getShipFiles() { Class[] classList = new Class[] {Schema.class, AvroInputFormat.class}; return FuncUtils.getShipFiles(classList); }
Example #7
Source File: AvroMixedMapReduce.java From hiped2 with Apache License 2.0 | 3 votes |
/** * The MapReduce driver - setup and launch the job. * * @param args the command-line arguments * @return the process exit code * @throws Exception if something goes wrong */ public int run(final String[] args) throws Exception { Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build(); int result = cli.runCmd(); if (result != 0) { return result; } Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT)); Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT)); Configuration conf = super.getConf(); JobConf job = new JobConf(conf); job.setJarByClass(AvroMixedMapReduce.class); job.set(AvroJob.INPUT_SCHEMA, Stock.SCHEMA$.toString()); job.set(AvroJob.OUTPUT_SCHEMA, StockAvg.SCHEMA$.toString()); job.set(AvroJob.OUTPUT_CODEC, SnappyCodec.class.getName()); job.setInputFormat(AvroInputFormat.class); job.setOutputFormat(AvroOutputFormat.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); return JobClient.runJob(job).isSuccessful() ? 0 : 1; }
Example #8
Source File: SmallFilesMapReduce.java From hiped2 with Apache License 2.0 | 3 votes |
/** * The MapReduce driver - setup and launch the job. * * @param args the command-line arguments * @return the process exit code * @throws Exception if something goes wrong */ public int run(final String[] args) throws Exception { Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build(); int result = cli.runCmd(); if (result != 0) { return result; } String inputPath = cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT); Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT)); Configuration conf = super.getConf(); JobConf job = new JobConf(conf); job.setJarByClass(SmallFilesMapReduce.class); job.set(AvroJob.INPUT_SCHEMA, SmallFilesWrite.SCHEMA.toString()); job.setInputFormat(AvroInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setMapperClass(Map.class); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); job.setNumReduceTasks(0); return JobClient.runJob(job).isSuccessful() ? 0 : 1; }