org.apache.avro.mapreduce.AvroKeyOutputFormat Java Examples
The following examples show how to use
org.apache.avro.mapreduce.AvroKeyOutputFormat.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HadoopSegmentPreprocessingJob.java From incubator-pinot with Apache License 2.0 | 5 votes |
private void setSchemaParams(Job job, Schema avroSchema) throws IOException { AvroMultipleOutputs.addNamedOutput(job, "avro", AvroKeyOutputFormat.class, avroSchema); AvroMultipleOutputs.setCountersEnabled(job, true); // Use LazyOutputFormat to avoid creating empty files. LazyOutputFormat.setOutputFormatClass(job, AvroKeyOutputFormat.class); // Input and output paths. FileInputFormat.setInputPaths(job, _inputSegmentDir); FileOutputFormat.setOutputPath(job, _preprocessedOutputDir); }
Example #2
Source File: AvroHdfsFileSink.java From components with Apache License 2.0 | 4 votes |
public AvroHdfsFileSink(UgiDoAs doAs, String path, boolean overwrite, boolean mergeOutput) { super(doAs, path, overwrite, mergeOutput, (Class) AvroKeyOutputFormat.class); }
Example #3
Source File: ExportHBaseTableToAvro.java From HBase-ToHDFS with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { if (args.length == 0) { System.out.println("ExportHBaseTableToAvro {tableName} {ColumnFamily} {outputPath} {compressionCodec snappy,gzip} {schemaLocationOnHdfs} {rowKeyColumn.Optional}"); return; } String table = args[0]; String columnFamily = args[1]; String outputPath = args[2]; String compressionCodec = args[3]; String schemaFilePath = args[4]; String rowKeyColumn = ""; if (args.length > 5) { rowKeyColumn = args[5]; } Job job = Job.getInstance(); HBaseConfiguration.addHbaseResources(job.getConfiguration()); job.setJarByClass(ExportHBaseTableToAvro.class); job.setJobName("ExportHBaseTableToAvro "); job.getConfiguration().set(ROW_KEY_COLUMN_CONF, rowKeyColumn); job.getConfiguration().set(SCHEMA_FILE_LOCATION_CONF, schemaFilePath); Scan scan = new Scan(); scan.setCaching(500); // 1 is the default in Scan, which will be bad for // MapReduce jobs scan.setCacheBlocks(false); // don't set to true for MR jobs scan.addFamily(Bytes.toBytes(columnFamily)); TableMapReduceUtil.initTableMapperJob(table, // input HBase table name scan, // Scan instance to control CF and attribute selection MyMapper.class, // mapper null, // mapper output key null, // mapper output value job); job.setOutputFormatClass(AvroKeyOutputFormat.class); AvroKeyOutputFormat.setOutputPath(job, new Path(outputPath)); Schema.Parser parser = new Schema.Parser(); FileSystem fs = FileSystem.get(job.getConfiguration()); AvroJob.setOutputKeySchema(job, parser.parse(fs.open(new Path(schemaFilePath)))); if (compressionCodec.equals("snappy")) { AvroKeyOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); } else if (compressionCodec.equals("gzip")) { AvroKeyOutputFormat.setOutputCompressorClass(job, GzipCodec.class); } else { // nothing } job.setNumReduceTasks(0); boolean b = job.waitForCompletion(true); }
Example #4
Source File: AggregationPhaseJob.java From incubator-pinot with Apache License 2.0 | 4 votes |
public Job run() throws Exception { Job job = Job.getInstance(getConf()); job.setJobName(name); job.setJarByClass(AggregationPhaseJob.class); FileSystem fs = FileSystem.get(getConf()); Configuration configuration = job.getConfiguration(); // Properties LOGGER.info("Properties {}", props); // Input Path String inputPathDir = getAndSetConfiguration(configuration, AGG_PHASE_INPUT_PATH); LOGGER.info("Input path dir: " + inputPathDir); for (String inputPath : inputPathDir.split(ThirdEyeConstants.FIELD_SEPARATOR)) { LOGGER.info("Adding input:" + inputPath); Path input = new Path(inputPath); FileInputFormat.addInputPath(job, input); } // Output path Path outputPath = new Path(getAndSetConfiguration(configuration, AGG_PHASE_OUTPUT_PATH)); LOGGER.info("Output path dir: " + outputPath.toString()); if (fs.exists(outputPath)) { fs.delete(outputPath, true); } FileOutputFormat.setOutputPath(job, outputPath); // Schema Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir); LOGGER.info("Schema : {}", avroSchema.toString(true)); job.getConfiguration().set(AGG_PHASE_AVRO_SCHEMA.toString(), avroSchema.toString()); // ThirdEyeConfig String dimensionTypesProperty = ThirdeyeAvroUtils.getDimensionTypesProperty( props.getProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_NAMES.toString()), avroSchema); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_TYPES.toString(), dimensionTypesProperty); String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty( props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()), props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty); ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props); LOGGER.info("Thirdeye Config {}", thirdeyeConfig.encode()); job.getConfiguration().set(AGG_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig)); // Map config job.setMapperClass(AggregationMapper.class); job.setInputFormatClass(AvroKeyInputFormat.class); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(BytesWritable.class); // Reduce config job.setReducerClass(AggregationReducer.class); job.setOutputKeyClass(AvroKey.class); job.setOutputValueClass(NullWritable.class); AvroJob.setOutputKeySchema(job, avroSchema); job.setOutputFormatClass(AvroKeyOutputFormat.class); String numReducers = props.getProperty(ThirdEyeJobProperties.THIRDEYE_NUM_REDUCERS.getName()); LOGGER.info("Num Reducers : {}", numReducers); if (StringUtils.isNotBlank(numReducers)) { job.setNumReduceTasks(Integer.valueOf(numReducers)); LOGGER.info("Setting num reducers {}", job.getNumReduceTasks()); } job.waitForCompletion(true); Counter counter = job.getCounters().findCounter(AggregationCounter.NUMBER_OF_RECORDS); LOGGER.info(counter.getDisplayName() + " : " + counter.getValue()); if (counter.getValue() == 0) { throw new IllegalStateException("No input records in " + inputPathDir); } counter = job.getCounters().findCounter(AggregationCounter.NUMBER_OF_RECORDS_FLATTENED); LOGGER.info(counter.getDisplayName() + " : " + counter.getValue()); for (String metric : thirdeyeConfig.getMetricNames()) { counter = job.getCounters().findCounter(thirdeyeConfig.getCollection(), metric); LOGGER.info(counter.getDisplayName() + " : " + counter.getValue()); } return job; }
Example #5
Source File: DerivedColumnTransformationPhaseJob.java From incubator-pinot with Apache License 2.0 | 4 votes |
public Job run() throws Exception { Job job = Job.getInstance(getConf()); job.setJobName(name); job.setJarByClass(DerivedColumnTransformationPhaseJob.class); Configuration configuration = job.getConfiguration(); FileSystem fs = FileSystem.get(configuration); // Input Path String inputPathDir = getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_INPUT_PATH); LOGGER.info("Input path dir: " + inputPathDir); for (String inputPath : inputPathDir.split(",")) { LOGGER.info("Adding input:" + inputPath); Path input = new Path(inputPath); FileInputFormat.addInputPath(job, input); } // Topk path String topkPath = getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_TOPK_PATH); LOGGER.info("Topk path : " + topkPath); // Output path Path outputPath = new Path(getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_PATH)); LOGGER.info("Output path dir: " + outputPath.toString()); if (fs.exists(outputPath)) { fs.delete(outputPath, true); } FileOutputFormat.setOutputPath(job, outputPath); // Schema Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir); LOGGER.info("Schema : {}", avroSchema.toString(true)); // ThirdEyeConfig String dimensionTypesProperty = ThirdeyeAvroUtils.getDimensionTypesProperty( props.getProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_NAMES.toString()), avroSchema); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_TYPES.toString(), dimensionTypesProperty); String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty( props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()), props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty); ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props); job.getConfiguration().set(DERIVED_COLUMN_TRANSFORMATION_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig)); LOGGER.info("ThirdEyeConfig {}", thirdeyeConfig.encode()); // New schema Schema outputSchema = newSchema(thirdeyeConfig); job.getConfiguration().set(DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_SCHEMA.toString(), outputSchema.toString()); // Map config job.setMapperClass(DerivedColumnTransformationPhaseMapper.class); job.setInputFormatClass(AvroKeyInputFormat.class); job.setMapOutputKeyClass(AvroKey.class); job.setMapOutputValueClass(NullWritable.class); AvroJob.setOutputKeySchema(job, outputSchema); LazyOutputFormat.setOutputFormatClass(job, AvroKeyOutputFormat.class); AvroMultipleOutputs.addNamedOutput(job, "avro", AvroKeyOutputFormat.class, outputSchema); job.setNumReduceTasks(0); job.waitForCompletion(true); return job; }
Example #6
Source File: SSTableExport.java From aegisthus with Apache License 2.0 | 4 votes |
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(SSTableExport.class); CommandLine cl = getOptions(args); if (cl == null) { return 1; } // Check all of the paths and load the sstable version from the input filenames List<Path> paths = Lists.newArrayList(); if (cl.hasOption(Feature.CMD_ARG_INPUT_FILE)) { for (String input : cl.getOptionValues(Feature.CMD_ARG_INPUT_FILE)) { checkVersionFromFilename(input); paths.add(new Path(input)); } } if (cl.hasOption(Feature.CMD_ARG_INPUT_DIR)) { paths.addAll(getDataFiles(job.getConfiguration(), cl.getOptionValue(Feature.CMD_ARG_INPUT_DIR))); } String avroSchemaString = getAvroSchema(cl.getOptionValue(Feature.CMD_ARG_AVRO_SCHEMA_FILE), job.getConfiguration()); Schema avroSchema = new Schema.Parser().parse(avroSchemaString); // At this point we have the version of sstable that we can use for this run job.getConfiguration().set(Aegisthus.Feature.CONF_SSTABLE_VERSION, version.toString()); if (job.getConfiguration().get(Aegisthus.Feature.CONF_CQL_SCHEMA) != null) { setConfigurationFromCql(job.getConfiguration()); } job.setInputFormatClass(AegisthusInputFormat.class); job.setMapperClass(CQLMapper.class); job.setOutputFormatClass(AvroKeyOutputFormat.class); AvroJob.setOutputKeySchema(job, avroSchema); // Map-only job job.setNumReduceTasks(0); TextInputFormat.setInputPaths(job, paths.toArray(new Path[paths.size()])); FileOutputFormat.setOutputPath(job, new Path(cl.getOptionValue(Feature.CMD_ARG_OUTPUT_DIR))); job.submit(); System.out.println(job.getJobID()); System.out.println(job.getTrackingURL()); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
Example #7
Source File: BloomFilterCreator.java From hiped2 with Apache License 2.0 | 3 votes |
/** * The MapReduce driver - setup and launch the job. * * @param args the command-line arguments * @return the process exit code * @throws Exception if something goes wrong */ public int run(final String[] args) throws Exception { Cli cli = Cli.builder().setArgs(args).addOptions(ReplicatedJoin.UserOptions.values()).build(); int result = cli.runCmd(); if (result != 0) { return result; } Path usersPath = new Path(cli.getArgValueAsString(ReplicatedJoin.UserOptions.USERS)); Path outputPath = new Path(cli.getArgValueAsString(ReplicatedJoin.UserOptions.OUTPUT)); Configuration conf = super.getConf(); Job job = new Job(conf); job.setJarByClass(BloomFilterCreator.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); AvroJob.setOutputKeySchema(job, AvroBytesRecord.SCHEMA); job.getConfiguration().set(AvroJob.CONF_OUTPUT_CODEC, SnappyCodec.class.getName()); job.setOutputFormatClass(AvroKeyOutputFormat.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(BloomFilter.class); FileInputFormat.setInputPaths(job, usersPath); FileOutputFormat.setOutputPath(job, outputPath); job.setNumReduceTasks(1); return job.waitForCompletion(true) ? 0 : 1; }