org.apache.avro.mapreduce.AvroKeyInputFormat Java Examples
The following examples show how to use
org.apache.avro.mapreduce.AvroKeyInputFormat.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HiveIncrPullSource.java From hudi with Apache License 2.0 | 6 votes |
@Override protected InputBatch<JavaRDD<GenericRecord>> fetchNewData(Option<String> lastCheckpointStr, long sourceLimit) { try { // find the source commit to pull Option<String> commitToPull = findCommitToPull(lastCheckpointStr); if (!commitToPull.isPresent()) { return new InputBatch<>(Option.empty(), lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : ""); } // read the files out. List<FileStatus> commitDeltaFiles = Arrays.asList(fs.listStatus(new Path(incrPullRootPath, commitToPull.get()))); String pathStr = commitDeltaFiles.stream().map(f -> f.getPath().toString()).collect(Collectors.joining(",")); JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr, AvroKeyInputFormat.class, AvroKey.class, NullWritable.class, sparkContext.hadoopConfiguration()); return new InputBatch<>(Option.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))), String.valueOf(commitToPull.get())); } catch (IOException ioe) { throw new HoodieIOException("Unable to read from source from checkpoint: " + lastCheckpointStr, ioe); } }
Example #2
Source File: AvroHdfsFileSource.java From components with Apache License 2.0 | 5 votes |
private AvroHdfsFileSource(UgiDoAs doAs, String filepattern, LazyAvroCoder<?> lac, ExtraHadoopConfiguration extraConfig, SerializableSplit serializableSplit) { super(doAs, filepattern, (Class) AvroKeyInputFormat.class, AvroKey.class, NullWritable.class, extraConfig, serializableSplit); this.lac = lac; setDefaultCoder(LazyAvroKeyWrapper.of(lac), WritableCoder.of(NullWritable.class)); }
Example #3
Source File: AvroDFSSource.java From hudi with Apache License 2.0 | 4 votes |
private JavaRDD<GenericRecord> fromFiles(String pathStr) { JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr, AvroKeyInputFormat.class, AvroKey.class, NullWritable.class, sparkContext.hadoopConfiguration()); return avroRDD.keys().map(r -> ((GenericRecord) r.datum())); }
Example #4
Source File: CombineAvroKeyInputFormat.java From incubator-pinot with Apache License 2.0 | 4 votes |
public AvroKeyRecordReaderWrapper(CombineFileSplit split, TaskAttemptContext context, Integer index) throws IOException, InterruptedException { super(new AvroKeyInputFormat<>(), split, context, index); }
Example #5
Source File: AggregationPhaseJob.java From incubator-pinot with Apache License 2.0 | 4 votes |
public Job run() throws Exception { Job job = Job.getInstance(getConf()); job.setJobName(name); job.setJarByClass(AggregationPhaseJob.class); FileSystem fs = FileSystem.get(getConf()); Configuration configuration = job.getConfiguration(); // Properties LOGGER.info("Properties {}", props); // Input Path String inputPathDir = getAndSetConfiguration(configuration, AGG_PHASE_INPUT_PATH); LOGGER.info("Input path dir: " + inputPathDir); for (String inputPath : inputPathDir.split(ThirdEyeConstants.FIELD_SEPARATOR)) { LOGGER.info("Adding input:" + inputPath); Path input = new Path(inputPath); FileInputFormat.addInputPath(job, input); } // Output path Path outputPath = new Path(getAndSetConfiguration(configuration, AGG_PHASE_OUTPUT_PATH)); LOGGER.info("Output path dir: " + outputPath.toString()); if (fs.exists(outputPath)) { fs.delete(outputPath, true); } FileOutputFormat.setOutputPath(job, outputPath); // Schema Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir); LOGGER.info("Schema : {}", avroSchema.toString(true)); job.getConfiguration().set(AGG_PHASE_AVRO_SCHEMA.toString(), avroSchema.toString()); // ThirdEyeConfig String dimensionTypesProperty = ThirdeyeAvroUtils.getDimensionTypesProperty( props.getProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_NAMES.toString()), avroSchema); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_TYPES.toString(), dimensionTypesProperty); String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty( props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()), props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty); ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props); LOGGER.info("Thirdeye Config {}", thirdeyeConfig.encode()); job.getConfiguration().set(AGG_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig)); // Map config job.setMapperClass(AggregationMapper.class); job.setInputFormatClass(AvroKeyInputFormat.class); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(BytesWritable.class); // Reduce config job.setReducerClass(AggregationReducer.class); job.setOutputKeyClass(AvroKey.class); job.setOutputValueClass(NullWritable.class); AvroJob.setOutputKeySchema(job, avroSchema); job.setOutputFormatClass(AvroKeyOutputFormat.class); String numReducers = props.getProperty(ThirdEyeJobProperties.THIRDEYE_NUM_REDUCERS.getName()); LOGGER.info("Num Reducers : {}", numReducers); if (StringUtils.isNotBlank(numReducers)) { job.setNumReduceTasks(Integer.valueOf(numReducers)); LOGGER.info("Setting num reducers {}", job.getNumReduceTasks()); } job.waitForCompletion(true); Counter counter = job.getCounters().findCounter(AggregationCounter.NUMBER_OF_RECORDS); LOGGER.info(counter.getDisplayName() + " : " + counter.getValue()); if (counter.getValue() == 0) { throw new IllegalStateException("No input records in " + inputPathDir); } counter = job.getCounters().findCounter(AggregationCounter.NUMBER_OF_RECORDS_FLATTENED); LOGGER.info(counter.getDisplayName() + " : " + counter.getValue()); for (String metric : thirdeyeConfig.getMetricNames()) { counter = job.getCounters().findCounter(thirdeyeConfig.getCollection(), metric); LOGGER.info(counter.getDisplayName() + " : " + counter.getValue()); } return job; }
Example #6
Source File: TopKPhaseJob.java From incubator-pinot with Apache License 2.0 | 4 votes |
public Job run() throws Exception { Job job = Job.getInstance(getConf()); job.setJobName(name); job.setJarByClass(TopKPhaseJob.class); Configuration configuration = job.getConfiguration(); FileSystem fs = FileSystem.get(configuration); // Properties LOGGER.info("Properties {}", props); // Input Path String inputPathDir = getAndSetConfiguration(configuration, TOPK_PHASE_INPUT_PATH); LOGGER.info("Input path dir: " + inputPathDir); for (String inputPath : inputPathDir.split(ThirdEyeConstants.FIELD_SEPARATOR)) { LOGGER.info("Adding input:" + inputPath); Path input = new Path(inputPath); FileInputFormat.addInputPath(job, input); } // Output path Path outputPath = new Path(getAndSetConfiguration(configuration, TOPK_PHASE_OUTPUT_PATH)); LOGGER.info("Output path dir: " + outputPath.toString()); if (fs.exists(outputPath)) { fs.delete(outputPath, true); } FileOutputFormat.setOutputPath(job, outputPath); // Schema Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir); LOGGER.info("Schema : {}", avroSchema.toString(true)); // ThirdEyeConfig String dimensionTypesProperty = ThirdeyeAvroUtils.getDimensionTypesProperty( props.getProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_NAMES.toString()), avroSchema); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_TYPES.toString(), dimensionTypesProperty); String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty( props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()), props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty); ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props); LOGGER.info("Thirdeye Config {}", thirdeyeConfig.encode()); job.getConfiguration().set(TOPK_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig)); // Map config job.setMapperClass(TopKPhaseMapper.class); job.setInputFormatClass(AvroKeyInputFormat.class); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(BytesWritable.class); // Combiner job.setCombinerClass(TopKPhaseCombiner.class); // Reduce config job.setReducerClass(TopKPhaseReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); job.setNumReduceTasks(1); job.waitForCompletion(true); return job; }
Example #7
Source File: DerivedColumnTransformationPhaseJob.java From incubator-pinot with Apache License 2.0 | 4 votes |
public Job run() throws Exception { Job job = Job.getInstance(getConf()); job.setJobName(name); job.setJarByClass(DerivedColumnTransformationPhaseJob.class); Configuration configuration = job.getConfiguration(); FileSystem fs = FileSystem.get(configuration); // Input Path String inputPathDir = getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_INPUT_PATH); LOGGER.info("Input path dir: " + inputPathDir); for (String inputPath : inputPathDir.split(",")) { LOGGER.info("Adding input:" + inputPath); Path input = new Path(inputPath); FileInputFormat.addInputPath(job, input); } // Topk path String topkPath = getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_TOPK_PATH); LOGGER.info("Topk path : " + topkPath); // Output path Path outputPath = new Path(getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_PATH)); LOGGER.info("Output path dir: " + outputPath.toString()); if (fs.exists(outputPath)) { fs.delete(outputPath, true); } FileOutputFormat.setOutputPath(job, outputPath); // Schema Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir); LOGGER.info("Schema : {}", avroSchema.toString(true)); // ThirdEyeConfig String dimensionTypesProperty = ThirdeyeAvroUtils.getDimensionTypesProperty( props.getProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_NAMES.toString()), avroSchema); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_DIMENSION_TYPES.toString(), dimensionTypesProperty); String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty( props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()), props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty); ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props); job.getConfiguration().set(DERIVED_COLUMN_TRANSFORMATION_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig)); LOGGER.info("ThirdEyeConfig {}", thirdeyeConfig.encode()); // New schema Schema outputSchema = newSchema(thirdeyeConfig); job.getConfiguration().set(DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_SCHEMA.toString(), outputSchema.toString()); // Map config job.setMapperClass(DerivedColumnTransformationPhaseMapper.class); job.setInputFormatClass(AvroKeyInputFormat.class); job.setMapOutputKeyClass(AvroKey.class); job.setMapOutputValueClass(NullWritable.class); AvroJob.setOutputKeySchema(job, outputSchema); LazyOutputFormat.setOutputFormatClass(job, AvroKeyOutputFormat.class); AvroMultipleOutputs.addNamedOutput(job, "avro", AvroKeyOutputFormat.class, outputSchema); job.setNumReduceTasks(0); job.waitForCompletion(true); return job; }
Example #8
Source File: OSMRunner.java From geowave with Apache License 2.0 | 4 votes |
@Override public int run(final String[] args) throws Exception { final Configuration conf = getConf(); conf.set("tableName", ingestOptions.getQualifiedTableName()); conf.set("osmVisibility", ingestOptions.getVisibilityOptions().getVisibility()); // job settings final Job job = Job.getInstance(conf, ingestOptions.getJobName()); job.setJarByClass(OSMRunner.class); switch (ingestOptions.getMapperType()) { case "NODE": { configureSchema(AvroNode.getClassSchema()); inputAvroFile = ingestOptions.getNodesBasePath(); job.setMapperClass(OSMNodeMapper.class); break; } case "WAY": { configureSchema(AvroWay.getClassSchema()); inputAvroFile = ingestOptions.getWaysBasePath(); job.setMapperClass(OSMWayMapper.class); break; } case "RELATION": { configureSchema(AvroRelation.getClassSchema()); inputAvroFile = ingestOptions.getRelationsBasePath(); job.setMapperClass(OSMRelationMapper.class); break; } default: break; } if ((avroSchema == null) || (inputAvroFile == null)) { throw new MissingArgumentException( "argument for mapper type must be one of: NODE, WAY, or RELATION"); } enableLocalityGroups(ingestOptions); // input format job.setInputFormatClass(AvroKeyInputFormat.class); FileInputFormat.setInputPaths(job, inputAvroFile); AvroJob.setInputKeySchema(job, avroSchema); // mappper job.setOutputKeyClass(Text.class); job.setOutputValueClass(Mutation.class); job.setOutputFormatClass(AccumuloOutputFormat.class); AccumuloOutputFormat.setConnectorInfo( job, accumuloOptions.getUser(), new PasswordToken(accumuloOptions.getPassword())); AccumuloOutputFormat.setCreateTables(job, true); AccumuloOutputFormat.setDefaultTableName(job, ingestOptions.getQualifiedTableName()); AccumuloOutputFormat.setZooKeeperInstance( job, new ClientConfiguration().withInstance(accumuloOptions.getInstance()).withZkHosts( accumuloOptions.getZookeeper())); // reducer job.setNumReduceTasks(0); return job.waitForCompletion(true) ? 0 : -1; }
Example #9
Source File: FileSystemViewKeyInputFormat.java From kite with Apache License 2.0 | 4 votes |
@Override FileInputFormat<AvroKey<E>, NullWritable> getInputFormat() { return new AvroKeyInputFormat<E>(); }