org.apache.avro.mapred.AvroValue Java Examples
The following examples show how to use
org.apache.avro.mapred.AvroValue.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: RegressionNaiveTrain.java From ml-ease with Apache License 2.0 | 6 votes |
@Override public int getPartition(AvroKey<String> key, AvroValue<Integer> value, int numPartitions) { String k = key.datum().toString(); if (_partitionIdMap!=null) { if (_partitionIdMap.containsKey(k)) { int partitionId = _partitionIdMap.get(k); return partitionId % numPartitions; } } return Math.abs(k.hashCode()) % numPartitions; }
Example #2
Source File: HadoopSegmentPreprocessingJob.java From incubator-pinot with Apache License 2.0 | 6 votes |
private void setHadoopJobConfigs(Job job, int numInputPaths) { job.getConfiguration().set(JobContext.JOB_NAME, this.getClass().getName()); // Turn this on to always firstly use class paths that user specifies. job.getConfiguration().set(MRJobConfig.MAPREDUCE_JOB_USER_CLASSPATH_FIRST, "true"); // Turn this off since we don't need an empty file in the output directory job.getConfiguration().set(FileOutputCommitter.SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, "false"); job.setJarByClass(HadoopSegmentPreprocessingJob.class); String hadoopTokenFileLocation = System.getenv(UserGroupInformation.HADOOP_TOKEN_FILE_LOCATION); if (hadoopTokenFileLocation != null) { job.getConfiguration().set(MRJobConfig.MAPREDUCE_JOB_CREDENTIALS_BINARY, hadoopTokenFileLocation); } // Mapper configs. job.setMapperClass(SegmentPreprocessingMapper.class); job.setMapOutputKeyClass(AvroKey.class); job.setMapOutputValueClass(AvroValue.class); job.getConfiguration().setInt(JobContext.NUM_MAPS, numInputPaths); // Reducer configs. job.setReducerClass(SegmentPreprocessingReducer.class); job.setOutputKeyClass(AvroKey.class); job.setOutputValueClass(NullWritable.class); }
Example #3
Source File: TransformPhaseJob.java From incubator-pinot with Apache License 2.0 | 5 votes |
@Override public void reduce(IntWritable key, Iterable<AvroValue<GenericRecord>> values, Context context) throws IOException, InterruptedException { for (AvroValue<GenericRecord> value : values) { GenericRecord record = value.datum(); context.write(new AvroKey<GenericRecord>(record), NullWritable.get()); } }
Example #4
Source File: ItemModelTest.java From ml-ease with Apache License 2.0 | 5 votes |
@Override public int getPartition(AvroKey<String> key, AvroValue<GenericData.Record> value, int numPartitions) { return Math.abs(key.datum().hashCode()) % numPartitions; }
Example #5
Source File: TransformPhaseJob.java From incubator-pinot with Apache License 2.0 | 5 votes |
@Override public void map(AvroKey<GenericRecord> recordWrapper, NullWritable value, Context context) throws IOException, InterruptedException { GenericRecord record = recordWrapper.datum(); GenericRecord outputRecord = transformUDF.transformRecord(sourceName, record); if (outputRecord != null) { IntWritable key = new IntWritable(reducerKey); reducerKey = (reducerKey == numReducers) ? (1) : (reducerKey + 1); context.write(key, new AvroValue<GenericRecord>(outputRecord)); } }
Example #6
Source File: SegmentPreprocessingMapper.java From incubator-pinot with Apache License 2.0 | 5 votes |
@Override public void map(AvroKey<GenericRecord> record, NullWritable value, final Context context) throws IOException, InterruptedException { if (_isAppend) { // Normalize time column value and check against sample value String timeColumnValue = record.datum().get(_timeColumn).toString(); String normalizedTimeColumnValue = _normalizedDateSegmentNameGenerator.getNormalizedDate(timeColumnValue); if (!normalizedTimeColumnValue.equals(_sampleNormalizedTimeColumnValue) && _firstInstanceOfMismatchedTime) { _firstInstanceOfMismatchedTime = false; // TODO: Create a custom exception and gracefully catch this exception outside, changing what the path to input // into segment creation should be LOGGER.warn("This segment contains multiple time units. Sample is {}, current is {}", _sampleNormalizedTimeColumnValue, normalizedTimeColumnValue); } } final GenericRecord inputRecord = record.datum(); final Schema schema = inputRecord.getSchema(); Preconditions.checkArgument(_outputSchema.equals(schema), "The schema of all avro files should be the same!"); GenericRecord outputKey = new GenericData.Record(_outputKeySchema); if (_sortedColumn == null) { outputKey.put("hashcode", inputRecord.hashCode()); } else if (_enablePartitioning) { outputKey.put(_sortedColumn, inputRecord.get(_sortedColumn)); } else { outputKey.put(_sortedColumn, inputRecord.get(_sortedColumn)); outputKey.put("hashcode", inputRecord.hashCode()); } try { context.write(new AvroKey<>(outputKey), new AvroValue<>(inputRecord)); } catch (Exception e) { LOGGER.error("Exception when writing context on mapper!"); throw e; } }
Example #7
Source File: SegmentPreprocessingReducer.java From incubator-pinot with Apache License 2.0 | 5 votes |
@Override public void reduce(final T inputRecord, final Iterable<AvroValue<GenericRecord>> values, final Context context) throws IOException, InterruptedException { for (final AvroValue<GenericRecord> value : values) { String fileName = generateFileName(); _multipleOutputs.write(new AvroKey<>(value.datum()), NullWritable.get(), fileName); } }
Example #8
Source File: SimpleAvroJob.java From datafu with Apache License 2.0 | 5 votes |
@Override protected void reduce(AvroKey<GenericRecord> key, Iterable<AvroValue<GenericRecord>> values, Context context) throws IOException, InterruptedException { long count = 0L; for (AvroValue<GenericRecord> value : values) { count += (Long)value.datum().get("count"); } output.put("id", key.datum().get("id")); output.put("count",count); context.write(new AvroKey<GenericRecord>(output), null); }
Example #9
Source File: TimePartitioner.java From datafu with Apache License 2.0 | 5 votes |
@Override public int getPartition(AvroKey<GenericRecord> key, AvroValue<GenericRecord> value, int numReduceTasks) { if (numReduceTasks != this.numReducers) { throw new RuntimeException("numReduceTasks " + numReduceTasks + " does not match expected " + this.numReducers); } Long time = (Long)key.datum().get("time"); if (time == null) { throw new RuntimeException("time is null"); } List<Integer> partitions = this.partitionMapping.get(time); if (partitions == null) { throw new RuntimeException("Couldn't find partition for " + time); } GenericRecord extractedKey = (GenericRecord)key.datum().get("value"); if (extractedKey == null) { throw new RuntimeException("extracted key is null"); } int partitionIndex = (extractedKey.hashCode() & Integer.MAX_VALUE) % partitions.size(); return partitions.get(partitionIndex); }
Example #10
Source File: PartitioningCombiner.java From datafu with Apache License 2.0 | 5 votes |
@SuppressWarnings("unchecked") public void reduce(Object keyObj, Iterable<Object> values, ReduceContext<Object,Object,Object,Object> context) throws IOException, InterruptedException { Accumulator<GenericRecord,GenericRecord> acc = getAccumulator(); if (acc == null) { throw new RuntimeException("No accumulator set for combiner!"); } acc.cleanup(); long accumulatedCount = 0; for (Object valueObj : values) { AvroValue<GenericRecord> value = (AvroValue<GenericRecord>)valueObj; acc.accumulate(value.datum()); accumulatedCount++; } if (accumulatedCount > 0) { GenericRecord intermediateValue = acc.getFinal(); if (intermediateValue != null) { context.write((AvroKey<GenericRecord>)keyObj,new AvroValue<GenericRecord>(intermediateValue)); } } }
Example #11
Source File: AvroKeyValueIdentityMapper.java From datafu with Apache License 2.0 | 5 votes |
@Override protected void map(Object keyObj, Object valueObj, Context context) throws java.io.IOException, java.lang.InterruptedException { @SuppressWarnings("unchecked") GenericRecord input = ((AvroKey<GenericRecord>)keyObj).datum(); GenericRecord key = (GenericRecord)input.get("key"); GenericRecord value = (GenericRecord)input.get("value"); context.write(new AvroKey<GenericRecord>(key),new AvroValue<GenericRecord>(value)); }
Example #12
Source File: CollapsingMapper.java From datafu with Apache License 2.0 | 5 votes |
public void collect(GenericRecord key, GenericRecord value) throws IOException, InterruptedException { if (key == null) { throw new RuntimeException("key is null"); } if (value == null) { throw new RuntimeException("value is null"); } getContext().write(new AvroKey<GenericRecord>(key), new AvroValue<GenericRecord>(value)); }
Example #13
Source File: AvroKeyValueMapReduce.java From hiped2 with Apache License 2.0 | 5 votes |
@Override protected void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException { Mean mean = new Mean(); for (DoubleWritable val: values) { mean.increment(val.get()); } StockAvg avg = new StockAvg(); avg.setSymbol(key.toString()); avg.setAvg(mean.getResult()); context.write(key, new AvroValue<StockAvg>(avg)); }
Example #14
Source File: AvroKeyValueMapReduce.java From hiped2 with Apache License 2.0 | 5 votes |
@Override public void map(AvroKey<CharSequence> key, AvroValue<Stock> value, Context context) throws IOException, InterruptedException { context.write(new Text(key.toString()), new DoubleWritable(value.datum().getOpen())); }
Example #15
Source File: AvroKeyValueMapReduce.java From hiped2 with Apache License 2.0 | 5 votes |
/** * The MapReduce driver - setup and launch the job. * * @param args the command-line arguments * @return the process exit code * @throws Exception if something goes wrong */ public int run(final String[] args) throws Exception { Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build(); int result = cli.runCmd(); if (result != 0) { return result; } Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT)); Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT)); Configuration conf = super.getConf(); Job job = new Job(conf); job.setJarByClass(AvroKeyValueMapReduce.class); FileInputFormat.setInputPaths(job, inputPath); job.setInputFormatClass(AvroKeyValueInputFormat.class); AvroJob.setInputKeySchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setInputValueSchema(job, Stock.SCHEMA$); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(AvroValue.class); job.setOutputFormatClass(AvroKeyValueOutputFormat.class); AvroJob.setOutputValueSchema(job, StockAvg.SCHEMA$); FileOutputFormat.setOutputPath(job, outputPath); return job.waitForCompletion(true) ? 0 : 1; }
Example #16
Source File: AvroKeyDedupReducer.java From incubator-gobblin with Apache License 2.0 | 5 votes |
@Override public int compare(AvroValue<GenericRecord> o1, AvroValue<GenericRecord> o2) { GenericRecord record1 = o1.datum(); GenericRecord record2 = o2.datum(); for (String deltaFieldName : this.deltaSchemaProvider.getDeltaFieldNames(record1)) { if (record1.get(deltaFieldName).equals(record2.get(deltaFieldName))) { continue; } return ((Comparable) record1.get(deltaFieldName)).compareTo(record2.get(deltaFieldName)); } return 0; }
Example #17
Source File: AvroKeyMapper.java From incubator-gobblin with Apache License 2.0 | 5 votes |
@Override protected void setup(Context context) throws IOException, InterruptedException { this.keySchema = AvroJob.getMapOutputKeySchema(context.getConfiguration()); this.outKey = new AvroKey<>(); this.outKey.datum(new GenericData.Record(this.keySchema)); this.outValue = new AvroValue<>(); }
Example #18
Source File: RegressionAdmmTrain.java From ml-ease with Apache License 2.0 | 5 votes |
@Override public int getPartition(AvroKey<Integer> key, AvroValue<RegressionPrepareOutput> value, int numPartitions) { Integer keyInt = key.datum(); if (keyInt < 0 || keyInt >= numPartitions) { throw new RuntimeException("Map key is wrong! key has to be in the range of [0,numPartitions-1]."); } return keyInt; }
Example #19
Source File: PartitioningMapper.java From datafu with Apache License 2.0 | 4 votes |
public void collect(GenericRecord key, GenericRecord value) throws IOException, InterruptedException { wrappedKey.put("value", key); context.write(new AvroKey<GenericRecord>(wrappedKey),new AvroValue<GenericRecord>(value)); }
Example #20
Source File: CollapsingCombiner.java From datafu with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") public void reduce(Object keyObj, Iterable<Object> values, ReduceContext<Object,Object,Object,Object> context) throws IOException, InterruptedException { Accumulator<GenericRecord,GenericRecord> acc = getAccumulator(); if (acc == null) { throw new RuntimeException("No combiner factory set"); } long accumulatedCount = 0; acc.cleanup(); for (Object valueObj : values) { GenericRecord value = ((AvroValue<GenericRecord>)valueObj).datum(); if (value.getSchema().getFullName().equals(getSchemas().getIntermediateValueSchema().getFullName())) { acc.accumulate(value); accumulatedCount++; } else if (value.getSchema().getFullName().equals(getSchemas().getDatedIntermediateValueSchema().getFullName())) { if (!_reusePreviousOutput) { throw new RuntimeException("Did not expect " + getSchemas().getDatedIntermediateValueSchema().getFullName()); } Long time = (Long)value.get("time"); GenericRecord data = (GenericData.Record)value.get("value"); if (time == null) { throw new RuntimeException("time is null"); } if (data == null) { throw new RuntimeException("value is null"); } if (time >= _beginTime && time <= _endTime) { acc.accumulate(data); accumulatedCount++; } else if (time < _beginTime) { // pass through unchanged, reducer will handle it context.write((AvroKey<GenericRecord>)keyObj,new AvroValue<GenericRecord>(value)); } else { throw new RuntimeException(String.format("Time %d is greater than end time %d",time,_endTime)); } } else if (value.getSchema().getFullName().equals(getSchemas().getOutputValueSchema().getFullName())) { if (!_reusePreviousOutput) { throw new RuntimeException("Did not expect " + getSchemas().getOutputValueSchema().getFullName()); } // pass through unchanged, reducer will handle it context.write((AvroKey<GenericRecord>)keyObj,new AvroValue<GenericRecord>(value)); } else { throw new RuntimeException("Unexpected type: " + value.getSchema().getFullName()); } } if (accumulatedCount > 0) { GenericRecord intermediateValue = acc.getFinal(); if (intermediateValue != null) { context.write((AvroKey<GenericRecord>)keyObj,new AvroValue<GenericRecord>(intermediateValue)); } } }
Example #21
Source File: KeyDedupReducerTest.java From incubator-gobblin with Apache License 2.0 | 4 votes |
@Test public void testAvroReduce() throws IOException, InterruptedException { Schema keySchema = new Schema.Parser().parse(AVRO_KEY_SCHEMA); GenericRecordBuilder keyRecordBuilder = new GenericRecordBuilder(keySchema.getField("key").schema()); keyRecordBuilder.set("partitionKey", 1); keyRecordBuilder.set("environment", "test"); keyRecordBuilder.set("subKey", "2"); GenericRecord record = keyRecordBuilder.build(); keyRecordBuilder = new GenericRecordBuilder(keySchema); keyRecordBuilder.set("key", record); GenericRecord keyRecord = keyRecordBuilder.build(); // Test reducer with delta field "scn" Schema fullSchema = new Schema.Parser().parse(AVRO_FULL_SCHEMA); AvroValue<GenericRecord> fullRecord1 = new AvroValue<>(); AvroValue<GenericRecord> fullRecord2 = new AvroValue<>(); AvroValue<GenericRecord> fullRecord3 = new AvroValue<>(); AvroValue<GenericRecord> fullRecord4 = new AvroValue<>(); GenericRecordBuilder fullRecordBuilder1 = new GenericRecordBuilder(fullSchema); fullRecordBuilder1.set("key", record); fullRecordBuilder1.set("scn", 123); fullRecordBuilder1.set("scn2", 100); fullRecord1.datum(fullRecordBuilder1.build()); fullRecordBuilder1.set("scn", 125); fullRecordBuilder1.set("scn2", 1); fullRecord2.datum(fullRecordBuilder1.build()); fullRecordBuilder1.set("scn", 124); fullRecordBuilder1.set("scn2", 10); fullRecord3.datum(fullRecordBuilder1.build()); fullRecordBuilder1.set("scn", 122); fullRecordBuilder1.set("scn2", 1000); fullRecord4.datum(fullRecordBuilder1.build()); Configuration conf = mock(Configuration.class); when(conf.get(AvroKeyDedupReducer.DELTA_SCHEMA_PROVIDER)) .thenReturn(FieldAttributeBasedDeltaFieldsProvider.class.getName()); when(conf.get(FieldAttributeBasedDeltaFieldsProvider.ATTRIBUTE_FIELD)).thenReturn("attributes_json"); when(conf.get(FieldAttributeBasedDeltaFieldsProvider.DELTA_PROP_NAME, FieldAttributeBasedDeltaFieldsProvider.DEFAULT_DELTA_PROP_NAME)) .thenReturn(FieldAttributeBasedDeltaFieldsProvider.DEFAULT_DELTA_PROP_NAME); RecordKeyDedupReducerBase<AvroKey<GenericRecord>, AvroValue<GenericRecord>, AvroKey<GenericRecord>, NullWritable> reducer = new AvroKeyDedupReducer(); WrappedReducer.Context reducerContext = mock(WrappedReducer.Context.class); when(reducerContext.getConfiguration()).thenReturn(conf); Counter moreThan1Counter = new GenericCounter(); when(reducerContext.getCounter(RecordKeyDedupReducerBase.EVENT_COUNTER.MORE_THAN_1)).thenReturn(moreThan1Counter); Counter dedupedCounter = new GenericCounter(); when(reducerContext.getCounter(RecordKeyDedupReducerBase.EVENT_COUNTER.DEDUPED)).thenReturn(dedupedCounter); Counter recordCounter = new GenericCounter(); when(reducerContext.getCounter(RecordKeyDedupReducerBase.EVENT_COUNTER.RECORD_COUNT)).thenReturn(recordCounter); reducer.setup(reducerContext); doNothing().when(reducerContext).write(any(AvroKey.class), any(NullWritable.class)); List<AvroValue<GenericRecord>> valueIterable = Lists.newArrayList(fullRecord1, fullRecord2, fullRecord3, fullRecord4); AvroKey<GenericRecord> key = new AvroKey<>(); key.datum(keyRecord); reducer.reduce(key, valueIterable, reducerContext); Assert.assertEquals(reducer.getOutKey().datum(), fullRecord2.datum()); // Test reducer without delta field Configuration conf2 = mock(Configuration.class); when(conf2.get(AvroKeyDedupReducer.DELTA_SCHEMA_PROVIDER)).thenReturn(null); when(reducerContext.getConfiguration()).thenReturn(conf2); RecordKeyDedupReducerBase<AvroKey<GenericRecord>, AvroValue<GenericRecord>, AvroKey<GenericRecord>, NullWritable> reducer2 = new AvroKeyDedupReducer(); reducer2.setup(reducerContext); reducer2.reduce(key, valueIterable, reducerContext); Assert.assertEquals(reducer2.getOutKey().datum(), fullRecord1.datum()); // Test reducer with compound delta key. Schema fullSchema2 = new Schema.Parser().parse(AVRO_FULL_SCHEMA_WITH_TWO_DELTA_FIELDS); GenericRecordBuilder fullRecordBuilder2 = new GenericRecordBuilder(fullSchema2); fullRecordBuilder2.set("key", record); fullRecordBuilder2.set("scn", 123); fullRecordBuilder2.set("scn2", 100); fullRecord1.datum(fullRecordBuilder2.build()); fullRecordBuilder2.set("scn", 125); fullRecordBuilder2.set("scn2", 1000); fullRecord2.datum(fullRecordBuilder2.build()); fullRecordBuilder2.set("scn", 126); fullRecordBuilder2.set("scn2", 1000); fullRecord3.datum(fullRecordBuilder2.build()); fullRecordBuilder2.set("scn", 130); fullRecordBuilder2.set("scn2", 100); fullRecord4.datum(fullRecordBuilder2.build()); List<AvroValue<GenericRecord>> valueIterable2 = Lists.newArrayList(fullRecord1, fullRecord2, fullRecord3, fullRecord4); reducer.reduce(key, valueIterable2, reducerContext); Assert.assertEquals(reducer.getOutKey().datum(), fullRecord3.datum()); }
Example #22
Source File: CompactionAvroJobConfigurator.java From incubator-gobblin with Apache License 2.0 | 4 votes |
protected void configureMapper(Job job) { job.setInputFormatClass(AvroKeyRecursiveCombineFileInputFormat.class); job.setMapperClass(AvroKeyMapper.class); job.setMapOutputKeyClass(AvroKey.class); job.setMapOutputValueClass(AvroValue.class); }
Example #23
Source File: SimpleAvroJob.java From datafu with Apache License 2.0 | 4 votes |
@Override protected void map(AvroKey<GenericRecord> input, NullWritable unused, Context context) throws IOException, InterruptedException { key.put("id", input.datum().get("id")); context.write(new AvroKey<GenericRecord>(key), new AvroValue<GenericRecord>(value)); }
Example #24
Source File: AvroKeyDedupReducer.java From incubator-gobblin with Apache License 2.0 | 4 votes |
@Override protected void setOutValue(AvroValue<GenericRecord> valueToRetain) { // do nothing since initReusableObject has assigned value for outValue. }
Example #25
Source File: AvroKeyDedupReducer.java From incubator-gobblin with Apache License 2.0 | 4 votes |
@Override protected void setOutKey(AvroValue<GenericRecord> valueToRetain) { outKey.datum(valueToRetain.datum()); }
Example #26
Source File: MRCompactorAvroKeyDedupJobRunner.java From incubator-gobblin with Apache License 2.0 | 4 votes |
@Override protected void setMapOutputValueClass(Job job) { job.setMapOutputValueClass(AvroValue.class); }
Example #27
Source File: GenericPartitioner.java From incubator-pinot with Apache License 2.0 | 4 votes |
@Override public int getPartition(T genericRecordAvroKey, AvroValue<GenericRecord> genericRecordAvroValue, int numPartitions) { final GenericRecord inputRecord = genericRecordAvroValue.datum(); final Object partitionColumnValue = inputRecord.get(_partitionColumn); return _partitionFunction.getPartition(partitionColumnValue); }