Java Code Examples for org.apache.avro.Schema#setFields()
The following examples show how to use
org.apache.avro.Schema#setFields() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AvroSchemaConverter190Int96Avro17.java From datacollector with Apache License 2.0 | 7 votes |
private Schema convertFields(String name, List<Type> parquetFields) { List<Schema.Field> fields = new ArrayList<Schema.Field>(); for (Type parquetType : parquetFields) { Schema fieldSchema = convertField(parquetType); if (parquetType.isRepetition(REPEATED)) { throw new UnsupportedOperationException("REPEATED not supported outside LIST or MAP. Type: " + parquetType); } else if (parquetType.isRepetition(Type.Repetition.OPTIONAL)) { fields.add(new Schema.Field( parquetType.getName(), optional(fieldSchema), null, NULL_VALUE)); } else { // REQUIRED fields.add(new Schema.Field( parquetType.getName(), fieldSchema, null, (Object) null)); } } Schema schema = Schema.createRecord(name, null, null, false); schema.setFields(fields); return schema; }
Example 2
Source File: MRCompactorAvroKeyDedupJobRunner.java From incubator-gobblin with Apache License 2.0 | 6 votes |
public static Optional<Schema> getKeySchemaFromRecord(Schema record) { Preconditions.checkArgument(record.getType() == Schema.Type.RECORD); List<Field> fields = Lists.newArrayList(); for (Field field : record.getFields()) { Optional<Schema> newFieldSchema = getKeySchema(field); if (newFieldSchema.isPresent()) { fields.add(new Field(field.name(), newFieldSchema.get(), field.doc(), field.defaultValue())); } } if (!fields.isEmpty()) { Schema newSchema = Schema.createRecord(record.getName(), record.getDoc(), record.getName(), false); newSchema.setFields(fields); return Optional.of(newSchema); } else { return Optional.absent(); } }
Example 3
Source File: RedshiftIT.java From digdag with Apache License 2.0 | 6 votes |
private byte[] avroTestData(List<Schema.Field> fields, List<Map<String, Object>> records) throws IOException { Schema schema = Schema.createRecord("testdata", null, null, false); schema.setFields(fields); ByteArrayOutputStream out = new ByteArrayOutputStream(); GenericDatumWriter<GenericData.Record> datum = new GenericDatumWriter<>(schema); DataFileWriter<GenericData.Record> writer = new DataFileWriter<>(datum); writer.create(schema, out); for (Map<String, Object> record : records) { GenericData.Record r = new GenericData.Record(schema); for (Map.Entry<String, Object> item : record.entrySet()) { r.put(item.getKey(), item.getValue()); } writer.append(r); } writer.close(); return out.toByteArray(); }
Example 4
Source File: TestAvroSchemaConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testOptionalArrayElement() throws Exception { Schema schema = Schema.createRecord("record1", null, null, false); Schema optionalIntArray = Schema.createArray(optional(Schema.create(INT))); schema.setFields(Arrays.asList( new Schema.Field("myintarray", optionalIntArray, null, null) )); testRoundTripConversion( NEW_BEHAVIOR, schema, "message record1 {\n" + " required group myintarray (LIST) {\n" + " repeated group list {\n" + " optional int32 element;\n" + " }\n" + " }\n" + "}\n"); }
Example 5
Source File: TestTableConversion.java From kite with Apache License 2.0 | 6 votes |
@Test public void testConvertStructs() { Schema recordSchema = Schema.createRecord("inner", null, null, false); recordSchema.setFields(Lists.newArrayList( new Schema.Field("a", optional(Schema.create(Schema.Type.INT)), null, NULL_DEFAULT), new Schema.Field("b", optional(Schema.create(Schema.Type.BYTES)), null, NULL_DEFAULT) )); Schema structOfStructsSchema = Schema.createRecord("test", null, null, false); structOfStructsSchema.setFields(Lists.newArrayList( new Schema.Field("str", optional(Schema.create(Schema.Type.STRING)), null, NULL_DEFAULT), new Schema.Field("inner", optional(recordSchema), null, NULL_DEFAULT) )); Assert.assertEquals("Should convert struct of structs", structOfStructsSchema, HiveSchemaConverter.convert( startPath, "test", STRUCT_OF_STRUCTS_TYPE, NO_REQUIRED_FIELDS)); }
Example 6
Source File: InputAvroSchemaTest.java From dbeam with Apache License 2.0 | 6 votes |
private Schema createRecordSchema( final String recordName, final String recordDoc, final String recordNamespace, final String[] fieldNames, final String[] fieldDocs) { Schema inputSchema = Schema.createRecord(recordName, recordDoc, recordNamespace, false); final List<Schema.Field> fields = new ArrayList<>(); for (int i = 0; i < fieldNames.length; i++) { String fieldName = fieldNames[i]; String fieldDoc = fieldDocs[i]; fields.add(new Schema.Field(fieldName, inputSchema, fieldDoc)); } inputSchema.setFields(fields); return inputSchema; }
Example 7
Source File: TestAvroSchemaConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testArrayOfOptionalRecordsOldBehavior() throws Exception { Schema innerRecord = Schema.createRecord("InnerRecord", null, null, false); Schema optionalString = optional(Schema.create(Schema.Type.STRING)); innerRecord.setFields(Lists.newArrayList( new Schema.Field("s1", optionalString, null, JsonProperties.NULL_VALUE), new Schema.Field("s2", optionalString, null, JsonProperties.NULL_VALUE) )); Schema schema = Schema.createRecord("HasArray", null, null, false); schema.setFields(Lists.newArrayList( new Schema.Field("myarray", Schema.createArray(optional(innerRecord)), null, null) )); System.err.println("Avro schema: " + schema.toString(true)); // Cannot use round-trip assertion because InnerRecord optional is removed testAvroToParquetConversion(schema, "message HasArray {\n" + " required group myarray (LIST) {\n" + " repeated group array {\n" + " optional binary s1 (UTF8);\n" + " optional binary s2 (UTF8);\n" + " }\n" + " }\n" + "}\n"); }
Example 8
Source File: AvroSchemaCodecFormat.java From funcj with MIT License | 6 votes |
@Override public Object encode( CodecCoreEx<WithSchema, Object, Config> core, T value, Object out ) { final String path = out + "." + type.getSimpleName(); final List<Schema.Field> fieldSchema = fields.entrySet().stream() .map(en -> new Schema.Field( en.getKey(), (Schema)en.getValue().encodeField(value, path))) .collect(toList()); final Schema schema = Schema.createRecord(path, null, null, false); schema.setFields(fieldSchema); return schema; }
Example 9
Source File: TestCopyCommandClusterNewField.java From kite with Apache License 2.0 | 6 votes |
@Override public Schema getEvolvedSchema(Schema original) { List<Schema.Field> fields = Lists.newArrayList(); fields.add(new Schema.Field("new", Schema.createUnion(ImmutableList.of( Schema.create(Schema.Type.NULL), Schema.create(Schema.Type.STRING))), "New field", NullNode.getInstance())); for (Schema.Field field : original.getFields()) { fields.add(new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultValue())); } Schema evolved = Schema.createRecord(original.getName(), original.getDoc(), original.getNamespace(), false); evolved.setFields(fields); return evolved; }
Example 10
Source File: AvroKeyValueSinkWriter.java From flink with Apache License 2.0 | 5 votes |
/** * Creates a KeyValuePair generic record schema. * * @return A schema for a generic record with two fields: 'key' and * 'value'. */ public static Schema getSchema(Schema keySchema, Schema valueSchema) { Schema schema = Schema.createRecord(KEY_VALUE_PAIR_RECORD_NAME, "A key/value pair", KEY_VALUE_PAIR_RECORD_NAMESPACE, false); schema.setFields(Arrays.asList(new Schema.Field(KEY_FIELD, keySchema, "The key", null), new Schema.Field(VALUE_FIELD, valueSchema, "The value", null))); return schema; }
Example 11
Source File: HoodieAvroUtils.java From hudi with Apache License 2.0 | 5 votes |
private static Schema initRecordKeySchema() { Schema.Field recordKeyField = new Schema.Field(HoodieRecord.RECORD_KEY_METADATA_FIELD, METADATA_FIELD_SCHEMA, "", NullNode.getInstance()); Schema recordKeySchema = Schema.createRecord("HoodieRecordKey", "", "", false); recordKeySchema.setFields(Collections.singletonList(recordKeyField)); return recordKeySchema; }
Example 12
Source File: AvroSchemaGenerator.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 5 votes |
public Schema generate(String schemaNameOverride) throws IOException { ClassWriter classWriter = new ClassWriter(options, connManager, tableName, null); Map<String, Integer> columnTypes = classWriter.getColumnTypes(); String[] columnNames = classWriter.getColumnNames(columnTypes); List<Field> fields = new ArrayList<Field>(); for (String columnName : columnNames) { String cleanedCol = AvroUtil.toAvroIdentifier(ClassWriter.toJavaIdentifier(columnName)); int sqlType = columnTypes.get(columnName); Schema avroSchema = toAvroSchema(sqlType, columnName); Field field = new Field(cleanedCol, avroSchema, null, NullNode.getInstance()); field.addProp("columnName", columnName); field.addProp("sqlType", Integer.toString(sqlType)); fields.add(field); } TableClassName tableClassName = new TableClassName(options); String shortClassName = tableClassName.getShortClassForTable(tableName); String avroTableName = (tableName == null ? TableClassName.QUERY_RESULT : tableName); String avroName = schemaNameOverride != null ? schemaNameOverride : (shortClassName == null ? avroTableName : shortClassName); String avroNamespace = tableClassName.getPackageForTable(); String doc = "Sqoop import of " + avroTableName; Schema schema = Schema.createRecord(avroName, doc, avroNamespace, false); schema.setFields(fields); schema.addProp("tableName", avroTableName); return schema; }
Example 13
Source File: RegressionTest.java From ml-ease with Apache License 2.0 | 5 votes |
private JobConf createJobConf(Class<? extends AvroMapper> mapperClass, Class<? extends AvroReducer> reducerClass) throws IOException, URISyntaxException { JobConf conf = createJobConf(); Schema inputSchema = Util.removeUnion(AvroUtils.getAvroInputSchema(conf)); if (inputSchema == null) { throw new IllegalStateException("Input does not have schema info and/or input is missing."); } _logger.info("Input Schema=" + inputSchema.toString()); List<Schema.Field> inputFields = inputSchema.getFields(); Schema.Field predField = new Schema.Field("pred", Schema.create(Type.FLOAT), "", null); List<Schema.Field> outputFields = new LinkedList<Schema.Field>(); for (Schema.Field field : inputFields) { outputFields.add(new Schema.Field(field.name(), field.schema(), field.doc(), null)); } outputFields.add(predField); Schema outputSchema = Schema.createRecord("AdmmTestOutput", "Test output for AdmmTest", "com.linkedin.lab.regression.avro", false); outputSchema.setFields(outputFields); AvroJob.setOutputSchema(conf, outputSchema); AvroJob.setMapOutputSchema(conf, Pair.getPairSchema(Schema.create(Type.FLOAT), outputSchema)); AvroJob.setMapperClass(conf, mapperClass); AvroJob.setReducerClass(conf, reducerClass); return conf; }
Example 14
Source File: SchemaBuilder.java From xml-avro with Apache License 2.0 | 5 votes |
private Schema createGroupSchema(String name, XSModelGroup groupTerm) { Schema record = Schema.createRecord(name, null, null, false); schemas.put(name, record); Map<String, Schema.Field> fields = new HashMap<>(); createGroupFields(groupTerm, fields, false); record.setFields(new ArrayList<>(fields.values())); return Schema.createArray(record); }
Example 15
Source File: PigSchema2Avro.java From spork with Apache License 2.0 | 5 votes |
/** * Convert pig data to Avro record * */ protected static Schema convertRecord(ResourceFieldSchema[] pigFields, boolean nullable) throws IOException { AvroStorageLog.funcCall("convertRecord"); // Type name is required for Avro record String typeName = getRecordName(); Schema outSchema = Schema.createRecord(typeName, null, null, false); List<Schema.Field> outFields = new ArrayList<Schema.Field>(); for (int i = 0; i < pigFields.length; i++) { /* get schema */ Schema fieldSchema = convert(pigFields[i], nullable); /* get field name of output */ String outname = pigFields[i].getName(); if (outname == null) outname = FIELD_NAME + "_" + i; // field name cannot be null /* get doc of output */ String desc = pigFields[i].getDescription(); outFields.add(new Field(outname, fieldSchema, desc, null)); } outSchema.setFields(outFields); return outSchema; }
Example 16
Source File: PigSchema2Avro.java From Cubert with Apache License 2.0 | 4 votes |
/** * Validate a Pig tuple is compatible with Avro record. If the Avro schema * is not complete (with uncovered fields), then convert those fields using * methods in set 1. * * Notice that users can get rid of Pig tuple wrappers, e.g. an Avro schema * "int" is compatible with a Pig schema "T:(int)" * */ protected static Schema validateAndConvertRecord(Schema avroSchema, ResourceFieldSchema[] pigFields) throws IOException { /* Get rid of Pig tuple wrappers. */ if (!avroSchema.getType().equals(Schema.Type.RECORD)) { if (pigFields.length != 1) throw new IOException("Expect only one field in Pig tuple schema. Avro schema is " + avroSchema.getType()); return validateAndConvert(avroSchema, pigFields[0]); } /* validate and convert a pig tuple with avro record */ boolean isPartialSchema = AvroStorageUtils.isUDPartialRecordSchema(avroSchema); AvroStorageLog.details("isPartialSchema=" + isPartialSchema); String typeName = isPartialSchema ? getRecordName() : avroSchema.getName(); Schema outSchema = Schema.createRecord(typeName, avroSchema.getDoc(), avroSchema.getNamespace(), false); List<Schema.Field> inFields = avroSchema.getFields(); if (!isPartialSchema && inFields.size() != pigFields.length) { throw new IOException("Expect " + inFields.size() + " fields in pig schema." + " But there are " + pigFields.length); } List<Schema.Field> outFields = new ArrayList<Schema.Field>(); for (int i = 0; i < pigFields.length; i++) { /* get user defined avro field schema */ Field inputField = isPartialSchema ? AvroStorageUtils.getUDField(avroSchema, i) : inFields.get(i); /* get schema */ Schema fieldSchema = null; if (inputField == null) { /* convert pig schema (nullable) */ fieldSchema = convert(pigFields[i], true); } else if (inputField.schema() == null) { /* convert pig schema (not-null) */ fieldSchema = convert(pigFields[i], false); } else { /* validate pigFields[i] with given avro schema */ fieldSchema = validateAndConvert(inputField.schema(), pigFields[i]); } /* get field name of output */ String outname = (isPartialSchema) ? pigFields[i].getName() : inputField.name(); if (outname == null) outname = FIELD_NAME + "_" + i; // field name cannot be null /* get doc of output */ String doc = (isPartialSchema) ? pigFields[i].getDescription() : inputField.doc(); JsonNode defaultvalue = (inputField != null) ? inputField.defaultValue() : null; outFields.add(new Field(outname, fieldSchema, doc, defaultvalue)); } outSchema.setFields(outFields); return outSchema; }
Example 17
Source File: AvroProjectionParquetMapReduce.java From hiped2 with Apache License 2.0 | 4 votes |
/** * The MapReduce driver - setup and launch the job. * * @param args the command-line arguments * @return the process exit code * @throws Exception if something goes wrong */ public int run(final String[] args) throws Exception { Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build(); int result = cli.runCmd(); if (result != 0) { return result; } Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT)); Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT)); Configuration conf = super.getConf(); Job job = new Job(conf); job.setJarByClass(AvroProjectionParquetMapReduce.class); job.setInputFormatClass(AvroParquetInputFormat.class); AvroParquetInputFormat.setInputPaths(job, inputPath); // predicate pushdown AvroParquetInputFormat.setUnboundRecordFilter(job, GoogleStockFilter.class); // projection pushdown Schema projection = Schema.createRecord(Stock.SCHEMA$.getName(), Stock.SCHEMA$.getDoc(), Stock.SCHEMA$.getNamespace(), false); List<Schema.Field> fields = Lists.newArrayList(); for (Schema.Field field : Stock.SCHEMA$.getFields()) { if ("symbol".equals(field.name()) || "open".equals(field.name())) { fields.add(new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultValue(), field.order())); } } projection.setFields(fields); AvroParquetInputFormat.setRequestedProjection(job, projection); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputFormatClass(AvroParquetOutputFormat.class); FileOutputFormat.setOutputPath(job, outputPath); AvroParquetOutputFormat.setSchema(job, StockAvg.SCHEMA$); return job.waitForCompletion(true) ? 0 : 1; }
Example 18
Source File: Schemas.java From parquet-mr with Apache License 2.0 | 4 votes |
/** * Merges two {@link Schema} instances or returns {@code null}. * <p> * The two schemas are merged if they are the same type. Records are merged * if the two records have the same name or have no names but have a * significant number of shared fields. * <p> * @see {@link #mergeOrUnion} to return a union when a merge is not possible. * * @param left a {@code Schema} * @param right a {@code Schema} * @return a merged {@code Schema} or {@code null} if merging is not possible */ private static Schema mergeOnly(Schema left, Schema right) { if (Objects.equal(left, right)) { return left; } // handle primitive type promotion; doesn't promote integers to floats switch (left.getType()) { case INT: if (right.getType() == Schema.Type.LONG) { return right; } break; case LONG: if (right.getType() == Schema.Type.INT) { return left; } break; case FLOAT: if (right.getType() == Schema.Type.DOUBLE) { return right; } break; case DOUBLE: if (right.getType() == Schema.Type.FLOAT) { return left; } } // any other cases where the types don't match must be combined by a union if (left.getType() != right.getType()) { return null; } switch (left.getType()) { case UNION: return union(left, right); case RECORD: if (left.getName() == null && right.getName() == null && fieldSimilarity(left, right) < SIMILARITY_THRESH) { return null; } else if (!Objects.equal(left.getName(), right.getName())) { return null; } Schema combinedRecord = Schema.createRecord( coalesce(left.getName(), right.getName()), coalesce(left.getDoc(), right.getDoc()), coalesce(left.getNamespace(), right.getNamespace()), false ); combinedRecord.setFields(mergeFields(left, right)); return combinedRecord; case MAP: return Schema.createMap( mergeOrUnion(left.getValueType(), right.getValueType())); case ARRAY: return Schema.createArray( mergeOrUnion(left.getElementType(), right.getElementType())); case ENUM: if (!Objects.equal(left.getName(), right.getName())) { return null; } Set<String> symbols = Sets.newLinkedHashSet(); symbols.addAll(left.getEnumSymbols()); symbols.addAll(right.getEnumSymbols()); return Schema.createEnum( left.getName(), coalesce(left.getDoc(), right.getDoc()), coalesce(left.getNamespace(), right.getNamespace()), ImmutableList.copyOf(symbols) ); default: // all primitives are handled before the switch by the equality check. // schemas that reach this point are not primitives and also not any of // the above known types. throw new UnsupportedOperationException( "Unknown schema type: " + left.getType()); } }
Example 19
Source File: MarketoRuntimeTestBase.java From components with Apache License 2.0 | 4 votes |
public Schema getFullDynamicSchema() { Schema emptySchema = Schema.createRecord("dynamic", null, null, false); emptySchema.setFields(new ArrayList<Field>()); emptySchema = AvroUtils.setIncludeAllFields(emptySchema, true); return emptySchema; }
Example 20
Source File: TestReflectInputOutputFormat.java From parquet-mr with Apache License 2.0 | 4 votes |
@Test public void testReadWrite() throws Exception { conf.setBoolean(AvroReadSupport.AVRO_COMPATIBILITY, false); final Job job = new Job(conf, "read"); job.setInputFormatClass(AvroParquetInputFormat.class); AvroParquetInputFormat.setInputPaths(job, parquetPath); // Test push-down predicates by using an electric car filter AvroParquetInputFormat.setUnboundRecordFilter(job, ElectricCarFilter.class); // Test schema projection by dropping the optional extras Schema projection = Schema.createRecord(CAR_SCHEMA.getName(), CAR_SCHEMA.getDoc(), CAR_SCHEMA.getNamespace(), false); List<Schema.Field> fields = Lists.newArrayList(); for (Schema.Field field : ReflectData.get().getSchema(Car.class).getFields()) { if (!"optionalExtra".equals(field.name())) { fields.add(new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultVal(), field.order())); } } projection.setFields(fields); AvroParquetInputFormat.setRequestedProjection(job, projection); job.setMapperClass(TestReflectInputOutputFormat.MyMapper2.class); job.setNumReduceTasks(0); job.setOutputFormatClass(AvroParquetOutputFormat.class); AvroParquetOutputFormat.setOutputPath(job, outputPath); AvroParquetOutputFormat.setSchema(job, CAR_SCHEMA); waitForJob(job); final Path mapperOutput = new Path(outputPath.toString(), "part-m-00000.parquet"); try(final AvroParquetReader<Car> out = new AvroParquetReader<Car>(conf, mapperOutput)) { Car car; Car previousCar = null; int lineNumber = 0; while ((car = out.read()) != null) { if (previousCar != null) { // Testing reference equality here. The "model" field should be dictionary-encoded. assertTrue(car.model == previousCar.model); } // Make sure that predicate push down worked as expected if (car.engine.type == EngineType.PETROL) { fail("UnboundRecordFilter failed to remove cars with PETROL engines"); } // Note we use lineNumber * 2 because of predicate push down Car expectedCar = nextRecord(lineNumber * 2); // We removed the optional extra field using projection so we shouldn't // see it here... expectedCar.optionalExtra = null; assertEquals("line " + lineNumber, expectedCar, car); ++lineNumber; previousCar = car; } } }