org.apache.avro.Schema#setFields

Source File: AvroSchemaConverter190Int96Avro17.java From datacollector with Apache License 2.0

7 votes

private Schema convertFields(String name, List<Type> parquetFields) {
  List<Schema.Field> fields = new ArrayList<Schema.Field>();
  for (Type parquetType : parquetFields) {
    Schema fieldSchema = convertField(parquetType);
    if (parquetType.isRepetition(REPEATED)) {
      throw new UnsupportedOperationException("REPEATED not supported outside LIST or MAP. Type: " + parquetType);
    } else if (parquetType.isRepetition(Type.Repetition.OPTIONAL)) {
      fields.add(new Schema.Field(
          parquetType.getName(), optional(fieldSchema), null, NULL_VALUE));
    } else { // REQUIRED
      fields.add(new Schema.Field(
          parquetType.getName(), fieldSchema, null, (Object) null));
    }
  }
  Schema schema = Schema.createRecord(name, null, null, false);
  schema.setFields(fields);
  return schema;
}

Source File: MRCompactorAvroKeyDedupJobRunner.java From incubator-gobblin with Apache License 2.0

6 votes

public static Optional<Schema> getKeySchemaFromRecord(Schema record) {
  Preconditions.checkArgument(record.getType() == Schema.Type.RECORD);

  List<Field> fields = Lists.newArrayList();
  for (Field field : record.getFields()) {
    Optional<Schema> newFieldSchema = getKeySchema(field);
    if (newFieldSchema.isPresent()) {
      fields.add(new Field(field.name(), newFieldSchema.get(), field.doc(), field.defaultValue()));
    }
  }
  if (!fields.isEmpty()) {
    Schema newSchema = Schema.createRecord(record.getName(), record.getDoc(), record.getName(), false);
    newSchema.setFields(fields);
    return Optional.of(newSchema);
  } else {
    return Optional.absent();
  }
}

Source File: RedshiftIT.java From digdag with Apache License 2.0

6 votes

private byte[] avroTestData(List<Schema.Field> fields, List<Map<String, Object>> records)
        throws IOException
{
    Schema schema = Schema.createRecord("testdata", null, null, false);
    schema.setFields(fields);

    ByteArrayOutputStream out = new ByteArrayOutputStream();
    GenericDatumWriter<GenericData.Record> datum = new GenericDatumWriter<>(schema);
    DataFileWriter<GenericData.Record> writer = new DataFileWriter<>(datum);
    writer.create(schema, out);
    for (Map<String, Object> record : records) {
        GenericData.Record r = new GenericData.Record(schema);
        for (Map.Entry<String, Object> item : record.entrySet()) {
            r.put(item.getKey(), item.getValue());
        }
        writer.append(r);
    }
    writer.close();

    return out.toByteArray();
}

Source File: TestAvroSchemaConverter.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testOptionalArrayElement() throws Exception {
  Schema schema = Schema.createRecord("record1", null, null, false);
  Schema optionalIntArray = Schema.createArray(optional(Schema.create(INT)));
  schema.setFields(Arrays.asList(
      new Schema.Field("myintarray", optionalIntArray, null, null)
  ));
  testRoundTripConversion(
      NEW_BEHAVIOR, schema,
      "message record1 {\n" +
          "  required group myintarray (LIST) {\n" +
          "    repeated group list {\n" +
          "      optional int32 element;\n" +
          "    }\n" +
          "  }\n" +
          "}\n");
}

Source File: TestTableConversion.java From kite with Apache License 2.0

6 votes

@Test
public void testConvertStructs() {
  Schema recordSchema = Schema.createRecord("inner", null, null, false);
  recordSchema.setFields(Lists.newArrayList(
      new Schema.Field("a",
          optional(Schema.create(Schema.Type.INT)), null, NULL_DEFAULT),
      new Schema.Field("b",
          optional(Schema.create(Schema.Type.BYTES)), null, NULL_DEFAULT)
  ));
  Schema structOfStructsSchema = Schema.createRecord("test", null, null, false);
  structOfStructsSchema.setFields(Lists.newArrayList(
      new Schema.Field("str",
          optional(Schema.create(Schema.Type.STRING)), null, NULL_DEFAULT),
      new Schema.Field("inner", optional(recordSchema), null, NULL_DEFAULT)
  ));

  Assert.assertEquals("Should convert struct of structs",
      structOfStructsSchema,
      HiveSchemaConverter.convert(
          startPath, "test", STRUCT_OF_STRUCTS_TYPE, NO_REQUIRED_FIELDS));
}

Source File: InputAvroSchemaTest.java From dbeam with Apache License 2.0

6 votes

private Schema createRecordSchema(
    final String recordName,
    final String recordDoc,
    final String recordNamespace,
    final String[] fieldNames,
    final String[] fieldDocs) {
  Schema inputSchema = Schema.createRecord(recordName, recordDoc, recordNamespace, false);
  final List<Schema.Field> fields = new ArrayList<>();
  for (int i = 0; i < fieldNames.length; i++) {
    String fieldName = fieldNames[i];
    String fieldDoc = fieldDocs[i];
    fields.add(new Schema.Field(fieldName, inputSchema, fieldDoc));
  }
  inputSchema.setFields(fields);

  return inputSchema;
}

Source File: TestAvroSchemaConverter.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testArrayOfOptionalRecordsOldBehavior() throws Exception {
  Schema innerRecord = Schema.createRecord("InnerRecord", null, null, false);
  Schema optionalString = optional(Schema.create(Schema.Type.STRING));
  innerRecord.setFields(Lists.newArrayList(
      new Schema.Field("s1", optionalString, null, JsonProperties.NULL_VALUE),
      new Schema.Field("s2", optionalString, null, JsonProperties.NULL_VALUE)
  ));
  Schema schema = Schema.createRecord("HasArray", null, null, false);
  schema.setFields(Lists.newArrayList(
      new Schema.Field("myarray", Schema.createArray(optional(innerRecord)),
          null, null)
  ));
  System.err.println("Avro schema: " + schema.toString(true));

  // Cannot use round-trip assertion because InnerRecord optional is removed
  testAvroToParquetConversion(schema, "message HasArray {\n" +
      "  required group myarray (LIST) {\n" +
      "    repeated group array {\n" +
      "      optional binary s1 (UTF8);\n" +
      "      optional binary s2 (UTF8);\n" +
      "    }\n" +
      "  }\n" +
      "}\n");
}

Source File: AvroSchemaCodecFormat.java From funcj with MIT License

6 votes

@Override
public Object encode(
        CodecCoreEx<WithSchema, Object, Config> core,
        T value,
        Object out
) {
    final String path = out + "." + type.getSimpleName();
    final List<Schema.Field> fieldSchema =
            fields.entrySet().stream()
                    .map(en -> new Schema.Field(
                            en.getKey(),
                            (Schema)en.getValue().encodeField(value, path)))
                    .collect(toList());
    final Schema schema = Schema.createRecord(path, null, null, false);
    schema.setFields(fieldSchema);
    return schema;
}

Source File: TestCopyCommandClusterNewField.java From kite with Apache License 2.0

6 votes

@Override
public Schema getEvolvedSchema(Schema original) {
  List<Schema.Field> fields = Lists.newArrayList();
  fields.add(new Schema.Field("new",
    Schema.createUnion(ImmutableList.of(
        Schema.create(Schema.Type.NULL),
        Schema.create(Schema.Type.STRING))),
    "New field", NullNode.getInstance()));

  for (Schema.Field field : original.getFields()) {
    fields.add(new Schema.Field(field.name(), field.schema(), field.doc(),
      field.defaultValue()));
  }

  Schema evolved = Schema.createRecord(original.getName(), original.getDoc(),
    original.getNamespace(), false);
  evolved.setFields(fields);

  return evolved;
}

Source File: AvroKeyValueSinkWriter.java From flink with Apache License 2.0

5 votes

/**
 * Creates a KeyValuePair generic record schema.
 *
 * @return A schema for a generic record with two fields: 'key' and
 *         'value'.
 */
public static Schema getSchema(Schema keySchema, Schema valueSchema) {
	Schema schema = Schema.createRecord(KEY_VALUE_PAIR_RECORD_NAME,
			"A key/value pair", KEY_VALUE_PAIR_RECORD_NAMESPACE, false);
	schema.setFields(Arrays.asList(new Schema.Field(KEY_FIELD,
			keySchema, "The key", null), new Schema.Field(VALUE_FIELD,
			valueSchema, "The value", null)));
	return schema;
}

Source File: HoodieAvroUtils.java From hudi with Apache License 2.0

5 votes

private static Schema initRecordKeySchema() {
  Schema.Field recordKeyField =
      new Schema.Field(HoodieRecord.RECORD_KEY_METADATA_FIELD, METADATA_FIELD_SCHEMA, "", NullNode.getInstance());
  Schema recordKeySchema = Schema.createRecord("HoodieRecordKey", "", "", false);
  recordKeySchema.setFields(Collections.singletonList(recordKeyField));
  return recordKeySchema;
}

Source File: AvroSchemaGenerator.java From aliyun-maxcompute-data-collectors with Apache License 2.0

5 votes

public Schema generate(String schemaNameOverride) throws IOException {
  ClassWriter classWriter = new ClassWriter(options, connManager,
      tableName, null);
  Map<String, Integer> columnTypes = classWriter.getColumnTypes();
  String[] columnNames = classWriter.getColumnNames(columnTypes);

  List<Field> fields = new ArrayList<Field>();
  for (String columnName : columnNames) {
    String cleanedCol = AvroUtil.toAvroIdentifier(ClassWriter.toJavaIdentifier(columnName));
    int sqlType = columnTypes.get(columnName);
    Schema avroSchema = toAvroSchema(sqlType, columnName);
    Field field = new Field(cleanedCol, avroSchema, null,  NullNode.getInstance());
    field.addProp("columnName", columnName);
    field.addProp("sqlType", Integer.toString(sqlType));
    fields.add(field);
  }

  TableClassName tableClassName = new TableClassName(options);
  String shortClassName = tableClassName.getShortClassForTable(tableName);
  String avroTableName = (tableName == null ? TableClassName.QUERY_RESULT : tableName);
  String avroName = schemaNameOverride != null ? schemaNameOverride :
      (shortClassName == null ? avroTableName : shortClassName);
  String avroNamespace = tableClassName.getPackageForTable();

  String doc = "Sqoop import of " + avroTableName;
  Schema schema = Schema.createRecord(avroName, doc, avroNamespace, false);
  schema.setFields(fields);
  schema.addProp("tableName", avroTableName);
  return schema;
}

Source File: RegressionTest.java From ml-ease with Apache License 2.0

5 votes

private JobConf createJobConf(Class<? extends AvroMapper> mapperClass,
                              Class<? extends AvroReducer> reducerClass) throws IOException, URISyntaxException
{
  JobConf conf = createJobConf();
  Schema inputSchema = Util.removeUnion(AvroUtils.getAvroInputSchema(conf));
  if (inputSchema == null)
  {
    throw new IllegalStateException("Input does not have schema info and/or input is missing.");
  }
  _logger.info("Input Schema=" + inputSchema.toString());
  List<Schema.Field> inputFields = inputSchema.getFields();
  Schema.Field predField =
      new Schema.Field("pred", Schema.create(Type.FLOAT), "", null);
  List<Schema.Field> outputFields = new LinkedList<Schema.Field>();
  for (Schema.Field field : inputFields)
  {
    outputFields.add(new Schema.Field(field.name(),
                                      field.schema(),
                                      field.doc(),
                                      null));
  }
  outputFields.add(predField);
  Schema outputSchema =
      Schema.createRecord("AdmmTestOutput",
                          "Test output for AdmmTest",
                          "com.linkedin.lab.regression.avro",
                          false);
  outputSchema.setFields(outputFields);
  AvroJob.setOutputSchema(conf, outputSchema);
  AvroJob.setMapOutputSchema(conf,
                             Pair.getPairSchema(Schema.create(Type.FLOAT), outputSchema));
  AvroJob.setMapperClass(conf, mapperClass);
  AvroJob.setReducerClass(conf, reducerClass);
  return conf;
}

Source File: SchemaBuilder.java From xml-avro with Apache License 2.0

5 votes

private Schema createGroupSchema(String name, XSModelGroup groupTerm) {
  Schema record = Schema.createRecord(name, null, null, false);
  schemas.put(name, record);

  Map<String, Schema.Field> fields = new HashMap<>();
  createGroupFields(groupTerm, fields, false);
  record.setFields(new ArrayList<>(fields.values()));

  return Schema.createArray(record);
}

Source File: PigSchema2Avro.java From spork with Apache License 2.0

5 votes

/**
 * Convert pig data to Avro record
 * 
 */
protected static Schema convertRecord(ResourceFieldSchema[] pigFields, boolean nullable) throws IOException {

    AvroStorageLog.funcCall("convertRecord");

    // Type name is required for Avro record
    String typeName = getRecordName();
    Schema outSchema = Schema.createRecord(typeName, null, null, false);

    List<Schema.Field> outFields = new ArrayList<Schema.Field>();
    for (int i = 0; i < pigFields.length; i++) {

        /* get schema */
        Schema fieldSchema = convert(pigFields[i], nullable);

        /* get field name of output */
        String outname = pigFields[i].getName();
        if (outname == null)
            outname = FIELD_NAME + "_" + i; // field name cannot be null

        /* get doc of output */
        String desc = pigFields[i].getDescription();

        outFields.add(new Field(outname, fieldSchema, desc, null));
    }

    outSchema.setFields(outFields);
    return outSchema;

}

Source File: PigSchema2Avro.java From Cubert with Apache License 2.0

4 votes

/**
 * Validate a Pig tuple is compatible with Avro record. If the Avro schema 
 * is not complete (with uncovered fields), then convert those fields using 
 * methods in set 1. 
 * 
 * Notice that users can get rid of Pig tuple wrappers, e.g. an Avro schema
 * "int" is compatible with a Pig schema "T:(int)"
 * 
 */
protected static Schema validateAndConvertRecord(Schema avroSchema, ResourceFieldSchema[] pigFields) throws IOException {

    /* Get rid of Pig tuple wrappers. */
    if (!avroSchema.getType().equals(Schema.Type.RECORD)) {
        if (pigFields.length != 1)
            throw new IOException("Expect only one field in Pig tuple schema. Avro schema is " + avroSchema.getType());

        return validateAndConvert(avroSchema, pigFields[0]);
    }

    /* validate and convert a pig tuple with avro record */
    boolean isPartialSchema = AvroStorageUtils.isUDPartialRecordSchema(avroSchema);
    AvroStorageLog.details("isPartialSchema=" + isPartialSchema);

    String typeName = isPartialSchema ? getRecordName() : avroSchema.getName();
    Schema outSchema = Schema.createRecord(typeName, avroSchema.getDoc(), avroSchema.getNamespace(), false);

    List<Schema.Field> inFields = avroSchema.getFields();
    if (!isPartialSchema && inFields.size() != pigFields.length) {
        throw new IOException("Expect " + inFields.size() + " fields in pig schema." + " But there are " + pigFields.length);
    }

    List<Schema.Field> outFields = new ArrayList<Schema.Field>();

    for (int i = 0; i < pigFields.length; i++) {
        /* get user defined avro field schema */
        Field inputField = isPartialSchema ? AvroStorageUtils.getUDField(avroSchema, i) : inFields.get(i);

        /* get schema */
        Schema fieldSchema = null;
        if (inputField == null) { 
            /* convert pig schema (nullable) */
            fieldSchema = convert(pigFields[i], true);
        } else if (inputField.schema() == null) { 
            /* convert pig schema (not-null) */
            fieldSchema = convert(pigFields[i], false);
        } else { 
            /* validate pigFields[i] with given avro schema */
            fieldSchema = validateAndConvert(inputField.schema(),
                                            pigFields[i]);
        }

        /* get field name of output */
        String outname = (isPartialSchema) ? pigFields[i].getName() : inputField.name();
        if (outname == null)
            outname = FIELD_NAME + "_" + i; // field name cannot be null

        /* get doc of output */
        String doc = (isPartialSchema) ? pigFields[i].getDescription() : inputField.doc();

        JsonNode defaultvalue = (inputField != null) ? inputField.defaultValue() : null;

        outFields.add(new Field(outname, fieldSchema, doc, defaultvalue));

    }

    outSchema.setFields(outFields);
    return outSchema;

}

Source File: AvroProjectionParquetMapReduce.java From hiped2 with Apache License 2.0

4 votes

/**
 * The MapReduce driver - setup and launch the job.
 *
 * @param args the command-line arguments
 * @return the process exit code
 * @throws Exception if something goes wrong
 */
public int run(final String[] args) throws Exception {

  Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build();
  int result = cli.runCmd();

  if (result != 0) {
    return result;
  }

  Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT));
  Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT));

  Configuration conf = super.getConf();

  Job job = new Job(conf);
  job.setJarByClass(AvroProjectionParquetMapReduce.class);

  job.setInputFormatClass(AvroParquetInputFormat.class);
  AvroParquetInputFormat.setInputPaths(job, inputPath);

  // predicate pushdown
  AvroParquetInputFormat.setUnboundRecordFilter(job, GoogleStockFilter.class);

  // projection pushdown
  Schema projection = Schema.createRecord(Stock.SCHEMA$.getName(),
      Stock.SCHEMA$.getDoc(), Stock.SCHEMA$.getNamespace(), false);
  List<Schema.Field> fields = Lists.newArrayList();
  for (Schema.Field field : Stock.SCHEMA$.getFields()) {
    if ("symbol".equals(field.name()) || "open".equals(field.name())) {
      fields.add(new Schema.Field(field.name(), field.schema(), field.doc(),
          field.defaultValue(), field.order()));
    }
  }
  projection.setFields(fields);
  AvroParquetInputFormat.setRequestedProjection(job, projection);


  job.setMapperClass(Map.class);
  job.setReducerClass(Reduce.class);

  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(DoubleWritable.class);

  job.setOutputFormatClass(AvroParquetOutputFormat.class);
  FileOutputFormat.setOutputPath(job, outputPath);
  AvroParquetOutputFormat.setSchema(job, StockAvg.SCHEMA$);

  return job.waitForCompletion(true) ? 0 : 1;
}

Source File: Schemas.java From parquet-mr with Apache License 2.0

4 votes

/**
 * Merges two {@link Schema} instances or returns {@code null}.
 * <p>
 * The two schemas are merged if they are the same type. Records are merged
 * if the two records have the same name or have no names but have a
 * significant number of shared fields.
 * <p>
 * @see {@link #mergeOrUnion} to return a union when a merge is not possible.
 *
 * @param left a {@code Schema}
 * @param right a {@code Schema}
 * @return a merged {@code Schema} or {@code null} if merging is not possible
 */
private static Schema mergeOnly(Schema left, Schema right) {
  if (Objects.equal(left, right)) {
    return left;
  }

  // handle primitive type promotion; doesn't promote integers to floats
  switch (left.getType()) {
    case INT:
      if (right.getType() == Schema.Type.LONG) {
        return right;
      }
      break;
    case LONG:
      if (right.getType() == Schema.Type.INT) {
        return left;
      }
      break;
    case FLOAT:
      if (right.getType() == Schema.Type.DOUBLE) {
        return right;
      }
      break;
    case DOUBLE:
      if (right.getType() == Schema.Type.FLOAT) {
        return left;
      }
  }

  // any other cases where the types don't match must be combined by a union
  if (left.getType() != right.getType()) {
    return null;
  }

  switch (left.getType()) {
    case UNION:
      return union(left, right);
    case RECORD:
      if (left.getName() == null && right.getName() == null &&
          fieldSimilarity(left, right) < SIMILARITY_THRESH) {
        return null;
      } else if (!Objects.equal(left.getName(), right.getName())) {
        return null;
      }

      Schema combinedRecord = Schema.createRecord(
          coalesce(left.getName(), right.getName()),
          coalesce(left.getDoc(), right.getDoc()),
          coalesce(left.getNamespace(), right.getNamespace()),
          false
      );
      combinedRecord.setFields(mergeFields(left, right));

      return combinedRecord;

    case MAP:
      return Schema.createMap(
          mergeOrUnion(left.getValueType(), right.getValueType()));

    case ARRAY:
      return Schema.createArray(
          mergeOrUnion(left.getElementType(), right.getElementType()));

    case ENUM:
      if (!Objects.equal(left.getName(), right.getName())) {
        return null;
      }
      Set<String> symbols = Sets.newLinkedHashSet();
      symbols.addAll(left.getEnumSymbols());
      symbols.addAll(right.getEnumSymbols());
      return Schema.createEnum(
          left.getName(),
          coalesce(left.getDoc(), right.getDoc()),
          coalesce(left.getNamespace(), right.getNamespace()),
          ImmutableList.copyOf(symbols)
      );

    default:
      // all primitives are handled before the switch by the equality check.
      // schemas that reach this point are not primitives and also not any of
      // the above known types.
      throw new UnsupportedOperationException(
          "Unknown schema type: " + left.getType());
  }
}

Source File: MarketoRuntimeTestBase.java From components with Apache License 2.0

4 votes

public Schema getFullDynamicSchema() {
    Schema emptySchema = Schema.createRecord("dynamic", null, null, false);
    emptySchema.setFields(new ArrayList<Field>());
    emptySchema = AvroUtils.setIncludeAllFields(emptySchema, true);
    return emptySchema;
}

Source File: TestReflectInputOutputFormat.java From parquet-mr with Apache License 2.0

4 votes

@Test
public void testReadWrite() throws Exception {

  conf.setBoolean(AvroReadSupport.AVRO_COMPATIBILITY, false);
  final Job job = new Job(conf, "read");
  job.setInputFormatClass(AvroParquetInputFormat.class);
  AvroParquetInputFormat.setInputPaths(job, parquetPath);
  // Test push-down predicates by using an electric car filter
  AvroParquetInputFormat.setUnboundRecordFilter(job, ElectricCarFilter.class);

  // Test schema projection by dropping the optional extras
  Schema projection = Schema.createRecord(CAR_SCHEMA.getName(),
      CAR_SCHEMA.getDoc(), CAR_SCHEMA.getNamespace(), false);
  List<Schema.Field> fields = Lists.newArrayList();
  for (Schema.Field field : ReflectData.get().getSchema(Car.class).getFields()) {
    if (!"optionalExtra".equals(field.name())) {
      fields.add(new Schema.Field(field.name(), field.schema(), field.doc(),
          field.defaultVal(), field.order()));
    }
  }
  projection.setFields(fields);
  AvroParquetInputFormat.setRequestedProjection(job, projection);

  job.setMapperClass(TestReflectInputOutputFormat.MyMapper2.class);
  job.setNumReduceTasks(0);

  job.setOutputFormatClass(AvroParquetOutputFormat.class);
  AvroParquetOutputFormat.setOutputPath(job, outputPath);
  AvroParquetOutputFormat.setSchema(job, CAR_SCHEMA);

  waitForJob(job);

  final Path mapperOutput = new Path(outputPath.toString(),
      "part-m-00000.parquet");
  try(final AvroParquetReader<Car> out = new AvroParquetReader<Car>(conf, mapperOutput)) {
    Car car;
    Car previousCar = null;
    int lineNumber = 0;
    while ((car = out.read()) != null) {
      if (previousCar != null) {
        // Testing reference equality here. The "model" field should be dictionary-encoded.
        assertTrue(car.model == previousCar.model);
      }
      // Make sure that predicate push down worked as expected
      if (car.engine.type == EngineType.PETROL) {
        fail("UnboundRecordFilter failed to remove cars with PETROL engines");
      }
      // Note we use lineNumber * 2 because of predicate push down
      Car expectedCar = nextRecord(lineNumber * 2);
      // We removed the optional extra field using projection so we shouldn't
      // see it here...
      expectedCar.optionalExtra = null;
      assertEquals("line " + lineNumber, expectedCar, car);
      ++lineNumber;
      previousCar = car;
    }
  }
}

Java Code Examples for org.apache.avro.Schema#setFields()