org.apache.parquet.hadoop.ParquetWriter Java Exaples

Source File: IntegrationTestHelper.java From circus-train with Apache License 2.0

9 votes

URI createData(
    URI tableUri,
    Schema schema,
    String hour,
    int id,
    String fieldName,
    Object data) throws IOException {
  GenericData.Record record = new GenericData.Record(schema);
  record.put("id", id);

  if (fieldName != null) {
    Schema.Field field = schema.getField(fieldName);
    Schema fieldSchema = field.schema();
    if (data instanceof Map) {
      GenericData.Record schemaRecord = new GenericData.Record(fieldSchema);
      ((Map<String, String>) data).forEach(schemaRecord::put);
      record.put(fieldName, schemaRecord);
    } else if (data != null) {
      record.put(fieldName, data);
    }
  }

  URI partition = URI.create(tableUri + "/hour=" + hour);
  String path = partition.getPath();
  File parentFolder = new File(path);
  parentFolder.mkdirs();
  File partitionFile = new File(parentFolder, "parquet0000");
  Path filePath = new Path(partitionFile.toURI());
  ParquetWriter<GenericData.Record> writer = AvroParquetWriter.<GenericData.Record>builder(filePath)
      .withSchema(schema)
      .withConf(new Configuration())
      .build();

  try {
    writer.write(record);
  } finally {
    writer.close();
  }
  return partition;
}

Source File: TestReadWrite.java From parquet-mr with Apache License 2.0

7 votes

@Test(expected=RuntimeException.class)
public void testMapRequiredValueWithNull() throws Exception {
  Schema schema = Schema.createRecord("record1", null, null, false);
  schema.setFields(Lists.newArrayList(
      new Schema.Field("mymap", Schema.createMap(Schema.create(Schema.Type.INT)), null, null)));

  Path file = new Path(createTempFile().getPath());

  try(ParquetWriter<GenericRecord> writer = AvroParquetWriter
      .<GenericRecord>builder(file)
      .withSchema(schema)
      .withConf(testConf)
      .build()) {

    // Write a record with a null value
    Map<String, Integer> map = new HashMap<String, Integer>();
    map.put("thirty-four", 34);
    map.put("eleventy-one", null);
    map.put("one-hundred", 100);

    GenericData.Record record = new GenericRecordBuilder(schema)
      .set("mymap", map).build();
    writer.write(record);
  }
}

Source File: AvroTestUtil.java From parquet-mr with Apache License 2.0

6 votes

@SuppressWarnings("unchecked")
public static <D> File write(TemporaryFolder temp, Configuration conf, GenericData model, Schema schema, D... data)
    throws IOException {
  File file = temp.newFile();
  Assert.assertTrue(file.delete());

  try (ParquetWriter<D> writer = AvroParquetWriter
    .<D>builder(new Path(file.toString()))
    .withDataModel(model)
    .withSchema(schema)
    .build()) {
    for (D datum : data) {
      writer.write(datum);
    }
  }

  return file;
}

Source File: HiveTestUtil.java From hudi with Apache License 2.0

6 votes

@SuppressWarnings({"unchecked", "deprecation"})
private static void generateParquetData(Path filePath, boolean isParquetSchemaSimple)
    throws IOException, URISyntaxException {
  Schema schema = getTestDataSchema(isParquetSchemaSimple);
  org.apache.parquet.schema.MessageType parquetSchema = new AvroSchemaConverter().convert(schema);
  BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.0001, -1,
      BloomFilterTypeCode.SIMPLE.name());
  HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(parquetSchema, schema, filter);
  ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024,
      ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
      ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, fileSystem.getConf());

  List<IndexedRecord> testRecords = (isParquetSchemaSimple ? SchemaTestUtil.generateTestRecords(0, 100)
      : SchemaTestUtil.generateEvolvedTestRecords(100, 100));
  testRecords.forEach(s -> {
    try {
      writer.write(s);
    } catch (IOException e) {
      fail("IOException while writing test records as parquet" + e.toString());
    }
  });
  writer.close();
}

Source File: ExaParquetWriterImpl.java From hadoop-etl-udfs with MIT License

6 votes

private ExaParquetWriterImpl(final MessageType schema,
                             final int numColumns,
                             final Configuration conf,
                             final Path path,
                             final String compressionType,
                             final ExaIterator exa,
                             final int firstColumnIndex,
                             final List<Integer> dynamicPartitionExaColNums) throws Exception {
    super(path,
            new TupleWriteSupport(schema, conf),
            CompressionCodecName.fromConf(compressionType),
            ParquetWriter.DEFAULT_BLOCK_SIZE,
            ParquetWriter.DEFAULT_PAGE_SIZE,
            ParquetWriter.DEFAULT_PAGE_SIZE,
            ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
            ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED,
            PARQUET_WRITER_VERSION,
            conf);

    System.out.println("Path: " + path.toString());
    System.out.println("Parquet schema:\n" + schema);

    // Create Tuple object with ExaIterator reference.
    this.row = new Tuple(exa, numColumns, firstColumnIndex, dynamicPartitionExaColNums);
}

Source File: TestHoodieAvroWriteSupport.java From hudi with Apache License 2.0

6 votes

@Test
public void testAddKey(@TempDir java.nio.file.Path tempDir) throws IOException {
  List<String> rowKeys = new ArrayList<>();
  for (int i = 0; i < 1000; i++) {
    rowKeys.add(UUID.randomUUID().toString());
  }
  String filePath = tempDir.resolve("test.parquet").toAbsolutePath().toString();
  Schema schema = HoodieAvroUtils.getRecordKeySchema();
  BloomFilter filter = BloomFilterFactory.createBloomFilter(
      1000, 0.0001, 10000,
      BloomFilterTypeCode.SIMPLE.name());
  HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
      new AvroSchemaConverter().convert(schema), schema, filter);
  ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP,
      120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE);
  for (String rowKey : rowKeys) {
    GenericRecord rec = new GenericData.Record(schema);
    rec.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, rowKey);
    writer.write(rec);
    writeSupport.add(rowKey);
  }
  writer.close();
}

Source File: ParquetFileReaderTest.java From kafka-connect-fs with Apache License 2.0

6 votes

@Override
protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException {
    FileSystem fs = fsConfig.getFs();
    File parquetFile = File.createTempFile("test-", "." + getFileExtension());

    try (ParquetWriter writer = AvroParquetWriter.<GenericRecord>builder(new Path(parquetFile.toURI()))
            .withConf(fs.getConf()).withWriteMode(ParquetFileWriter.Mode.OVERWRITE).withSchema(readerSchema).build()) {
        IntStream.range(0, NUM_RECORDS).forEach(index -> {
            GenericRecord datum = new GenericData.Record(readerSchema);
            datum.put(FIELD_INDEX, index);
            String uuid = UUID.randomUUID().toString();
            datum.put(FIELD_NAME, String.format("%d_name_%s", index, uuid));
            datum.put(FIELD_SURNAME, String.format("%d_surname_%s", index, uuid));
            try {
                fsConfig.offsetsByIndex().put(index, (long) index);
                writer.write(datum);
            } catch (IOException ioe) {
                throw new RuntimeException(ioe);
            }
        });
    }
    Path path = new Path(new Path(fsConfig.getFsUri()), parquetFile.getName());
    fs.moveFromLocalFile(new Path(parquetFile.getAbsolutePath()), path);
    return path;
}

Source File: TestParquetUtils.java From hudi with Apache License 2.0

6 votes

private void writeParquetFile(String typeCode, String filePath, List<String> rowKeys, Schema schema, boolean addPartitionPathField, String partitionPath) throws Exception {
  // Write out a parquet file
  BloomFilter filter = BloomFilterFactory
      .createBloomFilter(1000, 0.0001, 10000, typeCode);
  HoodieAvroWriteSupport writeSupport =
      new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter);
  ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP,
      120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE);
  for (String rowKey : rowKeys) {
    GenericRecord rec = new GenericData.Record(schema);
    rec.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, rowKey);
    if (addPartitionPathField) {
      rec.put(HoodieRecord.PARTITION_PATH_METADATA_FIELD, partitionPath);
    }
    writer.write(rec);
    writeSupport.add(rowKey);
  }
  writer.close();
}

Source File: HoodieParquetWriter.java From hudi with Apache License 2.0

6 votes

public HoodieParquetWriter(String instantTime, Path file, HoodieParquetConfig parquetConfig,
    Schema schema, SparkTaskContextSupplier sparkTaskContextSupplier) throws IOException {
  super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()),
      ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), parquetConfig.getCompressionCodecName(),
      parquetConfig.getBlockSize(), parquetConfig.getPageSize(), parquetConfig.getPageSize(),
      ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED,
      ParquetWriter.DEFAULT_WRITER_VERSION, FSUtils.registerFileSystem(file, parquetConfig.getHadoopConf()));
  this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf());
  this.fs =
      (HoodieWrapperFileSystem) this.file.getFileSystem(FSUtils.registerFileSystem(file, parquetConfig.getHadoopConf()));
  // We cannot accurately measure the snappy compressed output file size. We are choosing a
  // conservative 10%
  // TODO - compute this compression ratio dynamically by looking at the bytes written to the
  // stream and the actual file size reported by HDFS
  this.maxFileSize = parquetConfig.getMaxFileSize()
      + Math.round(parquetConfig.getMaxFileSize() * parquetConfig.getCompressionRatio());
  this.writeSupport = parquetConfig.getWriteSupport();
  this.instantTime = instantTime;
  this.sparkTaskContextSupplier = sparkTaskContextSupplier;
}

Source File: TestHDFSParquetImporter.java From hudi with Apache License 2.0

6 votes

public List<GenericRecord> createInsertRecords(Path srcFolder) throws ParseException, IOException {
  Path srcFile = new Path(srcFolder.toString(), "file1.parquet");
  long startTime = HoodieActiveTimeline.COMMIT_FORMATTER.parse("20170203000000").getTime() / 1000;
  List<GenericRecord> records = new ArrayList<GenericRecord>();
  for (long recordNum = 0; recordNum < 96; recordNum++) {
    records.add(HoodieTestDataGenerator.generateGenericRecord(Long.toString(recordNum), "rider-" + recordNum,
        "driver-" + recordNum, startTime + TimeUnit.HOURS.toSeconds(recordNum)));
  }
  try (ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(srcFile)
      .withSchema(HoodieTestDataGenerator.AVRO_SCHEMA).withConf(HoodieTestUtils.getDefaultHadoopConf()).build()) {
    for (GenericRecord record : records) {
      writer.write(record);
    }
  }
  return records;
}

Source File: TestHDFSParquetImporter.java From hudi with Apache License 2.0

6 votes

public List<GenericRecord> createUpsertRecords(Path srcFolder) throws ParseException, IOException {
  Path srcFile = new Path(srcFolder.toString(), "file1.parquet");
  long startTime = HoodieActiveTimeline.COMMIT_FORMATTER.parse("20170203000000").getTime() / 1000;
  List<GenericRecord> records = new ArrayList<GenericRecord>();
  // 10 for update
  for (long recordNum = 0; recordNum < 11; recordNum++) {
    records.add(HoodieTestDataGenerator.generateGenericRecord(Long.toString(recordNum), "rider-upsert-" + recordNum,
        "driver-upsert" + recordNum, startTime + TimeUnit.HOURS.toSeconds(recordNum)));
  }
  // 4 for insert
  for (long recordNum = 96; recordNum < 100; recordNum++) {
    records.add(HoodieTestDataGenerator.generateGenericRecord(Long.toString(recordNum), "rider-upsert-" + recordNum,
        "driver-upsert" + recordNum, startTime + TimeUnit.HOURS.toSeconds(recordNum)));
  }
  try (ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(srcFile)
      .withSchema(HoodieTestDataGenerator.AVRO_SCHEMA).withConf(HoodieTestUtils.getDefaultHadoopConf()).build()) {
    for (GenericRecord record : records) {
      writer.write(record);
    }
  }
  return records;
}

Source File: TestColumnSizeCommand.java From parquet-mr with Apache License 2.0

6 votes

private String createParquetFile() throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, INT32, "Num"));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = randomParquetFile().getAbsolutePath();
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file)).withConf(conf);
  Random rnd = new Random();
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", rnd.nextLong());
      g.add("Num", rnd.nextInt());
      writer.write(g);
    }
  }

  return file;
}

Source File: ProtoParquetWriterWithOffset.java From garmadon with Apache License 2.0

6 votes

/**
 * @param writer            The actual Proto + Parquet writer
 * @param temporaryHdfsPath The path to which the writer will output events
 * @param finalHdfsDir      The directory to write the final output to (renamed from temporaryHdfsPath)
 * @param fs                The filesystem on which both the temporary and final files reside
 * @param fileNamer         File-naming logic for the final path
 * @param dayStartTime      The day partition the final file will go to
 * @param eventName         Event name used for logging &amp; monitoring
 */
public ProtoParquetWriterWithOffset(ParquetWriter<MESSAGE_KIND> writer, Path temporaryHdfsPath,
                                    Path finalHdfsDir, FileSystem fs, OffsetComputer fileNamer,
                                    LocalDateTime dayStartTime, String eventName,
                                    BiConsumer<String, String> protoMetadataWriter, int partition) {
    this.writer = writer;
    this.temporaryHdfsPath = temporaryHdfsPath;
    this.finalHdfsDir = finalHdfsDir;
    this.fs = fs;
    this.fileNamer = fileNamer;
    this.dayStartTime = dayStartTime;
    this.eventName = eventName;
    this.fsBlockSize = fs.getDefaultBlockSize(finalHdfsDir);
    this.protoMetadataWriter = protoMetadataWriter;
    this.partition = partition;

    initializeLatestCommittedTimestampGauge();
}

Source File: FetchParquetTest.java From nifi with Apache License 2.0

6 votes

private void writeParquetUsersWithArray(final File parquetFile, int numUsers) throws IOException {
    if (parquetFile.exists()) {
        Assert.assertTrue(parquetFile.delete());
    }

    final AvroParquetWriter.Builder<GenericRecord> writerBuilder = createAvroParquetWriter(parquetFile, schemaWithArray);

    final Schema favoriteColorsSchema = schemaWithArray.getField("favorite_colors").schema();

    try (final ParquetWriter<GenericRecord> writer = writerBuilder.build()) {
        for (int i=0; i < numUsers; i++) {
            final GenericRecord user = new GenericData.Record(schema);
            user.put("name", "Bob" + i);
            user.put("favorite_number", i);


            final GenericData.Array<String> colors = new GenericData.Array<>(1, favoriteColorsSchema);
            colors.add("blue" + i);

            user.put("favorite_color", colors);

            writer.write(user);
        }
    }
}

Source File: TestStatistics.java From parquet-mr with Apache License 2.0

6 votes

public static void writeAndTest(WriteContext context) throws IOException {
  // Create the configuration, and then apply the schema to our configuration.
  Configuration configuration = new Configuration();
  GroupWriteSupport.setSchema(context.schema, configuration);
  GroupWriteSupport groupWriteSupport = new GroupWriteSupport();

  // Create the writer properties
  final int blockSize = context.blockSize;
  final int pageSize = context.pageSize;
  final int dictionaryPageSize = pageSize;
  final boolean enableDictionary = context.enableDictionary;
  final boolean enableValidation = context.enableValidation;
  ParquetProperties.WriterVersion writerVersion = context.version;
  CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;

  ParquetWriter<Group> writer = new ParquetWriter<Group>(context.fsPath,
      groupWriteSupport, codec, blockSize, pageSize, dictionaryPageSize,
      enableDictionary, enableValidation, writerVersion, configuration);

  context.write(writer);
  writer.close();

  context.test();

  context.path.delete();
}

Source File: SqlInterpreterTest.java From zeppelin with Apache License 2.0

6 votes

public File createParquetFile(int[] values,
                              ParquetProperties.WriterVersion version) throws IOException {
  File file = File.createTempFile("zeppelin-flink-input", ".par");
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  Configuration conf = new Configuration();

  MessageType schema = MessageTypeParser.parseMessageType(
          "message test { "
                  + "required int32 int32_field; "
                  + "} ");
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);

  ParquetWriter<Group> writer = new ParquetWriter<Group>(
          path,
          new GroupWriteSupport(),
          CompressionCodecName.UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
  for (int i = 0; i < values.length; i++) {
    writer.write(f.newGroup()
            .append("int32_field", values[i]));
  }
  writer.close();
  return file;
}

Source File: ColumnSizeCommandTest.java From parquet-mr with Apache License 2.0

6 votes

private String createParquetFile() throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, INT32, "Num"));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = randomParquetFile().getAbsolutePath();
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file)).withConf(conf);
  Random rnd = new Random();
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", rnd.nextLong());
      g.add("Num", rnd.nextInt());
      writer.write(g);
    }
  }

  return file;
}

Source File: PhoneBookWriter.java From parquet-mr with Apache License 2.0

5 votes

public static void write(ParquetWriter.Builder<Group, ?> builder, List<User> users) throws IOException {
  builder.config(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());
  try (ParquetWriter<Group> writer = builder.build()) {
    for (User u : users) {
      writer.write(groupFromUser(u));
    }
  }
}

Source File: TestStatistics.java From parquet-mr with Apache License 2.0

5 votes

@Override
public void write(ParquetWriter<Group> writer) throws IOException {
  for (int index = 0; index < recordCount; index++) {
    Group group = new SimpleGroup(super.schema);

    for (int column = 0, columnCnt = schema.getFieldCount(); column < columnCnt; ++column) {
      Type type = schema.getType(column);
      RandomValueGenerator<?> generator = randomGenerators.get(column);
      if (type.isRepetition(OPTIONAL) && generator.shouldGenerateNull()) {
        continue;
      }
      switch (type.asPrimitiveType().getPrimitiveTypeName()) {
      case BINARY:
      case FIXED_LEN_BYTE_ARRAY:
      case INT96:
        group.append(type.getName(), ((RandomBinaryBase<?>) generator).nextBinaryValue());
        break;
      case INT32:
        group.append(type.getName(), (Integer) generator.nextValue());
        break;
      case INT64:
        group.append(type.getName(), (Long) generator.nextValue());
        break;
      case FLOAT:
        group.append(type.getName(), (Float) generator.nextValue());
        break;
      case DOUBLE:
        group.append(type.getName(), (Double) generator.nextValue());
        break;
      case BOOLEAN:
        group.append(type.getName(), (Boolean) generator.nextValue());
        break;
      }
    }
    writer.write(group);
  }
}

Source File: ScroogeBinaryTest.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testScroogeBinaryEncoding() throws Exception {
  StringAndBinary expected = new StringAndBinary.Immutable("test",
      ByteBuffer.wrap(new byte[] {-123, 20, 33}));

  File temp = tempDir.newFile(UUID.randomUUID().toString());
  temp.deleteOnExit();
  temp.delete();

  Path path = new Path(temp.getPath());

  ParquetWriter<StringAndBinary> writer = new ParquetWriter<StringAndBinary>(
      path, new Configuration(), new ScroogeWriteSupport<StringAndBinary>(StringAndBinary.class));
  writer.write(expected);
  writer.close();

  // read using the parquet-thrift version to isolate the write path
  ParquetReader<org.apache.parquet.thrift.test.binary.StringAndBinary> reader = ThriftParquetReader.<org.apache.parquet.thrift.test.binary.StringAndBinary>
      build(path)
      .withThriftClass(org.apache.parquet.thrift.test.binary.StringAndBinary.class)
      .build();
  org.apache.parquet.thrift.test.binary.StringAndBinary record = reader.read();
  reader.close();

  Assert.assertEquals("String should match after serialization round trip",
      "test", record.s);
  Assert.assertEquals("ByteBuffer should match after serialization round trip",
      ByteBuffer.wrap(new byte[] {-123, 20, 33}), record.b);
}

Source File: TestReadWrite.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testNestedLists() throws Exception {
  Schema schema = new Schema.Parser().parse(
    Resources.getResource("nested_array.avsc").openStream());
  Path file = new Path(createTempFile().getPath());

  // Parquet writer
  ParquetWriter parquetWriter = AvroParquetWriter.builder(file).withSchema(schema)
    .withConf(testConf)
    .build();

  Schema innerRecordSchema = schema.getField("l1").schema().getTypes()
    .get(1).getElementType().getTypes().get(1);

  GenericRecord record = new GenericRecordBuilder(schema)
    .set("l1", Collections.singletonList(
      new GenericRecordBuilder(innerRecordSchema).set("l2", Collections.singletonList("hello")).build()
    ))
    .build();

  parquetWriter.write(record);
  parquetWriter.close();

  AvroParquetReader<GenericRecord> reader = new AvroParquetReader(testConf, file);
  GenericRecord nextRecord = reader.read();

  assertNotNull(nextRecord);
  assertNotNull(nextRecord.get("l1"));
  List l1List = (List) nextRecord.get("l1");
  assertNotNull(l1List.get(0));
  List l2List = (List) ((GenericRecord) l1List.get(0)).get("l2");
  assertEquals(str("hello"), l2List.get(0));
}

Source File: AvroToParquetConverterUtil.java From datacollector with Apache License 2.0

5 votes

private static ParquetWriter.Builder getParquetWriterBuilder(Path tempFile, Schema avroSchema, Configuration conf) {
  // Parquet Avro pre-1.9 doesn't work with logical types, so in that case we use custom Builder that injects our own
  // avro schema -> parquet schema generator class (which is a copy of the one that was provided in PARQUET-358).
  // Additionally, Parquet Avro 1.9.x does not support converting from Avro timestamps (logical types TIMESTAMP_MILLIS
  // and TIMESTAMP_MICROS) and so we have to extend Parquet Avro classes to support timestamps conversion.
  ParquetWriter.Builder builder = null;
  try {
    SemanticVersion parquetVersion = SemanticVersion.parse(Version.VERSION_NUMBER);
    if(parquetVersion.major > 1 || (parquetVersion.major == 1 && parquetVersion.minor >= 9)) {
      if (parquetVersion.major == 1 && parquetVersion.minor >= 9) {
        LOG.debug("Creating AvroParquetWriterBuilder190Int96");
        if (propertyDefined(conf, AvroParquetConstants.TIMEZONE)) {
          String timeZoneId = conf.get(AvroParquetConstants.TIMEZONE);
          builder = new AvroParquetWriterBuilder190Int96(tempFile, timeZoneId).withSchema(avroSchema);
        } else {
          builder = new AvroParquetWriterBuilder190Int96(tempFile).withSchema(avroSchema);
        }
      } else {
        LOG.debug("Creating AvroParquetWriter.builder");
        builder = AvroParquetWriter.builder(tempFile).withSchema(avroSchema);
      }
    } else {
      LOG.debug("Creating AvroParquetWriterBuilder");
      builder = new AvroParquetWriterBuilder(tempFile).withSchema(avroSchema);
    }
  } catch (SemanticVersion.SemanticVersionParseException e) {
    LOG.warn("Can't parse parquet version string: " + Version.VERSION_NUMBER, e);
    builder = new AvroParquetWriterBuilder(tempFile).withSchema(avroSchema);
  }
  return builder;
}

Source File: ParquetFileTest.java From parquet-mr with Apache License 2.0

5 votes

private void createTestParquetFile() throws IOException {
  File file = parquetFile();
  Path fsPath = new Path(file.getPath());
  Configuration conf = new Configuration();

  MessageType schema = createSchema();
  SimpleGroupFactory fact = new SimpleGroupFactory(schema);
  GroupWriteSupport.setSchema(schema, conf);

  try (
    ParquetWriter<Group> writer = new ParquetWriter<>(
      fsPath,
      new GroupWriteSupport(),
      CompressionCodecName.UNCOMPRESSED,
      1024,
      1024,
      512,
      true,
      false,
      ParquetProperties.WriterVersion.PARQUET_2_0,
      conf)) {
    for (int i = 0; i < 10; i++) {
      final byte[] bytes = new byte[12];
      ThreadLocalRandom.current().nextBytes(bytes);

      writer.write(fact.newGroup()
       .append(INT32_FIELD, 32 + i)
       .append(INT64_FIELD, 64L + i)
       .append(FLOAT_FIELD, 1.0f + i)
       .append(DOUBLE_FIELD, 2.0d + i)
       .append(BINARY_FIELD, Binary.fromString(COLORS[i % COLORS.length]))
       .append(FIXED_LEN_BYTE_ARRAY_FIELD,
         Binary.fromConstantByteArray(bytes)));
    }
  }
}

Source File: ScroogeBinaryTest.java From parquet-mr with Apache License 2.0

5 votes

@Test
@SuppressWarnings("unchecked")
public void testScroogeBinaryDecoding() throws Exception {
  StringAndBinary expected = new StringAndBinary.Immutable("test",
      ByteBuffer.wrap(new byte[] {-123, 20, 33}));

  File temp = tempDir.newFile(UUID.randomUUID().toString());
  temp.deleteOnExit();
  temp.delete();

  Path path = new Path(temp.getPath());

  ParquetWriter<StringAndBinary> writer = new ParquetWriter<StringAndBinary>(
      path, new Configuration(), new ScroogeWriteSupport<StringAndBinary>(StringAndBinary.class));
  writer.write(expected);
  writer.close();

  Configuration conf = new Configuration();
  conf.set("parquet.thrift.converter.class", ScroogeRecordConverter.class.getName());
  ParquetReader<StringAndBinary> reader = ParquetReader.<StringAndBinary>
      builder(new ScroogeReadSupport(), path)
      .withConf(conf)
      .build();
  StringAndBinary record = reader.read();
  reader.close();

  Assert.assertEquals("String should match after serialization round trip",
      "test", record.s());
  Assert.assertEquals("ByteBuffer should match after serialization round trip",
      ByteBuffer.wrap(new byte[] {-123, 20, 33}), record.b());
}

Source File: TestSimpleRecordConverter.java From parquet-mr with Apache License 2.0

5 votes

private void createTestParquetFile() throws IOException {
  Path fsPath = new Path(testFile().getPath());
  Configuration conf = new Configuration();

  MessageType schema = createSchema();
  SimpleGroupFactory fact = new SimpleGroupFactory(schema);
  GroupWriteSupport.setSchema(schema, conf);

  try (
    ParquetWriter<Group> writer = new ParquetWriter<>(
      fsPath,
      new GroupWriteSupport(),
      CompressionCodecName.UNCOMPRESSED,
      1024,
      1024,
      512,
      true,
      false,
      ParquetProperties.WriterVersion.PARQUET_2_0,
      conf)) {
    writer.write(fact.newGroup()
     .append(INT32_FIELD, 32)
     .append(INT64_FIELD, 64L)
     .append(FLOAT_FIELD, 1.0f)
     .append(DOUBLE_FIELD, 2.0d)
     .append(BINARY_FIELD, Binary.fromString("foobar"))
     .append(FIXED_LEN_BYTE_ARRAY_FIELD,
       Binary.fromConstantByteArray(new byte[]{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 })));
  }
}

Source File: TestPruneColumnsCommand.java From parquet-mr with Apache License 2.0

5 votes

private String createParquetFile(String prefix) throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, BINARY, "Name"),
    new PrimitiveType(REQUIRED, BINARY, "Gender"),
    new GroupType(OPTIONAL, "Links",
      new PrimitiveType(REPEATED, INT64, "Backward"),
      new PrimitiveType(REPEATED, INT64, "Forward")));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = createTempFile(prefix);
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file)).withConf(conf);
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", 1l);
      g.add("Name", "foo");
      g.add("Gender", "male");
      Group links = g.addGroup("Links");
      links.add(0, 2l);
      links.add(1, 3l);
      writer.write(g);
    }
  }

  return file;
}

Source File: TestUtil.java From flink with Apache License 2.0

5 votes

public static Path createTempParquetFile(File folder, Schema schema, List<IndexedRecord> records) throws IOException {
	Path path = new Path(folder.getPath(), UUID.randomUUID().toString());
	ParquetWriter<IndexedRecord> writer = AvroParquetWriter.<IndexedRecord>builder(
		new org.apache.hadoop.fs.Path(path.toUri())).withSchema(schema).withRowGroupSize(10).build();

	for (IndexedRecord record : records) {
		writer.write(record);
	}

	writer.close();
	return path;
}

Source File: ParquetAvroWriters.java From flink with Apache License 2.0

5 votes

private static <T> ParquetWriter<T> createAvroParquetWriter(
		String schemaString,
		GenericData dataModel,
		OutputFile out) throws IOException {

	final Schema schema = new Schema.Parser().parse(schemaString);

	return AvroParquetWriter.<T>builder(out)
			.withSchema(schema)
			.withDataModel(dataModel)
			.build();
}

Source File: ParquetRowDataBuilder.java From flink with Apache License 2.0

5 votes

@Override
public ParquetWriter<RowData> createWriter(OutputFile out) throws IOException {
	Configuration conf = configuration.conf();
	return new ParquetRowDataBuilder(out, rowType, utcTimestamp)
			.withCompressionCodec(getParquetCompressionCodec(conf))
			.withRowGroupSize(getBlockSize(conf))
			.withPageSize(getPageSize(conf))
			.withDictionaryPageSize(getDictionaryPageSize(conf))
			.withMaxPaddingSize(conf.getInt(
					MAX_PADDING_BYTES, ParquetWriter.MAX_PADDING_SIZE_DEFAULT))
			.withDictionaryEncoding(getEnableDictionary(conf))
			.withValidation(getValidation(conf))
			.withWriterVersion(getWriterVersion(conf))
			.withConf(conf).build();
}

Source File: AvroParquetConvertMapper.java From datacollector with Apache License 2.0

5 votes

@Override
protected void initializeWriter(
    Path tempFile,
    Schema avroSchema,
    Configuration conf,
    Context context
) throws IOException {
  ParquetWriter.Builder builder = AvroToParquetConverterUtil.initializeWriter(tempFile, avroSchema, conf);

  // Parquet writer
  parquetWriter = builder
      .withConf(context.getConfiguration())
      .build();
}

org.apache.parquet.hadoop.ParquetWriter Java Examples