org.apache.parquet.hadoop.ParquetWriter#write

Source File: IntegrationTestHelper.java From circus-train with Apache License 2.0

9 votes

URI createData(
    URI tableUri,
    Schema schema,
    String hour,
    int id,
    String fieldName,
    Object data) throws IOException {
  GenericData.Record record = new GenericData.Record(schema);
  record.put("id", id);

  if (fieldName != null) {
    Schema.Field field = schema.getField(fieldName);
    Schema fieldSchema = field.schema();
    if (data instanceof Map) {
      GenericData.Record schemaRecord = new GenericData.Record(fieldSchema);
      ((Map<String, String>) data).forEach(schemaRecord::put);
      record.put(fieldName, schemaRecord);
    } else if (data != null) {
      record.put(fieldName, data);
    }
  }

  URI partition = URI.create(tableUri + "/hour=" + hour);
  String path = partition.getPath();
  File parentFolder = new File(path);
  parentFolder.mkdirs();
  File partitionFile = new File(parentFolder, "parquet0000");
  Path filePath = new Path(partitionFile.toURI());
  ParquetWriter<GenericData.Record> writer = AvroParquetWriter.<GenericData.Record>builder(filePath)
      .withSchema(schema)
      .withConf(new Configuration())
      .build();

  try {
    writer.write(record);
  } finally {
    writer.close();
  }
  return partition;
}

Source File: TestHoodieAvroWriteSupport.java From hudi with Apache License 2.0

6 votes

@Test
public void testAddKey(@TempDir java.nio.file.Path tempDir) throws IOException {
  List<String> rowKeys = new ArrayList<>();
  for (int i = 0; i < 1000; i++) {
    rowKeys.add(UUID.randomUUID().toString());
  }
  String filePath = tempDir.resolve("test.parquet").toAbsolutePath().toString();
  Schema schema = HoodieAvroUtils.getRecordKeySchema();
  BloomFilter filter = BloomFilterFactory.createBloomFilter(
      1000, 0.0001, 10000,
      BloomFilterTypeCode.SIMPLE.name());
  HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
      new AvroSchemaConverter().convert(schema), schema, filter);
  ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP,
      120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE);
  for (String rowKey : rowKeys) {
    GenericRecord rec = new GenericData.Record(schema);
    rec.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, rowKey);
    writer.write(rec);
    writeSupport.add(rowKey);
  }
  writer.close();
}

Source File: TestParquetUtils.java From hudi with Apache License 2.0

6 votes

private void writeParquetFile(String typeCode, String filePath, List<String> rowKeys, Schema schema, boolean addPartitionPathField, String partitionPath) throws Exception {
  // Write out a parquet file
  BloomFilter filter = BloomFilterFactory
      .createBloomFilter(1000, 0.0001, 10000, typeCode);
  HoodieAvroWriteSupport writeSupport =
      new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter);
  ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP,
      120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE);
  for (String rowKey : rowKeys) {
    GenericRecord rec = new GenericData.Record(schema);
    rec.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, rowKey);
    if (addPartitionPathField) {
      rec.put(HoodieRecord.PARTITION_PATH_METADATA_FIELD, partitionPath);
    }
    writer.write(rec);
    writeSupport.add(rowKey);
  }
  writer.close();
}

Source File: SqlInterpreterTest.java From zeppelin with Apache License 2.0

6 votes

public File createParquetFile(int[] values,
                              ParquetProperties.WriterVersion version) throws IOException {
  File file = File.createTempFile("zeppelin-flink-input", ".par");
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  Configuration conf = new Configuration();

  MessageType schema = MessageTypeParser.parseMessageType(
          "message test { "
                  + "required int32 int32_field; "
                  + "} ");
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);

  ParquetWriter<Group> writer = new ParquetWriter<Group>(
          path,
          new GroupWriteSupport(),
          CompressionCodecName.UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
  for (int i = 0; i < values.length; i++) {
    writer.write(f.newGroup()
            .append("int32_field", values[i]));
  }
  writer.close();
  return file;
}

Source File: DictionaryFilterTest.java From parquet-mr with Apache License 2.0

6 votes

private static void writeData(SimpleGroupFactory f, ParquetWriter<Group> writer) throws IOException {
  for (int i = 0; i < nElements; i++) {
    int index = i % ALPHABET.length();

    Group group = f.newGroup()
        .append("binary_field", ALPHABET.substring(index, index+1))
        .append("single_value_field", "sharp")
        .append("fixed_field", DECIMAL_VALUES[i % DECIMAL_VALUES.length])
        .append("int32_field", intValues[i % intValues.length])
        .append("int64_field", longValues[i % longValues.length])
        .append("double_field", toDouble(intValues[i % intValues.length]))
        .append("float_field", toFloat(intValues[i % intValues.length]))
        .append("plain_int32_field", i)
        .append("fallback_binary_field", i < (nElements / 2) ?
            ALPHABET.substring(index, index+1) : UUID.randomUUID().toString())
        .append("int96_field", INT96_VALUES[i % INT96_VALUES.length]);

    // 10% of the time, leave the field null
    if (index % 10 > 0) {
      group.append("optional_single_value_field", "sharp");
    }

    writer.write(group);
  }
  writer.close();
}

Source File: TestReadWrite.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testNestedLists() throws Exception {
  Schema schema = new Schema.Parser().parse(
    Resources.getResource("nested_array.avsc").openStream());
  Path file = new Path(createTempFile().getPath());

  // Parquet writer
  ParquetWriter parquetWriter = AvroParquetWriter.builder(file).withSchema(schema)
    .withConf(testConf)
    .build();

  Schema innerRecordSchema = schema.getField("l1").schema().getTypes()
    .get(1).getElementType().getTypes().get(1);

  GenericRecord record = new GenericRecordBuilder(schema)
    .set("l1", Collections.singletonList(
      new GenericRecordBuilder(innerRecordSchema).set("l2", Collections.singletonList("hello")).build()
    ))
    .build();

  parquetWriter.write(record);
  parquetWriter.close();

  AvroParquetReader<GenericRecord> reader = new AvroParquetReader(testConf, file);
  GenericRecord nextRecord = reader.read();

  assertNotNull(nextRecord);
  assertNotNull(nextRecord.get("l1"));
  List l1List = (List) nextRecord.get("l1");
  assertNotNull(l1List.get(0));
  List l2List = (List) ((GenericRecord) l1List.get(0)).get("l2");
  assertEquals(str("hello"), l2List.get(0));
}

Source File: TestUtil.java From flink with Apache License 2.0

5 votes

public static Path createTempParquetFile(File folder, Schema schema, List<IndexedRecord> records) throws IOException {
	Path path = new Path(folder.getPath(), UUID.randomUUID().toString());
	ParquetWriter<IndexedRecord> writer = AvroParquetWriter.<IndexedRecord>builder(
		new org.apache.hadoop.fs.Path(path.toUri())).withSchema(schema).withRowGroupSize(10).build();

	for (IndexedRecord record : records) {
		writer.write(record);
	}

	writer.close();
	return path;
}

Source File: ScroogeBinaryTest.java From parquet-mr with Apache License 2.0

5 votes

@Test
@SuppressWarnings("unchecked")
public void testScroogeBinaryDecoding() throws Exception {
  StringAndBinary expected = new StringAndBinary.Immutable("test",
      ByteBuffer.wrap(new byte[] {-123, 20, 33}));

  File temp = tempDir.newFile(UUID.randomUUID().toString());
  temp.deleteOnExit();
  temp.delete();

  Path path = new Path(temp.getPath());

  ParquetWriter<StringAndBinary> writer = new ParquetWriter<StringAndBinary>(
      path, new Configuration(), new ScroogeWriteSupport<StringAndBinary>(StringAndBinary.class));
  writer.write(expected);
  writer.close();

  Configuration conf = new Configuration();
  conf.set("parquet.thrift.converter.class", ScroogeRecordConverter.class.getName());
  ParquetReader<StringAndBinary> reader = ParquetReader.<StringAndBinary>
      builder(new ScroogeReadSupport(), path)
      .withConf(conf)
      .build();
  StringAndBinary record = reader.read();
  reader.close();

  Assert.assertEquals("String should match after serialization round trip",
      "test", record.s());
  Assert.assertEquals("ByteBuffer should match after serialization round trip",
      ByteBuffer.wrap(new byte[] {-123, 20, 33}), record.b());
}

Source File: ScroogeBinaryTest.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testScroogeBinaryEncoding() throws Exception {
  StringAndBinary expected = new StringAndBinary.Immutable("test",
      ByteBuffer.wrap(new byte[] {-123, 20, 33}));

  File temp = tempDir.newFile(UUID.randomUUID().toString());
  temp.deleteOnExit();
  temp.delete();

  Path path = new Path(temp.getPath());

  ParquetWriter<StringAndBinary> writer = new ParquetWriter<StringAndBinary>(
      path, new Configuration(), new ScroogeWriteSupport<StringAndBinary>(StringAndBinary.class));
  writer.write(expected);
  writer.close();

  // read using the parquet-thrift version to isolate the write path
  ParquetReader<org.apache.parquet.thrift.test.binary.StringAndBinary> reader = ThriftParquetReader.<org.apache.parquet.thrift.test.binary.StringAndBinary>
      build(path)
      .withThriftClass(org.apache.parquet.thrift.test.binary.StringAndBinary.class)
      .build();
  org.apache.parquet.thrift.test.binary.StringAndBinary record = reader.read();
  reader.close();

  Assert.assertEquals("String should match after serialization round trip",
      "test", record.s);
  Assert.assertEquals("ByteBuffer should match after serialization round trip",
      ByteBuffer.wrap(new byte[] {-123, 20, 33}), record.b);
}

Source File: TestStatistics.java From parquet-mr with Apache License 2.0

5 votes

@Override
public void write(ParquetWriter<Group> writer) throws IOException {
  for (int index = 0; index < recordCount; index++) {
    Group group = new SimpleGroup(super.schema);

    for (int column = 0, columnCnt = schema.getFieldCount(); column < columnCnt; ++column) {
      Type type = schema.getType(column);
      RandomValueGenerator<?> generator = randomGenerators.get(column);
      if (type.isRepetition(OPTIONAL) && generator.shouldGenerateNull()) {
        continue;
      }
      switch (type.asPrimitiveType().getPrimitiveTypeName()) {
      case BINARY:
      case FIXED_LEN_BYTE_ARRAY:
      case INT96:
        group.append(type.getName(), ((RandomBinaryBase<?>) generator).nextBinaryValue());
        break;
      case INT32:
        group.append(type.getName(), (Integer) generator.nextValue());
        break;
      case INT64:
        group.append(type.getName(), (Long) generator.nextValue());
        break;
      case FLOAT:
        group.append(type.getName(), (Float) generator.nextValue());
        break;
      case DOUBLE:
        group.append(type.getName(), (Double) generator.nextValue());
        break;
      case BOOLEAN:
        group.append(type.getName(), (Boolean) generator.nextValue());
        break;
      }
    }
    writer.write(group);
  }
}

Source File: TestFiltersWithMissingColumns.java From parquet-mr with Apache License 2.0

5 votes

@Before
public void createDataFile() throws Exception {
  File file = temp.newFile("test.parquet");
  this.path = new Path(file.toString());

  MessageType type = Types.buildMessage()
      .required(INT64).named("id")
      .required(BINARY).as(UTF8).named("data")
      .named("test");

  SimpleGroupFactory factory = new SimpleGroupFactory(type);

  ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
      .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
      .withType(type)
      .build();

  try {
    for (long i = 0; i < 1000; i += 1) {
      Group g = factory.newGroup();
      g.add(0, i);
      g.add(1, "data-" + i);
      writer.write(g);
    }
  } finally {
    writer.close();
  }
}

Source File: FileEncodingsIT.java From parquet-mr with Apache License 2.0

5 votes

/**
 * Writes a set of values to a parquet file.
 * The ParquetWriter will write the values with dictionary encoding disabled so that we test specific encodings for
 */
private void writeValuesToFile(Path file, PrimitiveTypeName type, List<?> values, int rowGroupSize, int pageSize, boolean enableDictionary, WriterVersion version) throws IOException {
  MessageType schema;
  if (type == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
    schema = Types.buildMessage().required(type).length(FIXED_LENGTH).named("field").named("test");
  } else {
    schema = Types.buildMessage().required(type).named("field").named("test");
  }

  SimpleGroupFactory message = new SimpleGroupFactory(schema);
  GroupWriteSupport.setSchema(schema, configuration);

  ParquetWriter<Group> writer = ExampleParquetWriter.builder(file)
      .withCompressionCodec(compression)
      .withRowGroupSize(rowGroupSize)
      .withPageSize(pageSize)
      .withDictionaryPageSize(TEST_DICT_PAGE_SIZE)
      .withDictionaryEncoding(enableDictionary)
      .withWriterVersion(version)
      .withConf(configuration)
      .build();

  for (Object o: values) {
    switch (type) {
      case BOOLEAN:
        writer.write(message.newGroup().append("field", (Boolean)o));
      break;
      case INT32:
        writer.write(message.newGroup().append("field", (Integer)o));
      break;
      case INT64:
        writer.write(message.newGroup().append("field", (Long)o));
      break;
      case FLOAT:
        writer.write(message.newGroup().append("field", (Float)o));
      break;
      case DOUBLE:
        writer.write(message.newGroup().append("field", (Double)o));
      break;
      case INT96:
      case BINARY:
      case FIXED_LEN_BYTE_ARRAY:
        writer.write(message.newGroup().append("field", (Binary)o));
      break;
      default:
        throw new IllegalArgumentException("Unknown type name: " + type);
    }
  }

  writer.close();
}

Source File: DirectWriterTest.java From parquet-mr with Apache License 2.0

5 votes

protected Path writeDirect(MessageType type, DirectWriter writer,
                         Map<String, String> metadata) throws IOException {
  File temp = tempDir.newFile(UUID.randomUUID().toString());
  temp.deleteOnExit();
  temp.delete();

  Path path = new Path(temp.getPath());

  ParquetWriter<Void> parquetWriter = new ParquetWriter<Void>(
      path, new DirectWriteSupport(type, writer, metadata));
  parquetWriter.write(null);
  parquetWriter.close();

  return path;
}

Source File: PageChecksumDataGenerator.java From parquet-mr with Apache License 2.0

5 votes

public void generateData(Path outFile, int nRows, boolean writeChecksums,
                         CompressionCodecName compression) throws IOException {
  if (exists(configuration, outFile)) {
    System.out.println("File already exists " + outFile);
    return;
  }

  ParquetWriter<Group> writer = ExampleParquetWriter.builder(outFile)
    .withConf(configuration)
    .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
    .withCompressionCodec(compression)
    .withDictionaryEncoding(true)
    .withType(SCHEMA)
    .withPageWriteChecksumEnabled(writeChecksums)
    .build();

  GroupFactory groupFactory = new SimpleGroupFactory(SCHEMA);
  Random rand = new Random(42);
  for (int i = 0; i < nRows; i++) {
    Group group = groupFactory.newGroup();
    group
      .append("long_field", (long) i)
      .append("binary_field", randomUUID().toString())
      .addGroup("group")
      // Force dictionary encoding by performing modulo
      .append("int_field", rand.nextInt() % 100)
      .append("int_field", rand.nextInt() % 100)
      .append("int_field", rand.nextInt() % 100)
      .append("int_field", rand.nextInt() % 100);
    writer.write(group);
  }

  writer.close();
}

Source File: TestUtil.java From flink with Apache License 2.0

5 votes

public static Path createTempParquetFile(File folder, Schema schema, List<IndexedRecord> records) throws IOException {
	Path path = new Path(folder.getPath(), UUID.randomUUID().toString());
	ParquetWriter<IndexedRecord> writer = AvroParquetWriter.<IndexedRecord>builder(
		new org.apache.hadoop.fs.Path(path.toUri())).withSchema(schema).withRowGroupSize(10).build();

	for (IndexedRecord record : records) {
		writer.write(record);
	}

	writer.close();
	return path;
}

Source File: TestParquetInLining.java From hudi with Apache License 2.0

5 votes

@Test
public void testSimpleInlineFileSystem() throws IOException {
  Path outerInMemFSPath = getRandomOuterInMemPath();
  Path outerPath = new Path(FILE_SCHEME + outerInMemFSPath.toString().substring(outerInMemFSPath.toString().indexOf(':')));
  generatedPath = outerPath;
  ParquetWriter inlineWriter = new AvroParquetWriter(outerInMemFSPath, HoodieTestDataGenerator.AVRO_SCHEMA,
      CompressionCodecName.GZIP, 100 * 1024 * 1024, 1024 * 1024, true, inMemoryConf);
  // write few records
  List<GenericRecord> recordsToWrite = getParquetHoodieRecords();
  for (GenericRecord rec : recordsToWrite) {
    inlineWriter.write(rec);
  }
  inlineWriter.close();
  byte[] inlineBytes = getBytesToInline(outerInMemFSPath);
  long startOffset = generateOuterFile(outerPath, inlineBytes);

  long inlineLength = inlineBytes.length;

  // Generate phantom inline file
  Path inlinePath = getPhantomFile(outerPath, startOffset, inlineLength);

  // instantiate Parquet reader
  ParquetReader inLineReader = AvroParquetReader.builder(inlinePath).withConf(inlineConf).build();
  List<GenericRecord> records = readParquetGenericRecords(inLineReader);
  assertArrayEquals(recordsToWrite.toArray(), records.toArray());
  inLineReader.close();
}

Source File: DataGenerator.java From parquet-mr with Apache License 2.0

4 votes

public void generateData(Path outFile, Configuration configuration, ParquetProperties.WriterVersion version,
                         int blockSize, int pageSize, int fixedLenByteArraySize, CompressionCodecName codec, int nRows)
        throws IOException
{
  if (exists(configuration, outFile)) {
    System.out.println("File already exists " + outFile);
    return;
  }

  System.out.println("Generating data @ " + outFile);

  MessageType schema = parseMessageType(
          "message test { "
                  + "required binary binary_field; "
                  + "required int32 int32_field; "
                  + "required int64 int64_field; "
                  + "required boolean boolean_field; "
                  + "required float float_field; "
                  + "required double double_field; "
                  + "required fixed_len_byte_array(" + fixedLenByteArraySize +") flba_field; "
                  + "required int96 int96_field; "
                  + "} ");

  GroupWriteSupport.setSchema(schema, configuration);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);
  ParquetWriter<Group> writer = new ParquetWriter<Group>(outFile, new GroupWriteSupport(), codec, blockSize,
                                                         pageSize, DICT_PAGE_SIZE, true, false, version, configuration);

  //generate some data for the fixed len byte array field
  char[] chars = new char[fixedLenByteArraySize];
  Arrays.fill(chars, '*');

  for (int i = 0; i < nRows; i++) {
    writer.write(
      f.newGroup()
        .append("binary_field", randomUUID().toString())
        .append("int32_field", i)
        .append("int64_field", 64l)
        .append("boolean_field", true)
        .append("float_field", 1.0f)
        .append("double_field", 2.0d)
        .append("flba_field", new String(chars))
        .append("int96_field", Binary.fromConstantByteArray(new byte[12]))
    );
  }
  writer.close();
}

Java Code Examples for org.apache.parquet.hadoop.ParquetWriter#write()