org.apache.parquet.example.data.Group Java Exaples

Source File: TestColumnIndexes.java From parquet-mr with Apache License 2.0

6 votes

public Path write(Path directory) throws IOException {
  Path file = new Path(directory, "testColumnIndexes_" + this + ".parquet");
  Random random = new Random(seed);
  int recordCount = random.nextInt(MAX_TOTAL_ROWS) + 1;
  List<Supplier<?>> generators = buildGenerators(recordCount, random);
  Configuration conf = new Configuration();
  ParquetOutputFormat.setColumnIndexTruncateLength(conf, columnIndexTruncateLength);
  try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(file)
      .withType(SCHEMA)
      .withPageRowCountLimit(pageRowCountLimit)
      .withConf(conf)
      .build()) {
    for (int i = 0; i < recordCount; i++) {
      writer.write(createGroup(generators, random));
    }
  }
  return file;
}

Source File: ParquetResolver.java From pxf with Apache License 2.0

6 votes

/**
 * Constructs and sets the fields of a {@link OneRow}.
 *
 * @param record list of {@link OneField}
 * @return the constructed {@link OneRow}
 * @throws IOException if constructing a row from the fields failed
 */
@Override
public OneRow setFields(List<OneField> record) throws IOException {
    validateSchema();
    Group group = groupFactory.newGroup();
    for (int i = 0; i < record.size(); i++) {
        OneField field = record.get(i);
        ColumnDescriptor columnDescriptor = context.getTupleDescription().get(i);

        /*
         * We need to right trim the incoming value from Greenplum. This is
         * consistent with the behaviour in Hive, where char fields are right
         * trimmed during write. Note that String and varchar Hive types are
         * not right trimmed. Hive does not trim tabs or newlines
         */
        if (columnDescriptor.getDataType() == DataType.BPCHAR && field.val instanceof String) {
            field.val = Utilities.rightTrimWhiteSpace((String) field.val);
        }
        fillGroup(i, field, group, schema.getType(i));
    }
    return new OneRow(null, group);
}

Source File: TestPruneColumnsCommand.java From parquet-mr with Apache License 2.0

6 votes

private void validateColumns(String inputFile, List<String> prunePaths) throws IOException {
  ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), new Path(inputFile)).withConf(conf).build();
  for (int i = 0; i < numRecord; i++) {
    Group group = reader.read();
    if (!prunePaths.contains("DocId")) {
      assertEquals(1l, group.getLong("DocId", 0));
    }
    if (!prunePaths.contains("Name")) {
      assertEquals("foo", group.getBinary("Name", 0).toStringUsingUTF8());
    }
    if (!prunePaths.contains("Gender")) {
      assertEquals("male", group.getBinary("Gender", 0).toStringUsingUTF8());
    }
    if (!prunePaths.contains("Links")) {
      Group subGroup = group.getGroup("Links", 0);
      if (!prunePaths.contains("Links.Backward")) {
        assertEquals(2l, subGroup.getLong("Backward", 0));
      }
      if (!prunePaths.contains("Links.Forward")) {
        assertEquals(3l, subGroup.getLong("Forward", 0));
      }
    }
  }
  reader.close();
}

Source File: ParquetResolverTest.java From pxf with Apache License 2.0

6 votes

@SuppressWarnings("deprecation")
private List<Group> readParquetFile(String file, long expectedSize, MessageType schema) throws IOException {
    List<Group> result = new ArrayList<>();
    String parquetFile = Objects.requireNonNull(getClass().getClassLoader().getResource("parquet/" + file)).getPath();
    Path path = new Path(parquetFile);

    ParquetFileReader fileReader = new ParquetFileReader(new Configuration(), path, ParquetMetadataConverter.NO_FILTER);
    PageReadStore rowGroup;
    while ((rowGroup = fileReader.readNextRowGroup()) != null) {
        MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
        RecordReader<Group> recordReader = columnIO.getRecordReader(rowGroup, new GroupRecordConverter(schema));
        long rowCount = rowGroup.getRowCount();
        for (long i = 0; i < rowCount; i++) {
            result.add(recordReader.read());
        }
    }
    fileReader.close();
    assertEquals(expectedSize, result.size());
    return result;
}

Source File: SqlInterpreterTest.java From zeppelin with Apache License 2.0

6 votes

public File createParquetFile(int[] values,
                              ParquetProperties.WriterVersion version) throws IOException {
  File file = File.createTempFile("zeppelin-flink-input", ".par");
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  Configuration conf = new Configuration();

  MessageType schema = MessageTypeParser.parseMessageType(
          "message test { "
                  + "required int32 int32_field; "
                  + "} ");
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);

  ParquetWriter<Group> writer = new ParquetWriter<Group>(
          path,
          new GroupWriteSupport(),
          CompressionCodecName.UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
  for (int i = 0; i < values.length; i++) {
    writer.write(f.newGroup()
            .append("int32_field", values[i]));
  }
  writer.close();
  return file;
}

Source File: TestStatistics.java From parquet-mr with Apache License 2.0

6 votes

public static void writeAndTest(WriteContext context) throws IOException {
  // Create the configuration, and then apply the schema to our configuration.
  Configuration configuration = new Configuration();
  GroupWriteSupport.setSchema(context.schema, configuration);
  GroupWriteSupport groupWriteSupport = new GroupWriteSupport();

  // Create the writer properties
  final int blockSize = context.blockSize;
  final int pageSize = context.pageSize;
  final int dictionaryPageSize = pageSize;
  final boolean enableDictionary = context.enableDictionary;
  final boolean enableValidation = context.enableValidation;
  ParquetProperties.WriterVersion writerVersion = context.version;
  CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;

  ParquetWriter<Group> writer = new ParquetWriter<Group>(context.fsPath,
      groupWriteSupport, codec, blockSize, pageSize, dictionaryPageSize,
      enableDictionary, enableValidation, writerVersion, configuration);

  context.write(writer);
  writer.close();

  context.test();

  context.path.delete();
}

Source File: ParquetHdfsDataWriterTest.java From incubator-gobblin with Apache License 2.0

6 votes

protected List<TestRecord> readParquetFilesGroup(File outputFile)
    throws IOException {
  ParquetReader<Group> reader = null;
  List<Group> records = new ArrayList<>();
  try {
    reader = new ParquetReader<>(new Path(outputFile.toString()), new SimpleReadSupport());
    for (Group value = reader.read(); value != null; value = reader.read()) {
      records.add(value);
    }
  } finally {
    if (reader != null) {
      try {
        reader.close();
      } catch (Exception ex) {
        System.out.println(ex.getMessage());
      }
    }
  }
  return records.stream().map(value -> new TestRecord(
      value.getInteger(TestConstants.PARTITION_FIELD_NAME, 0),
      value.getLong(TestConstants.SEQUENCE_FIELD_NAME, 0),
      value.getString(TestConstants.PAYLOAD_FIELD_NAME, 0)
  )).collect(Collectors.toList());
}

Source File: FilteringBenchmarks.java From parquet-mr with Apache License 2.0

6 votes

@Setup
public void writeFile() throws IOException {
  WriteConfigurator writeConfigurator = getWriteConfigurator();
  file = new Path(
      Files.createTempFile("benchmark-filtering_" + characteristic + '_' + writeConfigurator + '_', ".parquet")
          .toAbsolutePath().toString());
  long[] data = generateData();
  characteristic.arrangeData(data);
  try (ParquetWriter<Group> writer = writeConfigurator.configureBuilder(ExampleParquetWriter.builder(file)
      .config(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, SCHEMA.toString())
      .withRowGroupSize(Integer.MAX_VALUE) // Ensure to have one row-group per file only
      .withWriteMode(OVERWRITE))
      .build()) {
    for (long value : data) {
      Group group = new SimpleGroup(SCHEMA);
      group.add(0, value);
      group.add(1, Binary.fromString(dummyGenerator.nextString()));
      group.add(2, Binary.fromString(dummyGenerator.nextString()));
      group.add(3, Binary.fromString(dummyGenerator.nextString()));
      group.add(4, Binary.fromString(dummyGenerator.nextString()));
      group.add(5, Binary.fromString(dummyGenerator.nextString()));
      writer.write(group);
    }
  }
}

Source File: PageChecksumReadBenchmarks.java From parquet-mr with Apache License 2.0

6 votes

private void readFile(Path file, int nRows, boolean verifyChecksums, Blackhole blackhole)
  throws IOException {
  try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file)
      .withConf(configuration)
      .usePageChecksumVerification(verifyChecksums)
      .build()) {
    for (int i = 0; i < nRows; i++) {
      Group group = reader.read();
      blackhole.consume(group.getLong("long_field", 0));
      blackhole.consume(group.getBinary("binary_field", 0));
      Group subgroup = group.getGroup("group", 0);
      blackhole.consume(subgroup.getInteger("int_field", 0));
      blackhole.consume(subgroup.getInteger("int_field", 1));
      blackhole.consume(subgroup.getInteger("int_field", 2));
      blackhole.consume(subgroup.getInteger("int_field", 3));
    }
  }
}

Source File: TestColumnIO.java From parquet-mr with Apache License 2.0

6 votes

private void validateGroups(List<Group> groups1, Object[][] e1) {
  Iterator<Group> i1 = groups1.iterator();
  for (int i = 0; i < e1.length; i++) {
    Object[] objects = e1[i];
    Group next = i1.next();
    for (int j = 0; j < objects.length; j++) {
      Object object = objects[j];
      if (object == null) {
        assertEquals(0, next.getFieldRepetitionCount(j));
      } else {
        assertEquals("looking for r[" + i + "][" + j + "][0]=" + object, 1, next.getFieldRepetitionCount(j));
        assertEquals(object, next.getInteger(j, 0));
      }
    }
  }
}

Source File: TestThriftToParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testWriteFile() throws IOException, InterruptedException, TException {
  final AddressBook a = new AddressBook(
      Arrays.asList(
          new Person(
              new Name("Bob", "Roberts"),
              0,
              "[email protected]",
              Arrays.asList(new PhoneNumber("1234567890")))));

  final Path fileToCreate = createFile(a);

  ParquetReader<Group> reader = createRecordReader(fileToCreate);

  Group g = null;
  int i = 0;
  while((g = reader.read()) != null) {
    assertEquals(a.persons.size(), g.getFieldRepetitionCount("persons"));
    assertEquals(a.persons.get(0).email, g.getGroup("persons", 0).getGroup(0, 0).getString("email", 0));
    // just some sanity check, we're testing the various layers somewhere else
    ++i;
  }
  assertEquals("read 1 record", 1, i);

}

Source File: TestColumnIO.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testOneOfEach() {
  MessageType oneOfEachSchema = MessageTypeParser.parseMessageType(oneOfEach);
  GroupFactory gf = new SimpleGroupFactory(oneOfEachSchema);
  Group g1 = gf.newGroup()
      .append("a", 1l)
      .append("b", 2)
      .append("c", 3.0f)
      .append("d", 4.0d)
      .append("e", true)
      .append("f", Binary.fromString("6"))
      .append("g", new NanoTime(1234, System.currentTimeMillis() * 1000))
      .append("h", Binary.fromString("abc"));

  testSchema(oneOfEachSchema, Arrays.asList(g1));
}

Source File: DictionaryFilterTest.java From parquet-mr with Apache License 2.0

5 votes

private static void prepareFile(WriterVersion version, Path file) throws IOException {
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);
  ParquetWriter<Group> writer = ExampleParquetWriter.builder(file)
      .withWriterVersion(version)
      .withCompressionCodec(GZIP)
      .withRowGroupSize(1024*1024)
      .withPageSize(1024)
      .enableDictionaryEncoding()
      .withDictionaryPageSize(2*1024)
      .withConf(conf)
      .build();
  writeData(f, writer);
}

Source File: TestColumnIndexes.java From parquet-mr with Apache License 2.0

5 votes

private Group createGroup(List<Supplier<?>> generators, Random random) {
  Group group = FACTORY.newGroup();
  for (int column = 0, columnCnt = SCHEMA.getFieldCount(); column < columnCnt; ++column) {
    Type type = SCHEMA.getType(column);
    Supplier<?> generator = generators.get(column);
    // 2% chance of null value for an optional column
    if (generator == null || (type.isRepetition(OPTIONAL) && random.nextInt(50) == 0)) {
      continue;
    }
    switch (type.asPrimitiveType().getPrimitiveTypeName()) {
    case BINARY:
    case FIXED_LEN_BYTE_ARRAY:
    case INT96:
      group.append(type.getName(), (Binary) generator.get());
      break;
    case INT32:
      group.append(type.getName(), (Integer) generator.get());
      break;
    case INT64:
      group.append(type.getName(), (Long) generator.get());
      break;
    case FLOAT:
      group.append(type.getName(), (Float) generator.get());
      break;
    case DOUBLE:
      group.append(type.getName(), (Double) generator.get());
      break;
    case BOOLEAN:
      group.append(type.getName(), (Boolean) generator.get());
      break;
    }
  }
  return group;
}

Source File: TestFiltered.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testFilterOnString() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 1);

  // First try matching against the A url in record 1
  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", equalTo("http://A"))));

  readOne(recordReader, "r2 filtered out", r1);

  // Second try matching against the B url in record 1 - it should fail as we only match
  // against the first instance of a
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", equalTo("http://B"))));

  List<Group> all = readAll(recordReader);
  assertEquals("There should be no matching records: " + all , 0, all.size());

  // Finally try matching against the C url in record 2
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", equalTo("http://C"))));

  readOne(recordReader, "r1 filtered out", r2);

}

Source File: TestRecordLevelFilters.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testAllFilter() throws Exception {
  BinaryColumn name = binaryColumn("name");

  FilterPredicate pred = eq(name, Binary.fromString("no matches"));

  List<Group> found = PhoneBookWriter.readFile(phonebookFile, FilterCompat.get(pred));
  assertEquals(new ArrayList<Group>(), found);
}

Source File: TestFiltered.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testApplyFunctionFilterOnString() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 1);

  // First try matching against the A url in record 1
  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", applyFunctionToString(new StringEndsWithAPredicate()))));

  readOne(recordReader, "r2 filtered out", r1);

  // Second try matching against the B url in record 1 - it should fail as we only match
  // against the first instance of a
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", equalTo("http://B"))));

  List<Group> all = readAll(recordReader);
  assertEquals("There should be no matching records: " + all , 0, all.size());

  // Finally try matching against the C url in record 2
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", equalTo("http://C"))));

  readOne(recordReader, "r1 filtered out", r2);

}

Source File: TestMultipleWriteRead.java From parquet-mr with Apache License 2.0

5 votes

private void validateFile(Path file, List<Group> data) throws IOException {
  try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).build()) {
    for (Group group : data) {
      assertEquals(group.toString(), reader.read().toString());
    }
  }
}

Source File: CompressionConveterTest.java From parquet-mr with Apache License 2.0

5 votes

private void validateColumns(String file, int numRecord, TestDocs testDocs) throws IOException {
  ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), new Path(file)).withConf(conf).build();
  for (int i = 0; i < numRecord; i++) {
    Group group = reader.read();
    assertTrue(group.getLong("DocId", 0) == testDocs.docId[i]);
    assertArrayEquals(group.getBinary("Name", 0).getBytes(), testDocs.name[i].getBytes());
    assertArrayEquals(group.getBinary("Gender", 0).getBytes(), testDocs.gender[i].getBytes());
    Group subGroup = group.getGroup("Links", 0);
    assertArrayEquals(subGroup.getBinary("Backward", 0).getBytes(), testDocs.linkBackward[i].getBytes());
    assertArrayEquals(subGroup.getBinary("Forward", 0).getBytes(), testDocs.linkForward[i].getBytes());
  }
  reader.close();
}

Source File: TestParquetWriterAppendBlocks.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testBasicBehavior() throws IOException {
  Path combinedFile = newTemp();
  ParquetFileWriter writer = new ParquetFileWriter(
      CONF, FILE_SCHEMA, combinedFile);
  writer.start();
  writer.appendFile(CONF, file1);
  writer.appendFile(CONF, file2);
  writer.end(EMPTY_METADATA);

  LinkedList<Group> expected = new LinkedList<Group>();
  expected.addAll(file1content);
  expected.addAll(file2content);

  ParquetReader<Group> reader = ParquetReader
      .builder(new GroupReadSupport(), combinedFile)
      .build();

  Group next;
  while ((next = reader.read()) != null) {
    Group expectedNext = expected.removeFirst();
    // check each value; equals is not supported for simple records
    Assert.assertEquals("Each id should match",
        expectedNext.getInteger("id", 0), next.getInteger("id", 0));
    Assert.assertEquals("Each string should match",
        expectedNext.getString("string", 0), next.getString("string", 0));
  }

  Assert.assertEquals("All records should be present", 0, expected.size());
}

Source File: FilteringBenchmarks.java From parquet-mr with Apache License 2.0

5 votes

private void benchmark(Blackhole blackhole, BaseContext context) throws Exception {
  FilterPredicate filter = FilterApi.eq(BaseContext.COLUMN, context.getRandom().nextLong());
  try (ParquetReader<Group> reader = context.createReaderBuilder()
      .withFilter(FilterCompat.get(filter))
      .build()) {
    blackhole.consume(reader.read());
  }
}

Source File: ParquetResolverTest.java From pxf with Apache License 2.0

5 votes

@Test
public void testGetFields_Primitive_RepeatedString() throws IOException {
    List<Type> columns = new ArrayList<>();
    columns.add(new PrimitiveType(Type.Repetition.REPEATED, PrimitiveTypeName.BINARY, "myString", OriginalType.UTF8));
    schema = new MessageType("TestProtobuf.StringArray", columns);
    context.setMetadata(schema);
    context.setTupleDescription(getColumnDescriptorsFromSchema(schema));
    resolver.initialize(context);

    List<Group> groups = readParquetFile("proto-repeated-string.parquet", 3, schema);
    List<OneField> fields;

    // row 0
    fields = assertRow(groups, 0, 1);
    assertEquals(DataType.TEXT.getOID(), fields.get(0).type);
    assertEquals("[\"hello\",\"world\"]", fields.get(0).val);

    // row 1
    fields = assertRow(groups, 1, 1);
    assertEquals(DataType.TEXT.getOID(), fields.get(0).type);
    assertEquals("[\"good\",\"bye\"]", fields.get(0).val);

    // row 2
    fields = assertRow(groups, 2, 1);
    assertEquals(DataType.TEXT.getOID(), fields.get(0).type);
    assertEquals("[\"one\",\"two\",\"three\"]", fields.get(0).val);

}

Source File: TestTupleRecordConsumer.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testMaps() throws ExecException, ParserException {
      String pigSchemaString = "a: [(b: chararray)]";
  SimpleGroup g = new SimpleGroup(getMessageType(pigSchemaString));
  Group map = g.addGroup("a");
  map.addGroup("map").append("key", "foo").addGroup("value").append("b", "foo");
  map.addGroup("map").append("key", "bar").addGroup("value").append("b", "bar");

  testFromGroups(pigSchemaString, Arrays.<Group>asList(g));
}

Source File: PageChecksumDataGenerator.java From parquet-mr with Apache License 2.0

5 votes

public void generateData(Path outFile, int nRows, boolean writeChecksums,
                         CompressionCodecName compression) throws IOException {
  if (exists(configuration, outFile)) {
    System.out.println("File already exists " + outFile);
    return;
  }

  ParquetWriter<Group> writer = ExampleParquetWriter.builder(outFile)
    .withConf(configuration)
    .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
    .withCompressionCodec(compression)
    .withDictionaryEncoding(true)
    .withType(SCHEMA)
    .withPageWriteChecksumEnabled(writeChecksums)
    .build();

  GroupFactory groupFactory = new SimpleGroupFactory(SCHEMA);
  Random rand = new Random(42);
  for (int i = 0; i < nRows; i++) {
    Group group = groupFactory.newGroup();
    group
      .append("long_field", (long) i)
      .append("binary_field", randomUUID().toString())
      .addGroup("group")
      // Force dictionary encoding by performing modulo
      .append("int_field", rand.nextInt() % 100)
      .append("int_field", rand.nextInt() % 100)
      .append("int_field", rand.nextInt() % 100)
      .append("int_field", rand.nextInt() % 100);
    writer.write(group);
  }

  writer.close();
}

Source File: TestSimpleRecordConverter.java From parquet-mr with Apache License 2.0

5 votes

private void createTestParquetFile() throws IOException {
  Path fsPath = new Path(testFile().getPath());
  Configuration conf = new Configuration();

  MessageType schema = createSchema();
  SimpleGroupFactory fact = new SimpleGroupFactory(schema);
  GroupWriteSupport.setSchema(schema, conf);

  try (
    ParquetWriter<Group> writer = new ParquetWriter<>(
      fsPath,
      new GroupWriteSupport(),
      CompressionCodecName.UNCOMPRESSED,
      1024,
      1024,
      512,
      true,
      false,
      ParquetProperties.WriterVersion.PARQUET_2_0,
      conf)) {
    writer.write(fact.newGroup()
     .append(INT32_FIELD, 32)
     .append(INT64_FIELD, 64L)
     .append(FLOAT_FIELD, 1.0f)
     .append(DOUBLE_FIELD, 2.0d)
     .append(BINARY_FIELD, Binary.fromString("foobar"))
     .append(FIXED_LEN_BYTE_ARRAY_FIELD,
       Binary.fromConstantByteArray(new byte[]{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 })));
  }
}

Source File: TestPruneColumnsCommand.java From parquet-mr with Apache License 2.0

5 votes

private String createParquetFile(String prefix) throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, BINARY, "Name"),
    new PrimitiveType(REQUIRED, BINARY, "Gender"),
    new GroupType(OPTIONAL, "Links",
      new PrimitiveType(REPEATED, INT64, "Backward"),
      new PrimitiveType(REPEATED, INT64, "Forward")));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = createTempFile(prefix);
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file)).withConf(conf);
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", 1l);
      g.add("Name", "foo");
      g.add("Gender", "male");
      Group links = g.addGroup("Links");
      links.add(0, 2l);
      links.add(1, 3l);
      writer.write(g);
    }
  }

  return file;
}

Source File: ParquetFileAccessor.java From pxf with Apache License 2.0

5 votes

/**
 * Reads the next record.
 *
 * @return one record or null when split is already exhausted
 * @throws IOException if unable to read
 */
@Override
public OneRow readNextObject() throws IOException {
    final long then = System.nanoTime();
    Group group = fileReader.read();
    final long nanos = System.nanoTime() - then;
    totalReadTimeInNanos += nanos;

    if (group != null) {
        rowsRead++;
        return new OneRow(null, group);
    }
    return null;
}

Source File: TestZstandardCodec.java From parquet-mr with Apache License 2.0

5 votes

@Override
public void map(LongWritable key, Text value, OutputCollector<Void, Group> outputCollector, Reporter reporter) throws IOException {
  Group group = factory.newGroup()
    .append("line", (int) key.get())
    .append("content", value.toString());
  outputCollector.collect(null, group);
}

Source File: TestConstants.java From incubator-gobblin with Apache License 2.0

5 votes

@Override
public Group convertToParquetGroup(TestRecord record) {
  Group group = new SimpleGroup(PARQUET_SCHEMA);
  group.add(PAYLOAD_FIELD_NAME, record.getPayload());
  group.add(SEQUENCE_FIELD_NAME, Long.valueOf(record.getSequence()));
  group.add(PARTITION_FIELD_NAME, record.getPartition());
  return group;
}

Source File: ParquetFileTest.java From parquet-mr with Apache License 2.0

5 votes

private void createTestParquetFile() throws IOException {
  File file = parquetFile();
  Path fsPath = new Path(file.getPath());
  Configuration conf = new Configuration();

  MessageType schema = createSchema();
  SimpleGroupFactory fact = new SimpleGroupFactory(schema);
  GroupWriteSupport.setSchema(schema, conf);

  try (
    ParquetWriter<Group> writer = new ParquetWriter<>(
      fsPath,
      new GroupWriteSupport(),
      CompressionCodecName.UNCOMPRESSED,
      1024,
      1024,
      512,
      true,
      false,
      ParquetProperties.WriterVersion.PARQUET_2_0,
      conf)) {
    for (int i = 0; i < 10; i++) {
      final byte[] bytes = new byte[12];
      ThreadLocalRandom.current().nextBytes(bytes);

      writer.write(fact.newGroup()
       .append(INT32_FIELD, 32 + i)
       .append(INT64_FIELD, 64L + i)
       .append(FLOAT_FIELD, 1.0f + i)
       .append(DOUBLE_FIELD, 2.0d + i)
       .append(BINARY_FIELD, Binary.fromString(COLORS[i % COLORS.length]))
       .append(FIXED_LEN_BYTE_ARRAY_FIELD,
         Binary.fromConstantByteArray(bytes)));
    }
  }
}

org.apache.parquet.example.data.Group Java Examples