org.apache.parquet.hadoop.example.GroupReadSupport Java Exaples

Source File: TestFiltersWithMissingColumns.java From parquet-mr with Apache License 2.0

6 votes

public static long countFilteredRecords(Path path, FilterPredicate pred) throws IOException{
  ParquetReader<Group> reader = ParquetReader
      .builder(new GroupReadSupport(), path)
      .withFilter(FilterCompat.get(pred))
      .build();

  long count = 0;
  try {
    while (reader.read() != null) {
      count += 1;
    }
  } finally {
    reader.close();
  }
  return count;
}

Source File: TestPruneColumnsCommand.java From parquet-mr with Apache License 2.0

6 votes

private void validateColumns(String inputFile, List<String> prunePaths) throws IOException {
  ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), new Path(inputFile)).withConf(conf).build();
  for (int i = 0; i < numRecord; i++) {
    Group group = reader.read();
    if (!prunePaths.contains("DocId")) {
      assertEquals(1l, group.getLong("DocId", 0));
    }
    if (!prunePaths.contains("Name")) {
      assertEquals("foo", group.getBinary("Name", 0).toStringUsingUTF8());
    }
    if (!prunePaths.contains("Gender")) {
      assertEquals("male", group.getBinary("Gender", 0).toStringUsingUTF8());
    }
    if (!prunePaths.contains("Links")) {
      Group subGroup = group.getGroup("Links", 0);
      if (!prunePaths.contains("Links.Backward")) {
        assertEquals(2l, subGroup.getLong("Backward", 0));
      }
      if (!prunePaths.contains("Links.Forward")) {
        assertEquals(3l, subGroup.getLong("Forward", 0));
      }
    }
  }
  reader.close();
}

Source File: PageChecksumReadBenchmarks.java From parquet-mr with Apache License 2.0

6 votes

private void readFile(Path file, int nRows, boolean verifyChecksums, Blackhole blackhole)
  throws IOException {
  try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file)
      .withConf(configuration)
      .usePageChecksumVerification(verifyChecksums)
      .build()) {
    for (int i = 0; i < nRows; i++) {
      Group group = reader.read();
      blackhole.consume(group.getLong("long_field", 0));
      blackhole.consume(group.getBinary("binary_field", 0));
      Group subgroup = group.getGroup("group", 0);
      blackhole.consume(subgroup.getInteger("int_field", 0));
      blackhole.consume(subgroup.getInteger("int_field", 1));
      blackhole.consume(subgroup.getInteger("int_field", 2));
      blackhole.consume(subgroup.getInteger("int_field", 3));
    }
  }
}

Source File: ReadBenchmarks.java From parquet-mr with Apache License 2.0

6 votes

private void read(Path parquetFile, int nRows, Blackhole blackhole) throws IOException
{
  ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), parquetFile).withConf(configuration).build();
  for (int i = 0; i < nRows; i++) {
    Group group = reader.read();
    blackhole.consume(group.getBinary("binary_field", 0));
    blackhole.consume(group.getInteger("int32_field", 0));
    blackhole.consume(group.getLong("int64_field", 0));
    blackhole.consume(group.getBoolean("boolean_field", 0));
    blackhole.consume(group.getFloat("float_field", 0));
    blackhole.consume(group.getDouble("double_field", 0));
    blackhole.consume(group.getBinary("flba_field", 0));
    blackhole.consume(group.getInt96("int96_field", 0));
  }
  reader.close();
}

Source File: TestThriftToParquetFileWriter.java From parquet-mr with Apache License 2.0

5 votes

private ParquetReader<Group> createRecordReader(Path parquetFilePath) throws IOException {
  Configuration configuration = new Configuration(true);

  GroupReadSupport readSupport = new GroupReadSupport();
  ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, parquetFilePath);
  MessageType schema = readFooter.getFileMetaData().getSchema();

  readSupport.init(configuration, null, schema);
  return new ParquetReader<Group>(parquetFilePath, readSupport);
}

Source File: DeprecatedInputFormatTest.java From parquet-mr with Apache License 2.0

5 votes

private void runMapReduceJob(CompressionCodecName codec) throws IOException, ClassNotFoundException, InterruptedException {

    final FileSystem fileSystem = parquetPath.getFileSystem(conf);
    fileSystem.delete(parquetPath, true);
    fileSystem.delete(outputPath, true);
    {
      writeJob = new Job(conf, "write");
      TextInputFormat.addInputPath(writeJob, inputPath);
      writeJob.setInputFormatClass(TextInputFormat.class);
      writeJob.setNumReduceTasks(0);
      ExampleOutputFormat.setCompression(writeJob, codec);
      ExampleOutputFormat.setOutputPath(writeJob, parquetPath);
      writeJob.setOutputFormatClass(ExampleOutputFormat.class);
      writeJob.setMapperClass(ReadMapper.class);
      ExampleOutputFormat.setSchema(
              writeJob,
              MessageTypeParser.parseMessageType(
                      writeSchema));
      writeJob.submit();
      waitForJob(writeJob);
    }
    {
      jobConf.set(ReadSupport.PARQUET_READ_SCHEMA, readSchema);
      jobConf.set(ParquetInputFormat.READ_SUPPORT_CLASS, GroupReadSupport.class.getCanonicalName());
      jobConf.setInputFormat(MyDeprecatedInputFormat.class);
      MyDeprecatedInputFormat.setInputPaths(jobConf, parquetPath);
      jobConf.setOutputFormat(org.apache.hadoop.mapred.TextOutputFormat.class);
      org.apache.hadoop.mapred.TextOutputFormat.setOutputPath(jobConf, outputPath);
      jobConf.setMapperClass(DeprecatedWriteMapper.class);
      jobConf.setNumReduceTasks(0);
      mapRedJob = JobClient.runJob(jobConf);
    }
  }

Source File: TestColumnIndexFiltering.java From parquet-mr with Apache License 2.0

5 votes

private List<User> readUsersWithProjection(Filter filter, MessageType schema, boolean useOtherFiltering, boolean useColumnIndexFilter) throws IOException {
  return PhoneBookWriter.readUsers(ParquetReader.builder(new GroupReadSupport(), file)
      .withFilter(filter)
      .useDictionaryFilter(useOtherFiltering)
      .useStatsFilter(useOtherFiltering)
      .useRecordFilter(useOtherFiltering)
      .useColumnIndexFilter(useColumnIndexFilter)
      .set(ReadSupport.PARQUET_READ_SCHEMA, schema.toString()));
}

Source File: TestColumnIndexFiltering.java From parquet-mr with Apache License 2.0

5 votes

private List<User> readUsers(Filter filter, boolean useOtherFiltering, boolean useColumnIndexFilter)
    throws IOException {
  return PhoneBookWriter.readUsers(ParquetReader.builder(new GroupReadSupport(), file)
      .withFilter(filter)
      .useDictionaryFilter(useOtherFiltering)
      .useStatsFilter(useOtherFiltering)
      .useRecordFilter(useOtherFiltering)
      .useColumnIndexFilter(useColumnIndexFilter));
}

Source File: CompressionConveterTest.java From parquet-mr with Apache License 2.0

5 votes

private void validateColumns(String file, int numRecord, TestDocs testDocs) throws IOException {
  ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), new Path(file)).withConf(conf).build();
  for (int i = 0; i < numRecord; i++) {
    Group group = reader.read();
    assertTrue(group.getLong("DocId", 0) == testDocs.docId[i]);
    assertArrayEquals(group.getBinary("Name", 0).getBytes(), testDocs.name[i].getBytes());
    assertArrayEquals(group.getBinary("Gender", 0).getBytes(), testDocs.gender[i].getBytes());
    Group subGroup = group.getGroup("Links", 0);
    assertArrayEquals(subGroup.getBinary("Backward", 0).getBytes(), testDocs.linkBackward[i].getBytes());
    assertArrayEquals(subGroup.getBinary("Forward", 0).getBytes(), testDocs.linkForward[i].getBytes());
  }
  reader.close();
}

Source File: TestMultipleWriteRead.java From parquet-mr with Apache License 2.0

5 votes

private void validateFile(Path file, Filter filter, Stream<Group> data) throws IOException {
  try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file)
      .withFilter(filter)
      .build()) {
    for (Iterator<Group> it = data.iterator(); it.hasNext();) {
      assertEquals(it.next().toString(), reader.read().toString());
    }
  }
}

Source File: TestMultipleWriteRead.java From parquet-mr with Apache License 2.0

5 votes

private void validateFile(Path file, List<Group> data) throws IOException {
  try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).build()) {
    for (Group group : data) {
      assertEquals(group.toString(), reader.read().toString());
    }
  }
}

Source File: TestParquetWriter.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testNullValuesWithPageRowLimit() throws IOException {
  MessageType schema = Types.buildMessage().optionalList().optionalElement(BINARY).as(stringType()).named("str_list")
      .named("msg");
  final int recordCount = 100;
  Configuration conf = new Configuration();
  GroupWriteSupport.setSchema(schema, conf);

  GroupFactory factory = new SimpleGroupFactory(schema);
  Group listNull = factory.newGroup();

  File file = temp.newFile();
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
      .withPageRowCountLimit(10)
      .withConf(conf)
      .build()) {
    for (int i = 0; i < recordCount; ++i) {
      writer.write(listNull);
    }
  }

  try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), path).build()) {
    int readRecordCount = 0;
    for (Group group = reader.read(); group != null; group = reader.read()) {
      assertEquals(listNull.toString(), group.toString());
      ++readRecordCount;
    }
    assertEquals("Number of written records should be equal to the read one", recordCount, readRecordCount);
  }
}

Source File: PhoneBookWriter.java From parquet-mr with Apache License 2.0

5 votes

private static ParquetReader<Group> createReader(Path file, Filter filter) throws IOException {
  Configuration conf = new Configuration();
  GroupWriteSupport.setSchema(schema, conf);

  return ParquetReader.builder(new GroupReadSupport(), file)
      .withConf(conf)
      .withFilter(filter)
      .build();
}

Source File: TestBloomFiltering.java From parquet-mr with Apache License 2.0

5 votes

private List<PhoneBookWriter.User> readUsers(FilterPredicate filter, boolean useOtherFiltering,
                                             boolean useBloomFilter) throws IOException {
  return PhoneBookWriter.readUsers(ParquetReader.builder(new GroupReadSupport(), file)
    .withFilter(FilterCompat.get(filter))
    .useDictionaryFilter(useOtherFiltering)
    .useStatsFilter(useOtherFiltering)
    .useRecordFilter(useOtherFiltering)
    .useBloomFilter(useBloomFilter)
    .useColumnIndexFilter(useOtherFiltering));
}

Source File: ParquetFileAccessor.java From pxf with Apache License 2.0

5 votes

/**
 * Opens the resource for read.
 *
 * @throws IOException if opening the resource failed
 */
@Override
public boolean openForRead() throws IOException {
    file = new Path(context.getDataSource());
    FileSplit fileSplit = HdfsUtilities.parseFileSplit(context);

    // Read the original schema from the parquet file
    MessageType originalSchema = getSchema(file, fileSplit);
    // Get a map of the column name to Types for the given schema
    Map<String, Type> originalFieldsMap = getOriginalFieldsMap(originalSchema);
    // Get the read schema. This is either the full set or a subset (in
    // case of column projection) of the greenplum schema.
    MessageType readSchema = buildReadSchema(originalFieldsMap, originalSchema);
    // Get the record filter in case of predicate push-down
    FilterCompat.Filter recordFilter = getRecordFilter(context.getFilterString(), originalFieldsMap, readSchema);

    // add column projection
    configuration.set(PARQUET_READ_SCHEMA, readSchema.toString());

    fileReader = ParquetReader.builder(new GroupReadSupport(), file)
            .withConf(configuration)
            // Create reader for a given split, read a range in file
            .withFileRange(fileSplit.getStart(), fileSplit.getStart() + fileSplit.getLength())
            .withFilter(recordFilter)
            .build();
    context.setMetadata(readSchema);
    return true;
}

Source File: TestParquetWriterAppendBlocks.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testAllowDroppingColumns() throws IOException {
  MessageType droppedColumnSchema = Types.buildMessage()
      .required(BINARY).as(UTF8).named("string")
      .named("AppendTest");

  Path droppedColumnFile = newTemp();
  ParquetFileWriter writer = new ParquetFileWriter(
      CONF, droppedColumnSchema, droppedColumnFile);
  writer.start();
  writer.appendFile(CONF, file1);
  writer.appendFile(CONF, file2);
  writer.end(EMPTY_METADATA);

  LinkedList<Group> expected = new LinkedList<Group>();
  expected.addAll(file1content);
  expected.addAll(file2content);

  ParquetMetadata footer = ParquetFileReader.readFooter(
      CONF, droppedColumnFile, NO_FILTER);
  for (BlockMetaData rowGroup : footer.getBlocks()) {
    Assert.assertEquals("Should have only the string column",
        1, rowGroup.getColumns().size());
  }

  ParquetReader<Group> reader = ParquetReader
      .builder(new GroupReadSupport(), droppedColumnFile)
      .build();

  Group next;
  while ((next = reader.read()) != null) {
    Group expectedNext = expected.removeFirst();
    Assert.assertEquals("Each string should match",
        expectedNext.getString("string", 0), next.getString("string", 0));
  }

  Assert.assertEquals("All records should be present", 0, expected.size());
}

Source File: TestParquetWriterAppendBlocks.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testBasicBehavior() throws IOException {
  Path combinedFile = newTemp();
  ParquetFileWriter writer = new ParquetFileWriter(
      CONF, FILE_SCHEMA, combinedFile);
  writer.start();
  writer.appendFile(CONF, file1);
  writer.appendFile(CONF, file2);
  writer.end(EMPTY_METADATA);

  LinkedList<Group> expected = new LinkedList<Group>();
  expected.addAll(file1content);
  expected.addAll(file2content);

  ParquetReader<Group> reader = ParquetReader
      .builder(new GroupReadSupport(), combinedFile)
      .build();

  Group next;
  while ((next = reader.read()) != null) {
    Group expectedNext = expected.removeFirst();
    // check each value; equals is not supported for simple records
    Assert.assertEquals("Each id should match",
        expectedNext.getInteger("id", 0), next.getInteger("id", 0));
    Assert.assertEquals("Each string should match",
        expectedNext.getString("string", 0), next.getString("string", 0));
  }

  Assert.assertEquals("All records should be present", 0, expected.size());
}

Source File: FilteringBenchmarks.java From parquet-mr with Apache License 2.0

5 votes

public ParquetReader.Builder<Group> createReaderBuilder() throws IOException {
  ReadConfigurator readConfigurator = getReadConfigurator();
  return readConfigurator.configureBuilder(
      new ParquetReader.Builder<Group>(HadoopInputFile.fromPath(file, new Configuration())) {
        @Override
        protected ReadSupport<Group> getReadSupport() {
          return new GroupReadSupport();
        }
      }.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, SCHEMA.toString()));
}

Source File: AbstractParquetFileReader.java From attic-apex-malhar with Apache License 2.0

5 votes

/**
 * Opens the file to read using GroupReadSupport
 */
@Override
protected InputStream openFile(Path path) throws IOException
{
  InputStream is = super.openFile(path);
  GroupReadSupport readSupport = new GroupReadSupport();
  readSupport.init(configuration, null, schema);
  reader = new ParquetReader<>(path, readSupport);
  return is;
}

Source File: TestConvertAvroToParquet.java From nifi with Apache License 2.0

4 votes

@Test
public void test_Data() throws Exception {


    FileInputStream fileInputStream = new FileInputStream(tmpAvro);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    int readedBytes;
    byte[] buf = new byte[1024];
    while ((readedBytes = fileInputStream.read(buf)) > 0) {
        out.write(buf, 0, readedBytes);
    }
    out.close();

    Map<String, String> attributes = new HashMap<String, String>() {{
        put(CoreAttributes.FILENAME.key(), "test.avro");
    }};
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToParquet.SUCCESS).get(0);

    // Save the flowfile
    byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
    FileOutputStream fos = new FileOutputStream(tmpParquet);
    fos.write(resultContents);
    fos.flush();
    fos.close();

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), new Path(tmpParquet.getAbsolutePath()))
                    .withConf(conf)
                    .build();

    List<Group> parquetRecords = new ArrayList<Group>();

    Group current;
    current = reader.read();
    while (current != null) {
        assertTrue(current instanceof Group);
        parquetRecords.add(current);
        current = reader.read();
    }

    Group firstRecord = parquetRecords.get(0);

    // Primitive
    assertEquals(firstRecord.getInteger("myint", 0), 1);
    assertEquals(firstRecord.getLong("mylong", 0), 2);
    assertEquals(firstRecord.getBoolean("myboolean", 0), true);
    assertEquals(firstRecord.getFloat("myfloat", 0), 3.1, 0.0001);
    assertEquals(firstRecord.getDouble("mydouble", 0), 4.1, 0.001);
    assertEquals(firstRecord.getString("mybytes", 0), "hello");
    assertEquals(firstRecord.getString("mystring", 0), "hello");

    // Nested
    assertEquals(firstRecord.getGroup("mynestedrecord",0).getInteger("mynestedint",0), 1);

    // Array
    assertEquals(firstRecord.getGroup("myarray",0).getGroup("list",0).getInteger("element", 0), 1);
    assertEquals(firstRecord.getGroup("myarray",0).getGroup("list",1).getInteger("element", 0), 2);

    // Map
    assertEquals(firstRecord.getGroup("mymap",0).getGroup("map",0).getInteger("value", 0), 1);
    assertEquals(firstRecord.getGroup("mymap",0).getGroup("map",1).getInteger("value", 0), 2);

    // Fixed
    assertEquals(firstRecord.getString("myfixed",0), "A");

}

Source File: TestParquetWriter.java From parquet-mr with Apache License 2.0

4 votes

@Test
public void test() throws Exception {
  Configuration conf = new Configuration();
  Path root = new Path("target/tests/TestParquetWriter/");
  enforceEmptyDir(conf, root);
  MessageType schema = parseMessageType(
      "message test { "
      + "required binary binary_field; "
      + "required int32 int32_field; "
      + "required int64 int64_field; "
      + "required boolean boolean_field; "
      + "required float float_field; "
      + "required double double_field; "
      + "required fixed_len_byte_array(3) flba_field; "
      + "required int96 int96_field; "
      + "} ");
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);
  Map<String, Encoding> expected = new HashMap<String, Encoding>();
  expected.put("10-" + PARQUET_1_0, PLAIN_DICTIONARY);
  expected.put("1000-" + PARQUET_1_0, PLAIN);
  expected.put("10-" + PARQUET_2_0, RLE_DICTIONARY);
  expected.put("1000-" + PARQUET_2_0, DELTA_BYTE_ARRAY);
  for (int modulo : asList(10, 1000)) {
    for (WriterVersion version : WriterVersion.values()) {
      Path file = new Path(root, version.name() + "_" + modulo);
      ParquetWriter<Group> writer = new ParquetWriter<Group>(
          file,
          new GroupWriteSupport(),
          UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
      for (int i = 0; i < 1000; i++) {
        writer.write(
            f.newGroup()
            .append("binary_field", "test" + (i % modulo))
            .append("int32_field", 32)
            .append("int64_field", 64l)
            .append("boolean_field", true)
            .append("float_field", 1.0f)
            .append("double_field", 2.0d)
            .append("flba_field", "foo")
            .append("int96_field", Binary.fromConstantByteArray(new byte[12])));
      }
      writer.close();
      ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
      for (int i = 0; i < 1000; i++) {
        Group group = reader.read();
        assertEquals("test" + (i % modulo), group.getBinary("binary_field", 0).toStringUsingUTF8());
        assertEquals(32, group.getInteger("int32_field", 0));
        assertEquals(64l, group.getLong("int64_field", 0));
        assertEquals(true, group.getBoolean("boolean_field", 0));
        assertEquals(1.0f, group.getFloat("float_field", 0), 0.001);
        assertEquals(2.0d, group.getDouble("double_field", 0), 0.001);
        assertEquals("foo", group.getBinary("flba_field", 0).toStringUsingUTF8());
        assertEquals(Binary.fromConstantByteArray(new byte[12]),
            group.getInt96("int96_field",0));
      }
      reader.close();
      ParquetMetadata footer = readFooter(conf, file, NO_FILTER);
      for (BlockMetaData blockMetaData : footer.getBlocks()) {
        for (ColumnChunkMetaData column : blockMetaData.getColumns()) {
          if (column.getPath().toDotString().equals("binary_field")) {
            String key = modulo + "-" + version;
            Encoding expectedEncoding = expected.get(key);
            assertTrue(
                key + ":" + column.getEncodings() + " should contain " + expectedEncoding,
                column.getEncodings().contains(expectedEncoding));
          }
        }
      }
      assertEquals("Object model property should be example",
          "example", footer.getFileMetaData().getKeyValueMetaData()
              .get(ParquetWriter.OBJECT_MODEL_NAME_PROP));
    }
  }
}

Source File: TestParquetWriterNewPage.java From parquet-mr with Apache License 2.0

4 votes

@Test
public void test() throws Exception {
  Configuration conf = new Configuration();
  Path root = new Path("target/tests/TestParquetWriter/");
  FileSystem fs = root.getFileSystem(conf);
  if (fs.exists(root)) {
    fs.delete(root, true);
  }
  fs.mkdirs(root);
  MessageType schema = parseMessageType(
      "message test { "
      + "required binary binary_field; "
      + "required int32 int32_field; "
      + "required int64 int64_field; "
      + "required boolean boolean_field; "
      + "required float float_field; "
      + "required double double_field; "
      + "required fixed_len_byte_array(3) flba_field; "
      + "required int96 int96_field; "
      + "optional binary null_field; "
      + "} ");
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);
  Map<String, Encoding> expected = new HashMap<String, Encoding>();
  expected.put("10-" + PARQUET_1_0, PLAIN_DICTIONARY);
  expected.put("1000-" + PARQUET_1_0, PLAIN);
  expected.put("10-" + PARQUET_2_0, RLE_DICTIONARY);
  expected.put("1000-" + PARQUET_2_0, DELTA_BYTE_ARRAY);
  for (int modulo : asList(10, 1000)) {
    for (WriterVersion version : WriterVersion.values()) {
      Path file = new Path(root, version.name() + "_" + modulo);
      ParquetWriter<Group> writer = new ParquetWriter<Group>(
          file,
          new GroupWriteSupport(),
          UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
      for (int i = 0; i < 1000; i++) {
        writer.write(
            f.newGroup()
            .append("binary_field", "test" + (i % modulo))
            .append("int32_field", 32)
            .append("int64_field", 64l)
            .append("boolean_field", true)
            .append("float_field", 1.0f)
            .append("double_field", 2.0d)
            .append("flba_field", "foo")
            .append("int96_field", Binary.fromConstantByteArray(new byte[12])));
      }
      writer.close();

      ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
      for (int i = 0; i < 1000; i++) {
        Group group = reader.read();
        assertEquals("test" + (i % modulo), group.getBinary("binary_field", 0).toStringUsingUTF8());
        assertEquals(32, group.getInteger("int32_field", 0));
        assertEquals(64l, group.getLong("int64_field", 0));
        assertEquals(true, group.getBoolean("boolean_field", 0));
        assertEquals(1.0f, group.getFloat("float_field", 0), 0.001);
        assertEquals(2.0d, group.getDouble("double_field", 0), 0.001);
        assertEquals("foo", group.getBinary("flba_field", 0).toStringUsingUTF8());
        assertEquals(Binary.fromConstantByteArray(new byte[12]), group.getInt96("int96_field",
            0));
        assertEquals(0, group.getFieldRepetitionCount("null_field"));
      }
      reader.close();
      ParquetMetadata footer = readFooter(conf, file, NO_FILTER);
      for (BlockMetaData blockMetaData : footer.getBlocks()) {
        for (ColumnChunkMetaData column : blockMetaData.getColumns()) {
          if (column.getPath().toDotString().equals("binary_field")) {
            String key = modulo + "-" + version;
            Encoding expectedEncoding = expected.get(key);
            assertTrue(
                key + ":" + column.getEncodings() + " should contain " + expectedEncoding,
                column.getEncodings().contains(expectedEncoding));
          }
        }
      }
    }
  }
}

Source File: TestInputOutputFormatWithPadding.java From parquet-mr with Apache License 2.0

4 votes

@Test
public void testBasicBehaviorWithPadding() throws Exception {
  HadoopOutputFile.getBlockFileSystems().add("file");

  File inputFile = temp.newFile();
  FileOutputStream out = new FileOutputStream(inputFile);
  out.write(FILE_CONTENT.getBytes("UTF-8"));
  out.close();

  File tempFolder = temp.newFolder();
  tempFolder.delete();
  Path tempPath = new Path(tempFolder.toURI());

  File outputFolder = temp.newFile();
  outputFolder.delete();

  Configuration conf = new Configuration();
  // May test against multiple hadoop versions
  conf.set("dfs.block.size", "1024");
  conf.set("dfs.blocksize", "1024");
  conf.set("dfs.blockSize", "1024");
  conf.set("fs.local.block.size", "1024");

  // don't use a cached FS with a different block size
  conf.set("fs.file.impl.disable.cache", "true");

  // disable summary metadata, it isn't needed
  conf.set("parquet.enable.summary-metadata", "false");
  conf.set("parquet.example.schema", PARQUET_TYPE.toString());

  {
    Job writeJob = new Job(conf, "write");
    writeJob.setInputFormatClass(TextInputFormat.class);
    TextInputFormat.addInputPath(writeJob, new Path(inputFile.toString()));

    writeJob.setOutputFormatClass(ParquetOutputFormat.class);
    writeJob.setMapperClass(Writer.class);
    writeJob.setNumReduceTasks(0); // write directly to Parquet without reduce
    ParquetOutputFormat.setWriteSupportClass(writeJob, GroupWriteSupport.class);
    ParquetOutputFormat.setBlockSize(writeJob, 1024);
    ParquetOutputFormat.setPageSize(writeJob, 512);
    ParquetOutputFormat.setDictionaryPageSize(writeJob, 512);
    ParquetOutputFormat.setEnableDictionary(writeJob, true);
    ParquetOutputFormat.setMaxPaddingSize(writeJob, 1023); // always pad
    ParquetOutputFormat.setOutputPath(writeJob, tempPath);

    waitForJob(writeJob);
  }

  // make sure padding was added
  File parquetFile = getDataFile(tempFolder);
  ParquetMetadata footer = ParquetFileReader.readFooter(conf,
      new Path(parquetFile.toString()), ParquetMetadataConverter.NO_FILTER);
  for (BlockMetaData block : footer.getBlocks()) {
    Assert.assertTrue("Block should start at a multiple of the block size",
        block.getStartingPos() % 1024 == 0);
  }

  {
    Job readJob = new Job(conf, "read");
    readJob.setInputFormatClass(NoSplits.class);
    ParquetInputFormat.setReadSupportClass(readJob, GroupReadSupport.class);
    TextInputFormat.addInputPath(readJob, tempPath);

    readJob.setOutputFormatClass(TextOutputFormat.class);
    readJob.setMapperClass(Reader.class);
    readJob.setNumReduceTasks(0); // write directly to text without reduce
    TextOutputFormat.setOutputPath(readJob, new Path(outputFolder.toString()));

    waitForJob(readJob);
  }

  File dataFile = getDataFile(outputFolder);
  Assert.assertNotNull("Should find a data file", dataFile);

  StringBuilder contentBuilder = new StringBuilder();
  for (String line : Files.readAllLines(dataFile.toPath(), StandardCharsets.UTF_8)) {
    contentBuilder.append(line);
  }
  String reconstructed = contentBuilder.toString();
  Assert.assertEquals("Should match written file content",
      FILE_CONTENT, reconstructed);

  HadoopOutputFile.getBlockFileSystems().remove("file");
}

org.apache.parquet.hadoop.example.GroupReadSupport Java Examples