org.apache.parquet.hadoop.ParquetFileReader Java Exaples

Source File: TransCompressionCommand.java From parquet-mr with Apache License 2.0

7 votes

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
  Preconditions.checkArgument(input != null && output != null,
    "Both input and output parquet file paths are required.");

  Preconditions.checkArgument(codec != null,
    "The codec cannot be null");

  Path inPath = new Path(input);
  Path outPath = new Path(output);
  CompressionCodecName codecName = CompressionCodecName.valueOf(codec);

  ParquetMetadata metaData = ParquetFileReader.readFooter(getConf(), inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(getConf(), schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, getConf()), HadoopReadOptions.builder(getConf()).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
  return 0;
}

Source File: HdfsOffsetComputer.java From garmadon with Apache License 2.0

6 votes

protected Long getMaxOffset(Map<String, FinalEventPartitionFile> dateFinalEventPartitionFile) {
    // Get max offset from all files for a partition
    return dateFinalEventPartitionFile
        .values()
        .stream()
        .flatMap(finalEventPartitionFile -> {
            try (ParquetFileReader pFR = ParquetFileReader.open(fs.getConf(), finalEventPartitionFile.getFilePath())) {
                return pFR.getFooter().getBlocks().stream();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }

        })
        .map(b -> b.getColumns().stream()
            .filter(column -> Arrays.stream(column.getPath().toArray()).allMatch(path -> path.equals("kafka_offset")))
            .findFirst()
            .map(ColumnChunkMetaData::getStatistics)
            .map(Statistics::genericGetMax)
            .map(Long.class::cast)
            .orElse(NO_OFFSET))
        .mapToLong(Long::longValue)
        .max()
        .orElse(NO_OFFSET);
}

Source File: Schemas.java From parquet-mr with Apache License 2.0

6 votes

public static Schema fromParquet(Configuration conf, URI location) throws IOException {
  Path path = new Path(location);
  FileSystem fs = path.getFileSystem(conf);

  ParquetMetadata footer = ParquetFileReader.readFooter(fs.getConf(), path);

  String schemaString = footer.getFileMetaData()
      .getKeyValueMetaData().get("parquet.avro.schema");
  if (schemaString == null) {
    // try the older property
    schemaString = footer.getFileMetaData()
        .getKeyValueMetaData().get("avro.schema");
  }

  if (schemaString != null) {
    return new Schema.Parser().parse(schemaString);
  } else {
    return new AvroSchemaConverter()
        .convert(footer.getFileMetaData().getSchema());
  }
}

Source File: ShowMetaCommand.java From parquet-mr with Apache License 2.0

6 votes

@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);

  String[] args = options.getArgs();
  String input = args[0];
  boolean showOriginalTypes = options.hasOption('o');

  Configuration conf = new Configuration();
  Path inputPath = new Path(input);
  FileStatus inputFileStatus = inputPath.getFileSystem(conf).getFileStatus(inputPath);
  List<Footer> footers = ParquetFileReader.readFooters(conf, inputFileStatus, false);

  PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
                                           .withAutoColumn()
                                           .withWhitespaceHandler(WhiteSpaceHandler.COLLAPSE_WHITESPACE)
                                           .withColumnPadding(1)
                                           .build();

  for(Footer f: footers) {
    out.format("file: %s%n" , f.getFile());
    MetadataUtils.showDetails(out, f.getParquetMetadata(), showOriginalTypes);
    out.flushColumns();
  }
}

Source File: TransCompressionCommand.java From parquet-mr with Apache License 2.0

6 votes

@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);
  List<String> args = options.getArgList();
  Path inPath = new Path(args.get(0));
  Path outPath = new Path(args.get(1));
  CompressionCodecName codecName = CompressionCodecName.valueOf(args.get(2));

  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
}

Source File: ParquetUtils.java From incubator-pinot with Apache License 2.0

6 votes

/**
 * Returns the schema for the given Parquet file path.
 */
public static Schema getParquetSchema(Path path)
    throws IOException {
  ParquetMetadata footer = ParquetFileReader.readFooter(getConfiguration(), path, ParquetMetadataConverter.NO_FILTER);
  Map<String, String> metaData = footer.getFileMetaData().getKeyValueMetaData();
  String schemaString = metaData.get("parquet.avro.schema");
  if (schemaString == null) {
    // Try the older property
    schemaString = metaData.get("avro.schema");
  }
  if (schemaString != null) {
    return new Schema.Parser().parse(schemaString);
  } else {
    return new AvroSchemaConverter().convert(footer.getFileMetaData().getSchema());
  }
}

Source File: TestMetricsRowGroupFilterTypes.java From iceberg with Apache License 2.0

6 votes

public void createParquetInputFile(List<Record> records) throws IOException {
  if (PARQUET_FILE.exists()) {
    Assert.assertTrue(PARQUET_FILE.delete());
  }

  OutputFile outFile = Files.localOutput(PARQUET_FILE);
  try (FileAppender<Record> appender = Parquet.write(outFile)
      .schema(FILE_SCHEMA)
      .createWriterFunc(GenericParquetWriter::buildWriter)
      .build()) {
    appender.addAll(records);
  }

  InputFile inFile = Files.localInput(PARQUET_FILE);
  try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) {
    Assert.assertEquals("Should create only one row group", 1, reader.getRowGroups().size());
    rowGroupMetadata = reader.getRowGroups().get(0);
    parquetSchema = reader.getFileMetaData().getSchema();
  }

  PARQUET_FILE.deleteOnExit();
}

Source File: ParquetResolverTest.java From pxf with Apache License 2.0

6 votes

@SuppressWarnings("deprecation")
private List<Group> readParquetFile(String file, long expectedSize, MessageType schema) throws IOException {
    List<Group> result = new ArrayList<>();
    String parquetFile = Objects.requireNonNull(getClass().getClassLoader().getResource("parquet/" + file)).getPath();
    Path path = new Path(parquetFile);

    ParquetFileReader fileReader = new ParquetFileReader(new Configuration(), path, ParquetMetadataConverter.NO_FILTER);
    PageReadStore rowGroup;
    while ((rowGroup = fileReader.readNextRowGroup()) != null) {
        MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
        RecordReader<Group> recordReader = columnIO.getRecordReader(rowGroup, new GroupRecordConverter(schema));
        long rowCount = rowGroup.getRowCount();
        for (long i = 0; i < rowCount; i++) {
            result.add(recordReader.read());
        }
    }
    fileReader.close();
    assertEquals(expectedSize, result.size());
    return result;
}

Source File: TestPruneColumnsCommand.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testPruneMultiColumns() throws Exception {
  // Create Parquet file
  String inputFile = createParquetFile("input");
  String outputFile = createTempFile("output");

  // Remove columns
  String cargs[] = {inputFile, outputFile, "Name", "Gender"};
  executeCommandLine(cargs);

  // Verify the schema are not changed for the columns not pruned
  ParquetMetadata pmd = ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER);
  MessageType schema = pmd.getFileMetaData().getSchema();
  List<Type> fields = schema.getFields();
  assertEquals(fields.size(), 2);
  assertEquals(fields.get(0).getName(), "DocId");
  assertEquals(fields.get(1).getName(), "Links");
  List<Type> subFields = fields.get(1).asGroupType().getFields();
  assertEquals(subFields.size(), 2);
  assertEquals(subFields.get(0).getName(), "Backward");
  assertEquals(subFields.get(1).getName(), "Forward");

  // Verify the data are not changed for the columns not pruned
  List<String> prunePaths = Arrays.asList("Name", "Gender");
  validateColumns(inputFile, prunePaths);
}

Source File: PentahoApacheInputFormat.java From pentaho-hadoop-shims with Apache License 2.0

6 votes

@Override
public List<IParquetInputField> readSchema( String file ) throws Exception {
  return inClassloader( () -> {
    Configuration conf = job.getConfiguration();
    S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( file, conf );
    Path filePath = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( file ) );
    FileSystem fs = FileSystem.get( filePath.toUri(), conf );
    FileStatus fileStatus = fs.getFileStatus( filePath );
    List<Footer> footers = ParquetFileReader.readFooters( conf, fileStatus, true );
    if ( footers.isEmpty() ) {
      return new ArrayList<>();
    } else {
      ParquetMetadata meta = footers.get( 0 ).getParquetMetadata();
      MessageType schema = meta.getFileMetaData().getSchema();
      return ParquetConverter.buildInputFields( schema );
    }
  } );
}

Source File: CompressionConveterTest.java From parquet-mr with Apache License 2.0

6 votes

private void convertCompression(Configuration conf, String inputFile, String outputFile, String codec) throws IOException {
  Path inPath = new Path(inputFile);
  Path outPath = new Path(outputFile);
  CompressionCodecName codecName = CompressionCodecName.valueOf(codec);

  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
}

Source File: SchemaCommand.java From parquet-mr with Apache License 2.0

6 votes

private String getParquetSchema(String source) throws IOException {
  Formats.Format format;
  try (SeekableInput in = openSeekable(source)) {
    format = Formats.detectFormat((InputStream) in);
    in.seek(0);

    switch (format) {
      case PARQUET:
        return new ParquetFileReader(
            getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER)
            .getFileMetaData().getSchema().toString();
      default:
        throw new IllegalArgumentException(String.format(
            "Could not get a Parquet schema for format %s: %s", format, source));
    }
  }
}

Source File: ParquetReader.java From tajo with Apache License 2.0

6 votes

private ParquetReader(Configuration conf,
                      Path file,
                      ReadSupport<T> readSupport,
                      Filter filter) throws IOException {
  this.readSupport = readSupport;
  this.filter = checkNotNull(filter, "filter");
  this.conf = conf;

  FileSystem fs = file.getFileSystem(conf);
  List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, HiddenFileFilter.INSTANCE));
  List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false);
  this.footersIterator = footers.iterator();

  for (Footer footer : footers) {
    for(BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
      totalRowCount += block.getRowCount();
    }
  }
}

Source File: TestStatistics.java From parquet-mr with Apache License 2.0

6 votes

@Override
public void test() throws IOException {
  Configuration configuration = new Configuration();
  ParquetMetadata metadata = ParquetFileReader.readFooter(configuration,
      super.fsPath, ParquetMetadataConverter.NO_FILTER);
  ParquetFileReader reader = new ParquetFileReader(configuration,
    metadata.getFileMetaData(),
    super.fsPath,
    metadata.getBlocks(),
    metadata.getFileMetaData().getSchema().getColumns());

  PageStatsValidator validator = new PageStatsValidator();

  PageReadStore pageReadStore;
  while ((pageReadStore = reader.readNextRowGroup()) != null) {
    validator.validate(metadata.getFileMetaData().getSchema(), pageReadStore);
  }
}

Source File: Schemas.java From kite with Apache License 2.0

6 votes

public static Schema fromParquet(FileSystem fs, Path location) throws IOException {
  ParquetMetadata footer = ParquetFileReader.readFooter(fs.getConf(), location);

  String schemaString = footer.getFileMetaData()
      .getKeyValueMetaData().get("parquet.avro.schema");
  if (schemaString == null) {
    // try the older property
    schemaString = footer.getFileMetaData()
        .getKeyValueMetaData().get("avro.schema");
  }

  if (schemaString != null) {
    return new Schema.Parser().parse(schemaString);
  } else {
    return new AvroSchemaConverter()
        .convert(footer.getFileMetaData().getSchema());
  }
}

Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0

5 votes

@Test
public void testReadSimpleGroup() throws IOException {
	Long[] array = {1L};
	GenericData.Record record = new GenericRecordBuilder(SIMPLE_SCHEMA)
		.set("bar", "test")
		.set("foo", 32L)
		.set("arr", array).build();

	Path path = createTempParquetFile(tempRoot.getRoot(), SIMPLE_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(SIMPLE_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(3, row.getArity());
	assertEquals(32L, row.getField(0));
	assertEquals("test", row.getField(1));
	assertArrayEquals(array, (Long[]) row.getField(2));
	assertTrue(rowReader.reachEnd());
}

Source File: TestPruneColumnsCommand.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testPruneNestedColumn() throws Exception {
  // Create Parquet file
  String inputFile = createParquetFile("input");
  String outputFile = createTempFile("output");

  // Remove nested column
  String cargs[] = {inputFile, outputFile, "Links.Backward"};
  executeCommandLine(cargs);

  // Verify the schema are not changed for the columns not pruned
  ParquetMetadata pmd = ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER);
  MessageType schema = pmd.getFileMetaData().getSchema();
  List<Type> fields = schema.getFields();
  assertEquals(fields.size(), 4);
  assertEquals(fields.get(0).getName(), "DocId");
  assertEquals(fields.get(1).getName(), "Name");
  assertEquals(fields.get(2).getName(), "Gender");
  assertEquals(fields.get(3).getName(), "Links");
  List<Type> subFields = fields.get(3).asGroupType().getFields();
  assertEquals(subFields.size(), 1);
  assertEquals(subFields.get(0).getName(), "Forward");

  // Verify the data are not changed for the columns not pruned
  List<String> prunePaths = Arrays.asList("Links.Backward");
  validateColumns(inputFile, prunePaths);
}

Source File: DictionaryFilterTest.java From parquet-mr with Apache License 2.0

5 votes

@Before
public void setUp() throws Exception {
  reader = ParquetFileReader.open(conf, file);
  ParquetMetadata meta = reader.getFooter();
  ccmd = meta.getBlocks().get(0).getColumns();
  dictionaries = reader.getDictionaryReader(meta.getBlocks().get(0));
}

Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0

5 votes

@Test
public void testMapGroup() throws IOException {
	Preconditions.checkState(unWrapSchema(NESTED_SCHEMA.getField("spamMap").schema())
		.getType().equals(Schema.Type.MAP));
	ImmutableMap.Builder<String, String> map = ImmutableMap.builder();
	map.put("testKey", "testValue");

	GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("foo", 32L)
		.set("spamMap", map.build())
		.build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());

	assertEquals(32L, row.getField(0));
	Map<?, ?> result = (Map<?, ?>) row.getField(1);
	assertEquals(result.get("testKey").toString(), "testValue");
	assertTrue(rowReader.reachEnd());
}

Source File: TestConvertAvroToParquet.java From nifi with Apache License 2.0

5 votes

@Test
public void test_Meta_Info() throws Exception {

    FileInputStream fileInputStream = new FileInputStream(tmpAvro);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    int readedBytes;
    byte[] buf = new byte[1024];
    while ((readedBytes = fileInputStream.read(buf)) > 0) {
        out.write(buf, 0, readedBytes);
    }
    out.close();

    Map<String, String> attributes = new HashMap<String, String>() {{
        put(CoreAttributes.FILENAME.key(), "test.avro");
    }};
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToParquet.SUCCESS).get(0);

    // Save the flowfile
    byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
    FileOutputStream fos = new FileOutputStream(tmpParquet);
    fos.write(resultContents);
    fos.flush();
    fos.close();

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    ParquetMetadata metaData;
    metaData = ParquetFileReader.readFooter(conf, new Path(tmpParquet.getAbsolutePath()), NO_FILTER);

    // #number of records
    long nParquetRecords = 0;
    for(BlockMetaData meta : metaData.getBlocks()){
        nParquetRecords += meta.getRowCount();
    }
    long nAvroRecord = records.size();

    assertEquals(nParquetRecords, nAvroRecord);
}

Source File: FileEncodingsIT.java From parquet-mr with Apache License 2.0

5 votes

private static List<PageReadStore> readBlocksFromFile(Path file) throws IOException {
  List<PageReadStore> rowGroups = new ArrayList<PageReadStore>();

  ParquetMetadata metadata = ParquetFileReader.readFooter(configuration, file, ParquetMetadataConverter.NO_FILTER);
  ParquetFileReader fileReader = new ParquetFileReader(configuration, metadata.getFileMetaData(), file, metadata.getBlocks(),
      metadata.getFileMetaData().getSchema().getColumns());

  PageReadStore group;
  while ((group = fileReader.readNextRowGroup()) != null) {
    rowGroups.add(group);
  }

  return rowGroups;
}

Source File: TestThriftToParquetFileWriter.java From parquet-mr with Apache License 2.0

5 votes

private ParquetReader<Group> createRecordReader(Path parquetFilePath) throws IOException {
  Configuration configuration = new Configuration(true);

  GroupReadSupport readSupport = new GroupReadSupport();
  ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, parquetFilePath);
  MessageType schema = readFooter.getFileMetaData().getSchema();

  readSupport.init(configuration, null, schema);
  return new ParquetReader<Group>(parquetFilePath, readSupport);
}

Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0

5 votes

@Test
public void testReadNestedGroup() throws IOException {
	Schema schema = unWrapSchema(NESTED_SCHEMA.getField("bar").schema());
	GenericData.Record barRecord = new GenericRecordBuilder(schema)
		.set("spam", 31L).build();

	GenericData.Record record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("foo", 32L)
		.set("bar", barRecord)
		.build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());
	assertEquals(32L, row.getField(0));
	assertEquals(31L, ((Row) row.getField(2)).getField(0));
	assertTrue(rowReader.reachEnd());
}

Source File: ParquetRecordReaderTest.java From flink with Apache License 2.0

5 votes

@Test
public void testReadMultipleSimpleGroup() throws IOException {
	Long[] array = {1L};

	List<IndexedRecord> records = new ArrayList<>();
	for (int i = 0; i < 100; i++) {
		GenericData.Record record = new GenericRecordBuilder(SIMPLE_SCHEMA)
			.set("bar", "test")
			.set("foo", i)
			.set("arr", array).build();
		records.add(record);
	}

	Path path = createTempParquetFile(tempRoot.getRoot(), SIMPLE_SCHEMA, records);
	MessageType readSchema = (new AvroSchemaConverter()).convert(SIMPLE_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertTrue(!rowReader.reachEnd());

	for (long i = 0; i < 100; i++) {
		assertFalse(rowReader.reachEnd());
		Row row = rowReader.nextRecord();
		assertEquals(3, row.getArity());
		assertEquals(i, row.getField(0));
		assertEquals("test", row.getField(1));
		assertArrayEquals(array, (Long[]) row.getField(2));
	}

	assertTrue(rowReader.reachEnd());
}

Source File: ColumnSizeCommand.java From parquet-mr with Apache License 2.0

5 votes

public Map<String, Long> getColumnSizeInBytes(Path inputFile) throws IOException {
  Map<String, Long> colSizes = new HashMap<>();
  ParquetMetadata pmd = ParquetFileReader.readFooter(new Configuration(), inputFile, ParquetMetadataConverter.NO_FILTER);

  for (BlockMetaData block : pmd.getBlocks()) {
    for (ColumnChunkMetaData column : block.getColumns()) {
      String colName = column.getPath().toDotString();
      colSizes.put(colName, column.getTotalSize() + colSizes.getOrDefault(colName, 0L));
    }
  }

  return colSizes;
}

Source File: ParquetInputFormat.java From flink with Apache License 2.0

5 votes

@Override
public void open(FileInputSplit split) throws IOException {
	// reset the flag when open a new split
	this.skipThisSplit = false;
	org.apache.hadoop.conf.Configuration configuration = new org.apache.hadoop.conf.Configuration();
	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(split.getPath().toUri()), configuration);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);
	MessageType fileSchema = fileReader.getFileMetaData().getSchema();
	MessageType readSchema = getReadSchema(fileSchema, split.getPath());
	if (skipThisSplit) {
		LOG.warn(String.format(
			"Escaped the file split [%s] due to mismatch of file schema to expected result schema",
			split.getPath().toString()));
	} else {
		this.parquetRecordReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema,
			filterPredicate == null ? FilterCompat.NOOP : FilterCompat.get(filterPredicate));
		this.parquetRecordReader.initialize(fileReader, configuration);
		this.parquetRecordReader.setSkipCorruptedRecord(this.skipCorruptedRecord);

		if (this.recordConsumed == null) {
			this.recordConsumed = getRuntimeContext().getMetricGroup().counter("parquet-records-consumed");
		}

		LOG.debug(String.format("Open ParquetInputFormat with FileInputSplit [%s]", split.getPath().toString()));
	}
}

Source File: ParquetColumnarRowSplitReader.java From flink with Apache License 2.0

5 votes

public ParquetColumnarRowSplitReader(
		boolean utcTimestamp,
		boolean caseSensitive,
		Configuration conf,
		LogicalType[] selectedTypes,
		String[] selectedFieldNames,
		ColumnBatchGenerator generator,
		int batchSize,
		Path path,
		long splitStart,
		long splitLength) throws IOException {
	this.utcTimestamp = utcTimestamp;
	this.selectedTypes = selectedTypes;
	this.batchSize = batchSize;
	// then we need to apply the predicate push down filter
	ParquetMetadata footer = readFooter(conf, path, range(splitStart, splitStart + splitLength));
	MessageType fileSchema = footer.getFileMetaData().getSchema();
	FilterCompat.Filter filter = getFilter(conf);
	List<BlockMetaData> blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);

	this.fileSchema = footer.getFileMetaData().getSchema();
	this.requestedSchema = clipParquetSchema(fileSchema, selectedFieldNames, caseSensitive);
	this.reader = new ParquetFileReader(
			conf, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns());

	long totalRowCount = 0;
	for (BlockMetaData block : blocks) {
		totalRowCount += block.getRowCount();
	}
	this.totalRowCount = totalRowCount;
	this.nextRow = 0;
	this.rowsInBatch = 0;
	this.rowsReturned = 0;

	checkSchema();

	this.writableVectors = createWritableVectors();
	this.columnarBatch = generator.generate(createReadableVectors());
	this.row = new ColumnarRowData(columnarBatch);
}

Source File: ParquetRecordReader.java From flink with Apache License 2.0

5 votes

public void initialize(ParquetFileReader reader, Configuration configuration) {
	this.reader = reader;
	FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
	// real schema of parquet file
	this.fileSchema = parquetFileMetadata.getSchema();
	Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
	ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
		configuration, toSetMultiMap(fileMetadata), readSchema));

	this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
	this.recordMaterializer = readSupport.prepareForRead(
		configuration, fileMetadata, readSchema, readContext);
	this.numTotalRecords = reader.getRecordCount();
}

Source File: SparkModelParser.java From ignite with Apache License 2.0

5 votes

/**
 * Load logistic regression model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadLogRegModel(String pathToMdl,
    LearningEnvironment learningEnvironment) {
    Vector coefficients = null;
    double interceptor = 0;

    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                interceptor = readInterceptor(g);
                coefficients = readCoefficients(g);
            }
        }

    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }

    return new LogisticRegressionModel(coefficients, interceptor);
}

Source File: FooterGatherer.java From Bats with Apache License 2.0

5 votes

public static List<Footer> getFooters(final Configuration conf, List<FileStatus> statuses, int parallelism) throws IOException {
  final List<TimedCallable<Footer>> readers = new ArrayList<>();
  final List<Footer> foundFooters = new ArrayList<>();
  for (FileStatus status : statuses) {


    if (status.isDirectory()){
      // first we check for summary file.
      FileSystem fs = status.getPath().getFileSystem(conf);

      final Path summaryPath = new Path(status.getPath(), ParquetFileWriter.PARQUET_METADATA_FILE);
      if (fs.exists(summaryPath)){
        FileStatus summaryStatus = fs.getFileStatus(summaryPath);
        foundFooters.addAll(ParquetFileReader.readSummaryFile(conf, summaryStatus));
        continue;
      }

      // else we handle as normal file.
      for (FileStatus inStatus : DrillFileSystemUtil.listFiles(fs, status.getPath(), false)){
        readers.add(new FooterReader(conf, inStatus));
      }
    } else {
      readers.add(new FooterReader(conf, status));
    }

  }
  if(!readers.isEmpty()){
    foundFooters.addAll(TimedCallable.run("Fetch Parquet Footers", logger, readers, parallelism));
  }

  return foundFooters;
}

org.apache.parquet.hadoop.ParquetFileReader Java Examples