org.apache.parquet.column.page.PageReadStore Java Exaples

Source File: TupleConsumerPerfTest.java From parquet-mr with Apache License 2.0

6 votes

private static void read(PageReadStore columns, String pigSchemaString, String message) throws ParserException {
    System.out.println(message);
    MessageColumnIO columnIO = newColumnFactory(pigSchemaString);
    TupleReadSupport tupleReadSupport = new TupleReadSupport();
    Map<String, String> pigMetaData = pigMetaData(pigSchemaString);
    MessageType schema = new PigSchemaConverter().convert(Utils.getSchemaFromString(pigSchemaString));
    ReadContext init = tupleReadSupport.init(null, pigMetaData, schema);
    RecordMaterializer<Tuple> recordConsumer = tupleReadSupport.prepareForRead(null, pigMetaData, schema, init);
    RecordReader<Tuple> recordReader = columnIO.getRecordReader(columns, recordConsumer);
    // TODO: put this back
//  if (DEBUG) {
//    recordConsumer = new RecordConsumerLoggingWrapper(recordConsumer);
//  }
    read(recordReader, 10000, pigSchemaString);
    read(recordReader, 10000, pigSchemaString);
    read(recordReader, 10000, pigSchemaString);
    read(recordReader, 10000, pigSchemaString);
    read(recordReader, 10000, pigSchemaString);
    read(recordReader, 100000, pigSchemaString);
    read(recordReader, 1000000, pigSchemaString);
    System.out.println();
  }

Source File: ParquetColumnarRowSplitReader.java From flink with Apache License 2.0

6 votes

private void readNextRowGroup() throws IOException {
	PageReadStore pages = reader.readNextRowGroup();
	if (pages == null) {
		throw new IOException("expecting more rows but reached last block. Read "
				+ rowsReturned + " out of " + totalRowCount);
	}
	List<ColumnDescriptor> columns = requestedSchema.getColumns();
	columnReaders = new AbstractColumnReader[columns.size()];
	for (int i = 0; i < columns.size(); ++i) {
		columnReaders[i] = createColumnReader(
				utcTimestamp,
				selectedTypes[i],
				columns.get(i),
				pages.getPageReader(columns.get(i)));
	}
	totalCountLoadedSoFar += pages.getRowCount();
}

Source File: ParquetReader.java From iceberg with Apache License 2.0

6 votes

private void advance() {
  while (shouldSkip[nextRowGroup]) {
    nextRowGroup += 1;
    reader.skipNextRowGroup();
  }

  PageReadStore pages;
  try {
    pages = reader.readNextRowGroup();
  } catch (IOException e) {
    throw new RuntimeIOException(e);
  }

  nextRowGroupStart += pages.getRowCount();
  nextRowGroup += 1;

  model.setPageSource(pages);
}

Source File: ParquetResolverTest.java From pxf with Apache License 2.0

6 votes

@SuppressWarnings("deprecation")
private List<Group> readParquetFile(String file, long expectedSize, MessageType schema) throws IOException {
    List<Group> result = new ArrayList<>();
    String parquetFile = Objects.requireNonNull(getClass().getClassLoader().getResource("parquet/" + file)).getPath();
    Path path = new Path(parquetFile);

    ParquetFileReader fileReader = new ParquetFileReader(new Configuration(), path, ParquetMetadataConverter.NO_FILTER);
    PageReadStore rowGroup;
    while ((rowGroup = fileReader.readNextRowGroup()) != null) {
        MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
        RecordReader<Group> recordReader = columnIO.getRecordReader(rowGroup, new GroupRecordConverter(schema));
        long rowCount = rowGroup.getRowCount();
        for (long i = 0; i < rowCount; i++) {
            result.add(recordReader.read());
        }
    }
    fileReader.close();
    assertEquals(expectedSize, result.size());
    return result;
}

Source File: ParquetReader.java From iceberg with Apache License 2.0

6 votes

private void advance() {
  while (shouldSkip[nextRowGroup]) {
    nextRowGroup += 1;
    reader.skipNextRowGroup();
  }

  PageReadStore pages;
  try {
    pages = reader.readNextRowGroup();
  } catch (IOException e) {
    throw new RuntimeIOException(e);
  }

  nextRowGroupStart += pages.getRowCount();

  model.setPageSource(pages);
}

Source File: ColumnReadStoreImpl.java From parquet-mr with Apache License 2.0

6 votes

/**
 * @param pageReadStore underlying page storage
 * @param recordConverter the user provided converter to materialize records
 * @param schema the schema we are reading
 * @param createdBy writer version string from the Parquet file being read
 */
public ColumnReadStoreImpl(PageReadStore pageReadStore,
                           GroupConverter recordConverter,
                           MessageType schema, String createdBy) {
  super();
  this.pageReadStore = pageReadStore;
  this.recordConverter = recordConverter;
  this.schema = schema;

  ParsedVersion version;
  try {
    version = VersionParser.parse(createdBy);
  } catch (RuntimeException | VersionParseException e) {
    version = null;
  }
  this.writerVersion = version;
}

Source File: CompressionConverter.java From parquet-mr with Apache License 2.0

6 votes

public void processBlocks(TransParquetFileReader reader, ParquetFileWriter writer, ParquetMetadata meta, MessageType schema,
                           String createdBy, CompressionCodecName codecName) throws IOException {
  int blockIndex = 0;
  PageReadStore store = reader.readNextRowGroup();
  while (store != null) {
    writer.startBlock(store.getRowCount());
    BlockMetaData blockMetaData = meta.getBlocks().get(blockIndex);
    List<ColumnChunkMetaData> columnsInOrder = blockMetaData.getColumns();
    Map<ColumnPath, ColumnDescriptor> descriptorsMap = schema.getColumns().stream().collect(
      Collectors.toMap(x -> ColumnPath.get(x.getPath()), x -> x));
    for (int i = 0; i < columnsInOrder.size(); i += 1) {
      ColumnChunkMetaData chunk = columnsInOrder.get(i);
      ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DummyGroupConverter(), schema, createdBy);
      ColumnDescriptor columnDescriptor = descriptorsMap.get(chunk.getPath());
      writer.startColumn(columnDescriptor, crstore.getColumnReader(columnDescriptor).getTotalValueCount(), codecName);
      processChunk(reader, writer, chunk, createdBy, codecName);
      writer.endColumn();
    }
    writer.endBlock();
    store = reader.readNextRowGroup();
    blockIndex++;
  }
}

Source File: TestDataPageV1Checksums.java From parquet-mr with Apache License 2.0

6 votes

/** Test that we do not write out checksums if the feature is turned off */
@Test
public void testWriteOffVerifyOff() throws IOException {
  Configuration conf = new Configuration();
  conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, false);
  conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false);

  Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);

  try (ParquetFileReader reader = getParquetFileReader(path, conf,
    Arrays.asList(colADesc, colBDesc))) {
    PageReadStore pageReadStore = reader.readNextRowGroup();

    assertCrcNotSet(readNextPage(colADesc, pageReadStore));
    assertCrcNotSet(readNextPage(colADesc, pageReadStore));
    assertCrcNotSet(readNextPage(colBDesc, pageReadStore));
    assertCrcNotSet(readNextPage(colBDesc, pageReadStore));
  }
}

Source File: TestDataPageV1Checksums.java From parquet-mr with Apache License 2.0

6 votes

/**
 * Do not write out page level crc checksums, but enable verification on the read path. Tests
 * that the read still succeeds and does not throw an exception.
 */
@Test
public void testWriteOffVerifyOn() throws IOException {
  Configuration conf = new Configuration();
  conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, false);
  conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);

  Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);

  try (ParquetFileReader reader = getParquetFileReader(path, conf,
    Arrays.asList(colADesc, colBDesc))) {
    PageReadStore pageReadStore = reader.readNextRowGroup();

    assertCorrectContent(readNextPage(colADesc, pageReadStore).getBytes().toByteArray(),
      colAPage1Bytes);
    assertCorrectContent(readNextPage(colADesc, pageReadStore).getBytes().toByteArray(),
      colAPage2Bytes);
    assertCorrectContent(readNextPage(colBDesc, pageReadStore).getBytes().toByteArray(),
      colBPage1Bytes);
    assertCorrectContent(readNextPage(colBDesc, pageReadStore).getBytes().toByteArray(),
      colBPage2Bytes);
  }
}

Source File: FileEncodingsIT.java From parquet-mr with Apache License 2.0

6 votes

public static void validatePages(Path file, List<?> expectedValues) throws IOException {
  List<PageReadStore> blockReaders = readBlocksFromFile(file);
  MessageType fileSchema = readSchemaFromFile(file);
  int rowGroupID = 0;
  int rowsRead = 0;
  for (PageReadStore pageReadStore : blockReaders) {
    for (ColumnDescriptor columnsDesc : fileSchema.getColumns()) {
      List<DataPage> pageGroup = getPageGroupForColumn(pageReadStore, columnsDesc);
      DictionaryPage dictPage = reusableCopy(getDictionaryPageForColumn(pageReadStore, columnsDesc));

      List<?> expectedRowGroupValues = expectedValues.subList(rowsRead, (int)(rowsRead + pageReadStore.getRowCount()));
      validateFirstToLast(rowGroupID, dictPage, pageGroup, columnsDesc, expectedRowGroupValues);
      validateLastToFirst(rowGroupID, dictPage, pageGroup, columnsDesc, expectedRowGroupValues);
    }

    rowsRead += pageReadStore.getRowCount();
    rowGroupID++;
  }
}

Source File: TestStatistics.java From parquet-mr with Apache License 2.0

6 votes

@Override
public void test() throws IOException {
  Configuration configuration = new Configuration();
  ParquetMetadata metadata = ParquetFileReader.readFooter(configuration,
      super.fsPath, ParquetMetadataConverter.NO_FILTER);
  ParquetFileReader reader = new ParquetFileReader(configuration,
    metadata.getFileMetaData(),
    super.fsPath,
    metadata.getBlocks(),
    metadata.getFileMetaData().getSchema().getColumns());

  PageStatsValidator validator = new PageStatsValidator();

  PageReadStore pageReadStore;
  while ((pageReadStore = reader.readNextRowGroup()) != null) {
    validator.validate(metadata.getFileMetaData().getSchema(), pageReadStore);
  }
}

Source File: ParquetRecordReader.java From flink with Apache License 2.0

5 votes

/**
 * Moves the reading position to the given block and seeks to and reads the given record.
 *
 * @param block The block to seek to.
 * @param recordInBlock The number of the record in the block to return next.
 */
public void seek(long block, long recordInBlock) throws IOException {

	List<BlockMetaData> blockMetaData = reader.getRowGroups();

	if (block == -1L && recordInBlock == -1L) {
		// the split was fully consumed
		currentBlock = blockMetaData.size() - 1;
		numReadRecords = numTotalRecords;
		numRecordsUpToCurrentBlock = numTotalRecords;
		return;
	}

	// init all counters for the start of the first block
	currentBlock = 0;
	numRecordsUpToPreviousBlock = 0;
	numRecordsUpToCurrentBlock = blockMetaData.get(0).getRowCount();
	numReadRecords = 0;

	// seek to the given block
	while (currentBlock < block) {
		currentBlock++;
		reader.skipNextRowGroup();
		numRecordsUpToPreviousBlock = numRecordsUpToCurrentBlock;
		numRecordsUpToCurrentBlock += blockMetaData.get(currentBlock).getRowCount();
		numReadRecords = numRecordsUpToPreviousBlock;
	}

	// seek to and read the given record
	PageReadStore pages = reader.readNextRowGroup();
	recordReader = createRecordReader(pages);
	for (int i = 0; i <= recordInBlock; i++) {
		readNextRecord();
	}
}

Source File: ParquetRecordReaderTest.java From dremio-oss with Apache License 2.0

5 votes

private void validateContains(MessageType schema, PageReadStore pages, String[] path, int values, BytesInput bytes)
    throws IOException {
  PageReader pageReader = pages.getPageReader(schema.getColumnDescription(path));
  DataPageV1 page = (DataPageV1) pageReader.readPage();
  assertEquals(values, page.getValueCount());
  assertArrayEquals(bytes.toByteArray(), page.getBytes().toByteArray());
}

Source File: FileEncodingsIT.java From parquet-mr with Apache License 2.0

5 votes

private static List<PageReadStore> readBlocksFromFile(Path file) throws IOException {
  List<PageReadStore> rowGroups = new ArrayList<PageReadStore>();

  ParquetMetadata metadata = ParquetFileReader.readFooter(configuration, file, ParquetMetadataConverter.NO_FILTER);
  ParquetFileReader fileReader = new ParquetFileReader(configuration, metadata.getFileMetaData(), file, metadata.getBlocks(),
      metadata.getFileMetaData().getSchema().getColumns());

  PageReadStore group;
  while ((group = fileReader.readNextRowGroup()) != null) {
    rowGroups.add(group);
  }

  return rowGroups;
}

Source File: FileEncodingsIT.java From parquet-mr with Apache License 2.0

5 votes

private static List<DataPage> getPageGroupForColumn(PageReadStore pageReadStore, ColumnDescriptor columnDescriptor) {
  PageReader pageReader = pageReadStore.getPageReader(columnDescriptor);
  List<DataPage> pageGroup = new ArrayList<DataPage>();

  DataPage page;
  while ((page = pageReader.readPage()) != null) {
    pageGroup.add(reusableCopy(page));
  }

  return pageGroup;
}

Source File: TestStatistics.java From parquet-mr with Apache License 2.0

5 votes

public void validate(MessageType schema, PageReadStore store) {
  for (ColumnDescriptor desc : schema.getColumns()) {
    PageReader reader = store.getPageReader(desc);
    DictionaryPage dict = reader.readDictionaryPage();
    DataPage page;
    while ((page = reader.readPage()) != null) {
      validateStatsForPage(page, dict, desc);
    }
  }
}

Source File: SparkModelParser.java From ignite with Apache License 2.0

5 votes

/**
 * Load Decision Tree model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadDecisionTreeModel(String pathToMdl, LearningEnvironment learningEnvironment) {
    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
        final Map<Integer, NodeData> nodes = new TreeMap<>();

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));

            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                NodeData nodeData = extractNodeDataFromParquetRow(g);
                nodes.put(nodeData.id, nodeData);
            }
        }
        return buildDecisionTreeModel(nodes);
    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }
    return null;
}

Source File: SparkModelParser.java From ignite with Apache License 2.0

5 votes

/**
 * Load linear regression model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadLinRegModel(String pathToMdl,
    LearningEnvironment learningEnvironment) {
    Vector coefficients = null;
    double interceptor = 0;

    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                interceptor = readLinRegInterceptor(g);
                coefficients = readLinRegCoefficients(g);
            }
        }

    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }

    return new LinearRegressionModel(coefficients, interceptor);
}

Source File: SparkModelParser.java From ignite with Apache License 2.0

5 votes

/**
 * Load logistic regression model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadLogRegModel(String pathToMdl,
    LearningEnvironment learningEnvironment) {
    Vector coefficients = null;
    double interceptor = 0;

    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                interceptor = readInterceptor(g);
                coefficients = readCoefficients(g);
            }
        }

    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }

    return new LogisticRegressionModel(coefficients, interceptor);
}

Source File: InternalParquetRecordReader.java From tajo with Apache License 2.0

5 votes

private void checkRead() throws IOException {
  if (current == totalCountLoadedSoFar) {
    if (current != 0) {
      totalTimeSpentProcessingRecords += (System.currentTimeMillis() - startedAssemblingCurrentBlockAt);
      if (Log.DEBUG) {
        LOG.debug("Assembled and processed " + totalCountLoadedSoFar + " records from " + columnCount + " columns in " + totalTimeSpentProcessingRecords + " ms: "+((float)totalCountLoadedSoFar / totalTimeSpentProcessingRecords) + " rec/ms, " + ((float)totalCountLoadedSoFar * columnCount / totalTimeSpentProcessingRecords) + " cell/ms");
        final long totalTime = totalTimeSpentProcessingRecords + totalTimeSpentReadingBytes;
        if (totalTime != 0) {
          final long percentReading = 100 * totalTimeSpentReadingBytes / totalTime;
          final long percentProcessing = 100 * totalTimeSpentProcessingRecords / totalTime;
          LOG.debug("time spent so far " + percentReading + "% reading ("+totalTimeSpentReadingBytes+" ms) and " + percentProcessing + "% processing ("+totalTimeSpentProcessingRecords+" ms)");
        }
      }
    }

    if (Log.DEBUG) LOG.debug("at row " + current + ". reading next block");
    long t0 = System.currentTimeMillis();
    PageReadStore pages = reader.readNextRowGroup();
    if (pages == null) {
      throw new IOException("expecting more rows but reached last block. Read " + current + " out of " + total);
    }
    long timeSpentReading = System.currentTimeMillis() - t0;
    totalTimeSpentReadingBytes += timeSpentReading;
    BenchmarkCounter.incrementTime(timeSpentReading);
    if (Log.INFO) LOG.info("block read in memory in " + timeSpentReading + " ms. row count = " + pages.getRowCount());
    if (Log.DEBUG) LOG.debug("initializing Record assembly with requested schema " + requestedSchema);
    MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema, strictTypeChecking);
    recordReader = columnIO.getRecordReader(pages, recordConverter, filter);
    startedAssemblingCurrentBlockAt = System.currentTimeMillis();
    totalCountLoadedSoFar += pages.getRowCount();
    ++ currentBlock;
  }
}

Source File: ParquetRecordReader.java From flink with Apache License 2.0

5 votes

private RecordReader<T> createRecordReader(PageReadStore pages) throws IOException {
	if (pages == null) {
		throw new IOException(
			"Expecting more rows but reached last block. Read " + numReadRecords + " out of " + numTotalRecords);
	}
	MessageColumnIO columnIO = columnIOFactory.getColumnIO(readSchema, fileSchema, true);
	return columnIO.getRecordReader(pages, recordMaterializer, filter);
}

Source File: ParquetRecordReader.java From flink with Apache License 2.0

5 votes

/**
 * Moves the reading position to the given block and seeks to and reads the given record.
 *
 * @param block The block to seek to.
 * @param recordInBlock The number of the record in the block to return next.
 */
public void seek(long block, long recordInBlock) throws IOException {

	List<BlockMetaData> blockMetaData = reader.getRowGroups();

	if (block == -1L && recordInBlock == -1L) {
		// the split was fully consumed
		currentBlock = blockMetaData.size() - 1;
		numReadRecords = numTotalRecords;
		numRecordsUpToCurrentBlock = numTotalRecords;
		return;
	}

	// init all counters for the start of the first block
	currentBlock = 0;
	numRecordsUpToPreviousBlock = 0;
	numRecordsUpToCurrentBlock = blockMetaData.get(0).getRowCount();
	numReadRecords = 0;

	// seek to the given block
	while (currentBlock < block) {
		currentBlock++;
		reader.skipNextRowGroup();
		numRecordsUpToPreviousBlock = numRecordsUpToCurrentBlock;
		numRecordsUpToCurrentBlock += blockMetaData.get(currentBlock).getRowCount();
		numReadRecords = numRecordsUpToPreviousBlock;
	}

	// seek to and read the given record
	PageReadStore pages = reader.readNextRowGroup();
	recordReader = createRecordReader(pages);
	for (int i = 0; i <= recordInBlock; i++) {
		readNextRecord();
	}
}

Source File: ParquetRecordReader.java From flink with Apache License 2.0

5 votes

private RecordReader<T> createRecordReader(PageReadStore pages) throws IOException {
	if (pages == null) {
		throw new IOException(
			"Expecting more rows but reached last block. Read " + numReadRecords + " out of " + numTotalRecords);
	}
	MessageColumnIO columnIO = columnIOFactory.getColumnIO(readSchema, fileSchema, true);
	return columnIO.getRecordReader(pages, recordMaterializer, filter);
}

Source File: TestDataPageV1Checksums.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testDictionaryEncoding() throws IOException {
  Configuration conf = new Configuration();

  // Write out dictionary encoded sample file via the non-checksum code path, extract the raw
  // bytes to calculate the  reference crc with
  conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, false);
  conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false);
  Path refPath = writeNestedWithNullsSampleParquetFile(conf, true, CompressionCodecName.SNAPPY);

  try (ParquetFileReader refReader =
    getParquetFileReader(refPath, conf, Collections.singletonList(colDValDesc))) {
    PageReadStore refPageReadStore = refReader.readNextRowGroup();
    // Read (decompressed) dictionary page
    byte[] dictPageBytes = readDictPage(colDValDesc, refPageReadStore).getBytes().toByteArray();
    byte[] colDValPageBytes = readNextPage(colDValDesc, refPageReadStore).getBytes().toByteArray();

    // Write out sample file with checksums
    conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);
    conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);
    Path path = writeNestedWithNullsSampleParquetFile(conf, true, CompressionCodecName.SNAPPY);

    try (ParquetFileReader reader =
      getParquetFileReader(path, conf, Collections.singletonList(colDValDesc))) {
      PageReadStore pageReadStore = reader.readNextRowGroup();

      DictionaryPage dictPage = readDictPage(colDValDesc, pageReadStore);
      assertCrcSetAndCorrect(dictPage, snappy(dictPageBytes));
      assertCorrectContent(dictPage.getBytes().toByteArray(), dictPageBytes);

      DataPageV1 colDValPage = readNextPage(colDValDesc, pageReadStore);
      assertCrcSetAndCorrect(colDValPage, snappy(colDValPageBytes));
      assertCorrectContent(colDValPage.getBytes().toByteArray(), colDValPageBytes);
    }
  }
}

Source File: TestDataPageV1Checksums.java From parquet-mr with Apache License 2.0

5 votes

/**
 * Tests that we adhere to the checksum calculation specification, namely that the crc is
 * calculated using the compressed concatenation of the repetition levels, definition levels and
 * the actual data. This is done by generating sample data with a nested schema containing nulls
 * (generating non trivial repetition and definition levels).
 */
@Test
public void testNestedWithNulls() throws IOException {
  Configuration conf = new Configuration();

  // Write out sample file via the non-checksum code path, extract the raw bytes to calculate the
  // reference crc with
  conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, false);
  conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false);
  Path refPath = writeNestedWithNullsSampleParquetFile(conf, false, CompressionCodecName.SNAPPY);

  try (ParquetFileReader refReader = getParquetFileReader(refPath, conf,
    Arrays.asList(colCIdDesc, colDValDesc))) {
    PageReadStore refPageReadStore = refReader.readNextRowGroup();
    byte[] colCIdPageBytes = readNextPage(colCIdDesc, refPageReadStore).getBytes().toByteArray();
    byte[] colDValPageBytes = readNextPage(colDValDesc, refPageReadStore).getBytes().toByteArray();

    // Write out sample file with checksums
    conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);
    conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);
    Path path = writeNestedWithNullsSampleParquetFile(conf, false, CompressionCodecName.SNAPPY);

    try (ParquetFileReader reader = getParquetFileReader(path, conf,
      Arrays.asList(colCIdDesc, colDValDesc))) {
      PageReadStore pageReadStore = reader.readNextRowGroup();

      DataPageV1 colCIdPage = readNextPage(colCIdDesc, pageReadStore);
      assertCrcSetAndCorrect(colCIdPage, snappy(colCIdPageBytes));
      assertCorrectContent(colCIdPage.getBytes().toByteArray(), colCIdPageBytes);

      DataPageV1 colDValPage = readNextPage(colDValDesc, pageReadStore);
      assertCrcSetAndCorrect(colDValPage, snappy(colDValPageBytes));
      assertCorrectContent(colDValPage.getBytes().toByteArray(), colDValPageBytes);
    }
  }
}

Source File: TestDataPageV1Checksums.java From parquet-mr with Apache License 2.0

5 votes

/**
 * Tests that the checksum is calculated using the compressed version of the data and that
 * checksum verification succeeds
 */
@Test
public void testCompression() throws IOException {
  Configuration conf = new Configuration();
  conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);
  conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);

  Path path = writeSimpleParquetFile(conf, CompressionCodecName.SNAPPY);

  try (ParquetFileReader reader = getParquetFileReader(path, conf,
    Arrays.asList(colADesc, colBDesc))) {
    PageReadStore pageReadStore = reader.readNextRowGroup();

    DataPageV1 colAPage1 = readNextPage(colADesc, pageReadStore);
    assertCrcSetAndCorrect(colAPage1, snappy(colAPage1Bytes));
    assertCorrectContent(colAPage1.getBytes().toByteArray(), colAPage1Bytes);

    DataPageV1 colAPage2 = readNextPage(colADesc, pageReadStore);
    assertCrcSetAndCorrect(colAPage2, snappy(colAPage2Bytes));
    assertCorrectContent(colAPage2.getBytes().toByteArray(), colAPage2Bytes);

    DataPageV1 colBPage1 = readNextPage(colBDesc, pageReadStore);
    assertCrcSetAndCorrect(colBPage1, snappy(colBPage1Bytes));
    assertCorrectContent(colBPage1.getBytes().toByteArray(), colBPage1Bytes);

    DataPageV1 colBPage2 = readNextPage(colBDesc, pageReadStore);
    assertCrcSetAndCorrect(colBPage2, snappy(colBPage2Bytes));
    assertCorrectContent(colBPage2.getBytes().toByteArray(), colBPage2Bytes);
  }
}

Source File: TestDataPageV1Checksums.java From parquet-mr with Apache License 2.0

5 votes

/**
 * Write out checksums and verify them on the read path. Tests that crc is set and that we can
 * read back what we wrote if checksums are enabled on both the write and read path.
 */
@Test
public void testWriteOnVerifyOn() throws IOException {
  Configuration conf = new Configuration();
  conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);
  conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);

  Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);

  try (ParquetFileReader reader = getParquetFileReader(path, conf,
    Arrays.asList(colADesc, colBDesc))) {
    PageReadStore pageReadStore = reader.readNextRowGroup();

    DataPageV1 colAPage1 = readNextPage(colADesc, pageReadStore);
    assertCrcSetAndCorrect(colAPage1, colAPage1Bytes);
    assertCorrectContent(colAPage1.getBytes().toByteArray(), colAPage1Bytes);

    DataPageV1 colAPage2 = readNextPage(colADesc, pageReadStore);
    assertCrcSetAndCorrect(colAPage2, colAPage2Bytes);
    assertCorrectContent(colAPage2.getBytes().toByteArray(), colAPage2Bytes);

    DataPageV1 colBPage1 = readNextPage(colBDesc, pageReadStore);
    assertCrcSetAndCorrect(colBPage1, colBPage1Bytes);
    assertCorrectContent(colBPage1.getBytes().toByteArray(), colBPage1Bytes);

    DataPageV1 colBPage2 = readNextPage(colBDesc, pageReadStore);
    assertCrcSetAndCorrect(colBPage2, colBPage2Bytes);
    assertCorrectContent(colBPage2.getBytes().toByteArray(), colBPage2Bytes);
  }
}

Source File: InternalParquetRecordReader.java From parquet-mr with Apache License 2.0

5 votes

private void checkRead() throws IOException {
  if (current == totalCountLoadedSoFar) {
    if (current != 0) {
      totalTimeSpentProcessingRecords += (System.currentTimeMillis() - startedAssemblingCurrentBlockAt);
      if (LOG.isInfoEnabled()) {
          LOG.info("Assembled and processed " + totalCountLoadedSoFar + " records from " + columnCount + " columns in " + totalTimeSpentProcessingRecords + " ms: "+((float)totalCountLoadedSoFar / totalTimeSpentProcessingRecords) + " rec/ms, " + ((float)totalCountLoadedSoFar * columnCount / totalTimeSpentProcessingRecords) + " cell/ms");
          final long totalTime = totalTimeSpentProcessingRecords + totalTimeSpentReadingBytes;
          if (totalTime != 0) {
              final long percentReading = 100 * totalTimeSpentReadingBytes / totalTime;
              final long percentProcessing = 100 * totalTimeSpentProcessingRecords / totalTime;
              LOG.info("time spent so far " + percentReading + "% reading ("+totalTimeSpentReadingBytes+" ms) and " + percentProcessing + "% processing ("+totalTimeSpentProcessingRecords+" ms)");
          }
      }
    }

    LOG.info("at row " + current + ". reading next block");
    long t0 = System.currentTimeMillis();
    PageReadStore pages = reader.readNextFilteredRowGroup();
    if (pages == null) {
      throw new IOException("expecting more rows but reached last block. Read " + current + " out of " + total);
    }
    long timeSpentReading = System.currentTimeMillis() - t0;
    totalTimeSpentReadingBytes += timeSpentReading;
    BenchmarkCounter.incrementTime(timeSpentReading);
    if (LOG.isInfoEnabled()) LOG.info("block read in memory in {} ms. row count = {}", timeSpentReading, pages.getRowCount());
    LOG.debug("initializing Record assembly with requested schema {}", requestedSchema);
    MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema, strictTypeChecking);
    recordReader = columnIO.getRecordReader(pages, recordConverter,
        filterRecords ? filter : FilterCompat.NOOP);
    startedAssemblingCurrentBlockAt = System.currentTimeMillis();
    totalCountLoadedSoFar += pages.getRowCount();
    ++ currentBlock;
  }
}

Source File: TestDataPageV1Checksums.java From parquet-mr with Apache License 2.0

5 votes

/**
 * Enable writing out page level crc checksum, disable verification in read path but check that
 * the crc checksums are correct. Tests whether we successfully write out correct crc checksums
 * without potentially failing on the read path verification .
 */
@Test
public void testWriteOnVerifyOff() throws IOException {
  Configuration conf = new Configuration();
  conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);
  conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false);

  Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);

  try (ParquetFileReader reader = getParquetFileReader(path, conf,
    Arrays.asList(colADesc, colBDesc))) {
    PageReadStore pageReadStore = reader.readNextRowGroup();

    DataPageV1 colAPage1 = readNextPage(colADesc, pageReadStore);
    assertCrcSetAndCorrect(colAPage1, colAPage1Bytes);
    assertCorrectContent(colAPage1.getBytes().toByteArray(), colAPage1Bytes);

    DataPageV1 colAPage2 = readNextPage(colADesc, pageReadStore);
    assertCrcSetAndCorrect(colAPage2, colAPage2Bytes);
    assertCorrectContent(colAPage2.getBytes().toByteArray(), colAPage2Bytes);

    DataPageV1 colBPage1 = readNextPage(colBDesc, pageReadStore);
    assertCrcSetAndCorrect(colBPage1, colBPage1Bytes);
    assertCorrectContent(colBPage1.getBytes().toByteArray(), colBPage1Bytes);

    DataPageV1 colBPage2 = readNextPage(colBDesc, pageReadStore);
    assertCrcSetAndCorrect(colBPage2, colBPage2Bytes);
    assertCorrectContent(colBPage2.getBytes().toByteArray(), colBPage2Bytes);
  }
}

Source File: SparkModelParser.java From ignite with Apache License 2.0

5 votes

/**
 * Load SVM model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadLinearSVMModel(String pathToMdl,
    LearningEnvironment learningEnvironment) {
    Vector coefficients = null;
    double interceptor = 0;

    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                interceptor = readSVMInterceptor(g);
                coefficients = readSVMCoefficients(g);
            }
        }
    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }

    return new SVMLinearClassificationModel(coefficients, interceptor);
}

org.apache.parquet.column.page.PageReadStore Java Examples