org.apache.parquet.hadoop.api.InitContext Java Exaples

Source File: PentahoParquetReadSupport.java From pentaho-hadoop-shims with Apache License 2.0

6 votes

@Override
public ReadContext init( InitContext context ) {
  String schemaStr = context.getConfiguration().get( ParquetConverter.PARQUET_SCHEMA_CONF_KEY );
  if ( schemaStr == null ) {
    throw new RuntimeException( "Schema not defined in the PentahoParquetSchema key" );
  }

  ParquetInputFieldList schema = ParquetInputFieldList.unmarshall( schemaStr );
  converter = new ParquetConverter( schema.getFields() );

  // get all fields from file's schema
  MessageType fileSchema = context.getFileSchema();
  List<Type> newFields = new ArrayList<>();
  // use only required fields
  for ( IParquetInputField f : schema ) {
    Type origField = fileSchema.getFields().get( fileSchema.getFieldIndex( f.getFormatFieldName() ) );
    newFields.add( origField );
  }
  if ( newFields.isEmpty() ) {
    throw new RuntimeException( "Fields should be declared" );
  }
  MessageType newSchema = new MessageType( fileSchema.getName(), newFields );

  return new ReadContext( newSchema, new HashMap<>() );
}

Source File: InternalParquetRecordReader.java From tajo with Apache License 2.0

6 votes

public void initialize(FileMetaData parquetFileMetadata,
                       Path file, List<BlockMetaData> blocks, Configuration configuration)
    throws IOException {
  // initialize a ReadContext for this file
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
      configuration, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.fileSchema = parquetFileMetadata.getSchema();
  this.file = file;
  this.columnCount = requestedSchema.getPaths().size();
  this.recordConverter = readSupport.prepareForRead(
      configuration, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  List<ColumnDescriptor> columns = requestedSchema.getColumns();
  reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
  for (BlockMetaData block : blocks) {
    total += block.getRowCount();
  }
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
  LOG.info("RecordReader initialized will read a total of " + total + " records.");
}

Source File: InternalParquetRecordReader.java From parquet-mr with Apache License 2.0

6 votes

public void initialize(ParquetFileReader reader, Configuration configuration)
    throws IOException {
  // initialize a ReadContext for this file
  this.reader = reader;
  FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
  this.fileSchema = parquetFileMetadata.getSchema();
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
      configuration, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.columnCount = requestedSchema.getPaths().size();
  // Setting the projection schema before running any filtering (e.g. getting filtered record count)
  // because projection impacts filtering
  reader.setRequestedSchema(requestedSchema);
  this.recordConverter = readSupport.prepareForRead(
      configuration, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  this.total = reader.getFilteredRecordCount();
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
  this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true);
  LOG.info("RecordReader initialized will read a total of {} records.", total);
}

Source File: ParquetInputFormat.java From parquet-mr with Apache License 2.0

6 votes

/**
 * @param configuration the configuration to connect to the file system
 * @param footers the footers of the files to read
 * @return the splits for the footers
 * @throws IOException if there is an error while reading
 * @deprecated split planning using file footers will be removed
 */
@Deprecated
public List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers) throws IOException {
  boolean strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  final long maxSplitSize = configuration.getLong("mapred.max.split.size", Long.MAX_VALUE);
  final long minSplitSize = Math.max(getFormatMinSplitSize(), configuration.getLong("mapred.min.split.size", 0L));
  if (maxSplitSize < 0 || minSplitSize < 0) {
    throw new ParquetDecodingException("maxSplitSize or minSplitSize should not be negative: maxSplitSize = " + maxSplitSize + "; minSplitSize = " + minSplitSize);
  }
  GlobalMetaData globalMetaData = ParquetFileWriter.getGlobalMetaData(footers, strictTypeChecking);
  ReadContext readContext = getReadSupport(configuration).init(new InitContext(
      configuration,
      globalMetaData.getKeyValueMetaData(),
      globalMetaData.getSchema()));

  return new ClientSideMetadataSplitStrategy().getSplits(
      configuration, footers, maxSplitSize, minSplitSize, readContext);
}

Source File: TestTupleRecordConsumer.java From parquet-mr with Apache License 2.0

5 votes

private RecordMaterializer<Tuple> newPigRecordConsumer(String pigSchemaString) throws ParserException {
  TupleReadSupport tupleReadSupport = new TupleReadSupport();
  final Configuration configuration = new Configuration(false);
  MessageType parquetSchema = getMessageType(pigSchemaString);
  final Map<String, String> pigMetaData = pigMetaData(pigSchemaString);
  Map<String, Set<String>> globalMetaData = new HashMap<String, Set<String>>();
  for (Entry<String, String> entry : pigMetaData.entrySet()) {
    globalMetaData.put(entry.getKey(), new HashSet<String>(Arrays.asList(entry.getValue())));
  }
  configuration.set(PARQUET_PIG_SCHEMA, pigSchemaString);
  final ReadContext init = tupleReadSupport.init(new InitContext(configuration, globalMetaData, parquetSchema));
  return tupleReadSupport.prepareForRead(configuration, pigMetaData, parquetSchema, init);
}

Source File: ParquetReadSupport.java From iceberg with Apache License 2.0

5 votes

@Override
@SuppressWarnings("deprecation")
public ReadContext init(Configuration configuration, Map<String, String> keyValueMetaData, MessageType fileSchema) {
  // Columns are selected from the Parquet file by taking the read context's message type and
  // matching to the file's columns by full path, so this must select columns by using the path
  // in the file's schema.

  MessageType projection = hasIds(fileSchema) ?
    pruneColumns(fileSchema, expectedSchema) :
    pruneColumnsFallback(fileSchema, expectedSchema);

  // override some known backward-compatibility options
  configuration.set("parquet.strict.typing", "false");
  configuration.set("parquet.avro.add-list-element-records", "false");
  configuration.set("parquet.avro.write-old-list-structure", "false");

  // set Avro schemas in case the reader is Avro
  AvroReadSupport.setRequestedProjection(configuration,
      AvroSchemaUtil.convert(expectedSchema, projection.getName()));
  org.apache.avro.Schema avroReadSchema = AvroSchemaUtil.buildAvroProjection(
      AvroSchemaUtil.convert(ParquetSchemaUtil.convert(projection), projection.getName()),
      expectedSchema, ImmutableMap.of());
  AvroReadSupport.setAvroReadSchema(configuration, ParquetAvro.parquetAvroSchema(avroReadSchema));

  // let the context set up read support metadata, but always use the correct projection
  ReadContext context = null;
  if (callInit) {
    try {
      context = wrapped.init(configuration, keyValueMetaData, projection);
    } catch (UnsupportedOperationException e) {
      // try the InitContext version
      context = wrapped.init(new InitContext(
          configuration, makeMultimap(keyValueMetaData), projection));
    }
  }

  return new ReadContext(projection,
      context != null ? context.getReadSupportMetadata() : ImmutableMap.of());
}

Source File: TajoReadSupport.java From tajo with Apache License 2.0

5 votes

/**
 * Initializes the ReadSupport.
 *
 * @param context The InitContext.
 * @return A ReadContext that defines how to read the file.
 */
@Override
public ReadSupport.ReadContext init(InitContext context) {
  if (requestedSchema == null) {
    throw new RuntimeException("requestedSchema is null.");
  }
  MessageType requestedParquetSchema =
    new TajoSchemaConverter().convert(requestedSchema);
  LOG.debug("Reading data with projection:\n" + requestedParquetSchema);
  return new ReadContext(requestedParquetSchema);
}

Source File: ParquetRecordReader.java From flink with Apache License 2.0

5 votes

public void initialize(ParquetFileReader reader, Configuration configuration) {
	this.reader = reader;
	FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
	// real schema of parquet file
	this.fileSchema = parquetFileMetadata.getSchema();
	Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
	ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
		configuration, toSetMultiMap(fileMetadata), readSchema));

	this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
	this.recordMaterializer = readSupport.prepareForRead(
		configuration, fileMetadata, readSchema, readContext);
	this.numTotalRecords = reader.getRecordCount();
}

Source File: TupleReadSupport.java From parquet-mr with Apache License 2.0

5 votes

@Override
public ReadContext init(InitContext initContext) {
  Schema pigSchema = getPigSchema(initContext.getConfiguration());
  RequiredFieldList requiredFields = getRequiredFields(initContext.getConfiguration());
  boolean columnIndexAccess = initContext.getConfiguration().getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false);

  if (pigSchema == null) {
    return new ReadContext(initContext.getFileSchema());
  } else {

    // project the file schema according to the requested Pig schema
    MessageType parquetRequestedSchema = new PigSchemaConverter(columnIndexAccess).filter(initContext.getFileSchema(), pigSchema, requiredFields);
    return new ReadContext(parquetRequestedSchema);
  }
}

Source File: ProtoReadSupport.java From parquet-mr with Apache License 2.0

5 votes

@Override
public ReadContext init(InitContext context) {
  String requestedProjectionString = context.getConfiguration().get(PB_REQUESTED_PROJECTION);

  if (requestedProjectionString != null && !requestedProjectionString.trim().isEmpty()) {
    MessageType requestedProjection = getSchemaForRead(context.getFileSchema(), requestedProjectionString);
    LOG.debug("Reading data with projection {}", requestedProjection);
    return new ReadContext(requestedProjection);
  } else {
    MessageType fileSchema = context.getFileSchema();
    LOG.debug("Reading data with schema {}", fileSchema);
    return new ReadContext(fileSchema);
  }
}

Source File: ParquetRecordReader.java From flink with Apache License 2.0

5 votes

public void initialize(ParquetFileReader reader, Configuration configuration) {
	this.reader = reader;
	FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
	// real schema of parquet file
	this.fileSchema = parquetFileMetadata.getSchema();
	Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
	ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
		configuration, toSetMultiMap(fileMetadata), readSchema));

	this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
	this.recordMaterializer = readSupport.prepareForRead(
		configuration, fileMetadata, readSchema, readContext);
	this.numTotalRecords = reader.getRecordCount();
}

Source File: InternalParquetRecordReader.java From parquet-mr with Apache License 2.0

5 votes

public void initialize(ParquetFileReader reader, ParquetReadOptions options) {
  // copy custom configuration to the Configuration passed to the ReadSupport
  Configuration conf = new Configuration();
  if (options instanceof HadoopReadOptions) {
    conf = ((HadoopReadOptions) options).getConf();
  }
  for (String property : options.getPropertyNames()) {
    conf.set(property, options.getProperty(property));
  }

  // initialize a ReadContext for this file
  this.reader = reader;
  FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
  this.fileSchema = parquetFileMetadata.getSchema();
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(conf, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.columnCount = requestedSchema.getPaths().size();
  // Setting the projection schema before running any filtering (e.g. getting filtered record count)
  // because projection impacts filtering
  reader.setRequestedSchema(requestedSchema);
  this.recordConverter = readSupport.prepareForRead(conf, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = options.isEnabled(STRICT_TYPE_CHECKING, true);
  this.total = reader.getFilteredRecordCount();
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(options, total);
  this.filterRecords = options.useRecordFilter();
  LOG.info("RecordReader initialized will read a total of {} records.", total);
}

Source File: ThriftReadSupport.java From parquet-mr with Apache License 2.0

5 votes

@Override
public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(InitContext context) {
  final Configuration configuration = context.getConfiguration();
  final MessageType fileMessageType = context.getFileSchema();
  MessageType requestedProjection = fileMessageType;
  String partialSchemaString = configuration.get(ReadSupport.PARQUET_READ_SCHEMA);

  FieldProjectionFilter projectionFilter = getFieldProjectionFilter(configuration);

  if (partialSchemaString != null && projectionFilter != null) {
    throw new ThriftProjectionException(
        String.format("You cannot provide both a partial schema and field projection filter."
                + "Only one of (%s, %s, %s) should be set.",
            PARQUET_READ_SCHEMA, STRICT_THRIFT_COLUMN_FILTER_KEY, THRIFT_COLUMN_FILTER_KEY));
  }

  //set requestedProjections only when it's specified
  if (partialSchemaString != null) {
    requestedProjection = getSchemaForRead(fileMessageType, partialSchemaString);
  } else if (projectionFilter != null) {
    try {
      initThriftClassFromMultipleFiles(context.getKeyValueMetadata(), configuration);
      requestedProjection =  getProjectedSchema(projectionFilter);
    } catch (ClassNotFoundException e) {
      throw new ThriftProjectionException("can not find thriftClass from configuration", e);
    }
  }

  MessageType schemaForRead = getSchemaForRead(fileMessageType, requestedProjection);
  return new ReadContext(schemaForRead);
}

Source File: SimpleReadSupport.java From parquet-mr with Apache License 2.0

4 votes

@Override
public ReadContext init(InitContext context) {
  return new ReadContext(context.getFileSchema());
}

Source File: TestInputOutputFormat.java From parquet-mr with Apache License 2.0

4 votes

@Override
public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(InitContext context) {
  Set<String> counts = context.getKeyValueMetadata().get("my.count");
  assertTrue("counts: " + counts, counts.size() > 0);
  return super.init(context);
}

Source File: ParquetHdfsDataWriterTest.java From incubator-gobblin with Apache License 2.0

4 votes

@Override
public ReadContext init(InitContext context) {
  return new ReadContext(context.getFileSchema());
}

Source File: RowReadSupport.java From flink with Apache License 2.0

4 votes

@Override
public ReadContext init(InitContext initContext) {
	checkNotNull(initContext, "initContext");
	returnTypeInfo = ParquetSchemaConverter.fromParquetType(initContext.getFileSchema());
	return new ReadContext(initContext.getFileSchema());
}

Source File: ParquetReadSupport.java From iceberg with Apache License 2.0

4 votes

@Override
@SuppressWarnings("deprecation")
public ReadContext init(Configuration configuration, Map<String, String> keyValueMetaData, MessageType fileSchema) {
  // Columns are selected from the Parquet file by taking the read context's message type and
  // matching to the file's columns by full path, so this must select columns by using the path
  // in the file's schema.

  MessageType projection;
  if (ParquetSchemaUtil.hasIds(fileSchema)) {
    projection = ParquetSchemaUtil.pruneColumns(fileSchema, expectedSchema);
  } else if (nameMapping != null) {
    MessageType typeWithIds = ParquetSchemaUtil.applyNameMapping(fileSchema, nameMapping);
    projection = ParquetSchemaUtil.pruneColumns(typeWithIds, expectedSchema);
  } else {
    projection = ParquetSchemaUtil.pruneColumnsFallback(fileSchema, expectedSchema);
  }

  // override some known backward-compatibility options
  configuration.set("parquet.strict.typing", "false");
  configuration.set("parquet.avro.add-list-element-records", "false");
  configuration.set("parquet.avro.write-old-list-structure", "false");

  // set Avro schemas in case the reader is Avro
  AvroReadSupport.setRequestedProjection(configuration,
      AvroSchemaUtil.convert(expectedSchema, projection.getName()));
  org.apache.avro.Schema avroReadSchema = AvroSchemaUtil.buildAvroProjection(
      AvroSchemaUtil.convert(ParquetSchemaUtil.convert(projection), projection.getName()),
      expectedSchema, ImmutableMap.of());
  AvroReadSupport.setAvroReadSchema(configuration, ParquetAvro.parquetAvroSchema(avroReadSchema));

  // let the context set up read support metadata, but always use the correct projection
  ReadContext context = null;
  if (callInit) {
    try {
      context = wrapped.init(configuration, keyValueMetaData, projection);
    } catch (UnsupportedOperationException e) {
      // try the InitContext version
      context = wrapped.init(new InitContext(
          configuration, makeMultimap(keyValueMetaData), projection));
    }
  }

  return new ReadContext(projection,
      context != null ? context.getReadSupportMetadata() : ImmutableMap.of());
}

Source File: RowReadSupport.java From flink with Apache License 2.0

4 votes

@Override
public ReadContext init(InitContext initContext) {
	checkNotNull(initContext, "initContext");
	returnTypeInfo = ParquetSchemaConverter.fromParquetType(initContext.getFileSchema());
	return new ReadContext(initContext.getFileSchema());
}

org.apache.parquet.hadoop.api.InitContext Java Examples