org.apache.parquet.hadoop.api.InitContext Java Examples
The following examples show how to use
org.apache.parquet.hadoop.api.InitContext.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PentahoParquetReadSupport.java From pentaho-hadoop-shims with Apache License 2.0 | 6 votes |
@Override public ReadContext init( InitContext context ) { String schemaStr = context.getConfiguration().get( ParquetConverter.PARQUET_SCHEMA_CONF_KEY ); if ( schemaStr == null ) { throw new RuntimeException( "Schema not defined in the PentahoParquetSchema key" ); } ParquetInputFieldList schema = ParquetInputFieldList.unmarshall( schemaStr ); converter = new ParquetConverter( schema.getFields() ); // get all fields from file's schema MessageType fileSchema = context.getFileSchema(); List<Type> newFields = new ArrayList<>(); // use only required fields for ( IParquetInputField f : schema ) { Type origField = fileSchema.getFields().get( fileSchema.getFieldIndex( f.getFormatFieldName() ) ); newFields.add( origField ); } if ( newFields.isEmpty() ) { throw new RuntimeException( "Fields should be declared" ); } MessageType newSchema = new MessageType( fileSchema.getName(), newFields ); return new ReadContext( newSchema, new HashMap<>() ); }
Example #2
Source File: InternalParquetRecordReader.java From tajo with Apache License 2.0 | 6 votes |
public void initialize(FileMetaData parquetFileMetadata, Path file, List<BlockMetaData> blocks, Configuration configuration) throws IOException { // initialize a ReadContext for this file Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.fileSchema = parquetFileMetadata.getSchema(); this.file = file; this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); List<ColumnDescriptor> columns = requestedSchema.getColumns(); reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns); for (BlockMetaData block : blocks) { total += block.getRowCount(); } this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total); LOG.info("RecordReader initialized will read a total of " + total + " records."); }
Example #3
Source File: InternalParquetRecordReader.java From parquet-mr with Apache License 2.0 | 6 votes |
public void initialize(ParquetFileReader reader, Configuration configuration) throws IOException { // initialize a ReadContext for this file this.reader = reader; FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); this.fileSchema = parquetFileMetadata.getSchema(); Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.columnCount = requestedSchema.getPaths().size(); // Setting the projection schema before running any filtering (e.g. getting filtered record count) // because projection impacts filtering reader.setRequestedSchema(requestedSchema); this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); this.total = reader.getFilteredRecordCount(); this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total); this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true); LOG.info("RecordReader initialized will read a total of {} records.", total); }
Example #4
Source File: ParquetInputFormat.java From parquet-mr with Apache License 2.0 | 6 votes |
/** * @param configuration the configuration to connect to the file system * @param footers the footers of the files to read * @return the splits for the footers * @throws IOException if there is an error while reading * @deprecated split planning using file footers will be removed */ @Deprecated public List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers) throws IOException { boolean strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); final long maxSplitSize = configuration.getLong("mapred.max.split.size", Long.MAX_VALUE); final long minSplitSize = Math.max(getFormatMinSplitSize(), configuration.getLong("mapred.min.split.size", 0L)); if (maxSplitSize < 0 || minSplitSize < 0) { throw new ParquetDecodingException("maxSplitSize or minSplitSize should not be negative: maxSplitSize = " + maxSplitSize + "; minSplitSize = " + minSplitSize); } GlobalMetaData globalMetaData = ParquetFileWriter.getGlobalMetaData(footers, strictTypeChecking); ReadContext readContext = getReadSupport(configuration).init(new InitContext( configuration, globalMetaData.getKeyValueMetaData(), globalMetaData.getSchema())); return new ClientSideMetadataSplitStrategy().getSplits( configuration, footers, maxSplitSize, minSplitSize, readContext); }
Example #5
Source File: TestTupleRecordConsumer.java From parquet-mr with Apache License 2.0 | 5 votes |
private RecordMaterializer<Tuple> newPigRecordConsumer(String pigSchemaString) throws ParserException { TupleReadSupport tupleReadSupport = new TupleReadSupport(); final Configuration configuration = new Configuration(false); MessageType parquetSchema = getMessageType(pigSchemaString); final Map<String, String> pigMetaData = pigMetaData(pigSchemaString); Map<String, Set<String>> globalMetaData = new HashMap<String, Set<String>>(); for (Entry<String, String> entry : pigMetaData.entrySet()) { globalMetaData.put(entry.getKey(), new HashSet<String>(Arrays.asList(entry.getValue()))); } configuration.set(PARQUET_PIG_SCHEMA, pigSchemaString); final ReadContext init = tupleReadSupport.init(new InitContext(configuration, globalMetaData, parquetSchema)); return tupleReadSupport.prepareForRead(configuration, pigMetaData, parquetSchema, init); }
Example #6
Source File: ParquetReadSupport.java From iceberg with Apache License 2.0 | 5 votes |
@Override @SuppressWarnings("deprecation") public ReadContext init(Configuration configuration, Map<String, String> keyValueMetaData, MessageType fileSchema) { // Columns are selected from the Parquet file by taking the read context's message type and // matching to the file's columns by full path, so this must select columns by using the path // in the file's schema. MessageType projection = hasIds(fileSchema) ? pruneColumns(fileSchema, expectedSchema) : pruneColumnsFallback(fileSchema, expectedSchema); // override some known backward-compatibility options configuration.set("parquet.strict.typing", "false"); configuration.set("parquet.avro.add-list-element-records", "false"); configuration.set("parquet.avro.write-old-list-structure", "false"); // set Avro schemas in case the reader is Avro AvroReadSupport.setRequestedProjection(configuration, AvroSchemaUtil.convert(expectedSchema, projection.getName())); org.apache.avro.Schema avroReadSchema = AvroSchemaUtil.buildAvroProjection( AvroSchemaUtil.convert(ParquetSchemaUtil.convert(projection), projection.getName()), expectedSchema, ImmutableMap.of()); AvroReadSupport.setAvroReadSchema(configuration, ParquetAvro.parquetAvroSchema(avroReadSchema)); // let the context set up read support metadata, but always use the correct projection ReadContext context = null; if (callInit) { try { context = wrapped.init(configuration, keyValueMetaData, projection); } catch (UnsupportedOperationException e) { // try the InitContext version context = wrapped.init(new InitContext( configuration, makeMultimap(keyValueMetaData), projection)); } } return new ReadContext(projection, context != null ? context.getReadSupportMetadata() : ImmutableMap.of()); }
Example #7
Source File: TajoReadSupport.java From tajo with Apache License 2.0 | 5 votes |
/** * Initializes the ReadSupport. * * @param context The InitContext. * @return A ReadContext that defines how to read the file. */ @Override public ReadSupport.ReadContext init(InitContext context) { if (requestedSchema == null) { throw new RuntimeException("requestedSchema is null."); } MessageType requestedParquetSchema = new TajoSchemaConverter().convert(requestedSchema); LOG.debug("Reading data with projection:\n" + requestedParquetSchema); return new ReadContext(requestedParquetSchema); }
Example #8
Source File: ParquetRecordReader.java From flink with Apache License 2.0 | 5 votes |
public void initialize(ParquetFileReader reader, Configuration configuration) { this.reader = reader; FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); // real schema of parquet file this.fileSchema = parquetFileMetadata.getSchema(); Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), readSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.recordMaterializer = readSupport.prepareForRead( configuration, fileMetadata, readSchema, readContext); this.numTotalRecords = reader.getRecordCount(); }
Example #9
Source File: TupleReadSupport.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public ReadContext init(InitContext initContext) { Schema pigSchema = getPigSchema(initContext.getConfiguration()); RequiredFieldList requiredFields = getRequiredFields(initContext.getConfiguration()); boolean columnIndexAccess = initContext.getConfiguration().getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false); if (pigSchema == null) { return new ReadContext(initContext.getFileSchema()); } else { // project the file schema according to the requested Pig schema MessageType parquetRequestedSchema = new PigSchemaConverter(columnIndexAccess).filter(initContext.getFileSchema(), pigSchema, requiredFields); return new ReadContext(parquetRequestedSchema); } }
Example #10
Source File: ProtoReadSupport.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public ReadContext init(InitContext context) { String requestedProjectionString = context.getConfiguration().get(PB_REQUESTED_PROJECTION); if (requestedProjectionString != null && !requestedProjectionString.trim().isEmpty()) { MessageType requestedProjection = getSchemaForRead(context.getFileSchema(), requestedProjectionString); LOG.debug("Reading data with projection {}", requestedProjection); return new ReadContext(requestedProjection); } else { MessageType fileSchema = context.getFileSchema(); LOG.debug("Reading data with schema {}", fileSchema); return new ReadContext(fileSchema); } }
Example #11
Source File: ParquetRecordReader.java From flink with Apache License 2.0 | 5 votes |
public void initialize(ParquetFileReader reader, Configuration configuration) { this.reader = reader; FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); // real schema of parquet file this.fileSchema = parquetFileMetadata.getSchema(); Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), readSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.recordMaterializer = readSupport.prepareForRead( configuration, fileMetadata, readSchema, readContext); this.numTotalRecords = reader.getRecordCount(); }
Example #12
Source File: InternalParquetRecordReader.java From parquet-mr with Apache License 2.0 | 5 votes |
public void initialize(ParquetFileReader reader, ParquetReadOptions options) { // copy custom configuration to the Configuration passed to the ReadSupport Configuration conf = new Configuration(); if (options instanceof HadoopReadOptions) { conf = ((HadoopReadOptions) options).getConf(); } for (String property : options.getPropertyNames()) { conf.set(property, options.getProperty(property)); } // initialize a ReadContext for this file this.reader = reader; FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); this.fileSchema = parquetFileMetadata.getSchema(); Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext(conf, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.columnCount = requestedSchema.getPaths().size(); // Setting the projection schema before running any filtering (e.g. getting filtered record count) // because projection impacts filtering reader.setRequestedSchema(requestedSchema); this.recordConverter = readSupport.prepareForRead(conf, fileMetadata, fileSchema, readContext); this.strictTypeChecking = options.isEnabled(STRICT_TYPE_CHECKING, true); this.total = reader.getFilteredRecordCount(); this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(options, total); this.filterRecords = options.useRecordFilter(); LOG.info("RecordReader initialized will read a total of {} records.", total); }
Example #13
Source File: ThriftReadSupport.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(InitContext context) { final Configuration configuration = context.getConfiguration(); final MessageType fileMessageType = context.getFileSchema(); MessageType requestedProjection = fileMessageType; String partialSchemaString = configuration.get(ReadSupport.PARQUET_READ_SCHEMA); FieldProjectionFilter projectionFilter = getFieldProjectionFilter(configuration); if (partialSchemaString != null && projectionFilter != null) { throw new ThriftProjectionException( String.format("You cannot provide both a partial schema and field projection filter." + "Only one of (%s, %s, %s) should be set.", PARQUET_READ_SCHEMA, STRICT_THRIFT_COLUMN_FILTER_KEY, THRIFT_COLUMN_FILTER_KEY)); } //set requestedProjections only when it's specified if (partialSchemaString != null) { requestedProjection = getSchemaForRead(fileMessageType, partialSchemaString); } else if (projectionFilter != null) { try { initThriftClassFromMultipleFiles(context.getKeyValueMetadata(), configuration); requestedProjection = getProjectedSchema(projectionFilter); } catch (ClassNotFoundException e) { throw new ThriftProjectionException("can not find thriftClass from configuration", e); } } MessageType schemaForRead = getSchemaForRead(fileMessageType, requestedProjection); return new ReadContext(schemaForRead); }
Example #14
Source File: SimpleReadSupport.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public ReadContext init(InitContext context) { return new ReadContext(context.getFileSchema()); }
Example #15
Source File: TestInputOutputFormat.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(InitContext context) { Set<String> counts = context.getKeyValueMetadata().get("my.count"); assertTrue("counts: " + counts, counts.size() > 0); return super.init(context); }
Example #16
Source File: ParquetHdfsDataWriterTest.java From incubator-gobblin with Apache License 2.0 | 4 votes |
@Override public ReadContext init(InitContext context) { return new ReadContext(context.getFileSchema()); }
Example #17
Source File: RowReadSupport.java From flink with Apache License 2.0 | 4 votes |
@Override public ReadContext init(InitContext initContext) { checkNotNull(initContext, "initContext"); returnTypeInfo = ParquetSchemaConverter.fromParquetType(initContext.getFileSchema()); return new ReadContext(initContext.getFileSchema()); }
Example #18
Source File: ParquetReadSupport.java From iceberg with Apache License 2.0 | 4 votes |
@Override @SuppressWarnings("deprecation") public ReadContext init(Configuration configuration, Map<String, String> keyValueMetaData, MessageType fileSchema) { // Columns are selected from the Parquet file by taking the read context's message type and // matching to the file's columns by full path, so this must select columns by using the path // in the file's schema. MessageType projection; if (ParquetSchemaUtil.hasIds(fileSchema)) { projection = ParquetSchemaUtil.pruneColumns(fileSchema, expectedSchema); } else if (nameMapping != null) { MessageType typeWithIds = ParquetSchemaUtil.applyNameMapping(fileSchema, nameMapping); projection = ParquetSchemaUtil.pruneColumns(typeWithIds, expectedSchema); } else { projection = ParquetSchemaUtil.pruneColumnsFallback(fileSchema, expectedSchema); } // override some known backward-compatibility options configuration.set("parquet.strict.typing", "false"); configuration.set("parquet.avro.add-list-element-records", "false"); configuration.set("parquet.avro.write-old-list-structure", "false"); // set Avro schemas in case the reader is Avro AvroReadSupport.setRequestedProjection(configuration, AvroSchemaUtil.convert(expectedSchema, projection.getName())); org.apache.avro.Schema avroReadSchema = AvroSchemaUtil.buildAvroProjection( AvroSchemaUtil.convert(ParquetSchemaUtil.convert(projection), projection.getName()), expectedSchema, ImmutableMap.of()); AvroReadSupport.setAvroReadSchema(configuration, ParquetAvro.parquetAvroSchema(avroReadSchema)); // let the context set up read support metadata, but always use the correct projection ReadContext context = null; if (callInit) { try { context = wrapped.init(configuration, keyValueMetaData, projection); } catch (UnsupportedOperationException e) { // try the InitContext version context = wrapped.init(new InitContext( configuration, makeMultimap(keyValueMetaData), projection)); } } return new ReadContext(projection, context != null ? context.getReadSupportMetadata() : ImmutableMap.of()); }
Example #19
Source File: RowReadSupport.java From flink with Apache License 2.0 | 4 votes |
@Override public ReadContext init(InitContext initContext) { checkNotNull(initContext, "initContext"); returnTypeInfo = ParquetSchemaConverter.fromParquetType(initContext.getFileSchema()); return new ReadContext(initContext.getFileSchema()); }