parquet.schema.MessageType Java Examples
The following examples show how to use
parquet.schema.MessageType.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PentahoTwitterInputFormat.java From pentaho-hadoop-shims with Apache License 2.0 | 7 votes |
@Override public List<IParquetInputField> readSchema( String file ) throws Exception { return inClassloader( () -> { Configuration conf = job.getConfiguration(); S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( file, conf ); Path filePath = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( file ) ); FileSystem fs = FileSystem.get( filePath.toUri(), conf ); FileStatus fileStatus = fs.getFileStatus( filePath ); List<Footer> footers = ParquetFileReader.readFooters( conf, fileStatus, true ); if ( footers.isEmpty() ) { return new ArrayList<>(); } else { ParquetMetadata meta = footers.get( 0 ).getParquetMetadata(); MessageType schema = meta.getFileMetaData().getSchema(); return ParquetConverter.buildInputFields( schema ); } } ); }
Example #2
Source File: ExaParquetWriterImpl.java From hadoop-etl-udfs with MIT License | 7 votes |
private ExaParquetWriterImpl(final MessageType schema, final int numColumns, final Configuration conf, final Path path, final String compressionType, final ExaIterator exa, final int firstColumnIndex, final List<Integer> dynamicPartitionExaColNums) throws Exception { System.out.println("Path: " + path.toString()); System.out.println("Parquet schema:\n" + schema); TupleWriteSupport.setSchema(schema, conf); this.writer = new ParquetWriter<>(path, new TupleWriteSupport(), CompressionCodecName.fromConf(compressionType), ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, conf); // Create Tuple object with ExaIterator reference. this.row = new Tuple(exa, numColumns, firstColumnIndex, dynamicPartitionExaColNums); }
Example #3
Source File: PentahoParquetReadSupport.java From pentaho-hadoop-shims with Apache License 2.0 | 6 votes |
@Override public ReadContext init( InitContext context ) { String schemaStr = context.getConfiguration().get( ParquetConverter.PARQUET_SCHEMA_CONF_KEY ); if ( schemaStr == null ) { throw new RuntimeException( "Schema not defined in the PentahoParquetSchema key" ); } ParquetInputFieldList schema = ParquetInputFieldList.unmarshall( schemaStr ); converter = new ParquetConverter( schema.getFields() ); // get all fields from file's schema MessageType fileSchema = context.getFileSchema(); List<Type> newFields = new ArrayList<>(); // use only required fields for ( IParquetInputField f : schema ) { Type origField = fileSchema.getFields().get( fileSchema.getFieldIndex( f.getFormatFieldName() ) ); newFields.add( origField ); } if ( newFields.isEmpty() ) { throw new RuntimeException( "Fields should be declared" ); } MessageType newSchema = new MessageType( fileSchema.getName(), newFields ); return new ReadContext( newSchema, new HashMap<>() ); }
Example #4
Source File: MetadataUtils.java From parquet-tools with Apache License 2.0 | 6 votes |
private static void showDetails(PrettyPrintWriter out, PrimitiveType type, int depth, MessageType container, List<String> cpath) { String name = Strings.repeat(".", depth) + type.getName(); OriginalType otype = type.getOriginalType(); Repetition rep = type.getRepetition(); PrimitiveTypeName ptype = type.getPrimitiveTypeName(); out.format("%s: %s %s", name, rep, ptype); if (otype != null) out.format(" O:%s", otype); if (container != null) { cpath.add(type.getName()); String[] paths = cpath.toArray(new String[cpath.size()]); cpath.remove(cpath.size() - 1); ColumnDescriptor desc = container.getColumnDescription(paths); int defl = desc.getMaxDefinitionLevel(); int repl = desc.getMaxRepetitionLevel(); out.format(" R:%d D:%d", repl, defl); } out.println(); }
Example #5
Source File: JsonElementConversionFactory.java From incubator-gobblin with Apache License 2.0 | 6 votes |
private Type buildSchema() { JsonArray inputSchema = this.jsonSchema.getDataTypeValues(); List<Type> parquetTypes = new ArrayList<>(); for (JsonElement element : inputSchema) { JsonObject map = (JsonObject) element; JsonSchema elementSchema = new JsonSchema(map); String columnName = elementSchema.getColumnName(); JsonElementConverter converter = JsonElementConversionFactory.getConverter(elementSchema, false); Type schemaType = converter.schema(); this.converters.put(columnName, converter); parquetTypes.add(schemaType); } String docName = this.jsonSchema.getColumnName(); switch (recordType) { case ROOT: return new MessageType(docName, parquetTypes); case CHILD: return new GroupType(optionalOrRequired(this.jsonSchema), docName, parquetTypes); default: throw new RuntimeException("Unsupported Record type"); } }
Example #6
Source File: ExaParquetWriterImpl.java From hadoop-etl-udfs with MIT License | 6 votes |
public ExaParquetWriterImpl(final List<ExaParquetTypeInfo> schemaTypes, final Configuration conf, final Path path, final String compressionType, final ExaIterator exa, final int firstColumnIndex, final List<Integer> dynamicPartitionExaColNums) throws Exception { // Use the schemaTypes provided since HCat table metadata isn't available. // This should normally only be used for testing. this(new MessageType("hive_schema", ExaParquetWriterImpl.typeInfoToParquetTypes(schemaTypes)), schemaTypes.size(), conf, path, compressionType, exa, firstColumnIndex, dynamicPartitionExaColNums); }
Example #7
Source File: PentahoParquetWriteSupport.java From pentaho-hadoop-shims with Apache License 2.0 | 5 votes |
private MessageType createParquetSchema() { List<Type> types = new ArrayList<>(); for ( IParquetOutputField outputField : outputFields ) { types.add( convertToPrimitiveType( outputField ) ); } if ( types.isEmpty() ) { throw new IllegalArgumentException( "Schema should contain at least one field" ); } return new MessageType( "parquet-schema", types ); }
Example #8
Source File: MetadataUtils.java From parquet-tools with Apache License 2.0 | 5 votes |
private static void showDetails(PrettyPrintWriter out, GroupType type, int depth, MessageType container, List<String> cpath) { String name = Strings.repeat(".", depth) + type.getName(); Repetition rep = type.getRepetition(); int fcount = type.getFieldCount(); out.format("%s: %s F:%d%n", name, rep, fcount); cpath.add(type.getName()); for (Type ftype : type.getFields()) { showDetails(out, ftype, depth + 1, container, cpath); } cpath.remove(cpath.size() - 1); }
Example #9
Source File: JsonIntermediateToParquetGroupConverterTest.java From incubator-gobblin with Apache License 2.0 | 5 votes |
@Test(expectedExceptions = RuntimeException.class, expectedExceptionsMessageRegExp = "Symbol .* does not belong to set \\[.*?\\]") public void testEnumTypeBelongsToEnumSet() throws Exception { JsonObject test = deepCopy(testCases.get("enum").getAsJsonObject(), JsonObject.class); parquetConverter = new JsonIntermediateToParquetGroupConverter(); MessageType schema = parquetConverter.convertSchema(test.get("schema").getAsJsonArray(), workUnit); JsonObject jsonRecord = test.get("record").getAsJsonObject(); jsonRecord.addProperty("some_enum", "HELL"); parquetConverter.convertRecord(schema, jsonRecord, workUnit).iterator().next(); }
Example #10
Source File: JsonIntermediateToParquetGroupConverterTest.java From incubator-gobblin with Apache License 2.0 | 5 votes |
private void testCase(String testCaseName) throws SchemaConversionException, DataConversionException { JsonObject test = testCases.get(testCaseName).getAsJsonObject(); parquetConverter = new JsonIntermediateToParquetGroupConverter(); MessageType schema = parquetConverter.convertSchema(test.get("schema").getAsJsonArray(), workUnit); Group record = parquetConverter.convertRecord(schema, test.get("record").getAsJsonObject(), workUnit).iterator().next(); assertEqualsIgnoreSpaces(schema.toString(), test.get("expectedSchema").getAsString()); assertEqualsIgnoreSpaces(record.toString(), test.get("expectedRecord").getAsString()); }
Example #11
Source File: ParquetReader.java From paraflow with Apache License 2.0 | 5 votes |
public ParquetReader(MessageType fileSchema, MessageType requestedSchema, List<BlockMetaData> blocks, ParquetDataSource dataSource, TypeManager typeManager) { this.fileSchema = fileSchema; this.requestedSchema = requestedSchema; this.blocks = blocks; this.dataSource = dataSource; this.typeManager = typeManager; initializeColumnReaders(); }
Example #12
Source File: JsonIntermediateToParquetGroupConverter.java From incubator-gobblin with Apache License 2.0 | 5 votes |
@Override public MessageType convertSchema(JsonArray inputSchema, WorkUnitState workUnit) throws SchemaConversionException { String fieldName = workUnit.getExtract().getTable(); JsonSchema jsonSchema = new JsonSchema(inputSchema); jsonSchema.setColumnName(fieldName); recordConverter = new RecordConverter(jsonSchema, ROOT); return (MessageType) recordConverter.schema(); }
Example #13
Source File: MetadataUtils.java From parquet-tools with Apache License 2.0 | 5 votes |
private static void showDetails(PrettyPrintWriter out, Type type, int depth, MessageType container, List<String> cpath) { if (type instanceof GroupType) { showDetails(out, type.asGroupType(), depth, container, cpath); return; } else if (type instanceof PrimitiveType) { showDetails(out, type.asPrimitiveType(), depth, container, cpath); return; } }
Example #14
Source File: DumpCommand.java From parquet-tools with Apache License 2.0 | 5 votes |
@Override public void execute(CommandLine options) throws Exception { super.execute(options); String[] args = options.getArgs(); String input = args[0]; Configuration conf = new Configuration(); Path inpath = new Path(input); ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inpath); MessageType schema = metaData.getFileMetaData().getSchema(); PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter() .withAutoColumn() .withAutoCrop() .withWhitespaceHandler(WhiteSpaceHandler.ELIMINATE_NEWLINES) .withColumnPadding(1) .withMaxBufferedLines(1000000) .withFlushOnTab() .build(); boolean showmd = !options.hasOption('m'); boolean showdt = !options.hasOption('d'); Set<String> showColumns = null; if (options.hasOption('c')) { String[] cols = options.getOptionValues('c'); showColumns = new HashSet<String>(Arrays.asList(cols)); } dump(out, metaData, schema, inpath, showmd, showdt, showColumns); }
Example #15
Source File: HdfsOdpsImportJob.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 5 votes |
private DatasetDescriptor getDatasetDescriptorFromParquetFile(Job job, FileSystem fs, String uri) throws IOException { ArrayList<FileStatus> files = new ArrayList<FileStatus>(); FileStatus[] dirs; dirs = fs.globStatus(fs.makeQualified(getInputPath())); for (int i = 0; (dirs != null && i < dirs.length); i++) { files.addAll(Arrays.asList(fs.listStatus(dirs[i].getPath(), HIDDEN_FILES_PATH_FILTER))); // We only check one file, so exit the loop when we have at least // one. if (files.size() > 0) { break; } } ParquetMetadata parquetMetadata; try { parquetMetadata = ParquetFileReader.readFooter(job.getConfiguration(), fs.makeQualified(files.get(0).getPath())); } catch (IOException e) { LOG.error("Wrong file format. Please check the export file's format.", e); throw e; } MessageType schema = parquetMetadata.getFileMetaData().getSchema(); Schema avroSchema = new AvroSchemaConverter().convert(schema); DatasetDescriptor descriptor = new DatasetDescriptor.Builder().schema(avroSchema).format(Formats.PARQUET) .compressionType(ParquetJob.getCompressionType(job.getConfiguration())).build(); return descriptor; }
Example #16
Source File: ParquetConverter.java From pentaho-hadoop-shims with Apache License 2.0 | 5 votes |
public static List<IParquetInputField> buildInputFields( MessageType schema ) { List<IParquetInputField> inputFields = new ArrayList<>(); for ( Type type : schema.getFields() ) { if ( type.isPrimitive() ) { inputFields.add( convertField( type ) ); } } return inputFields; }
Example #17
Source File: ParaflowPageSource.java From paraflow with Apache License 2.0 | 5 votes |
public int getFieldIndex(MessageType fileSchema, String name) { try { return fileSchema.getFieldIndex(name); } catch (InvalidRecordException e) { for (parquet.schema.Type type : fileSchema.getFields()) { if (type.getName().equalsIgnoreCase(name)) { return fileSchema.getFieldIndex(type.getName()); } } return -1; } }
Example #18
Source File: ParaflowPageSource.java From paraflow with Apache License 2.0 | 5 votes |
private parquet.schema.Type getParquetType(ParaflowColumnHandle column, MessageType messageType) { if (messageType.containsField(column.getName())) { return messageType.getType(column.getName()); } // parquet is case-insensitive, all hdfs-columns get converted to lowercase for (parquet.schema.Type type : messageType.getFields()) { if (type.getName().equalsIgnoreCase(column.getName())) { return type; } } return null; }
Example #19
Source File: ParaflowPageSource.java From paraflow with Apache License 2.0 | 5 votes |
public ParaflowPageSource( ParquetReader parquetReader, ParquetDataSource dataSource, MessageType fileSchema, MessageType requestedSchema, long totalBytes, List<ParaflowColumnHandle> columns, TypeManager typeManager) { checkArgument(totalBytes >= 0, "totalBytes is negative"); this.parquetReader = requireNonNull(parquetReader, "parquetReader is null"); this.dataSource = requireNonNull(dataSource, "dataSource is null"); this.fileSchema = requireNonNull(fileSchema, "fileSchema is null"); this.requestedSchema = requireNonNull(requestedSchema, "requestedSchema is null"); this.totalBytes = totalBytes; this.columnSize = columns.size(); this.constantBlocks = new Block[columnSize]; ImmutableList.Builder<String> namesBuilder = ImmutableList.builder(); ImmutableList.Builder<Type> typesBuilder = ImmutableList.builder(); for (int columnIndex = 0; columnIndex < columnSize; columnIndex++) { ParaflowColumnHandle column = columns.get(columnIndex); String name = column.getName(); Type type = typeManager.getType(column.getType().getTypeSignature()); namesBuilder.add(name); typesBuilder.add(type); if (getParquetType(column, fileSchema) == null) { constantBlocks[columnIndex] = RunLengthEncodedBlock.create(type, null, MAX_VECTOR_LENGTH); } } columnNames = namesBuilder.build(); types = typesBuilder.build(); }
Example #20
Source File: ParaflowPageSourceProvider.java From paraflow with Apache License 2.0 | 5 votes |
private Type getParquetType(ParaflowColumnHandle column, MessageType messageType) { if (messageType.containsField(column.getName())) { return messageType.getType(column.getName()); } // parquet is case-insensitive, all hdfs-columns get converted to lowercase for (Type type : messageType.getFields()) { if (type.getName().equalsIgnoreCase(column.getName())) { return type; } } return null; }
Example #21
Source File: ParquetMetadataReader.java From paraflow with Apache License 2.0 | 5 votes |
private static MessageType readParquetSchema(List<SchemaElement> schema) { Iterator<SchemaElement> schemaIterator = schema.iterator(); SchemaElement rootSchema = schemaIterator.next(); Types.MessageTypeBuilder builder = Types.buildMessage(); readTypeSchema(builder, schemaIterator, rootSchema.getNum_children()); return builder.named(rootSchema.name); }
Example #22
Source File: ParquetDataWriterBuilder.java From incubator-gobblin with Apache License 2.0 | 4 votes |
/** * Build a version-specific {@link ParquetWriter} for given {@link ParquetWriterConfiguration} * @param writerConfiguration * @return * @throws IOException */ @Override public ParquetWriterShim getVersionSpecificWriter(ParquetWriterConfiguration writerConfiguration) throws IOException { CompressionCodecName codecName = CompressionCodecName.fromConf(writerConfiguration.getCodecName()); ParquetProperties.WriterVersion writerVersion = ParquetProperties.WriterVersion .fromString(writerConfiguration.getWriterVersion()); Configuration conf = new Configuration(); ParquetWriter versionSpecificWriter = null; switch (writerConfiguration.getRecordFormat()) { case GROUP: { GroupWriteSupport.setSchema((MessageType) this.schema, conf); WriteSupport support = new GroupWriteSupport(); versionSpecificWriter = new ParquetWriter<Group>( writerConfiguration.getAbsoluteStagingFile(), support, codecName, writerConfiguration.getBlockSize(), writerConfiguration.getPageSize(), writerConfiguration.getDictPageSize(), writerConfiguration.isDictionaryEnabled(), writerConfiguration.isValidate(), writerVersion, conf); break; } case AVRO: { versionSpecificWriter = new AvroParquetWriter( writerConfiguration.getAbsoluteStagingFile(), (Schema) this.schema, codecName, writerConfiguration.getBlockSize(), writerConfiguration.getPageSize(), writerConfiguration.isDictionaryEnabled(), conf); break; } case PROTOBUF: { versionSpecificWriter = new ProtoParquetWriter( writerConfiguration.getAbsoluteStagingFile(), (Class<? extends Message>) this.schema, codecName, writerConfiguration.getBlockSize(), writerConfiguration.getPageSize(), writerConfiguration.isDictionaryEnabled(), writerConfiguration.isValidate()); break; } default: throw new RuntimeException("Record format not supported"); } ParquetWriter finalVersionSpecificWriter = versionSpecificWriter; return new ParquetWriterShim() { @Override public void write(Object record) throws IOException { finalVersionSpecificWriter.write(record); } @Override public void close() throws IOException { finalVersionSpecificWriter.close(); } }; }
Example #23
Source File: MetadataUtils.java From parquet-tools with Apache License 2.0 | 4 votes |
public static void showDetails(PrettyPrintWriter out, MessageType type) { List<String> cpath = new ArrayList<String>(); for (Type ftype : type.getFields()) { showDetails(out, ftype, 0, type, cpath); } }
Example #24
Source File: SimpleRecordMaterializer.java From parquet-tools with Apache License 2.0 | 4 votes |
public SimpleRecordMaterializer(MessageType schema) { this.root = new SimpleRecordConverter(schema); }
Example #25
Source File: SimpleReadSupport.java From parquet-tools with Apache License 2.0 | 4 votes |
@Override public RecordMaterializer<SimpleRecord> prepareForRead(Configuration conf, Map<String,String> metaData, MessageType schema, ReadContext context) { return new SimpleRecordMaterializer(schema); }
Example #26
Source File: TestReadWriteParquet.java From parquet-examples with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { if(args.length < 2) { LOG.error("Usage: " + getClass().getName() + " INPUTFILE OUTPUTFILE [compression]"); return 1; } String inputFile = args[0]; String outputFile = args[1]; String compression = (args.length > 2) ? args[2] : "none"; Path parquetFilePath = null; // Find a file in case a directory was passed RemoteIterator<LocatedFileStatus> it = FileSystem.get(getConf()).listFiles(new Path(inputFile), true); while(it.hasNext()) { FileStatus fs = it.next(); if(fs.isFile()) { parquetFilePath = fs.getPath(); break; } } if(parquetFilePath == null) { LOG.error("No file found for " + inputFile); return 1; } LOG.info("Getting schema from " + parquetFilePath); ParquetMetadata readFooter = ParquetFileReader.readFooter(getConf(), parquetFilePath); MessageType schema = readFooter.getFileMetaData().getSchema(); LOG.info(schema); GroupWriteSupport.setSchema(schema, getConf()); Job job = new Job(getConf()); job.setJarByClass(getClass()); job.setJobName(getClass().getName()); job.setMapperClass(ReadRequestMap.class); job.setNumReduceTasks(0); job.setInputFormatClass(ExampleInputFormat.class); job.setOutputFormatClass(ExampleOutputFormat.class); CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED; if(compression.equalsIgnoreCase("snappy")) { codec = CompressionCodecName.SNAPPY; } else if(compression.equalsIgnoreCase("gzip")) { codec = CompressionCodecName.GZIP; } LOG.info("Output compression: " + codec); ExampleOutputFormat.setCompression(job, codec); FileInputFormat.setInputPaths(job, new Path(inputFile)); FileOutputFormat.setOutputPath(job, new Path(outputFile)); job.waitForCompletion(true); return 0; }
Example #27
Source File: PentahoParquetReadSupport.java From pentaho-hadoop-shims with Apache License 2.0 | 4 votes |
@Override public RecordMaterializer<RowMetaAndData> prepareForRead( Configuration configuration, Map<String, String> keyValueMetaData, MessageType fileSchema, ReadContext readContext ) { return new ParquetConverter.MyRecordMaterializer( converter ); }
Example #28
Source File: ParquetHdfsDataWriterTest.java From incubator-gobblin with Apache License 2.0 | 4 votes |
@Override public RecordMaterializer<Group> prepareForRead(Configuration conf, Map<String, String> metaData, MessageType schema, ReadContext context) { return new GroupRecordConverter(schema); }
Example #29
Source File: JsonIntermediateToParquetGroupConverter.java From incubator-gobblin with Apache License 2.0 | 4 votes |
@Override public Iterable<Group> convertRecord(MessageType outputSchema, JsonObject inputRecord, WorkUnitState workUnit) throws DataConversionException { return new SingleRecordIterable<>((Group) recordConverter.convert(inputRecord)); }
Example #30
Source File: TupleWriteSupport.java From hadoop-etl-udfs with MIT License | 4 votes |
public static MessageType getSchema(Configuration configuration) { return MessageTypeParser.parseMessageType(configuration.get(PARQUET_SCHEMA_PROPERTY_NAME)); }