parquet.schema.MessageType Java Exaples

Source File: PentahoTwitterInputFormat.java From pentaho-hadoop-shims with Apache License 2.0

7 votes

@Override
public List<IParquetInputField> readSchema( String file ) throws Exception {
  return inClassloader( () -> {
    Configuration conf = job.getConfiguration();
    S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( file, conf );
    Path filePath = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( file ) );
    FileSystem fs = FileSystem.get( filePath.toUri(), conf );
    FileStatus fileStatus = fs.getFileStatus( filePath );
    List<Footer> footers = ParquetFileReader.readFooters( conf, fileStatus, true );
    if ( footers.isEmpty() ) {
      return new ArrayList<>();
    } else {
      ParquetMetadata meta = footers.get( 0 ).getParquetMetadata();
      MessageType schema = meta.getFileMetaData().getSchema();
      return ParquetConverter.buildInputFields( schema );
    }
  } );
}

Source File: ExaParquetWriterImpl.java From hadoop-etl-udfs with MIT License

7 votes

private ExaParquetWriterImpl(final MessageType schema,
                             final int numColumns,
                             final Configuration conf,
                             final Path path,
                             final String compressionType,
                             final ExaIterator exa,
                             final int firstColumnIndex,
                             final List<Integer> dynamicPartitionExaColNums) throws Exception {
    System.out.println("Path: " + path.toString());
    System.out.println("Parquet schema:\n" + schema);

    TupleWriteSupport.setSchema(schema, conf);
    this.writer = new ParquetWriter<>(path,
            new TupleWriteSupport(),
            CompressionCodecName.fromConf(compressionType),
            ParquetWriter.DEFAULT_BLOCK_SIZE,
            ParquetWriter.DEFAULT_PAGE_SIZE,
            ParquetWriter.DEFAULT_PAGE_SIZE,
            ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
            ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED,
            conf);

    // Create Tuple object with ExaIterator reference.
    this.row = new Tuple(exa, numColumns, firstColumnIndex, dynamicPartitionExaColNums);
}

Source File: PentahoParquetReadSupport.java From pentaho-hadoop-shims with Apache License 2.0

6 votes

@Override
public ReadContext init( InitContext context ) {
  String schemaStr = context.getConfiguration().get( ParquetConverter.PARQUET_SCHEMA_CONF_KEY );
  if ( schemaStr == null ) {
    throw new RuntimeException( "Schema not defined in the PentahoParquetSchema key" );
  }

  ParquetInputFieldList schema = ParquetInputFieldList.unmarshall( schemaStr );
  converter = new ParquetConverter( schema.getFields() );

  // get all fields from file's schema
  MessageType fileSchema = context.getFileSchema();
  List<Type> newFields = new ArrayList<>();
  // use only required fields
  for ( IParquetInputField f : schema ) {
    Type origField = fileSchema.getFields().get( fileSchema.getFieldIndex( f.getFormatFieldName() ) );
    newFields.add( origField );
  }
  if ( newFields.isEmpty() ) {
    throw new RuntimeException( "Fields should be declared" );
  }
  MessageType newSchema = new MessageType( fileSchema.getName(), newFields );

  return new ReadContext( newSchema, new HashMap<>() );
}

Source File: MetadataUtils.java From parquet-tools with Apache License 2.0

6 votes

private static void showDetails(PrettyPrintWriter out, PrimitiveType type, int depth, MessageType container, List<String> cpath) {
  String name = Strings.repeat(".", depth) + type.getName();
  OriginalType otype = type.getOriginalType();
  Repetition rep = type.getRepetition();
  PrimitiveTypeName ptype = type.getPrimitiveTypeName();

  out.format("%s: %s %s", name, rep, ptype);
  if (otype != null) out.format(" O:%s", otype);

  if (container != null) {
    cpath.add(type.getName());
    String[] paths = cpath.toArray(new String[cpath.size()]);
    cpath.remove(cpath.size() - 1);

    ColumnDescriptor desc = container.getColumnDescription(paths);

    int defl = desc.getMaxDefinitionLevel();
    int repl = desc.getMaxRepetitionLevel();
    out.format(" R:%d D:%d", repl, defl);
  }
  out.println();
}

Source File: JsonElementConversionFactory.java From incubator-gobblin with Apache License 2.0

6 votes

private Type buildSchema() {
  JsonArray inputSchema = this.jsonSchema.getDataTypeValues();
  List<Type> parquetTypes = new ArrayList<>();
  for (JsonElement element : inputSchema) {
    JsonObject map = (JsonObject) element;
    JsonSchema elementSchema = new JsonSchema(map);
    String columnName = elementSchema.getColumnName();
    JsonElementConverter converter = JsonElementConversionFactory.getConverter(elementSchema, false);
    Type schemaType = converter.schema();
    this.converters.put(columnName, converter);
    parquetTypes.add(schemaType);
  }
  String docName = this.jsonSchema.getColumnName();
  switch (recordType) {
    case ROOT:
      return new MessageType(docName, parquetTypes);
    case CHILD:
      return new GroupType(optionalOrRequired(this.jsonSchema), docName, parquetTypes);
    default:
      throw new RuntimeException("Unsupported Record type");
  }
}

Source File: ExaParquetWriterImpl.java From hadoop-etl-udfs with MIT License

6 votes

public ExaParquetWriterImpl(final List<ExaParquetTypeInfo> schemaTypes,
                            final Configuration conf,
                            final Path path,
                            final String compressionType,
                            final ExaIterator exa,
                            final int firstColumnIndex,
                            final List<Integer> dynamicPartitionExaColNums) throws Exception {
    // Use the schemaTypes provided since HCat table metadata isn't available.
    // This should normally only be used for testing.
    this(new MessageType("hive_schema", ExaParquetWriterImpl.typeInfoToParquetTypes(schemaTypes)),
            schemaTypes.size(),
            conf,
            path,
            compressionType,
            exa,
            firstColumnIndex,
            dynamicPartitionExaColNums);
}

Source File: PentahoParquetWriteSupport.java From pentaho-hadoop-shims with Apache License 2.0

5 votes

private MessageType createParquetSchema() {
  List<Type> types = new ArrayList<>();

  for ( IParquetOutputField outputField : outputFields ) {
    types.add( convertToPrimitiveType( outputField ) );
  }

  if ( types.isEmpty() ) {
    throw new IllegalArgumentException( "Schema should contain at least one field" );
  }

  return new MessageType( "parquet-schema", types );
}

Source File: MetadataUtils.java From parquet-tools with Apache License 2.0

5 votes

private static void showDetails(PrettyPrintWriter out, GroupType type, int depth, MessageType container, List<String> cpath) {
  String name = Strings.repeat(".", depth) + type.getName();
  Repetition rep = type.getRepetition();
  int fcount = type.getFieldCount();
  out.format("%s: %s F:%d%n", name, rep, fcount);

  cpath.add(type.getName());
  for (Type ftype : type.getFields()) {
    showDetails(out, ftype, depth + 1, container, cpath);
  }
  cpath.remove(cpath.size() - 1);
}

Source File: JsonIntermediateToParquetGroupConverterTest.java From incubator-gobblin with Apache License 2.0

5 votes

@Test(expectedExceptions = RuntimeException.class, expectedExceptionsMessageRegExp = "Symbol .* does not belong to set \\[.*?\\]")
public void testEnumTypeBelongsToEnumSet()
    throws Exception {
  JsonObject test = deepCopy(testCases.get("enum").getAsJsonObject(), JsonObject.class);
  parquetConverter = new JsonIntermediateToParquetGroupConverter();

  MessageType schema = parquetConverter.convertSchema(test.get("schema").getAsJsonArray(), workUnit);
  JsonObject jsonRecord = test.get("record").getAsJsonObject();
  jsonRecord.addProperty("some_enum", "HELL");

  parquetConverter.convertRecord(schema, jsonRecord, workUnit).iterator().next();
}

Source File: JsonIntermediateToParquetGroupConverterTest.java From incubator-gobblin with Apache License 2.0

5 votes

private void testCase(String testCaseName)
    throws SchemaConversionException, DataConversionException {
  JsonObject test = testCases.get(testCaseName).getAsJsonObject();
  parquetConverter = new JsonIntermediateToParquetGroupConverter();

  MessageType schema = parquetConverter.convertSchema(test.get("schema").getAsJsonArray(), workUnit);
  Group record =
      parquetConverter.convertRecord(schema, test.get("record").getAsJsonObject(), workUnit).iterator().next();
  assertEqualsIgnoreSpaces(schema.toString(), test.get("expectedSchema").getAsString());
  assertEqualsIgnoreSpaces(record.toString(), test.get("expectedRecord").getAsString());
}

Source File: ParquetReader.java From paraflow with Apache License 2.0

5 votes

public ParquetReader(MessageType fileSchema,
                     MessageType requestedSchema,
                     List<BlockMetaData> blocks,
                     ParquetDataSource dataSource,
                     TypeManager typeManager)
{
    this.fileSchema = fileSchema;
    this.requestedSchema = requestedSchema;
    this.blocks = blocks;
    this.dataSource = dataSource;
    this.typeManager = typeManager;
    initializeColumnReaders();
}

Source File: JsonIntermediateToParquetGroupConverter.java From incubator-gobblin with Apache License 2.0

5 votes

@Override
public MessageType convertSchema(JsonArray inputSchema, WorkUnitState workUnit)
    throws SchemaConversionException {
  String fieldName = workUnit.getExtract().getTable();
  JsonSchema jsonSchema = new JsonSchema(inputSchema);
  jsonSchema.setColumnName(fieldName);
  recordConverter = new RecordConverter(jsonSchema, ROOT);
  return (MessageType) recordConverter.schema();
}

Source File: MetadataUtils.java From parquet-tools with Apache License 2.0

5 votes

private static void showDetails(PrettyPrintWriter out, Type type, int depth, MessageType container, List<String> cpath) {
  if (type instanceof GroupType) {
    showDetails(out, type.asGroupType(), depth, container, cpath);
    return;
  } else if (type instanceof PrimitiveType) {
    showDetails(out, type.asPrimitiveType(), depth, container, cpath);
    return;
  }
}

Source File: DumpCommand.java From parquet-tools with Apache License 2.0

5 votes

@Override
public void execute(CommandLine options) throws Exception {
    super.execute(options);

    String[] args = options.getArgs();
    String input = args[0];

    Configuration conf = new Configuration();
    Path inpath = new Path(input);

    ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inpath);
    MessageType schema = metaData.getFileMetaData().getSchema();

    PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
                                             .withAutoColumn()
                                             .withAutoCrop()
                                             .withWhitespaceHandler(WhiteSpaceHandler.ELIMINATE_NEWLINES)
                                             .withColumnPadding(1)
                                             .withMaxBufferedLines(1000000)
                                             .withFlushOnTab()
                                             .build();

    boolean showmd = !options.hasOption('m');
    boolean showdt = !options.hasOption('d');

    Set<String> showColumns = null;
    if (options.hasOption('c')) {
        String[] cols = options.getOptionValues('c');
        showColumns = new HashSet<String>(Arrays.asList(cols));
    }

    dump(out, metaData, schema, inpath, showmd, showdt, showColumns);
}

Source File: HdfsOdpsImportJob.java From aliyun-maxcompute-data-collectors with Apache License 2.0

5 votes

private DatasetDescriptor getDatasetDescriptorFromParquetFile(Job job, FileSystem fs, String uri)
    throws IOException {

  ArrayList<FileStatus> files = new ArrayList<FileStatus>();
  FileStatus[] dirs;
  dirs = fs.globStatus(fs.makeQualified(getInputPath()));
  for (int i = 0; (dirs != null && i < dirs.length); i++) {
    files.addAll(Arrays.asList(fs.listStatus(dirs[i].getPath(), HIDDEN_FILES_PATH_FILTER)));
    // We only check one file, so exit the loop when we have at least
    // one.
    if (files.size() > 0) {
      break;
    }
  }

  ParquetMetadata parquetMetadata;
  try {
    parquetMetadata =
        ParquetFileReader.readFooter(job.getConfiguration(),
            fs.makeQualified(files.get(0).getPath()));
  } catch (IOException e) {
    LOG.error("Wrong file format. Please check the export file's format.", e);
    throw e;
  }
  MessageType schema = parquetMetadata.getFileMetaData().getSchema();
  Schema avroSchema = new AvroSchemaConverter().convert(schema);
  DatasetDescriptor descriptor =
      new DatasetDescriptor.Builder().schema(avroSchema).format(Formats.PARQUET)
          .compressionType(ParquetJob.getCompressionType(job.getConfiguration())).build();
  return descriptor;
}

Source File: ParquetConverter.java From pentaho-hadoop-shims with Apache License 2.0

5 votes

public static List<IParquetInputField> buildInputFields( MessageType schema ) {
  List<IParquetInputField> inputFields = new ArrayList<>();

  for ( Type type : schema.getFields() ) {
    if ( type.isPrimitive() ) {
      inputFields.add( convertField( type ) );
    }
  }

  return inputFields;
}

Source File: ParaflowPageSource.java From paraflow with Apache License 2.0

5 votes

public int getFieldIndex(MessageType fileSchema, String name)
{
    try {
        return fileSchema.getFieldIndex(name);
    }
    catch (InvalidRecordException e) {
        for (parquet.schema.Type type : fileSchema.getFields()) {
            if (type.getName().equalsIgnoreCase(name)) {
                return fileSchema.getFieldIndex(type.getName());
            }
        }
        return -1;
    }
}

Source File: ParaflowPageSource.java From paraflow with Apache License 2.0

5 votes

private parquet.schema.Type getParquetType(ParaflowColumnHandle column, MessageType messageType)
{
    if (messageType.containsField(column.getName())) {
        return messageType.getType(column.getName());
    }
    // parquet is case-insensitive, all hdfs-columns get converted to lowercase
    for (parquet.schema.Type type : messageType.getFields()) {
        if (type.getName().equalsIgnoreCase(column.getName())) {
            return type;
        }
    }
    return null;
}

Source File: ParaflowPageSource.java From paraflow with Apache License 2.0

5 votes

public ParaflowPageSource(
        ParquetReader parquetReader,
        ParquetDataSource dataSource,
        MessageType fileSchema,
        MessageType requestedSchema,
        long totalBytes,
        List<ParaflowColumnHandle> columns,
        TypeManager typeManager)
{
    checkArgument(totalBytes >= 0, "totalBytes is negative");

    this.parquetReader = requireNonNull(parquetReader, "parquetReader is null");
    this.dataSource = requireNonNull(dataSource, "dataSource is null");
    this.fileSchema = requireNonNull(fileSchema, "fileSchema is null");
    this.requestedSchema = requireNonNull(requestedSchema, "requestedSchema is null");
    this.totalBytes = totalBytes;

    this.columnSize = columns.size();
    this.constantBlocks = new Block[columnSize];
    ImmutableList.Builder<String> namesBuilder = ImmutableList.builder();
    ImmutableList.Builder<Type> typesBuilder = ImmutableList.builder();
    for (int columnIndex = 0; columnIndex < columnSize; columnIndex++) {
        ParaflowColumnHandle column = columns.get(columnIndex);
        String name = column.getName();
        Type type = typeManager.getType(column.getType().getTypeSignature());

        namesBuilder.add(name);
        typesBuilder.add(type);

        if (getParquetType(column, fileSchema) == null) {
            constantBlocks[columnIndex] = RunLengthEncodedBlock.create(type, null, MAX_VECTOR_LENGTH);
        }
    }
    columnNames = namesBuilder.build();
    types = typesBuilder.build();
}

Source File: ParaflowPageSourceProvider.java From paraflow with Apache License 2.0

5 votes

private Type getParquetType(ParaflowColumnHandle column, MessageType messageType)
{
    if (messageType.containsField(column.getName())) {
        return messageType.getType(column.getName());
    }
    // parquet is case-insensitive, all hdfs-columns get converted to lowercase
    for (Type type : messageType.getFields()) {
        if (type.getName().equalsIgnoreCase(column.getName())) {
            return type;
        }
    }
    return null;
}

Source File: ParquetMetadataReader.java From paraflow with Apache License 2.0

5 votes

private static MessageType readParquetSchema(List<SchemaElement> schema)
{
    Iterator<SchemaElement> schemaIterator = schema.iterator();
    SchemaElement rootSchema = schemaIterator.next();
    Types.MessageTypeBuilder builder = Types.buildMessage();
    readTypeSchema(builder, schemaIterator, rootSchema.getNum_children());
    return builder.named(rootSchema.name);
}

Source File: ParquetDataWriterBuilder.java From incubator-gobblin with Apache License 2.0

4 votes

/**
 * Build a version-specific {@link ParquetWriter} for given {@link ParquetWriterConfiguration}
 * @param writerConfiguration
 * @return
 * @throws IOException
 */
@Override
public ParquetWriterShim getVersionSpecificWriter(ParquetWriterConfiguration writerConfiguration)
    throws IOException {

  CompressionCodecName codecName = CompressionCodecName.fromConf(writerConfiguration.getCodecName());
  ParquetProperties.WriterVersion writerVersion = ParquetProperties.WriterVersion
      .fromString(writerConfiguration.getWriterVersion());

  Configuration conf = new Configuration();
  ParquetWriter versionSpecificWriter = null;
  switch (writerConfiguration.getRecordFormat()) {
    case GROUP: {
      GroupWriteSupport.setSchema((MessageType) this.schema, conf);
      WriteSupport support = new GroupWriteSupport();
      versionSpecificWriter = new ParquetWriter<Group>(
          writerConfiguration.getAbsoluteStagingFile(),
          support,
          codecName,
          writerConfiguration.getBlockSize(),
          writerConfiguration.getPageSize(),
          writerConfiguration.getDictPageSize(),
          writerConfiguration.isDictionaryEnabled(),
          writerConfiguration.isValidate(),
          writerVersion,
          conf);
      break;
    }
    case AVRO:  {
      versionSpecificWriter = new AvroParquetWriter(
          writerConfiguration.getAbsoluteStagingFile(),
          (Schema) this.schema,
          codecName,
          writerConfiguration.getBlockSize(),
          writerConfiguration.getPageSize(),
          writerConfiguration.isDictionaryEnabled(),
          conf);
      break;
    }
    case PROTOBUF: {
      versionSpecificWriter = new ProtoParquetWriter(
          writerConfiguration.getAbsoluteStagingFile(),
          (Class<? extends Message>) this.schema,
          codecName,
          writerConfiguration.getBlockSize(),
          writerConfiguration.getPageSize(),
          writerConfiguration.isDictionaryEnabled(),
          writerConfiguration.isValidate());
      break;
    }
    default: throw new RuntimeException("Record format not supported");
  }
  ParquetWriter finalVersionSpecificWriter = versionSpecificWriter;

  return new ParquetWriterShim() {
    @Override
    public void write(Object record)
        throws IOException {
      finalVersionSpecificWriter.write(record);
    }

    @Override
    public void close()
        throws IOException {
      finalVersionSpecificWriter.close();
    }
  };
}

Source File: MetadataUtils.java From parquet-tools with Apache License 2.0

4 votes

public static void showDetails(PrettyPrintWriter out, MessageType type) {
  List<String> cpath = new ArrayList<String>();
  for (Type ftype : type.getFields()) {
    showDetails(out, ftype, 0, type, cpath);
  }
}

Source File: SimpleRecordMaterializer.java From parquet-tools with Apache License 2.0

4 votes

public SimpleRecordMaterializer(MessageType schema) {
  this.root = new SimpleRecordConverter(schema);
}

Source File: SimpleReadSupport.java From parquet-tools with Apache License 2.0

4 votes

@Override
public RecordMaterializer<SimpleRecord> prepareForRead(Configuration conf, Map<String,String> metaData, MessageType schema, ReadContext context) {
  return new SimpleRecordMaterializer(schema);
}

Source File: TestReadWriteParquet.java From parquet-examples with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {
if(args.length < 2) {
    LOG.error("Usage: " + getClass().getName() + " INPUTFILE OUTPUTFILE [compression]");
    return 1;
}
String inputFile = args[0];
String outputFile = args[1];
String compression = (args.length > 2) ? args[2] : "none";

Path parquetFilePath = null;
// Find a file in case a directory was passed
RemoteIterator<LocatedFileStatus> it = FileSystem.get(getConf()).listFiles(new Path(inputFile), true);
while(it.hasNext()) {
    FileStatus fs = it.next();
    if(fs.isFile()) {
	parquetFilePath = fs.getPath();
	break;
    }
}
if(parquetFilePath == null) {
    LOG.error("No file found for " + inputFile);
    return 1;
}
LOG.info("Getting schema from " + parquetFilePath);
ParquetMetadata readFooter = ParquetFileReader.readFooter(getConf(), parquetFilePath);
MessageType schema = readFooter.getFileMetaData().getSchema();
LOG.info(schema);
GroupWriteSupport.setSchema(schema, getConf());

       Job job = new Job(getConf());
       job.setJarByClass(getClass());
       job.setJobName(getClass().getName());
       job.setMapperClass(ReadRequestMap.class);
job.setNumReduceTasks(0);
job.setInputFormatClass(ExampleInputFormat.class);
job.setOutputFormatClass(ExampleOutputFormat.class);

CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
if(compression.equalsIgnoreCase("snappy")) {
    codec = CompressionCodecName.SNAPPY;
} else if(compression.equalsIgnoreCase("gzip")) {
    codec = CompressionCodecName.GZIP;
}
LOG.info("Output compression: " + codec);
ExampleOutputFormat.setCompression(job, codec);

FileInputFormat.setInputPaths(job, new Path(inputFile));
       FileOutputFormat.setOutputPath(job, new Path(outputFile));

       job.waitForCompletion(true);

       return 0;
   }

Source File: PentahoParquetReadSupport.java From pentaho-hadoop-shims with Apache License 2.0

4 votes

@Override
public RecordMaterializer<RowMetaAndData> prepareForRead( Configuration configuration,
                                                          Map<String, String> keyValueMetaData,
                                                          MessageType fileSchema, ReadContext readContext ) {
  return new ParquetConverter.MyRecordMaterializer( converter );
}

Source File: ParquetHdfsDataWriterTest.java From incubator-gobblin with Apache License 2.0

4 votes

@Override
public RecordMaterializer<Group> prepareForRead(Configuration conf, Map<String, String> metaData,
    MessageType schema, ReadContext context) {
  return new GroupRecordConverter(schema);
}

Source File: JsonIntermediateToParquetGroupConverter.java From incubator-gobblin with Apache License 2.0

4 votes

@Override
public Iterable<Group> convertRecord(MessageType outputSchema, JsonObject inputRecord, WorkUnitState workUnit)
    throws DataConversionException {
  return new SingleRecordIterable<>((Group) recordConverter.convert(inputRecord));
}

Source File: TupleWriteSupport.java From hadoop-etl-udfs with MIT License

4 votes

public static MessageType getSchema(Configuration configuration) {
    return MessageTypeParser.parseMessageType(configuration.get(PARQUET_SCHEMA_PROPERTY_NAME));
}

parquet.schema.MessageType Java Examples