parquet.hadoop.metadata.CompressionCodecName Java Examples
The following examples show how to use
parquet.hadoop.metadata.CompressionCodecName.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ExaParquetWriterImpl.java From hadoop-etl-udfs with MIT License | 7 votes |
private ExaParquetWriterImpl(final MessageType schema, final int numColumns, final Configuration conf, final Path path, final String compressionType, final ExaIterator exa, final int firstColumnIndex, final List<Integer> dynamicPartitionExaColNums) throws Exception { System.out.println("Path: " + path.toString()); System.out.println("Parquet schema:\n" + schema); TupleWriteSupport.setSchema(schema, conf); this.writer = new ParquetWriter<>(path, new TupleWriteSupport(), CompressionCodecName.fromConf(compressionType), ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, conf); // Create Tuple object with ExaIterator reference. this.row = new Tuple(exa, numColumns, firstColumnIndex, dynamicPartitionExaColNums); }
Example #2
Source File: ParquetAvroExample.java From parquet-flinktacular with Apache License 2.0 | 6 votes |
public static void writeAvro(DataSet<Tuple2<Void, Person>> data, String outputPath) throws IOException { // Set up the Hadoop Input Format Job job = Job.getInstance(); // Set up Hadoop Output Format HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new AvroParquetOutputFormat(), job); FileOutputFormat.setOutputPath(job, new Path(outputPath)); AvroParquetOutputFormat.setSchema(job, Person.getClassSchema()); ParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY); ParquetOutputFormat.setEnableDictionary(job, true); // Output & Execute data.output(hadoopOutputFormat); }
Example #3
Source File: ParquetThriftExample.java From parquet-flinktacular with Apache License 2.0 | 6 votes |
public static void writeThrift(DataSet<Tuple2<Void, Person>> data, String outputPath) throws IOException { // Set up the Hadoop Input Format Job job = Job.getInstance(); // Set up Hadoop Output Format HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job); FileOutputFormat.setOutputPath(job, new Path(outputPath)); ParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY); ParquetOutputFormat.setEnableDictionary(job, true); ParquetThriftOutputFormat.setThriftClass(job, Person.class); // Output & Execute data.output(hadoopOutputFormat); }
Example #4
Source File: PentahoTwitterOutputFormat.java From pentaho-hadoop-shims with Apache License 2.0 | 6 votes |
@Override public void setCompression( COMPRESSION comp ) throws Exception { inClassloader( () -> { CompressionCodecName codec; switch ( comp ) { case SNAPPY: codec = CompressionCodecName.SNAPPY; break; case GZIP: codec = CompressionCodecName.GZIP; break; case LZO: codec = CompressionCodecName.LZO; break; default: codec = CompressionCodecName.UNCOMPRESSED; break; } ParquetOutputFormat.setCompression( job, codec ); } ); }
Example #5
Source File: ParquetProtobufExample.java From parquet-flinktacular with Apache License 2.0 | 5 votes |
public static void writeProtobuf(DataSet<Tuple2<Void, Person>> data, String outputPath) throws IOException { Job job = Job.getInstance(); // Set up Hadoop Output Format HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ProtoParquetOutputFormat(), job); FileOutputFormat.setOutputPath(job, new Path(outputPath)); ProtoParquetOutputFormat.setProtobufClass(job, Person.class); ParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY); ParquetOutputFormat.setEnableDictionary(job, true); // Output & Execute data.output(hadoopOutputFormat); }
Example #6
Source File: CSVToParquet.java From parquet-flinktacular with Apache License 2.0 | 5 votes |
private static void createLineitems(ExecutionEnvironment env) throws IOException { DataSet<Tuple2<Void,LineitemTable>> lineitems = getLineitemDataSet(env).map(new LineitemToParquet()); Job job = Job.getInstance(); HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job); ParquetThriftOutputFormat.setOutputPath(job, new Path(outputPath + "/lineitems")); ParquetThriftOutputFormat.setThriftClass(job, LineitemTable.class); ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.SNAPPY); ParquetThriftOutputFormat.setCompressOutput(job, true); ParquetThriftOutputFormat.setEnableDictionary(job, true); lineitems.output(hadoopOutputFormat); }
Example #7
Source File: CSVToParquet.java From parquet-flinktacular with Apache License 2.0 | 5 votes |
private static void createOrders(ExecutionEnvironment env) throws IOException { DataSet<Tuple2<Void,OrderTable>> orders = getOrdersDataSet(env).map(new OrdersToParquet()); Job job = Job.getInstance(); HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job); ParquetThriftOutputFormat.setOutputPath(job, new Path(outputPath + "/orders")); ParquetThriftOutputFormat.setThriftClass(job, OrderTable.class); ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.SNAPPY); ParquetThriftOutputFormat.setCompressOutput(job, true); ParquetThriftOutputFormat.setEnableDictionary(job, true); orders.output(hadoopOutputFormat); }
Example #8
Source File: CSVToParquet.java From parquet-flinktacular with Apache License 2.0 | 5 votes |
private static void createCustomers(ExecutionEnvironment env) throws IOException { DataSet<Tuple2<Void,CustomerTable>> customers = getCustomerDataSet(env).map(new CustomerToParquet()); Job job = Job.getInstance(); HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job); ParquetThriftOutputFormat.setOutputPath(job, new Path(outputPath + "/cust")); ParquetThriftOutputFormat.setThriftClass(job, CustomerTable.class); ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.SNAPPY); ParquetThriftOutputFormat.setCompressOutput(job, true); ParquetThriftOutputFormat.setEnableDictionary(job, true); customers.output(hadoopOutputFormat); }
Example #9
Source File: CSVToParquet.java From parquet-flinktacular with Apache License 2.0 | 5 votes |
private static void createDateDim(ExecutionEnvironment env) throws IOException { DataSet<Tuple2<Void, DateDimTable>> datedims = getDateDimDataSet(env).map(new DateDimToParquet()); Job job = Job.getInstance(); HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job); ParquetThriftOutputFormat.setOutputPath(job, new Path(outputPath + "/datedim")); ParquetThriftOutputFormat.setThriftClass(job, DateDimTable.class); ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.SNAPPY); ParquetThriftOutputFormat.setCompressOutput(job, true); ParquetThriftOutputFormat.setEnableDictionary(job, false); datedims.output(hadoopOutputFormat); }
Example #10
Source File: CSVToParquet.java From parquet-flinktacular with Apache License 2.0 | 5 votes |
private static void createItem(ExecutionEnvironment env) throws IOException { DataSet<Tuple2<Void, ItemTable>> items = getItemDataSet(env).map(new ItemToParquet()); Job job = Job.getInstance(); HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job); ParquetThriftOutputFormat.setOutputPath(job, new Path(outputPath + "/item")); ParquetThriftOutputFormat.setThriftClass(job, ItemTable.class); ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.SNAPPY); ParquetThriftOutputFormat.setCompressOutput(job, true); ParquetThriftOutputFormat.setEnableDictionary(job, false); items.output(hadoopOutputFormat); }
Example #11
Source File: CSVToParquet.java From parquet-flinktacular with Apache License 2.0 | 5 votes |
private static void createStoreSales(ExecutionEnvironment env) throws IOException { DataSet<Tuple2<Void, StoreSalesTable>> storeSales = getStoreSalesDataSet(env).map(new StoreSalesToParquet()); Job job = Job.getInstance(); HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job); ParquetThriftOutputFormat.setOutputPath(job, new Path(outputPath + "/storesales")); ParquetThriftOutputFormat.setThriftClass(job, StoreSalesTable.class); ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.SNAPPY); ParquetThriftOutputFormat.setCompressOutput(job, true); ParquetThriftOutputFormat.setEnableDictionary(job, false); storeSales.output(hadoopOutputFormat); }
Example #12
Source File: ParquetAvroStockWriter.java From hiped2 with Apache License 2.0 | 5 votes |
/** * Write the file. * * @param args the command-line arguments * @return the process exit code * @throws Exception if something goes wrong */ public int run(final String[] args) throws Exception { Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.IOFileOpts.values()).build(); int result = cli.runCmd(); if (result != 0) { return result; } File inputFile = new File(cli.getArgValueAsString(CliCommonOpts.IOFileOpts.INPUT)); Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.IOFileOpts.OUTPUT)); AvroParquetWriter<Stock> writer = new AvroParquetWriter<Stock>(outputPath, Stock.SCHEMA$, CompressionCodecName.SNAPPY, ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, true); for (Stock stock : AvroStockUtils.fromCsvFile(inputFile)) { writer.write(stock); } writer.close(); return 0; }
Example #13
Source File: ParquetMetadataReader.java From paraflow with Apache License 2.0 | 4 votes |
public static ParquetMetadata readFooter(FileSystem fileSystem, Path file) throws IOException { FileStatus fileStatus = fileSystem.getFileStatus(file); try (FSDataInputStream inputStream = fileSystem.open(file)) { // Parquet File Layout: // // MAGIC // variable: Data // variable: Metadata // 4 bytes: MetadataLength // MAGIC long length = fileStatus.getLen(); validateParquet(length >= MAGIC.length + PARQUET_METADATA_LENGTH + MAGIC.length, "%s is not a valid Parquet File", file); long metadataLengthIndex = length - PARQUET_METADATA_LENGTH - MAGIC.length; inputStream.seek(metadataLengthIndex); int metadataLength = readIntLittleEndian(inputStream); byte[] magic = new byte[MAGIC.length]; inputStream.readFully(magic); validateParquet(Arrays.equals(MAGIC, magic), "Not valid Parquet file: %s expected magic number: %s got: %s", file, Arrays.toString(MAGIC), Arrays.toString(magic)); long metadataIndex = metadataLengthIndex - metadataLength; validateParquet( metadataIndex >= MAGIC.length && metadataIndex < metadataLengthIndex, "Corrupted Parquet file: %s metadata index: %s out of range", file, metadataIndex); inputStream.seek(metadataIndex); FileMetaData fileMetaData = readFileMetaData(inputStream); List<SchemaElement> schema = fileMetaData.getSchema(); validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", file); MessageType messageType = readParquetSchema(schema); List<BlockMetaData> blocks = new ArrayList<>(); List<RowGroup> rowGroups = fileMetaData.getRow_groups(); if (rowGroups != null) { for (RowGroup rowGroup : rowGroups) { BlockMetaData blockMetaData = new BlockMetaData(); blockMetaData.setRowCount(rowGroup.getNum_rows()); blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size()); List<ColumnChunk> columns = rowGroup.getColumns(); validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup); String filePath = columns.get(0).getFile_path(); for (ColumnChunk columnChunk : columns) { validateParquet( (filePath == null && columnChunk.getFile_path() == null) || (filePath != null && filePath.equals(columnChunk.getFile_path())), "all column chunks of the same row group must be in the same file"); ColumnMetaData metaData = columnChunk.meta_data; String[] path = metaData.path_in_schema.toArray(new String[metaData.path_in_schema.size()]); ColumnPath columnPath = ColumnPath.get(path); ColumnChunkMetaData column = ColumnChunkMetaData.get( columnPath, messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName(), CompressionCodecName.fromParquet(metaData.codec), readEncodings(metaData.encodings), readStats(metaData.statistics, messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName()), metaData.data_page_offset, metaData.dictionary_page_offset, metaData.num_values, metaData.total_compressed_size, metaData.total_uncompressed_size); blockMetaData.addColumn(column); } blockMetaData.setPath(filePath); blocks.add(blockMetaData); } } Map<String, String> keyValueMetaData = new HashMap<>(); List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata(); if (keyValueList != null) { for (KeyValue keyValue : keyValueList) { keyValueMetaData.put(keyValue.key, keyValue.value); } } return new ParquetMetadata(new parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks); } }
Example #14
Source File: ParquetDataWriterBuilder.java From incubator-gobblin with Apache License 2.0 | 4 votes |
/** * Build a version-specific {@link ParquetWriter} for given {@link ParquetWriterConfiguration} * @param writerConfiguration * @return * @throws IOException */ @Override public ParquetWriterShim getVersionSpecificWriter(ParquetWriterConfiguration writerConfiguration) throws IOException { CompressionCodecName codecName = CompressionCodecName.fromConf(writerConfiguration.getCodecName()); ParquetProperties.WriterVersion writerVersion = ParquetProperties.WriterVersion .fromString(writerConfiguration.getWriterVersion()); Configuration conf = new Configuration(); ParquetWriter versionSpecificWriter = null; switch (writerConfiguration.getRecordFormat()) { case GROUP: { GroupWriteSupport.setSchema((MessageType) this.schema, conf); WriteSupport support = new GroupWriteSupport(); versionSpecificWriter = new ParquetWriter<Group>( writerConfiguration.getAbsoluteStagingFile(), support, codecName, writerConfiguration.getBlockSize(), writerConfiguration.getPageSize(), writerConfiguration.getDictPageSize(), writerConfiguration.isDictionaryEnabled(), writerConfiguration.isValidate(), writerVersion, conf); break; } case AVRO: { versionSpecificWriter = new AvroParquetWriter( writerConfiguration.getAbsoluteStagingFile(), (Schema) this.schema, codecName, writerConfiguration.getBlockSize(), writerConfiguration.getPageSize(), writerConfiguration.isDictionaryEnabled(), conf); break; } case PROTOBUF: { versionSpecificWriter = new ProtoParquetWriter( writerConfiguration.getAbsoluteStagingFile(), (Class<? extends Message>) this.schema, codecName, writerConfiguration.getBlockSize(), writerConfiguration.getPageSize(), writerConfiguration.isDictionaryEnabled(), writerConfiguration.isValidate()); break; } default: throw new RuntimeException("Record format not supported"); } ParquetWriter finalVersionSpecificWriter = versionSpecificWriter; return new ParquetWriterShim() { @Override public void write(Object record) throws IOException { finalVersionSpecificWriter.write(record); } @Override public void close() throws IOException { finalVersionSpecificWriter.close(); } }; }
Example #15
Source File: TestReadWriteParquet.java From parquet-examples with Apache License 2.0 | 4 votes |
public int run(String[] args) throws Exception { if(args.length < 2) { LOG.error("Usage: " + getClass().getName() + " INPUTFILE OUTPUTFILE [compression]"); return 1; } String inputFile = args[0]; String outputFile = args[1]; String compression = (args.length > 2) ? args[2] : "none"; Path parquetFilePath = null; // Find a file in case a directory was passed RemoteIterator<LocatedFileStatus> it = FileSystem.get(getConf()).listFiles(new Path(inputFile), true); while(it.hasNext()) { FileStatus fs = it.next(); if(fs.isFile()) { parquetFilePath = fs.getPath(); break; } } if(parquetFilePath == null) { LOG.error("No file found for " + inputFile); return 1; } LOG.info("Getting schema from " + parquetFilePath); ParquetMetadata readFooter = ParquetFileReader.readFooter(getConf(), parquetFilePath); MessageType schema = readFooter.getFileMetaData().getSchema(); LOG.info(schema); GroupWriteSupport.setSchema(schema, getConf()); Job job = new Job(getConf()); job.setJarByClass(getClass()); job.setJobName(getClass().getName()); job.setMapperClass(ReadRequestMap.class); job.setNumReduceTasks(0); job.setInputFormatClass(ExampleInputFormat.class); job.setOutputFormatClass(ExampleOutputFormat.class); CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED; if(compression.equalsIgnoreCase("snappy")) { codec = CompressionCodecName.SNAPPY; } else if(compression.equalsIgnoreCase("gzip")) { codec = CompressionCodecName.GZIP; } LOG.info("Output compression: " + codec); ExampleOutputFormat.setCompression(job, codec); FileInputFormat.setInputPaths(job, new Path(inputFile)); FileOutputFormat.setOutputPath(job, new Path(outputFile)); job.waitForCompletion(true); return 0; }