Java Code Examples for org.apache.parquet.hadoop.CodecFactory#createDirectCodecFactory()
The following examples show how to use
org.apache.parquet.hadoop.CodecFactory#createDirectCodecFactory() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: GlobalDictionaryBuilder.java From dremio-oss with Apache License 2.0 | 6 votes |
public static void main(String []args) { try (final BufferAllocator bufferAllocator = new RootAllocator(VM.getMaxDirectMemory())) { final Path tableDir = Path.of(args[0]); final Configuration conf = new Configuration(); final CompressionCodecFactory codecFactory = CodecFactory.createDirectCodecFactory(conf, new ParquetDirectByteBufferAllocator(bufferAllocator), 0); final FileSystem fs = HadoopFileSystem.get(tableDir, conf); if (fs.exists(tableDir) && fs.isDirectory(tableDir)) { Map<ColumnDescriptor, Path> dictionaryEncodedColumns = createGlobalDictionaries(codecFactory, fs, tableDir, bufferAllocator).getColumnsToDictionaryFiles(); long version = getDictionaryVersion(fs, tableDir); Path dictionaryRootDir = getDictionaryVersionedRootPath(fs, tableDir, version); for (ColumnDescriptor columnDescriptor: dictionaryEncodedColumns.keySet()) { final VectorContainer data = readDictionary(fs, dictionaryRootDir, columnDescriptor, bufferAllocator); System.out.println("Dictionary for column [" + columnDescriptor.toString() + " size " + data.getRecordCount()); BatchPrinter.printBatch(data); data.clear(); } } } catch (IOException ioe) { logger.error("Failed ", ioe); } }
Example 2
Source File: ParquetFormatPlugin.java From dremio-oss with Apache License 2.0 | 6 votes |
public PreviewReader( OperatorContext context, FileSystem fs, FileAttributes attributes ) throws IOException { super(); this.context = context; this.fs = fs; this.attributes = attributes; final long maxFooterLen = context.getOptions().getOption(ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR); this.streamProvider = new SingleStreamProvider(fs, attributes.getPath(), attributes.size(), maxFooterLen, false, null); this.footer = this.streamProvider.getFooter(); boolean autoCorrectCorruptDates = context.getOptions().getOption(ExecConstants.PARQUET_AUTO_CORRECT_DATES_VALIDATOR) && getConfig().autoCorrectCorruptDates; this.dateStatus = ParquetReaderUtility.detectCorruptDates(footer, GroupScan.ALL_COLUMNS, autoCorrectCorruptDates); this.schemaHelper = SchemaDerivationHelper.builder() .readInt96AsTimeStamp(context.getOptions().getOption(ExecConstants.PARQUET_READER_INT96_AS_TIMESTAMP_VALIDATOR)) .dateCorruptionStatus(dateStatus) .build(); this.codec = CodecFactory.createDirectCodecFactory(new Configuration(), new ParquetDirectByteBufferAllocator(context.getAllocator()), 0); }
Example 3
Source File: TestGlobalDictionaryPlan.java From dremio-oss with Apache License 2.0 | 6 votes |
@BeforeClass public static void setup() throws Exception { testRootAllocator = RootAllocatorFactory.newRoot(config); testAllocator = testRootAllocator.newChildAllocator("test-glb-dict", 0, testRootAllocator.getLimit()); testNoResult("alter session set \"store.parquet.enable_dictionary_encoding_binary_type\"=true"); testNoResult("CREATE TABLE dfs_test.globaldictionary AS SELECT * FROM cp.\"globaldictionary.json\""); testNoResult("CREATE TABLE dfs_test.places AS SELECT * FROM cp.\"places.json\""); final Configuration conf = new Configuration(); codec = CodecFactory.createDirectCodecFactory(conf, new ParquetDirectByteBufferAllocator(testAllocator), 0); fs = HadoopFileSystem.getLocal(conf); tableDirPath1 = Path.of(getDfsTestTmpSchemaLocation() + "/globaldictionary"); GlobalDictionaryBuilder.createGlobalDictionaries(codec, fs, tableDirPath1, testAllocator); tableDirPath2 = Path.of(getDfsTestTmpSchemaLocation() + "/places"); GlobalDictionaryBuilder.createGlobalDictionaries(codec, fs, tableDirPath2, testAllocator); }
Example 4
Source File: TestGlobalDictionaryBuilder.java From dremio-oss with Apache License 2.0 | 6 votes |
@Test public void testLocalDictionaries() throws IOException { try (final BufferAllocator bufferAllocator = allocatorRule.newAllocator("test-global-dictionary-builder", 0, Long.MAX_VALUE)) { final CompressionCodecFactory codecFactory = CodecFactory.createDirectCodecFactory(conf, new ParquetDirectByteBufferAllocator(bufferAllocator), 0); Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries1 = LocalDictionariesReader.readDictionaries(fs, tableDirPath.resolve("phonebook1.parquet"), codecFactory); Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries2 = LocalDictionariesReader.readDictionaries(fs, tableDirPath.resolve("phonebook2.parquet"), codecFactory); Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries3 = LocalDictionariesReader.readDictionaries(fs, tableDirPath.resolve("phonebook3.parquet"), codecFactory); Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries4 = LocalDictionariesReader.readDictionaries(fs, partitionDirPath.resolve("phonebook4.parquet"), codecFactory); assertEquals(2, dictionaries1.getKey().size()); // name and kind have dictionaries assertEquals(1, dictionaries2.getKey().size()); assertEquals(1, dictionaries3.getKey().size()); assertEquals(1, dictionaries4.getKey().size()); assertEquals(0, dictionaries1.getValue().size()); assertEquals(1, dictionaries2.getValue().size()); // skip name assertEquals(1, dictionaries3.getValue().size()); // skip name assertEquals(1, dictionaries4.getValue().size()); // skip name } }
Example 5
Source File: ParquetRecordWriter.java From Bats with Apache License 2.0 | 5 votes |
public ParquetRecordWriter(FragmentContext context, ParquetWriter writer) throws OutOfMemoryException { this.oContext = context.newOperatorContext(writer); this.codecFactory = CodecFactory.createDirectCodecFactory(writer.getFormatPlugin().getFsConf(), new ParquetDirectByteBufferAllocator(oContext.getAllocator()), pageSize); this.partitionColumns = writer.getPartitionColumns(); this.hasPartitions = partitionColumns != null && partitionColumns.size() > 0; this.extraMetaData.put(DRILL_VERSION_PROPERTY, DrillVersionInfo.getVersion()); this.extraMetaData.put(WRITER_VERSION_PROPERTY, String.valueOf(ParquetWriter.WRITER_VERSION)); this.storageStrategy = writer.getStorageStrategy() == null ? StorageStrategy.DEFAULT : writer.getStorageStrategy(); this.cleanUpLocations = Lists.newArrayList(); this.conf = new Configuration(writer.getFormatPlugin().getFsConf()); }
Example 6
Source File: LocalDictionariesReader.java From dremio-oss with Apache License 2.0 | 5 votes |
public static void main(String[] args) { try (final BufferAllocator bufferAllocator = new RootAllocator(VM.getMaxDirectMemory())) { final Configuration fsConf = new Configuration(); final FileSystem fs = HadoopFileSystem.getLocal(fsConf); final Path filePath = Path.of(args[0]); final CompressionCodecFactory codecFactory = CodecFactory.createDirectCodecFactory(fsConf, new ParquetDirectByteBufferAllocator(bufferAllocator), 0); final Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries = readDictionaries(fs, filePath, codecFactory); for (Map.Entry<ColumnDescriptor, Dictionary> entry : dictionaries.getLeft().entrySet()) { printDictionary(entry.getKey(), entry.getValue()); } System.out.println("Binary columns which are not dictionary encoded: " + dictionaries.getRight()); } catch (IOException ioe) { logger.error("Failed ", ioe); } }
Example 7
Source File: UnifiedParquetReader.java From dremio-oss with Apache License 2.0 | 5 votes |
public UnifiedParquetReader( OperatorContext context, ParquetReaderFactory readerFactory, BatchSchema tableSchema, ParquetScanProjectedColumns projectedColumns, Map<String, GlobalDictionaryFieldInfo> globalDictionaryFieldInfoMap, List<ParquetFilterCondition> filterConditions, ParquetFilterCreator filterCreator, ParquetDictionaryConvertor dictionaryConvertor, ParquetDatasetSplitScanXAttr readEntry, FileSystem fs, ParquetMetadata footer, GlobalDictionaries dictionaries, SchemaDerivationHelper schemaHelper, boolean vectorize, boolean enableDetailedTracing, boolean supportsColocatedReads, InputStreamProvider inputStreamProvider) { super(); this.context = context; this.readerFactory = readerFactory; this.globalDictionaryFieldInfoMap = globalDictionaryFieldInfoMap; this.filterConditions = filterConditions; this.filterCreator = filterCreator; this.dictionaryConvertor = dictionaryConvertor; this.fs = fs; this.footer = footer; this.readEntry = readEntry; this.vectorize = vectorize; this.tableSchema = tableSchema; this.projectedColumns = projectedColumns; this.columnResolver = null; this.dictionaries = dictionaries; this.codecFactory = CodecFactory.createDirectCodecFactory(new Configuration(), new ParquetDirectByteBufferAllocator(context.getAllocator()), 0); this.enableDetailedTracing = enableDetailedTracing; this.inputStreamProvider = inputStreamProvider; this.schemaHelper = schemaHelper; this.supportsColocatedReads = supportsColocatedReads; this.ignoreSchemaLearning = false; }
Example 8
Source File: TestTpchDistributedWithGlobalDictionaries.java From dremio-oss with Apache License 2.0 | 5 votes |
@BeforeClass public static void setup() throws Exception { testRootAllocator = RootAllocatorFactory.newRoot(config); testAllocator = testRootAllocator.newChildAllocator("test-tpch-distrib", 0, testRootAllocator.getLimit()); testNoResult("alter session set \"store.parquet.enable_dictionary_encoding_binary_type\"=true"); final Configuration conf = new Configuration(); final CompressionCodecFactory codec = CodecFactory.createDirectCodecFactory(conf, new ParquetDirectByteBufferAllocator(testAllocator), 0); fs = HadoopFileSystem.getLocal(conf); testNoResult("CREATE TABLE dfs_test.tpch_lineitem_gd AS SELECT * FROM cp.\"tpch/lineitem.parquet\""); testNoResult("CREATE TABLE dfs_test.tpch_customer_gd AS SELECT * FROM cp.\"tpch/customer.parquet\""); testNoResult("CREATE TABLE dfs_test.tpch_part_gd AS SELECT * FROM cp.\"tpch/part.parquet\""); testNoResult("CREATE TABLE dfs_test.tpch_partsupp_gd AS SELECT * FROM cp.\"tpch/partsupp.parquet\""); testNoResult("CREATE TABLE dfs_test.tpch_region_gd AS SELECT * FROM cp.\"tpch/region.parquet\""); testNoResult("CREATE TABLE dfs_test.tpch_nation_gd AS SELECT * FROM cp.\"tpch/nation.parquet\""); testNoResult("CREATE TABLE dfs_test.tpch_supplier_gd AS SELECT * FROM cp.\"tpch/supplier.parquet\""); testNoResult("CREATE TABLE dfs_test.tpch_orders_gd AS SELECT * FROM cp.\"tpch/orders.parquet\""); lineitem = Path.of(getDfsTestTmpSchemaLocation() + "/tpch_lineitem_gd"); customer = Path.of(getDfsTestTmpSchemaLocation() + "/tpch_customer_gd"); part = Path.of(getDfsTestTmpSchemaLocation() + "/tpch_part_gd"); partsupp = Path.of(getDfsTestTmpSchemaLocation() + "/tpch_partsupp_gd"); region = Path.of(getDfsTestTmpSchemaLocation() + "/tpch_region_gd"); nation = Path.of(getDfsTestTmpSchemaLocation() + "/tpch_nation_gd"); supplier = Path.of(getDfsTestTmpSchemaLocation() + "/tpch_supplier_gd"); orders = Path.of(getDfsTestTmpSchemaLocation() + "/tpch_orders_gd"); GlobalDictionaryBuilder.createGlobalDictionaries(codec, fs, lineitem, testAllocator); GlobalDictionaryBuilder.createGlobalDictionaries(codec, fs, customer, testAllocator); GlobalDictionaryBuilder.createGlobalDictionaries(codec, fs, part, testAllocator); GlobalDictionaryBuilder.createGlobalDictionaries(codec, fs, partsupp, testAllocator); GlobalDictionaryBuilder.createGlobalDictionaries(codec, fs, region, testAllocator); GlobalDictionaryBuilder.createGlobalDictionaries(codec, fs, nation, testAllocator); GlobalDictionaryBuilder.createGlobalDictionaries(codec, fs, supplier, testAllocator); GlobalDictionaryBuilder.createGlobalDictionaries(codec, fs, orders, testAllocator); disableGlobalDictionary(); }
Example 9
Source File: ParquetFormatDatasetAccessor.java From dremio-oss with Apache License 2.0 | 4 votes |
/** * Read the records in the first parquet file to generate schema for selected parquet files * * @param selection parquet file selection * @param fs file system wrapper * @return schema of selected parquet files */ private BatchSchema getBatchSchemaFromReader(final FileSelection selection, final FileSystem fs) throws Exception { final SabotContext context = ((ParquetFormatPlugin) formatPlugin).getContext(); try ( BufferAllocator sampleAllocator = context.getAllocator().newChildAllocator("sample-alloc", 0, Long.MAX_VALUE); OperatorContextImpl operatorContext = new OperatorContextImpl(context.getConfig(), sampleAllocator, context.getOptionManager(), 1000); SampleMutator mutator = new SampleMutator(sampleAllocator) ) { final CompressionCodecFactory codec = CodecFactory.createDirectCodecFactory(new Configuration(), new ParquetDirectByteBufferAllocator(operatorContext.getAllocator()), 0); for (FileAttributes firstFile : selection.getFileAttributesList()) { ParquetMetadata footer = SingletonParquetFooterCache.readFooter(fsPlugin.getSystemUserFS(), firstFile, ParquetMetadataConverter.NO_FILTER, context.getOptionManager().getOption(ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR)); if (footer.getBlocks().size() == 0) { continue; } final boolean autoCorrectCorruptDates = context.getOptionManager().getOption(ExecConstants.PARQUET_AUTO_CORRECT_DATES_VALIDATOR) && ((ParquetFormatPlugin) formatPlugin).getConfig().autoCorrectCorruptDates; final ParquetReaderUtility.DateCorruptionStatus dateStatus = ParquetReaderUtility.detectCorruptDates(footer, GroupScan.ALL_COLUMNS, autoCorrectCorruptDates); final SchemaDerivationHelper schemaHelper = SchemaDerivationHelper.builder() .readInt96AsTimeStamp(operatorContext.getOptions().getOption(PARQUET_READER_INT96_AS_TIMESTAMP).getBoolVal()) .dateCorruptionStatus(dateStatus) .build(); boolean isAccelerator = fsPlugin.getId().getName().equals(ACCELERATOR_STORAGEPLUGIN_NAME); final ImplicitFilesystemColumnFinder finder = new ImplicitFilesystemColumnFinder(context.getOptionManager(), fs, GroupScan.ALL_COLUMNS, isAccelerator); final long maxFooterLen = context.getOptionManager().getOption(ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR); try (InputStreamProvider streamProvider = new SingleStreamProvider(fs, firstFile.getPath(), firstFile.size(), maxFooterLen, false, null); RecordReader reader = new AdditionalColumnsRecordReader(new ParquetRowiseReader(operatorContext, footer, 0, firstFile.getPath().toString(), ParquetScanProjectedColumns.fromSchemaPaths(GroupScan.ALL_COLUMNS), fs, schemaHelper, streamProvider, codec, true), finder.getImplicitFieldsForSample(selection))) { reader.setup(mutator); mutator.allocate(100); // Read the parquet file to populate inner list types reader.next(); mutator.getContainer().buildSchema(BatchSchema.SelectionVectorMode.NONE); return mutator.getContainer().getSchema(); } } } catch (Exception e) { throw e; } throw UserException.dataReadError().message("Only empty parquet files found.").build(logger); }
Example 10
Source File: ParquetRecordWriter.java From dremio-oss with Apache License 2.0 | 4 votes |
public ParquetRecordWriter(OperatorContext context, ParquetWriter writer, ParquetFormatConfig config) throws OutOfMemoryException{ this.context = context; this.codecAllocator = context.getAllocator().newChildAllocator("ParquetCodecFactory", 0, Long.MAX_VALUE); this.columnEncoderAllocator = context.getAllocator().newChildAllocator("ParquetColEncoder", 0, Long.MAX_VALUE); this.codecFactory = CodecFactory.createDirectCodecFactory(new Configuration(), new ParquetDirectByteBufferAllocator(codecAllocator), pageSize); this.extraMetaData.put(DREMIO_VERSION_PROPERTY, DremioVersionInfo.getVersion()); this.extraMetaData.put(IS_DATE_CORRECT_PROPERTY, "true"); this.plugin = writer.getFormatPlugin().getFsPlugin(); this.queryUser = writer.getProps().getUserName(); FragmentHandle handle = context.getFragmentHandle(); String fragmentId = String.format("%d_%d", handle.getMajorFragmentId(), handle.getMinorFragmentId()); this.location = writer.getLocation(); this.prefix = fragmentId; this.extension = config.outputExtension; if (writer.getOptions() != null) { this.partitionColumns = writer.getOptions().getPartitionColumns(); this.isIcebergWriter = (writer.getOptions().getIcebergWriterOperation() != WriterOptions.IcebergWriterOperation.NONE); } else { this.partitionColumns = null; this.isIcebergWriter = false; } if (this.isIcebergWriter && writer.getOptions().getExtendedProperty() != null) { initIcebergColumnIDList(writer.getOptions().getExtendedProperty()); } memoryThreshold = (int) context.getOptions().getOption(ExecConstants.PARQUET_MEMORY_THRESHOLD_VALIDATOR); blockSize = (int) context.getOptions().getOption(ExecConstants.PARQUET_BLOCK_SIZE_VALIDATOR); pageSize = (int) context.getOptions().getOption(ExecConstants.PARQUET_PAGE_SIZE_VALIDATOR); final String codecName = context.getOptions().getOption(ExecConstants.PARQUET_WRITER_COMPRESSION_TYPE_VALIDATOR).toLowerCase(); switch(codecName) { case "snappy": codec = CompressionCodecName.SNAPPY; break; case "lzo": codec = CompressionCodecName.LZO; break; case "gzip": codec = CompressionCodecName.GZIP; break; case "none": case "uncompressed": codec = CompressionCodecName.UNCOMPRESSED; break; default: throw new UnsupportedOperationException(String.format("Unknown compression type: %s", codecName)); } enableDictionary = context.getOptions().getOption(ExecConstants.PARQUET_WRITER_ENABLE_DICTIONARY_ENCODING_VALIDATOR); enableDictionaryForBinary = context.getOptions().getOption(ExecConstants.PARQUET_WRITER_ENABLE_DICTIONARY_ENCODING_BINARY_TYPE_VALIDATOR); maxPartitions = context.getOptions().getOption(ExecConstants.PARQUET_MAXIMUM_PARTITIONS_VALIDATOR); minRecordsForFlush = context.getOptions().getOption(ExecConstants.PARQUET_MIN_RECORDS_FOR_FLUSH_VALIDATOR); parquetFileWriteTimeThresholdMilliSecs = (int)context.getOptions().getOption(ExecConstants.PARQUET_WRITE_TIME_THRESHOLD_MILLI_SECS_VALIDATOR); parquetFileWriteIoRateThresholdMbps = context.getOptions().getOption(ExecConstants.PARQUET_WRITE_IO_RATE_THRESHOLD_MBPS_VALIDATOR); }
Example 11
Source File: HadoopCodecs.java From parquet-mr with Apache License 2.0 | 4 votes |
public static CompressionCodecFactory newDirectFactory(Configuration conf, ByteBufferAllocator allocator, int sizeHint) { return CodecFactory.createDirectCodecFactory(conf, allocator, sizeHint); }