org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat Java Examples
The following examples show how to use
org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HiveUtil.java From presto with Apache License 2.0 | 6 votes |
public static InputFormat<?, ?> getInputFormat(Configuration configuration, Properties schema, boolean symlinkTarget) { String inputFormatName = getInputFormatName(schema); try { JobConf jobConf = toJobConf(configuration); configureCompressionCodecs(jobConf); Class<? extends InputFormat<?, ?>> inputFormatClass = getInputFormatClass(jobConf, inputFormatName); if (symlinkTarget && inputFormatClass == SymlinkTextInputFormat.class) { // Symlink targets are assumed to be TEXTFILE unless serde indicates otherwise. inputFormatClass = TextInputFormat.class; if (isDeserializerClass(schema, AvroSerDe.class)) { inputFormatClass = AvroContainerInputFormat.class; } } return ReflectionUtils.newInstance(inputFormatClass, jobConf); } catch (ClassNotFoundException | RuntimeException e) { throw new PrestoException(HIVE_UNSUPPORTED_FORMAT, "Unable to create input format " + inputFormatName, e); } }
Example #2
Source File: HiveMetadataUtils.java From dremio-oss with Apache License 2.0 | 5 votes |
/** * Find the rowcount based on stats in Hive metastore or estimate using filesize/filetype/recordSize/split size * * @param statsParams parameters controling the stats calculations * @param statsFromMetastore * @param sizeRatio Ration of this split contributing to all stats in given <i>statsFromMetastore</i> * @param splitSizeInBytes * @param format * @param estimatedRecordSize * @return */ public static long findRowCountInSplit(StatsEstimationParameters statsParams, HiveDatasetStats statsFromMetastore, final double sizeRatio, final long splitSizeInBytes, InputFormat<?, ?> format, final int estimatedRecordSize) { final Class<? extends InputFormat> inputFormat = format == null ? null : ((Class<? extends InputFormat>) format.getClass()); double compressionFactor = 1.0; if (MapredParquetInputFormat.class.equals(inputFormat)) { compressionFactor = 30; } else if (OrcInputFormat.class.equals(inputFormat)) { compressionFactor = 30f; } else if (AvroContainerInputFormat.class.equals(inputFormat)) { compressionFactor = 10f; } else if (RCFileInputFormat.class.equals(inputFormat)) { compressionFactor = 10f; } final long estimatedRowCount = (long) Math.ceil(splitSizeInBytes * compressionFactor / estimatedRecordSize); // Metastore stats are for complete partition. Multiply it by the size ratio of this split final long metastoreRowCount = (long) Math.ceil(sizeRatio * statsFromMetastore.getRecordCount()); logger.trace("Hive stats estimation: compression factor '{}', recordSize '{}', estimated '{}', from metastore '{}'", compressionFactor, estimatedRecordSize, estimatedRowCount, metastoreRowCount); if (statsParams.useMetastoreStats() && statsFromMetastore.hasContent()) { return metastoreRowCount; } // return the maximum of estimate and metastore count return Math.max(estimatedRowCount, metastoreRowCount); }
Example #3
Source File: HiveMetadataUtils.java From dremio-oss with Apache License 2.0 | 5 votes |
/** * Find the rowcount based on stats in Hive metastore or estimate using filesize/filetype/recordSize/split size * * @param statsParams parameters controling the stats calculations * @param statsFromMetastore * @param sizeRatio Ration of this split contributing to all stats in given <i>statsFromMetastore</i> * @param splitSizeInBytes * @param format * @param estimatedRecordSize * @return */ public static long findRowCountInSplit(StatsEstimationParameters statsParams, HiveDatasetStats statsFromMetastore, final double sizeRatio, final long splitSizeInBytes, InputFormat<?, ?> format, final int estimatedRecordSize) { final Class<? extends InputFormat> inputFormat = format == null ? null : ((Class<? extends InputFormat>) format.getClass()); double compressionFactor = 1.0; if (MapredParquetInputFormat.class.equals(inputFormat)) { compressionFactor = 30; } else if (OrcInputFormat.class.equals(inputFormat)) { compressionFactor = 30f; } else if (AvroContainerInputFormat.class.equals(inputFormat)) { compressionFactor = 10f; } else if (RCFileInputFormat.class.equals(inputFormat)) { compressionFactor = 10f; } final long estimatedRowCount = (long) Math.ceil(splitSizeInBytes * compressionFactor / estimatedRecordSize); // Metastore stats are for complete partition. Multiply it by the size ratio of this split final long metastoreRowCount = (long) Math.ceil(sizeRatio * statsFromMetastore.getRecordCount()); logger.trace("Hive stats estimation: compression factor '{}', recordSize '{}', estimated '{}', from metastore '{}'", compressionFactor, estimatedRecordSize, estimatedRowCount, metastoreRowCount); if (statsParams.useMetastoreStats() && statsFromMetastore.hasContent()) { return metastoreRowCount; } // return the maximum of estimate and metastore count return Math.max(estimatedRowCount, metastoreRowCount); }
Example #4
Source File: HiveAvroCopyEntityHelper.java From incubator-gobblin with Apache License 2.0 | 5 votes |
/** * Tell whether a hive table is actually an Avro table * @param table a hive {@link Table} * @return true if it is a hive table */ public static boolean isHiveTableAvroType(Table table) { String serializationLib = table.getTTable().getSd().getSerdeInfo().getSerializationLib(); String inputFormat = table.getTTable().getSd().getInputFormat(); String outputFormat = table.getTTable().getSd().getOutputFormat(); return inputFormat.endsWith(AvroContainerInputFormat.class.getSimpleName()) || outputFormat.endsWith(AvroContainerOutputFormat.class.getSimpleName()) || serializationLib.endsWith(AvroSerDe.class.getSimpleName()); }
Example #5
Source File: HiveMetaStoreUtils.java From incubator-gobblin with Apache License 2.0 | 5 votes |
private static StorageDescriptor getStorageDescriptor(HiveRegistrationUnit unit) { State props = unit.getStorageProps(); StorageDescriptor sd = new StorageDescriptor(); sd.setParameters(getParameters(props)); //Treat AVRO and other formats differently. Details can be found in GOBBLIN-877 if (unit.isRegisterSchema() || (unit.getInputFormat().isPresent() && !unit.getInputFormat().get().equals(AvroContainerInputFormat.class.getName()))) { sd.setCols(getFieldSchemas(unit)); } if (unit.getLocation().isPresent()) { sd.setLocation(unit.getLocation().get()); } if (unit.getInputFormat().isPresent()) { sd.setInputFormat(unit.getInputFormat().get()); } if (unit.getOutputFormat().isPresent()) { sd.setOutputFormat(unit.getOutputFormat().get()); } if (unit.getIsCompressed().isPresent()) { sd.setCompressed(unit.getIsCompressed().get()); } if (unit.getNumBuckets().isPresent()) { sd.setNumBuckets(unit.getNumBuckets().get()); } if (unit.getBucketColumns().isPresent()) { sd.setBucketCols(unit.getBucketColumns().get()); } if (unit.getIsStoredAsSubDirs().isPresent()) { sd.setStoredAsSubDirectories(unit.getIsStoredAsSubDirs().get()); } sd.setSerdeInfo(getSerDeInfo(unit)); return sd; }
Example #6
Source File: HiveMetaStoreUtilsTest.java From incubator-gobblin with Apache License 2.0 | 5 votes |
@Test public void testGetTableAvro() { final String databaseName = "testdb"; final String tableName = "testtable"; HiveTable.Builder builder = new HiveTable.Builder(); builder.withDbName(databaseName).withTableName(tableName); State serdeProps = new State(); serdeProps.setProp("avro.schema.literal", "{\"type\": \"record\", \"name\": \"TestEvent\"," + " \"namespace\": \"test.namespace\", \"fields\": [{\"name\":\"a\"," + " \"type\": \"int\"}]}"); builder.withSerdeProps(serdeProps); HiveTable hiveTable = builder.build(); hiveTable.setInputFormat(AvroContainerInputFormat.class.getName()); hiveTable.setOutputFormat(AvroContainerOutputFormat.class.getName()); hiveTable.setSerDeType(AvroSerDe.class.getName()); Table table = HiveMetaStoreUtils.getTable(hiveTable); Assert.assertEquals(table.getDbName(), databaseName); Assert.assertEquals(table.getTableName(), tableName); StorageDescriptor sd = table.getSd(); Assert.assertEquals(sd.getInputFormat(), AvroContainerInputFormat.class.getName()); Assert.assertEquals(sd.getOutputFormat(), AvroContainerOutputFormat.class.getName()); Assert.assertNotNull(sd.getSerdeInfo()); Assert.assertEquals(sd.getSerdeInfo().getSerializationLib(), AvroSerDe.class.getName()); List<FieldSchema> fields = sd.getCols(); Assert.assertTrue(fields != null && fields.size() == 1); FieldSchema fieldA = fields.get(0); Assert.assertEquals(fieldA.getName(), "a"); Assert.assertEquals(fieldA.getType(), "int"); }
Example #7
Source File: HiveMetaStoreUtilsTest.java From incubator-gobblin with Apache License 2.0 | 5 votes |
@Test public void testGetTableAvroInvalidSchema() { final String databaseName = "testdb"; final String tableName = "testtable"; HiveTable.Builder builder = new HiveTable.Builder(); builder.withDbName(databaseName).withTableName(tableName); State serdeProps = new State(); serdeProps.setProp("avro.schema.literal", "invalid schema"); builder.withSerdeProps(serdeProps); HiveTable hiveTable = builder.build(); hiveTable.setInputFormat(AvroContainerInputFormat.class.getName()); hiveTable.setOutputFormat(AvroContainerOutputFormat.class.getName()); hiveTable.setSerDeType(AvroSerDe.class.getName()); Table table = HiveMetaStoreUtils.getTable(hiveTable); Assert.assertEquals(table.getDbName(), databaseName); Assert.assertEquals(table.getTableName(), tableName); StorageDescriptor sd = table.getSd(); Assert.assertEquals(sd.getInputFormat(), AvroContainerInputFormat.class.getName()); Assert.assertEquals(sd.getOutputFormat(), AvroContainerOutputFormat.class.getName()); Assert.assertNotNull(sd.getSerdeInfo()); Assert.assertEquals(sd.getSerdeInfo().getSerializationLib(), AvroSerDe.class.getName()); List<FieldSchema> fields = sd.getCols(); Assert.assertTrue(fields != null && fields.size() == 0); }
Example #8
Source File: HiveMetaStoreUtilsTest.java From incubator-gobblin with Apache License 2.0 | 4 votes |
@Test public void testGetHiveTable() throws Exception { final String databaseName = "testdb"; final String tableName = "testtable"; final String tableSdLoc = "/tmp/testtable"; final String partitionName = "partitionName"; State serdeProps = new State(); serdeProps.setProp("avro.schema.literal", "{\"type\": \"record\", \"name\": \"TestEvent\"," + " \"namespace\": \"test.namespace\", \"fields\": [{\"name\":\"testName\"," + " \"type\": \"int\"}]}"); List<FieldSchema> fieldSchemas = new ArrayList<>(); fieldSchemas.add(new FieldSchema("testName","int", "testContent")); SerDeInfo si = new SerDeInfo(); si.setParameters(getParameters(serdeProps)); si.setName(tableName); StorageDescriptor sd = new StorageDescriptor(fieldSchemas, tableSdLoc, AvroContainerInputFormat.class.getName(), AvroContainerOutputFormat.class.getName(), false, 0, si, null, Lists.<Order>newArrayList(), null); sd.setParameters(getParameters(serdeProps)); Table table = new Table(tableName, databaseName, "testOwner", 0, 0, 0, sd, Lists.<FieldSchema>newArrayList(), Maps.<String,String>newHashMap(), "", "", ""); table.addToPartitionKeys(new FieldSchema(partitionName, "string", "some comment")); HiveTable hiveTable = HiveMetaStoreUtils.getHiveTable(table); Assert.assertEquals(hiveTable.getDbName(), databaseName); Assert.assertEquals(hiveTable.getTableName(), tableName); Assert.assertTrue(hiveTable.getInputFormat().isPresent()); Assert.assertTrue(hiveTable.getOutputFormat().isPresent()); Assert.assertEquals(hiveTable.getInputFormat().get(), AvroContainerInputFormat.class.getName()); Assert.assertEquals(hiveTable.getOutputFormat().get(), AvroContainerOutputFormat.class.getName()); Assert.assertNotNull(hiveTable.getSerDeType()); List<HiveRegistrationUnit.Column> fields = hiveTable.getColumns(); Assert.assertTrue(fields != null && fields.size() == 1); HiveRegistrationUnit.Column fieldA = fields.get(0); Assert.assertEquals(fieldA.getName(), "testName"); Assert.assertEquals(fieldA.getType(), "int"); }