org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat Java Exaples

Source File: HiveUtil.java From presto with Apache License 2.0

6 votes

public static InputFormat<?, ?> getInputFormat(Configuration configuration, Properties schema, boolean symlinkTarget)
{
    String inputFormatName = getInputFormatName(schema);
    try {
        JobConf jobConf = toJobConf(configuration);
        configureCompressionCodecs(jobConf);

        Class<? extends InputFormat<?, ?>> inputFormatClass = getInputFormatClass(jobConf, inputFormatName);
        if (symlinkTarget && inputFormatClass == SymlinkTextInputFormat.class) {
            // Symlink targets are assumed to be TEXTFILE unless serde indicates otherwise.
            inputFormatClass = TextInputFormat.class;
            if (isDeserializerClass(schema, AvroSerDe.class)) {
                inputFormatClass = AvroContainerInputFormat.class;
            }
        }

        return ReflectionUtils.newInstance(inputFormatClass, jobConf);
    }
    catch (ClassNotFoundException | RuntimeException e) {
        throw new PrestoException(HIVE_UNSUPPORTED_FORMAT, "Unable to create input format " + inputFormatName, e);
    }
}

Source File: HiveMetadataUtils.java From dremio-oss with Apache License 2.0

5 votes

/**
 * Find the rowcount based on stats in Hive metastore or estimate using filesize/filetype/recordSize/split size
 *
 * @param statsParams         parameters controling the stats calculations
 * @param statsFromMetastore
 * @param sizeRatio           Ration of this split contributing to all stats in given <i>statsFromMetastore</i>
 * @param splitSizeInBytes
 * @param format
 * @param estimatedRecordSize
 * @return
 */
public static long findRowCountInSplit(StatsEstimationParameters statsParams, HiveDatasetStats statsFromMetastore,
                                       final double sizeRatio, final long splitSizeInBytes, InputFormat<?, ?> format,
                                       final int estimatedRecordSize) {

  final Class<? extends InputFormat> inputFormat =
    format == null ? null : ((Class<? extends InputFormat>) format.getClass());

  double compressionFactor = 1.0;
  if (MapredParquetInputFormat.class.equals(inputFormat)) {
    compressionFactor = 30;
  } else if (OrcInputFormat.class.equals(inputFormat)) {
    compressionFactor = 30f;
  } else if (AvroContainerInputFormat.class.equals(inputFormat)) {
    compressionFactor = 10f;
  } else if (RCFileInputFormat.class.equals(inputFormat)) {
    compressionFactor = 10f;
  }

  final long estimatedRowCount = (long) Math.ceil(splitSizeInBytes * compressionFactor / estimatedRecordSize);

  // Metastore stats are for complete partition. Multiply it by the size ratio of this split
  final long metastoreRowCount = (long) Math.ceil(sizeRatio * statsFromMetastore.getRecordCount());

  logger.trace("Hive stats estimation: compression factor '{}', recordSize '{}', estimated '{}', from metastore '{}'",
    compressionFactor, estimatedRecordSize, estimatedRowCount, metastoreRowCount);

  if (statsParams.useMetastoreStats() && statsFromMetastore.hasContent()) {
    return metastoreRowCount;
  }

  // return the maximum of estimate and metastore count
  return Math.max(estimatedRowCount, metastoreRowCount);
}

Source File: HiveMetadataUtils.java From dremio-oss with Apache License 2.0

5 votes

/**
 * Find the rowcount based on stats in Hive metastore or estimate using filesize/filetype/recordSize/split size
 *
 * @param statsParams         parameters controling the stats calculations
 * @param statsFromMetastore
 * @param sizeRatio           Ration of this split contributing to all stats in given <i>statsFromMetastore</i>
 * @param splitSizeInBytes
 * @param format
 * @param estimatedRecordSize
 * @return
 */
public static long findRowCountInSplit(StatsEstimationParameters statsParams, HiveDatasetStats statsFromMetastore,
                                       final double sizeRatio, final long splitSizeInBytes, InputFormat<?, ?> format,
                                       final int estimatedRecordSize) {

  final Class<? extends InputFormat> inputFormat =
    format == null ? null : ((Class<? extends InputFormat>) format.getClass());

  double compressionFactor = 1.0;
  if (MapredParquetInputFormat.class.equals(inputFormat)) {
    compressionFactor = 30;
  } else if (OrcInputFormat.class.equals(inputFormat)) {
    compressionFactor = 30f;
  } else if (AvroContainerInputFormat.class.equals(inputFormat)) {
    compressionFactor = 10f;
  } else if (RCFileInputFormat.class.equals(inputFormat)) {
    compressionFactor = 10f;
  }

  final long estimatedRowCount = (long) Math.ceil(splitSizeInBytes * compressionFactor / estimatedRecordSize);

  // Metastore stats are for complete partition. Multiply it by the size ratio of this split
  final long metastoreRowCount = (long) Math.ceil(sizeRatio * statsFromMetastore.getRecordCount());

  logger.trace("Hive stats estimation: compression factor '{}', recordSize '{}', estimated '{}', from metastore '{}'",
    compressionFactor, estimatedRecordSize, estimatedRowCount, metastoreRowCount);

  if (statsParams.useMetastoreStats() && statsFromMetastore.hasContent()) {
    return metastoreRowCount;
  }

  // return the maximum of estimate and metastore count
  return Math.max(estimatedRowCount, metastoreRowCount);
}

Source File: HiveAvroCopyEntityHelper.java From incubator-gobblin with Apache License 2.0

5 votes

/**
 * Tell whether a hive table is actually an Avro table
 * @param table a hive {@link Table}
 * @return true if it is a hive table
 */
public static boolean isHiveTableAvroType(Table table) {
  String serializationLib = table.getTTable().getSd().getSerdeInfo().getSerializationLib();
  String inputFormat = table.getTTable().getSd().getInputFormat();
  String outputFormat = table.getTTable().getSd().getOutputFormat();

  return inputFormat.endsWith(AvroContainerInputFormat.class.getSimpleName())
      || outputFormat.endsWith(AvroContainerOutputFormat.class.getSimpleName())
      || serializationLib.endsWith(AvroSerDe.class.getSimpleName());
}

Source File: HiveMetaStoreUtils.java From incubator-gobblin with Apache License 2.0

5 votes

private static StorageDescriptor getStorageDescriptor(HiveRegistrationUnit unit) {
  State props = unit.getStorageProps();
  StorageDescriptor sd = new StorageDescriptor();
  sd.setParameters(getParameters(props));
  //Treat AVRO and other formats differently. Details can be found in GOBBLIN-877
  if (unit.isRegisterSchema() ||
      (unit.getInputFormat().isPresent() && !unit.getInputFormat().get().equals(AvroContainerInputFormat.class.getName()))) {
    sd.setCols(getFieldSchemas(unit));
  }
  if (unit.getLocation().isPresent()) {
    sd.setLocation(unit.getLocation().get());
  }
  if (unit.getInputFormat().isPresent()) {
    sd.setInputFormat(unit.getInputFormat().get());
  }
  if (unit.getOutputFormat().isPresent()) {
    sd.setOutputFormat(unit.getOutputFormat().get());
  }
  if (unit.getIsCompressed().isPresent()) {
    sd.setCompressed(unit.getIsCompressed().get());
  }
  if (unit.getNumBuckets().isPresent()) {
    sd.setNumBuckets(unit.getNumBuckets().get());
  }
  if (unit.getBucketColumns().isPresent()) {
    sd.setBucketCols(unit.getBucketColumns().get());
  }
  if (unit.getIsStoredAsSubDirs().isPresent()) {
    sd.setStoredAsSubDirectories(unit.getIsStoredAsSubDirs().get());
  }
  sd.setSerdeInfo(getSerDeInfo(unit));
  return sd;
}

Source File: HiveMetaStoreUtilsTest.java From incubator-gobblin with Apache License 2.0

5 votes

@Test
public void testGetTableAvro() {
  final String databaseName = "testdb";
  final String tableName = "testtable";

  HiveTable.Builder builder = new HiveTable.Builder();

  builder.withDbName(databaseName).withTableName(tableName);

  State serdeProps = new State();
  serdeProps.setProp("avro.schema.literal", "{\"type\": \"record\", \"name\": \"TestEvent\","
      + " \"namespace\": \"test.namespace\", \"fields\": [{\"name\":\"a\"," + " \"type\": \"int\"}]}");
  builder.withSerdeProps(serdeProps);

  HiveTable hiveTable = builder.build();
  hiveTable.setInputFormat(AvroContainerInputFormat.class.getName());
  hiveTable.setOutputFormat(AvroContainerOutputFormat.class.getName());
  hiveTable.setSerDeType(AvroSerDe.class.getName());

  Table table = HiveMetaStoreUtils.getTable(hiveTable);
  Assert.assertEquals(table.getDbName(), databaseName);
  Assert.assertEquals(table.getTableName(), tableName);

  StorageDescriptor sd = table.getSd();
  Assert.assertEquals(sd.getInputFormat(), AvroContainerInputFormat.class.getName());
  Assert.assertEquals(sd.getOutputFormat(), AvroContainerOutputFormat.class.getName());
  Assert.assertNotNull(sd.getSerdeInfo());
  Assert.assertEquals(sd.getSerdeInfo().getSerializationLib(), AvroSerDe.class.getName());

  List<FieldSchema> fields = sd.getCols();
  Assert.assertTrue(fields != null && fields.size() == 1);
  FieldSchema fieldA = fields.get(0);
  Assert.assertEquals(fieldA.getName(), "a");
  Assert.assertEquals(fieldA.getType(), "int");
}

Source File: HiveMetaStoreUtilsTest.java From incubator-gobblin with Apache License 2.0

5 votes

@Test
public void testGetTableAvroInvalidSchema() {
  final String databaseName = "testdb";
  final String tableName = "testtable";

  HiveTable.Builder builder = new HiveTable.Builder();

  builder.withDbName(databaseName).withTableName(tableName);

  State serdeProps = new State();
  serdeProps.setProp("avro.schema.literal", "invalid schema");
  builder.withSerdeProps(serdeProps);

  HiveTable hiveTable = builder.build();
  hiveTable.setInputFormat(AvroContainerInputFormat.class.getName());
  hiveTable.setOutputFormat(AvroContainerOutputFormat.class.getName());
  hiveTable.setSerDeType(AvroSerDe.class.getName());

  Table table = HiveMetaStoreUtils.getTable(hiveTable);
  Assert.assertEquals(table.getDbName(), databaseName);
  Assert.assertEquals(table.getTableName(), tableName);

  StorageDescriptor sd = table.getSd();
  Assert.assertEquals(sd.getInputFormat(), AvroContainerInputFormat.class.getName());
  Assert.assertEquals(sd.getOutputFormat(), AvroContainerOutputFormat.class.getName());
  Assert.assertNotNull(sd.getSerdeInfo());
  Assert.assertEquals(sd.getSerdeInfo().getSerializationLib(), AvroSerDe.class.getName());

  List<FieldSchema> fields = sd.getCols();
  Assert.assertTrue(fields != null && fields.size() == 0);
}

Source File: HiveMetaStoreUtilsTest.java From incubator-gobblin with Apache License 2.0

4 votes

@Test
public void testGetHiveTable() throws Exception {
  final String databaseName = "testdb";
  final String tableName = "testtable";
  final String tableSdLoc = "/tmp/testtable";
  final String partitionName = "partitionName";

  State serdeProps = new State();
  serdeProps.setProp("avro.schema.literal", "{\"type\": \"record\", \"name\": \"TestEvent\","
          + " \"namespace\": \"test.namespace\", \"fields\": [{\"name\":\"testName\"," + " \"type\": \"int\"}]}");

  List<FieldSchema> fieldSchemas = new ArrayList<>();
  fieldSchemas.add(new FieldSchema("testName","int", "testContent"));
  SerDeInfo si = new SerDeInfo();
  si.setParameters(getParameters(serdeProps));
  si.setName(tableName);

  StorageDescriptor sd = new StorageDescriptor(fieldSchemas, tableSdLoc,
          AvroContainerInputFormat.class.getName(), AvroContainerOutputFormat.class.getName(),
          false, 0, si, null, Lists.<Order>newArrayList(), null);
  sd.setParameters(getParameters(serdeProps));
  Table table = new Table(tableName, databaseName, "testOwner", 0, 0, 0, sd,
          Lists.<FieldSchema>newArrayList(), Maps.<String,String>newHashMap(), "", "", "");
  table.addToPartitionKeys(new FieldSchema(partitionName, "string", "some comment"));

  HiveTable hiveTable = HiveMetaStoreUtils.getHiveTable(table);
  Assert.assertEquals(hiveTable.getDbName(), databaseName);
  Assert.assertEquals(hiveTable.getTableName(), tableName);

  Assert.assertTrue(hiveTable.getInputFormat().isPresent());
  Assert.assertTrue(hiveTable.getOutputFormat().isPresent());
  Assert.assertEquals(hiveTable.getInputFormat().get(), AvroContainerInputFormat.class.getName());
  Assert.assertEquals(hiveTable.getOutputFormat().get(), AvroContainerOutputFormat.class.getName());
  Assert.assertNotNull(hiveTable.getSerDeType());

  List<HiveRegistrationUnit.Column> fields = hiveTable.getColumns();
  Assert.assertTrue(fields != null && fields.size() == 1);
  HiveRegistrationUnit.Column fieldA = fields.get(0);
  Assert.assertEquals(fieldA.getName(), "testName");
  Assert.assertEquals(fieldA.getType(), "int");

}

org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat Java Examples