Java Code Examples for org.apache.parquet.schema.OriginalType#UTF8

The following examples show how to use org.apache.parquet.schema.OriginalType#UTF8 . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestMetadataReader.java    From presto with Apache License 2.0 6 votes vote down vote up
@Test(dataProvider = "allCreatedBy")
public void testReadStatsBinaryUtf8(Optional<String> fileCreatedBy)
{
    PrimitiveType varchar = new PrimitiveType(OPTIONAL, BINARY, "Test column", OriginalType.UTF8);
    Statistics statistics;

    // Stats written by Parquet after https://issues.apache.org/jira/browse/PARQUET-1025
    statistics = new Statistics();
    statistics.setNull_count(13);
    statistics.setMin_value("a".getBytes(UTF_8));
    statistics.setMax_value("é".getBytes(UTF_8));
    assertThat(MetadataReader.readStats(fileCreatedBy, Optional.of(statistics), varchar))
            .isInstanceOfSatisfying(BinaryStatistics.class, columnStatistics -> {
                assertEquals(columnStatistics.getNumNulls(), 13);
                assertEquals(columnStatistics.getMin().getBytes(), new byte[] {'a'});
                assertEquals(columnStatistics.getMax().getBytes(), new byte[] {(byte) 0xC3, (byte) 0xA9});
                assertEquals(columnStatistics.getMinBytes(), new byte[] {'a'});
                assertEquals(columnStatistics.getMaxBytes(), new byte[] {(byte) 0xC3, (byte) 0xA9});
                assertEquals(columnStatistics.genericGetMin().getBytes(), new byte[] {'a'});
                assertEquals(columnStatistics.genericGetMax().getBytes(), new byte[] {(byte) 0xC3, (byte) 0xA9});
            });
}
 
Example 2
Source File: MetadataReader.java    From presto with Apache License 2.0 4 votes vote down vote up
private static OriginalType getOriginalType(ConvertedType type)
{
    switch (type) {
        case UTF8:
            return OriginalType.UTF8;
        case MAP:
            return OriginalType.MAP;
        case MAP_KEY_VALUE:
            return OriginalType.MAP_KEY_VALUE;
        case LIST:
            return OriginalType.LIST;
        case ENUM:
            return OriginalType.ENUM;
        case DECIMAL:
            return OriginalType.DECIMAL;
        case DATE:
            return OriginalType.DATE;
        case TIME_MILLIS:
            return OriginalType.TIME_MILLIS;
        case TIMESTAMP_MILLIS:
            return OriginalType.TIMESTAMP_MILLIS;
        case INTERVAL:
            return OriginalType.INTERVAL;
        case INT_8:
            return OriginalType.INT_8;
        case INT_16:
            return OriginalType.INT_16;
        case INT_32:
            return OriginalType.INT_32;
        case INT_64:
            return OriginalType.INT_64;
        case UINT_8:
            return OriginalType.UINT_8;
        case UINT_16:
            return OriginalType.UINT_16;
        case UINT_32:
            return OriginalType.UINT_32;
        case UINT_64:
            return OriginalType.UINT_64;
        case JSON:
            return OriginalType.JSON;
        case BSON:
            return OriginalType.BSON;
        case TIMESTAMP_MICROS:
            return OriginalType.TIMESTAMP_MICROS;
        case TIME_MICROS:
            return OriginalType.TIME_MICROS;
        default:
            throw new IllegalArgumentException("Unknown converted type " + type);
    }
}
 
Example 3
Source File: ParquetResolver.java    From pxf with Apache License 2.0 4 votes vote down vote up
private void fillGroup(int index, OneField field, Group group, Type type) throws IOException {
    if (field.val == null)
        return;
    switch (type.asPrimitiveType().getPrimitiveTypeName()) {
        case BINARY:
            if (type.getOriginalType() == OriginalType.UTF8)
                group.add(index, (String) field.val);
            else
                group.add(index, Binary.fromReusedByteArray((byte[]) field.val));
            break;
        case INT32:
            if (type.getOriginalType() == OriginalType.INT_16)
                group.add(index, (Short) field.val);
            else
                group.add(index, (Integer) field.val);
            break;
        case INT64:
            group.add(index, (Long) field.val);
            break;
        case DOUBLE:
            group.add(index, (Double) field.val);
            break;
        case FLOAT:
            group.add(index, (Float) field.val);
            break;
        case FIXED_LEN_BYTE_ARRAY:
            // From org.apache.hadoop.hive.ql.io.parquet.write.DataWritableWriter.DecimalDataWriter#decimalToBinary
            String value = (String) field.val;
            int precision = Math.min(HiveDecimal.MAX_PRECISION, type.asPrimitiveType().getDecimalMetadata().getPrecision());
            int scale = Math.min(HiveDecimal.MAX_SCALE, type.asPrimitiveType().getDecimalMetadata().getScale());
            HiveDecimal hiveDecimal = HiveDecimal.enforcePrecisionScale(
                    HiveDecimal.create(value),
                    precision,
                    scale);

            if (hiveDecimal == null) {
                // When precision is higher than HiveDecimal.MAX_PRECISION
                // and enforcePrecisionScale returns null, it means we
                // cannot store the value in Parquet because we have
                // exceeded the precision. To make the behavior consistent
                // with Hive's behavior when storing on a Parquet-backed
                // table, we store the value as null.
                return;
            }

            byte[] decimalBytes = hiveDecimal.bigIntegerBytesScaled(scale);

            // Estimated number of bytes needed.
            int precToBytes = ParquetFileAccessor.PRECISION_TO_BYTE_COUNT[precision - 1];
            if (precToBytes == decimalBytes.length) {
                // No padding needed.
                group.add(index, Binary.fromReusedByteArray(decimalBytes));
            } else {
                byte[] tgt = new byte[precToBytes];
                if (hiveDecimal.signum() == -1) {
                    // For negative number, initializing bits to 1
                    for (int i = 0; i < precToBytes; i++) {
                        tgt[i] |= 0xFF;
                    }
                }
                System.arraycopy(decimalBytes, 0, tgt, precToBytes - decimalBytes.length, decimalBytes.length); // Padding leading zeroes/ones.
                group.add(index, Binary.fromReusedByteArray(tgt));
            }
            // end -- org.apache.hadoop.hive.ql.io.parquet.write.DataWritableWriter.DecimalDataWriter#decimalToBinary
            break;
        case INT96:  // SQL standard timestamp string value with or without time zone literals: https://www.postgresql.org/docs/9.4/datatype-datetime.html
            String timestamp = (String) field.val;
            if (TIMESTAMP_PATTERN.matcher(timestamp).find()) {
                // Note: this conversion convert type "timestamp with time zone" will lose timezone information
                // while preserving the correct value. (as Parquet doesn't support timestamp with time zone.
                group.add(index, ParquetTypeConverter.getBinaryFromTimestampWithTimeZone(timestamp));
            } else {
                group.add(index, ParquetTypeConverter.getBinaryFromTimestamp(timestamp));
            }
            break;
        case BOOLEAN:
            group.add(index, (Boolean) field.val);
            break;
        default:
            throw new IOException("Not supported type " + type.asPrimitiveType().getPrimitiveTypeName());
    }
}
 
Example 4
Source File: ParquetTypeHelper.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
/**
 * Returns an arrow vector field for a parquet primitive field
 *
 * @param colPath       schema path of the column
 * @param primitiveType parquet primitive type
 * @param originalType  parquet original type
 * @param schemaHelper  schema helper used for type conversions
 * @return arrow vector field
 */
public static Field createField(SchemaPath colPath,
                                PrimitiveType primitiveType,
                                OriginalType originalType,
                                SchemaDerivationHelper schemaHelper) {
  final String colName = colPath.getAsNamePart().getName();
  switch (primitiveType.getPrimitiveTypeName()) {
    case BINARY:
    case FIXED_LEN_BYTE_ARRAY:
      if (originalType == OriginalType.UTF8) {
        return CompleteType.VARCHAR.toField(colName);
      }
      if (originalType == OriginalType.DECIMAL) {

        return CompleteType.fromDecimalPrecisionScale(primitiveType.getDecimalMetadata()
          .getPrecision(), primitiveType.getDecimalMetadata().getScale()).toField(colName);
      }
      if (schemaHelper.isVarChar(colPath)) {
        return CompleteType.VARCHAR.toField(colName);
      }
      return CompleteType.VARBINARY.toField(colName);
    case BOOLEAN:
      return CompleteType.BIT.toField(colName);
    case DOUBLE:
      return CompleteType.DOUBLE.toField(colName);
    case FLOAT:
      return CompleteType.FLOAT.toField(colName);
    case INT32:
      if (originalType == OriginalType.DATE) {
        return CompleteType.DATE.toField(colName);
      } else if (originalType == OriginalType.TIME_MILLIS) {
        return CompleteType.TIME.toField(colName);
      } else if (originalType == OriginalType.DECIMAL) {
        return CompleteType.fromDecimalPrecisionScale(primitiveType.getDecimalMetadata()
          .getPrecision(), primitiveType.getDecimalMetadata().getScale()).toField(colName);
      }
      return CompleteType.INT.toField(colName);
    case INT64:
      if (originalType == OriginalType.TIMESTAMP_MILLIS) {
        return CompleteType.TIMESTAMP.toField(colName);
      } else if (originalType == OriginalType.DECIMAL) {
        return CompleteType.fromDecimalPrecisionScale(primitiveType.getDecimalMetadata()
          .getPrecision(), primitiveType.getDecimalMetadata().getScale()).toField(colName);
      }
      return CompleteType.BIGINT.toField(colName);
    case INT96:
      if (schemaHelper.readInt96AsTimeStamp()) {
        return CompleteType.TIMESTAMP.toField(colName);
      }
      return CompleteType.VARBINARY.toField(colName);
    default:
      throw UserException.unsupportedError()
        .message("Parquet Primitive Type '%s', Original Type '%s' combination not supported. Column '%s'",
          primitiveType.toString(), originalType != null ? originalType : "Not Available", colName)
        .build();
  }
}
 
Example 5
Source File: PentahoParquetWriteSupport.java    From pentaho-hadoop-shims with Apache License 2.0 4 votes vote down vote up
private PrimitiveType convertToPrimitiveType( IParquetOutputField f ) {
  Type.Repetition rep = f.getAllowNull() ? Type.Repetition.OPTIONAL : Type.Repetition.REQUIRED;
  String formatFieldName = f.getFormatFieldName();
  switch ( f.getParquetType() ) {
    case BINARY:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.BINARY, formatFieldName );
    case BOOLEAN:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.BOOLEAN, formatFieldName );
    case DOUBLE:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.DOUBLE, formatFieldName );
    case FLOAT:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.FLOAT, formatFieldName );
    case INT_32:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.INT32, formatFieldName );
    case UTF8:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.BINARY, formatFieldName, OriginalType.UTF8 );
    case INT_64:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.INT64, formatFieldName, OriginalType.INT_64 );
    case INT_96:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.INT96, formatFieldName );
    case DATE:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.INT32, formatFieldName, OriginalType.DATE );
    case DECIMAL:
      if ( f.getAllowNull() ) {
        return Types.optional( PrimitiveType.PrimitiveTypeName.BINARY ).as( OriginalType.DECIMAL )
          .precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName );
      } else {
        return Types.required( PrimitiveType.PrimitiveTypeName.BINARY ).as( OriginalType.DECIMAL )
          .precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName );
      }
    case DECIMAL_INT_32:
      if ( f.getAllowNull() ) {
        return Types.optional( PrimitiveType.PrimitiveTypeName.INT32 ).as( OriginalType.DECIMAL )
          .precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName );
      } else {
        return Types.required( PrimitiveType.PrimitiveTypeName.INT32 ).as( OriginalType.DECIMAL )
          .precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName );
      }
    case DECIMAL_INT_64:
      if ( f.getAllowNull() ) {
        return Types.optional( PrimitiveType.PrimitiveTypeName.INT64 ).as( OriginalType.DECIMAL )
          .precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName );
      } else {
        return Types.required( PrimitiveType.PrimitiveTypeName.INT64 ).as( OriginalType.DECIMAL )
          .precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName );
      }
    case TIMESTAMP_MILLIS:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.INT64, formatFieldName,
        OriginalType.TIMESTAMP_MILLIS );
    default:
      throw new RuntimeException( "Unsupported output type: " + f.getParquetType() );
  }
}