org.apache.parquet.example.data.Group#add

Source File: FilteringBenchmarks.java From parquet-mr with Apache License 2.0

6 votes

@Setup
public void writeFile() throws IOException {
  WriteConfigurator writeConfigurator = getWriteConfigurator();
  file = new Path(
      Files.createTempFile("benchmark-filtering_" + characteristic + '_' + writeConfigurator + '_', ".parquet")
          .toAbsolutePath().toString());
  long[] data = generateData();
  characteristic.arrangeData(data);
  try (ParquetWriter<Group> writer = writeConfigurator.configureBuilder(ExampleParquetWriter.builder(file)
      .config(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, SCHEMA.toString())
      .withRowGroupSize(Integer.MAX_VALUE) // Ensure to have one row-group per file only
      .withWriteMode(OVERWRITE))
      .build()) {
    for (long value : data) {
      Group group = new SimpleGroup(SCHEMA);
      group.add(0, value);
      group.add(1, Binary.fromString(dummyGenerator.nextString()));
      group.add(2, Binary.fromString(dummyGenerator.nextString()));
      group.add(3, Binary.fromString(dummyGenerator.nextString()));
      group.add(4, Binary.fromString(dummyGenerator.nextString()));
      group.add(5, Binary.fromString(dummyGenerator.nextString()));
      writer.write(group);
    }
  }
}

Source File: TestTupleRecordConsumer.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testArtSchema() throws ExecException, ParserException {

  String pigSchemaString =
          "DocId:long, " +
          "Links:(Backward:{(long)}, Forward:{(long)}), " +
          "Name:{(Language:{(Code:chararray,Country:chararray)}, Url:chararray)}";

  SimpleGroup g = new SimpleGroup(getMessageType(pigSchemaString));
  g.add("DocId", 1l);
  Group links = g.addGroup("Links");
  links.addGroup("Backward").addGroup("bag").add(0, 1l);
  links.addGroup("Forward").addGroup("bag").add(0, 1l);
  Group name = g.addGroup("Name").addGroup("bag");
  name.addGroup("Language").addGroup("bag").append("Code", "en").append("Country", "US");
  name.add("Url", "http://foo/bar");

  testFromGroups(pigSchemaString, Arrays.<Group>asList(g));
}

Source File: TestConstants.java From incubator-gobblin with Apache License 2.0

5 votes

@Override
public Group convertToParquetGroup(TestRecord record) {
  Group group = new SimpleGroup(PARQUET_SCHEMA);
  group.add(PAYLOAD_FIELD_NAME, record.getPayload());
  group.add(SEQUENCE_FIELD_NAME, Long.valueOf(record.getSequence()));
  group.add(PARTITION_FIELD_NAME, record.getPartition());
  return group;
}

Source File: TestPruneColumnsCommand.java From parquet-mr with Apache License 2.0

5 votes

private String createParquetFile(String prefix) throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, BINARY, "Name"),
    new PrimitiveType(REQUIRED, BINARY, "Gender"),
    new GroupType(OPTIONAL, "Links",
      new PrimitiveType(REPEATED, INT64, "Backward"),
      new PrimitiveType(REPEATED, INT64, "Forward")));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = createTempFile(prefix);
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file)).withConf(conf);
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", 1l);
      g.add("Name", "foo");
      g.add("Gender", "male");
      Group links = g.addGroup("Links");
      links.add(0, 2l);
      links.add(1, 3l);
      writer.write(g);
    }
  }

  return file;
}

Source File: TestParquetWriterAppendBlocks.java From parquet-mr with Apache License 2.0

5 votes

@Before
public void createSourceData() throws IOException {
  this.file1 = newTemp();
  this.file2 = newTemp();

  ParquetWriter<Group> writer1 = ExampleParquetWriter.builder(file1)
      .withType(FILE_SCHEMA)
      .build();
  ParquetWriter<Group> writer2 = ExampleParquetWriter.builder(file2)
      .withType(FILE_SCHEMA)
      .build();

  for (int i = 0; i < FILE_SIZE; i += 1) {
    Group group1 = GROUP_FACTORY.newGroup();
    group1.add("id", i);
    group1.add("string", UUID.randomUUID().toString());
    writer1.write(group1);
    file1content.add(group1);

    Group group2 = GROUP_FACTORY.newGroup();
    group2.add("id", FILE_SIZE+i);
    group2.add("string", UUID.randomUUID().toString());
    writer2.write(group2);
    file2content.add(group2);
  }

  writer1.close();
  writer2.close();
}

Source File: TestInputOutputFormatWithPadding.java From parquet-mr with Apache License 2.0

5 votes

@Override
protected void map(LongWritable key, Text value, Context context)
    throws IOException, InterruptedException {
  // writes each character of the line with a UUID
  String line = value.toString();
  for (int i = 0; i < line.length(); i += 1) {
    Group group = GROUP_FACTORY.newGroup();
    group.add(0, Binary.fromString(UUID.randomUUID().toString()));
    group.add(1, Binary.fromString(line.substring(i, i+1)));
    context.write(null, group);
  }
}

Source File: TestInputFormatColumnProjection.java From parquet-mr with Apache License 2.0

5 votes

@Override
protected void map(LongWritable key, Text value, Context context)
    throws IOException, InterruptedException {
  // writes each character of the line with a UUID
  String line = value.toString();
  for (int i = 0; i < line.length(); i += 1) {
    Group group = GROUP_FACTORY.newGroup();
    group.add(0, Binary.fromString(UUID.randomUUID().toString()));
    group.add(1, Binary.fromString(line.substring(i, i+1)));
    context.write(null, group);
  }
}

Source File: TestMultipleWriteRead.java From parquet-mr with Apache License 2.0

5 votes

@Override
public Group get() {
  Group group = factory.newGroup();
  group.add("id", random.nextInt());
  group.add("name", getString(NAME_MIN_SIZE, NAME_MAX_SIZE));
  Group phoneNumbers = group.addGroup("phone_numbers");
  for (int i = 0, n = random.nextInt(PHONE_NUMBERS_MAX_SIZE); i < n; ++i) {
    Group phoneNumber = phoneNumbers.addGroup(0);
    phoneNumber.add(0, random.nextLong() % (MAX_PHONE_NUMBER - MIN_PHONE_NUMBER) + MIN_PHONE_NUMBER);
  }
  if (random.nextDouble() >= COMMENT_NULL_RATIO) {
    group.add("comment", getString(0, COMMENT_MAX_SIZE));
  }
  return group;
}

Source File: CompressionConveterTest.java From parquet-mr with Apache License 2.0

5 votes

private String createParquetFile(Configuration conf, Map<String, String> extraMeta, int numRecord, String prefix, String codec,
                                       ParquetProperties.WriterVersion writerVersion, int pageSize, TestDocs testDocs) throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, BINARY, "Name"),
    new PrimitiveType(REQUIRED, BINARY, "Gender"),
    new GroupType(OPTIONAL, "Links",
      new PrimitiveType(REPEATED, BINARY, "Backward"),
      new PrimitiveType(REPEATED, BINARY, "Forward")));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = createTempFile(prefix);
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file))
    .withConf(conf)
    .withWriterVersion(writerVersion)
    .withExtraMetaData(extraMeta)
    .withDictionaryEncoding("DocId", true)
    .withValidation(true)
    .enablePageWriteChecksum()
    .withPageSize(pageSize)
    .withCompressionCodec(CompressionCodecName.valueOf(codec));
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", testDocs.docId[i]);
      g.add("Name", testDocs.name[i]);
      g.add("Gender", testDocs.gender[i]);
      Group links = g.addGroup("Links");
      links.add(0, testDocs.linkBackward[i]);
      links.add(1, testDocs.linkForward[i]);
      writer.write(g);
    }
  }

  return file;
}

Source File: TestFiltersWithMissingColumns.java From parquet-mr with Apache License 2.0

5 votes

@Before
public void createDataFile() throws Exception {
  File file = temp.newFile("test.parquet");
  this.path = new Path(file.toString());

  MessageType type = Types.buildMessage()
      .required(INT64).named("id")
      .required(BINARY).as(UTF8).named("data")
      .named("test");

  SimpleGroupFactory factory = new SimpleGroupFactory(type);

  ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
      .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
      .withType(type)
      .build();

  try {
    for (long i = 0; i < 1000; i += 1) {
      Group g = factory.newGroup();
      g.add(0, i);
      g.add(1, "data-" + i);
      writer.write(g);
    }
  } finally {
    writer.close();
  }
}

Source File: ParquetResolver.java From pxf with Apache License 2.0

4 votes

private void fillGroup(int index, OneField field, Group group, Type type) throws IOException {
    if (field.val == null)
        return;
    switch (type.asPrimitiveType().getPrimitiveTypeName()) {
        case BINARY:
            if (type.getOriginalType() == OriginalType.UTF8)
                group.add(index, (String) field.val);
            else
                group.add(index, Binary.fromReusedByteArray((byte[]) field.val));
            break;
        case INT32:
            if (type.getOriginalType() == OriginalType.INT_16)
                group.add(index, (Short) field.val);
            else
                group.add(index, (Integer) field.val);
            break;
        case INT64:
            group.add(index, (Long) field.val);
            break;
        case DOUBLE:
            group.add(index, (Double) field.val);
            break;
        case FLOAT:
            group.add(index, (Float) field.val);
            break;
        case FIXED_LEN_BYTE_ARRAY:
            // From org.apache.hadoop.hive.ql.io.parquet.write.DataWritableWriter.DecimalDataWriter#decimalToBinary
            String value = (String) field.val;
            int precision = Math.min(HiveDecimal.MAX_PRECISION, type.asPrimitiveType().getDecimalMetadata().getPrecision());
            int scale = Math.min(HiveDecimal.MAX_SCALE, type.asPrimitiveType().getDecimalMetadata().getScale());
            HiveDecimal hiveDecimal = HiveDecimal.enforcePrecisionScale(
                    HiveDecimal.create(value),
                    precision,
                    scale);

            if (hiveDecimal == null) {
                // When precision is higher than HiveDecimal.MAX_PRECISION
                // and enforcePrecisionScale returns null, it means we
                // cannot store the value in Parquet because we have
                // exceeded the precision. To make the behavior consistent
                // with Hive's behavior when storing on a Parquet-backed
                // table, we store the value as null.
                return;
            }

            byte[] decimalBytes = hiveDecimal.bigIntegerBytesScaled(scale);

            // Estimated number of bytes needed.
            int precToBytes = ParquetFileAccessor.PRECISION_TO_BYTE_COUNT[precision - 1];
            if (precToBytes == decimalBytes.length) {
                // No padding needed.
                group.add(index, Binary.fromReusedByteArray(decimalBytes));
            } else {
                byte[] tgt = new byte[precToBytes];
                if (hiveDecimal.signum() == -1) {
                    // For negative number, initializing bits to 1
                    for (int i = 0; i < precToBytes; i++) {
                        tgt[i] |= 0xFF;
                    }
                }
                System.arraycopy(decimalBytes, 0, tgt, precToBytes - decimalBytes.length, decimalBytes.length); // Padding leading zeroes/ones.
                group.add(index, Binary.fromReusedByteArray(tgt));
            }
            // end -- org.apache.hadoop.hive.ql.io.parquet.write.DataWritableWriter.DecimalDataWriter#decimalToBinary
            break;
        case INT96:  // SQL standard timestamp string value with or without time zone literals: https://www.postgresql.org/docs/9.4/datatype-datetime.html
            String timestamp = (String) field.val;
            if (TIMESTAMP_PATTERN.matcher(timestamp).find()) {
                // Note: this conversion convert type "timestamp with time zone" will lose timezone information
                // while preserving the correct value. (as Parquet doesn't support timestamp with time zone.
                group.add(index, ParquetTypeConverter.getBinaryFromTimestampWithTimeZone(timestamp));
            } else {
                group.add(index, ParquetTypeConverter.getBinaryFromTimestamp(timestamp));
            }
            break;
        case BOOLEAN:
            group.add(index, (Boolean) field.val);
            break;
        default:
            throw new IOException("Not supported type " + type.asPrimitiveType().getPrimitiveTypeName());
    }
}

Source File: ParquetResolverTest.java From pxf with Apache License 2.0

4 votes

@Test
public void testGetFields_Primitive_Repeated_Synthetic() {
    // this test does not read the actual Parquet file, but rather construct Group object synthetically
    schema = getParquetSchemaForPrimitiveTypes(Type.Repetition.REPEATED, true);
    // schema has changed, set metadata again
    context.setMetadata(schema);
    context.setTupleDescription(getColumnDescriptorsFromSchema(schema));
    resolver.initialize(context);

    /*
    Corresponding DB column types  are:
    TEXT,TEXT,INTEGER, DOUBLE PRECISION,NUMERIC,TIMESTAMP,REAL,BIGINT,BOOLEAN,SMALLINT,SMALLINT,VARCHAR(5),CHAR(3),BYTEA
     */

    Group group = new SimpleGroup(schema);

    group.add(0, "row1-1");
    group.add(0, "row1-2");

    // leave column 1 (t2) unset as part fo the test

    group.add(2, 1);
    group.add(2, 2);
    group.add(2, 3);

    group.add(3, 6.0d);
    group.add(3, -16.34d);

    BigDecimal value = new BigDecimal("12345678.9012345987654321"); // place of dot doesn't matter
    byte fillByte = (byte) (value.signum() < 0 ? 0xFF : 0x00);
    byte[] unscaled = value.unscaledValue().toByteArray();
    byte[] bytes = new byte[16];
    int offset = bytes.length - unscaled.length;
    for (int i = 0; i < bytes.length; i += 1) {
        bytes[i] = (i < offset) ? fillByte : unscaled[i - offset];
    }
    group.add(4, Binary.fromReusedByteArray(bytes));

    group.add(5, ParquetTypeConverter.getBinaryFromTimestamp("2019-03-14 14:10:28"));
    group.add(5, ParquetTypeConverter.getBinaryFromTimestamp("1969-12-30 05:42:23.211211"));

    group.add(6, 7.7f);
    group.add(6, -12345.35354646f);

    group.add(7, 23456789L);
    group.add(7, -123456789012345L);

    group.add(8, true);
    group.add(8, false);

    group.add(9, (short) 1);
    group.add(9, (short) -3);

    group.add(10, (short) 269);
    group.add(10, (short) -313);

    group.add(11, Binary.fromString("Hello"));
    group.add(11, Binary.fromString("World"));

    group.add(12, Binary.fromString("foo"));
    group.add(12, Binary.fromString("bar"));

    byte[] byteArray1 = new byte[]{(byte) 49, (byte) 50, (byte) 51};
    group.add(13, Binary.fromReusedByteArray(byteArray1, 0, 3));
    byte[] byteArray2 = new byte[]{(byte) 52, (byte) 53, (byte) 54};
    group.add(13, Binary.fromReusedByteArray(byteArray2, 0, 3));

    group.add(14, ParquetTypeConverter.getBinaryFromTimestampWithTimeZone("2019-03-14 14:10:28+07"));
    OffsetDateTime offsetDateTime1 = OffsetDateTime.parse("2019-03-14T14:10:28+07:00");
    ZonedDateTime localDateTime1 = offsetDateTime1.atZoneSameInstant(ZoneId.systemDefault());
    String localDateTimeString1 = localDateTime1.format(DateTimeFormatter.ofPattern("[yyyy-MM-dd HH:mm:ss]"));

    group.add(15, ParquetTypeConverter.getBinaryFromTimestampWithTimeZone("2019-03-14 14:10:28-07:30"));
    OffsetDateTime offsetDateTime2 = OffsetDateTime.parse("2019-03-14T14:10:28-07:30");
    ZonedDateTime localDateTime2 = offsetDateTime2.atZoneSameInstant(ZoneId.systemDefault());
    String localDateTimeString2 = localDateTime2.format(DateTimeFormatter.ofPattern("[yyyy-MM-dd HH:mm:ss]"));


    List<Group> groups = new ArrayList<>();
    groups.add(group);
    List<OneField> fields = assertRow(groups, 0, 16);

    assertField(fields, 0, "[\"row1-1\",\"row1-2\"]", DataType.TEXT);
    assertField(fields, 1, "[]", DataType.TEXT);
    assertField(fields, 2, "[1,2,3]", DataType.TEXT);
    assertField(fields, 3, "[6.0,-16.34]", DataType.TEXT);
    assertField(fields, 4, "[123456.789012345987654321]", DataType.TEXT); // scale fixed to 18 in schema
    assertField(fields, 5, "[\"2019-03-14 14:10:28\",\"1969-12-30 05:42:23.211211\"]", DataType.TEXT);
    assertField(fields, 6, "[7.7,-12345.354]", DataType.TEXT); // rounded to the precision of 8
    assertField(fields, 7, "[23456789,-123456789012345]", DataType.TEXT);
    assertField(fields, 8, "[true,false]", DataType.TEXT);
    assertField(fields, 9, "[1,-3]", DataType.TEXT);
    assertField(fields, 10, "[269,-313]", DataType.TEXT);
    assertField(fields, 11, "[\"Hello\",\"World\"]", DataType.TEXT);
    assertField(fields, 12, "[\"foo\",\"bar\"]", DataType.TEXT); // 3 chars only
    Base64.Encoder encoder = Base64.getEncoder(); // byte arrays are Base64 encoded into strings
    String expectedByteArrays = "[\"" + encoder.encodeToString(byteArray1) + "\",\"" + encoder.encodeToString(byteArray2) + "\"]";
    assertField(fields, 13, expectedByteArrays, DataType.TEXT);
    assertField(fields, 14, "[\"" + localDateTimeString1 + "\"]", DataType.TEXT);
    assertField(fields, 15, "[\"" + localDateTimeString2 + "\"]", DataType.TEXT);
}

Source File: ApacheParquet.java From sylph with Apache License 2.0

4 votes

private void addValueToGroup(Class<?> dataType, Group group, int index, Object value)
{
    if (value == null || "".equals(value)) {
        return;
    }
    if (dataType == Binary.class) {
        group.add(index, value.toString());
    }
    else if (dataType == byte.class) {
        group.add(index, Byte.valueOf(value.toString()));
    }
    else if (dataType == short.class) {
        group.add(index, Short.valueOf(value.toString()));
    }
    else if (dataType == int.class) {
        group.add(index, Integer.valueOf(value.toString()));
    }
    else if (dataType == long.class) {
        group.add(index, Long.parseLong(value.toString()));
    }
    else if (dataType == double.class) {
        group.add(index, Double.valueOf(value.toString()));
    }
    else if (dataType == float.class) {
        group.add(index, Float.valueOf(value.toString()));
    }
    else if (dataType == Map.class) {
        int mapFieldSize = 0;
        //List<MessageType> mapSchemaList = mapEntrySchema.get(index);
        Group mapFieldGroup = new SimpleGroup(mapTopSchema);
        for (Map.Entry<String, Object> mapFieldEntry : ((Map<String, Object>) value)
                .entrySet()) {
            Group mapEntryKeyValueGroup = new SimpleGroup(kvSchema);
            final String key = mapFieldEntry.getKey();
            final Object vValue = mapFieldEntry.getValue();
            if (vValue != null) {
                mapEntryKeyValueGroup.add("key", key);
                mapFieldSize += key.length();
                mapEntryKeyValueGroup.add("value", vValue.toString());
                mapFieldSize += vValue.toString().length();
                mapFieldGroup.add("key_value", mapEntryKeyValueGroup);
            }
        }
        group.add(index, mapFieldGroup);
    }
    else {
        group.add(index, value.toString());
    }
}

Java Code Examples for org.apache.parquet.example.data.Group#add()