parquet.example.data.Group Java Exaples

Source File: ParquetGroup.java From incubator-gobblin with Apache License 2.0

6 votes

public String toString(String indent) {
  StringBuilder result = new StringBuilder();
  int i = 0;
  for (Type field : this.schema.getFields()) {
    String name = field.getName();
    List<Object> values = this.data[i];
    for (Object value : values) {
      result.append(indent).append(name);
      if (value == null) {
        result.append(": NULL\n");
      } else if (value instanceof Group) {
        result.append("\n").append(((ParquetGroup) value).toString(indent + "  "));
      } else {
        result.append(": ").append(value.toString()).append("\n");
      }
    }
    i++;
  }
  return result.toString();
}

Source File: ParquetHdfsDataWriterTest.java From incubator-gobblin with Apache License 2.0

6 votes

protected List<TestRecord> readParquetFilesGroup(File outputFile)
    throws IOException {
  ParquetReader<Group> reader = null;
  List<Group> records = new ArrayList<>();
  try {
    reader = new ParquetReader<>(new Path(outputFile.toString()), new SimpleReadSupport());
    for (Group value = reader.read(); value != null; value = reader.read()) {
      records.add(value);
    }
  } finally {
    if (reader != null) {
      try {
        reader.close();
      } catch (Exception ex) {
        System.out.println(ex.getMessage());
      }
    }
  }
  return records.stream().map(value -> new TestRecord(
      value.getInteger(TestConstants.PARTITION_FIELD_NAME, 0),
      value.getInteger(TestConstants.SEQUENCE_FIELD_NAME, 0),
      value.getString(TestConstants.PAYLOAD_FIELD_NAME, 0)
  )).collect(Collectors.toList());
}

Source File: ParquetGroup.java From incubator-gobblin with Apache License 2.0

5 votes

/**
 * Add any object of {@link PrimitiveType} or {@link Group} type with a String key.
 * @param key
 * @param object
 */
public void add(String key, Object object) {
  int fieldIndex = getIndex(key);
  if (object.getClass() == ParquetGroup.class) {
    this.addGroup(key, (Group) object);
  } else {
    this.add(fieldIndex, (Primitive) object);
  }
}

Source File: JsonIntermediateToParquetGroupConverterTest.java From incubator-gobblin with Apache License 2.0

5 votes

private void testCase(String testCaseName)
    throws SchemaConversionException, DataConversionException {
  JsonObject test = testCases.get(testCaseName).getAsJsonObject();
  parquetConverter = new JsonIntermediateToParquetGroupConverter();

  MessageType schema = parquetConverter.convertSchema(test.get("schema").getAsJsonArray(), workUnit);
  Group record =
      parquetConverter.convertRecord(schema, test.get("record").getAsJsonObject(), workUnit).iterator().next();
  assertEqualsIgnoreSpaces(schema.toString(), test.get("expectedSchema").getAsString());
  assertEqualsIgnoreSpaces(record.toString(), test.get("expectedRecord").getAsString());
}

Source File: TestConstants.java From incubator-gobblin with Apache License 2.0

5 votes

@Override
public Group convertToParquetGroup(TestRecord record) {
  Group group = new SimpleGroup(PARQUET_SCHEMA);
  group.add(PAYLOAD_FIELD_NAME, record.getPayload());
  group.add(SEQUENCE_FIELD_NAME, Long.valueOf(record.getSequence()).intValue());
  group.add(PARTITION_FIELD_NAME, record.getPartition());
  return group;
}

Source File: ExampleParquetMapReduce.java From hiped2 with Apache License 2.0

5 votes

@Override
public void map(Void key,
                Group value,
                Context context) throws IOException, InterruptedException {
  context.write(new Text(value.getString("symbol", 0)),
      new DoubleWritable(Double.valueOf(value.getValueToString(2, 0))));
}

Source File: ExampleParquetMapReduce.java From hiped2 with Apache License 2.0

5 votes

@Override
protected void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException {
  Mean mean = new Mean();
  for (DoubleWritable val : values) {
    mean.increment(val.get());
  }
  Group group = factory.newGroup()
      .append("symbol", key.toString())
      .append("avg", mean.getResult());
  context.write(null, group);
}

Source File: ParquetGroup.java From incubator-gobblin with Apache License 2.0

4 votes

@Override
public Group addGroup(int fieldIndex) {
  ParquetGroup g = new ParquetGroup(this.schema.getType(fieldIndex).asGroupType());
  this.data[fieldIndex].add(g);
  return g;
}

Source File: ParquetGroup.java From incubator-gobblin with Apache License 2.0

4 votes

public Group getGroup(int fieldIndex, int index) {
  return (Group) this.getValue(fieldIndex, index);
}

Source File: ParquetGroup.java From incubator-gobblin with Apache License 2.0

4 votes

/**
 * Add a {@link Group} given a String key.
 * @param key
 * @param object
 */
private void addGroup(String key, Group object) {
  int fieldIndex = getIndex(key);
  this.schema.getType(fieldIndex).asGroupType();
  this.data[fieldIndex].add(object);
}

Source File: JsonIntermediateToParquetGroupConverter.java From incubator-gobblin with Apache License 2.0

4 votes

@Override
public Iterable<Group> convertRecord(MessageType outputSchema, JsonObject inputRecord, WorkUnitState workUnit)
    throws DataConversionException {
  return new SingleRecordIterable<>((Group) recordConverter.convert(inputRecord));
}

Source File: ParquetDataWriterBuilder.java From incubator-gobblin with Apache License 2.0

4 votes

/**
 * Build a version-specific {@link ParquetWriter} for given {@link ParquetWriterConfiguration}
 * @param writerConfiguration
 * @return
 * @throws IOException
 */
@Override
public ParquetWriterShim getVersionSpecificWriter(ParquetWriterConfiguration writerConfiguration)
    throws IOException {

  CompressionCodecName codecName = CompressionCodecName.fromConf(writerConfiguration.getCodecName());
  ParquetProperties.WriterVersion writerVersion = ParquetProperties.WriterVersion
      .fromString(writerConfiguration.getWriterVersion());

  Configuration conf = new Configuration();
  ParquetWriter versionSpecificWriter = null;
  switch (writerConfiguration.getRecordFormat()) {
    case GROUP: {
      GroupWriteSupport.setSchema((MessageType) this.schema, conf);
      WriteSupport support = new GroupWriteSupport();
      versionSpecificWriter = new ParquetWriter<Group>(
          writerConfiguration.getAbsoluteStagingFile(),
          support,
          codecName,
          writerConfiguration.getBlockSize(),
          writerConfiguration.getPageSize(),
          writerConfiguration.getDictPageSize(),
          writerConfiguration.isDictionaryEnabled(),
          writerConfiguration.isValidate(),
          writerVersion,
          conf);
      break;
    }
    case AVRO:  {
      versionSpecificWriter = new AvroParquetWriter(
          writerConfiguration.getAbsoluteStagingFile(),
          (Schema) this.schema,
          codecName,
          writerConfiguration.getBlockSize(),
          writerConfiguration.getPageSize(),
          writerConfiguration.isDictionaryEnabled(),
          conf);
      break;
    }
    case PROTOBUF: {
      versionSpecificWriter = new ProtoParquetWriter(
          writerConfiguration.getAbsoluteStagingFile(),
          (Class<? extends Message>) this.schema,
          codecName,
          writerConfiguration.getBlockSize(),
          writerConfiguration.getPageSize(),
          writerConfiguration.isDictionaryEnabled(),
          writerConfiguration.isValidate());
      break;
    }
    default: throw new RuntimeException("Record format not supported");
  }
  ParquetWriter finalVersionSpecificWriter = versionSpecificWriter;

  return new ParquetWriterShim() {
    @Override
    public void write(Object record)
        throws IOException {
      finalVersionSpecificWriter.write(record);
    }

    @Override
    public void close()
        throws IOException {
      finalVersionSpecificWriter.close();
    }
  };
}

Source File: ParquetHdfsDataWriterTest.java From incubator-gobblin with Apache License 2.0

4 votes

@Override
public RecordMaterializer<Group> prepareForRead(Configuration conf, Map<String, String> metaData,
    MessageType schema, ReadContext context) {
  return new GroupRecordConverter(schema);
}

Source File: TestReadParquet.java From parquet-examples with Apache License 2.0

4 votes

@Override
public void map(LongWritable key, Group value, Context context) throws IOException, InterruptedException {
    NullWritable outKey = NullWritable.get();
    if(expectedFields == null) {
	// Get the file schema which may be different from the fields in a particular record) from the input split
	String fileSchema = ((ParquetInputSplit)context.getInputSplit()).getFileSchema();
	// System.err.println("file schema from context: " + fileSchema);
	RecordSchema schema = new RecordSchema(fileSchema);
	expectedFields = schema.getFields();
	//System.err.println("inferred schema: " + expectedFields.toString());
    }

    // No public accessor to the column values in a Group, so extract them from the string representation
    String line = value.toString();
    String[] fields = line.split("\n");

           StringBuilder csv = new StringBuilder();
    boolean hasContent = false;
    int i = 0;
    // Look for each expected column
    Iterator<FieldDescription> it = expectedFields.iterator();
    while(it.hasNext()) {
	if(hasContent ) {
	    csv.append(',');
	}
	String name = it.next().name;
	if(fields.length > i) {
	    String[] parts = fields[i].split(": ");
	    // We assume proper order, but there may be fields missing
	    if(parts[0].equals(name)) {
		boolean mustQuote = (parts[1].contains(",") || parts[1].contains("'"));
		if(mustQuote) {
		    csv.append('"');
		}
		csv.append(parts[1]);
		if(mustQuote) {
		    csv.append('"');
		}
		hasContent = true;
		i++;
	    }
	}
    }
    context.write(outKey, new Text(csv.toString()));
       }

Source File: TestReadWriteParquet.java From parquet-examples with Apache License 2.0

4 votes

@Override
public void map(LongWritable key, Group value, Context context) throws IOException, InterruptedException {
    context.write(null, value);
       }

parquet.example.data.Group Java Examples