org.apache.parquet.example.data.simple.SimpleGroup Java Exaples

Source File: ColumnSizeCommandTest.java From parquet-mr with Apache License 2.0

6 votes

private String createParquetFile() throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, INT32, "Num"));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = randomParquetFile().getAbsolutePath();
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file)).withConf(conf);
  Random rnd = new Random();
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", rnd.nextLong());
      g.add("Num", rnd.nextInt());
      writer.write(g);
    }
  }

  return file;
}

Source File: TestColumnSizeCommand.java From parquet-mr with Apache License 2.0

6 votes

private String createParquetFile() throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, INT32, "Num"));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = randomParquetFile().getAbsolutePath();
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file)).withConf(conf);
  Random rnd = new Random();
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", rnd.nextLong());
      g.add("Num", rnd.nextInt());
      writer.write(g);
    }
  }

  return file;
}

Source File: FilteringBenchmarks.java From parquet-mr with Apache License 2.0

6 votes

@Setup
public void writeFile() throws IOException {
  WriteConfigurator writeConfigurator = getWriteConfigurator();
  file = new Path(
      Files.createTempFile("benchmark-filtering_" + characteristic + '_' + writeConfigurator + '_', ".parquet")
          .toAbsolutePath().toString());
  long[] data = generateData();
  characteristic.arrangeData(data);
  try (ParquetWriter<Group> writer = writeConfigurator.configureBuilder(ExampleParquetWriter.builder(file)
      .config(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, SCHEMA.toString())
      .withRowGroupSize(Integer.MAX_VALUE) // Ensure to have one row-group per file only
      .withWriteMode(OVERWRITE))
      .build()) {
    for (long value : data) {
      Group group = new SimpleGroup(SCHEMA);
      group.add(0, value);
      group.add(1, Binary.fromString(dummyGenerator.nextString()));
      group.add(2, Binary.fromString(dummyGenerator.nextString()));
      group.add(3, Binary.fromString(dummyGenerator.nextString()));
      group.add(4, Binary.fromString(dummyGenerator.nextString()));
      group.add(5, Binary.fromString(dummyGenerator.nextString()));
      writer.write(group);
    }
  }
}

Source File: ParquetAsTextInputFormat.java From iow-hadoop-streaming with Apache License 2.0

6 votes

@Override
public boolean next(Text key, Text value) throws IOException {

    if (eof)    // The case where there are no records at all (and first record is being read in constructor)
        return false;

    try {
        // Remember, that we've already read the first record
        if (!firstRecord) {
            if (!realReader.nextKeyValue())
                return false;   // eof

            SimpleGroup g = realReader.getCurrentValue();
            ls = groupToStrings(g);
        }
        else
            firstRecord = false;

        if (key != null) key.set(fetchKey());
        if (value != null) value.set(fetchValue());
        return true;
    } catch (InterruptedException e) {
        throw new IOException(e);
    }
}

Source File: TestTupleRecordConsumer.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testArtSchema() throws ExecException, ParserException {

  String pigSchemaString =
          "DocId:long, " +
          "Links:(Backward:{(long)}, Forward:{(long)}), " +
          "Name:{(Language:{(Code:chararray,Country:chararray)}, Url:chararray)}";

  SimpleGroup g = new SimpleGroup(getMessageType(pigSchemaString));
  g.add("DocId", 1l);
  Group links = g.addGroup("Links");
  links.addGroup("Backward").addGroup("bag").add(0, 1l);
  links.addGroup("Forward").addGroup("bag").add(0, 1l);
  Group name = g.addGroup("Name").addGroup("bag");
  name.addGroup("Language").addGroup("bag").append("Code", "en").append("Country", "US");
  name.add("Url", "http://foo/bar");

  testFromGroups(pigSchemaString, Arrays.<Group>asList(g));
}

Source File: SparkModelParser.java From ignite with Apache License 2.0

6 votes

/**
 * Form the node data according data in parquet row.
 *
 * @param g The given group presenting the node data from Spark DT model.
 */
@NotNull private static SparkModelParser.NodeData extractNodeDataFromParquetRow(SimpleGroup g) {
    NodeData nodeData = new NodeData();

    nodeData.id = g.getInteger(0, 0);
    nodeData.prediction = g.getDouble(1, 0);
    nodeData.leftChildId = g.getInteger(5, 0);
    nodeData.rightChildId = g.getInteger(6, 0);

    if (nodeData.leftChildId == -1 && nodeData.rightChildId == -1) {
        nodeData.featureIdx = -1;
        nodeData.threshold = -1;
        nodeData.isLeafNode = true;
    }
    else {
        final SimpleGroup splitGrp = (SimpleGroup)g.getGroup(7, 0);
        nodeData.featureIdx = splitGrp.getInteger(0, 0);
        nodeData.threshold = splitGrp.getGroup(1, 0).getGroup(0, 0).getDouble(0, 0);
    }
    return nodeData;
}

Source File: ParquetFileLineFetcher.java From hugegraph-loader with Apache License 2.0

6 votes

@Override
public Line fetch() {
    boolean needFetchNext = this.pages == null ||
                            this.currRowOffset >= this.pagesRowCount;
    // Read next row group
    if (needFetchNext && !this.fetchNextPage()) {
        return null;
    }

    int fieldSize = this.schema.getFields().size();
    Object[] values = new Object[fieldSize];
    SimpleGroup group = (SimpleGroup) this.recordReader.read();
    for (int fieldIndex = 0; fieldIndex < fieldSize; fieldIndex++) {
        values[fieldIndex] = group.getValueToString(fieldIndex, 0);
    }
    String rawLine = StringUtils.join(values, Constants.COMMA_STR);

    this.currRowOffset++;
    this.increaseOffset();
    /*
     * NOTE: parquet file actually corresponds to a table structure,
     * doesn't need to skip line or match header
     */
    return new Line(rawLine, this.source().header(), values);
}

Source File: TestTupleRecordConsumer.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testBags() throws ExecException, ParserException {
  String pigSchemaString = "a: {(b: chararray)}";

  SimpleGroup g = new SimpleGroup(getMessageType(pigSchemaString));
  Group addGroup = g.addGroup("a");
  addGroup.addGroup("bag").append("b", "foo");
  addGroup.addGroup("bag").append("b", "bar");

  testFromGroups(pigSchemaString, Arrays.<Group>asList(g));
}

Source File: TestTupleRecordConsumer.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testMaps() throws ExecException, ParserException {
      String pigSchemaString = "a: [(b: chararray)]";
  SimpleGroup g = new SimpleGroup(getMessageType(pigSchemaString));
  Group map = g.addGroup("a");
  map.addGroup("map").append("key", "foo").addGroup("value").append("b", "foo");
  map.addGroup("map").append("key", "bar").addGroup("value").append("b", "bar");

  testFromGroups(pigSchemaString, Arrays.<Group>asList(g));
}

Source File: SparkModelParser.java From ignite with Apache License 2.0

5 votes

/**
 * Read coefficient matrix from parquet.
 *
 * @param g Coefficient group.
 * @return Vector of coefficients.
 */
private static Vector readCoefficients(SimpleGroup g) {
    Vector coefficients;
    final int amountOfCoefficients = g.getGroup(3, 0).getGroup(5, 0).getFieldRepetitionCount(0);

    coefficients = new DenseVector(amountOfCoefficients);

    for (int j = 0; j < amountOfCoefficients; j++) {
        double coefficient = g.getGroup(3, 0).getGroup(5, 0).getGroup(0, j).getDouble(0, 0);
        coefficients.set(j, coefficient);
    }
    return coefficients;
}

Source File: ParquetAsJsonOutputFormat.java From iow-hadoop-streaming with Apache License 2.0

5 votes

@Override
protected RecordWriter<Text, Text>
    createRecordWriter(ParquetRecordWriter<SimpleGroup> w, FileSystem fs, JobConf job, String name, Progressable p)
        throws IOException {

    return new JsonRecordWriterWrapper(w, fs, job, name, p);
}

Source File: TestStatistics.java From parquet-mr with Apache License 2.0

5 votes

@Override
public void write(ParquetWriter<Group> writer) throws IOException {
  for (int index = 0; index < recordCount; index++) {
    Group group = new SimpleGroup(super.schema);

    for (int column = 0, columnCnt = schema.getFieldCount(); column < columnCnt; ++column) {
      Type type = schema.getType(column);
      RandomValueGenerator<?> generator = randomGenerators.get(column);
      if (type.isRepetition(OPTIONAL) && generator.shouldGenerateNull()) {
        continue;
      }
      switch (type.asPrimitiveType().getPrimitiveTypeName()) {
      case BINARY:
      case FIXED_LEN_BYTE_ARRAY:
      case INT96:
        group.append(type.getName(), ((RandomBinaryBase<?>) generator).nextBinaryValue());
        break;
      case INT32:
        group.append(type.getName(), (Integer) generator.nextValue());
        break;
      case INT64:
        group.append(type.getName(), (Long) generator.nextValue());
        break;
      case FLOAT:
        group.append(type.getName(), (Float) generator.nextValue());
        break;
      case DOUBLE:
        group.append(type.getName(), (Double) generator.nextValue());
        break;
      case BOOLEAN:
        group.append(type.getName(), (Boolean) generator.nextValue());
        break;
      }
    }
    writer.write(group);
  }
}

Source File: PhoneBookWriter.java From parquet-mr with Apache License 2.0

5 votes

public static SimpleGroup groupFromUser(User user) {
  SimpleGroup root = new SimpleGroup(schema);
  root.append("id", user.getId());

  if (user.getName() != null) {
    root.append("name", user.getName());
  }

  if (user.getPhoneNumbers() != null) {
    Group phoneNumbers = root.addGroup("phoneNumbers");
    for (PhoneNumber number : user.getPhoneNumbers()) {
      Group phone = phoneNumbers.addGroup("phone");
      phone.append("number", number.getNumber());
      if (number.getKind() != null) {
        phone.append("kind", number.getKind());
      }
    }
  }

  if (user.getLocation() != null) {
    Group location = root.addGroup("location");
    if (user.getLocation().getLon() != null) {
      location.append("lon", user.getLocation().getLon());
    }
    if (user.getLocation().getLat() != null) {
      location.append("lat", user.getLocation().getLat());
    }
  }
  return root;
}

Source File: TestParquetFileWriter.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testWriteReadStatisticsAllNulls() throws Exception {
  // this test assumes statistics will be read
  Assume.assumeTrue(!shouldIgnoreStatistics(Version.FULL_VERSION, BINARY));

  File testFile = temp.newFile();
  testFile.delete();

  writeSchema = "message example {\n" +
          "required binary content (UTF8);\n" +
          "}";

  Path path = new Path(testFile.toURI());

  MessageType schema = MessageTypeParser.parseMessageType(writeSchema);
  Configuration configuration = new Configuration();
  configuration.setBoolean("parquet.strings.signed-min-max.enabled", true);
  GroupWriteSupport.setSchema(schema, configuration);

  ParquetWriter<Group> writer = new ParquetWriter<Group>(path, configuration, new GroupWriteSupport());

  Group r1 = new SimpleGroup(schema);
  writer.write(r1);
  writer.close();

  ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);

  // assert the statistics object is not empty
  org.apache.parquet.column.statistics.Statistics stats = readFooter.getBlocks().get(0).getColumns().get(0).getStatistics();
  assertFalse("is empty: " + stats, stats.isEmpty());
  // assert the number of nulls are correct for the first block
  assertEquals("nulls: " + stats, 1, stats.getNumNulls());
}

Source File: CompressionConveterTest.java From parquet-mr with Apache License 2.0

5 votes

private String createParquetFile(Configuration conf, Map<String, String> extraMeta, int numRecord, String prefix, String codec,
                                       ParquetProperties.WriterVersion writerVersion, int pageSize, TestDocs testDocs) throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, BINARY, "Name"),
    new PrimitiveType(REQUIRED, BINARY, "Gender"),
    new GroupType(OPTIONAL, "Links",
      new PrimitiveType(REPEATED, BINARY, "Backward"),
      new PrimitiveType(REPEATED, BINARY, "Forward")));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = createTempFile(prefix);
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file))
    .withConf(conf)
    .withWriterVersion(writerVersion)
    .withExtraMetaData(extraMeta)
    .withDictionaryEncoding("DocId", true)
    .withValidation(true)
    .enablePageWriteChecksum()
    .withPageSize(pageSize)
    .withCompressionCodec(CompressionCodecName.valueOf(codec));
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", testDocs.docId[i]);
      g.add("Name", testDocs.name[i]);
      g.add("Gender", testDocs.gender[i]);
      Group links = g.addGroup("Links");
      links.add(0, testDocs.linkBackward[i]);
      links.add(1, testDocs.linkForward[i]);
      writer.write(g);
    }
  }

  return file;
}

Source File: TestPruneColumnsCommand.java From parquet-mr with Apache License 2.0

5 votes

private String createParquetFile(String prefix) throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, BINARY, "Name"),
    new PrimitiveType(REQUIRED, BINARY, "Gender"),
    new GroupType(OPTIONAL, "Links",
      new PrimitiveType(REPEATED, INT64, "Backward"),
      new PrimitiveType(REPEATED, INT64, "Forward")));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = createTempFile(prefix);
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file)).withConf(conf);
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", 1l);
      g.add("Name", "foo");
      g.add("Gender", "male");
      Group links = g.addGroup("Links");
      links.add(0, 2l);
      links.add(1, 3l);
      writer.write(g);
    }
  }

  return file;
}

Source File: ParquetAsTextInputFormat.java From iow-hadoop-streaming with Apache License 2.0

5 votes

public TextRecordReaderWrapper(ParquetInputFormat<SimpleGroup> newInputFormat,
                           InputSplit oldSplit,
                           JobConf oldJobConf,
                           Reporter reporter) throws IOException {

    splitLen = oldSplit.getLength();

    try {
        ReadSupport<SimpleGroup> rs = ParquetInputFormat.getReadSupportInstance(oldJobConf);
        realReader = new ParquetRecordReader<>(rs);
        realReader.initialize(((StreamingParquetInputSplitWrapper)oldSplit).realSplit, oldJobConf, reporter);

        oldJobConf.set("map.input.file",((StreamingParquetInputSplitWrapper)oldSplit).realSplit.getPath().toString());
        oldJobConf.set("mapreduce.map.input.file",((StreamingParquetInputSplitWrapper)oldSplit).realSplit.getPath().toString());

        // read once to gain access to key and value objects
        if (realReader.nextKeyValue()) {

          firstRecord = true;
          valueContainer = new Container<>();
          SimpleGroup v = realReader.getCurrentValue();
          valueContainer.set(v);
          ls = groupToStrings(v);
        } else {

          eof = true;
        }
    } catch (InterruptedException e) {
        Thread.interrupted();
        throw new IOException(e);
    }
}

Source File: TestConstants.java From incubator-gobblin with Apache License 2.0

5 votes

@Override
public Group convertToParquetGroup(TestRecord record) {
  Group group = new SimpleGroup(PARQUET_SCHEMA);
  group.add(PAYLOAD_FIELD_NAME, record.getPayload());
  group.add(SEQUENCE_FIELD_NAME, Long.valueOf(record.getSequence()));
  group.add(PARTITION_FIELD_NAME, record.getPartition());
  return group;
}

Source File: SparkModelParser.java From ignite with Apache License 2.0

5 votes

/**
 * Read interceptor value from parquet.
 *
 * @param g Interceptor group.
 */
private static double readInterceptor(SimpleGroup g) {
    double interceptor;

    final SimpleGroup interceptVector = (SimpleGroup)g.getGroup(2, 0);
    final SimpleGroup interceptVectorVal = (SimpleGroup)interceptVector.getGroup(3, 0);
    final SimpleGroup interceptVectorValElement = (SimpleGroup)interceptVectorVal.getGroup(0, 0);

    interceptor = interceptVectorValElement.getDouble(0, 0);

    return interceptor;
}

Source File: SparkModelParser.java From ignite with Apache License 2.0

5 votes

/**
 * Read coefficient matrix from parquet.
 *
 * @param g Coefficient group.
 * @return Vector of coefficients.
 */
private static Vector readLinRegCoefficients(SimpleGroup g) {
    Vector coefficients;
    Group coeffGroup = g.getGroup(1, 0).getGroup(3, 0);

    final int amountOfCoefficients = coeffGroup.getFieldRepetitionCount(0);

    coefficients = new DenseVector(amountOfCoefficients);

    for (int j = 0; j < amountOfCoefficients; j++) {
        double coefficient = coeffGroup.getGroup(0, j).getDouble(0, 0);
        coefficients.set(j, coefficient);
    }
    return coefficients;
}

Source File: ParquetAsTextInputFormat.java From iow-hadoop-streaming with Apache License 2.0

5 votes

protected List<String> groupToStrings(SimpleGroup grp) {

            ArrayList<String> s = new ArrayList<>();

            for (int n = 0; n < grp.getType().getFieldCount(); n ++) {

                Type field = grp.getType().getType(n);
                    try {
                        if (!field.isPrimitive())
                           s.addAll(groupToStrings((SimpleGroup) grp.getGroup(n, 0))); // array of groups not (yet) supported
                        else if (field.getRepetition() == Type.Repetition.REPEATED) {

                            boolean is_binary =
                                field.asPrimitiveType().getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.BINARY;
                            StringBuilder sb = new StringBuilder("[");
                            ArrayList<String> arr = new ArrayList<>();
                            for (int i = 0; i < grp.getFieldRepetitionCount(n); i ++)
                                arr.add(is_binary ? "\"" + grp.getValueToString(n, i) + "\"" :
                                    grp.getValueToString(n, i));

                            sb.append(Joiner.on(", ").join(arr));
                            sb.append("]");
                            s.add(sb.toString());
                        }
                        else
                            s.add(grp.getValueToString(n, 0));
                    }
                    catch (RuntimeException e) {
                        if(e.getMessage().startsWith("not found") && field.getRepetition() == Type.Repetition.OPTIONAL)
                            s.add("");
                        else
                            throw e;
                    }
            }

            return s;
        }

Source File: SparkModelParser.java From ignite with Apache License 2.0

5 votes

/**
 * Read coefficient matrix from parquet.
 *
 * @param g Coefficient group.
 * @return Vector of coefficients.
 */
private static Vector readSVMCoefficients(SimpleGroup g) {
    Vector coefficients;
    Group coeffGroup = g.getGroup(0, 0).getGroup(3, 0);

    final int amountOfCoefficients = coeffGroup.getFieldRepetitionCount(0);

    coefficients = new DenseVector(amountOfCoefficients);

    for (int j = 0; j < amountOfCoefficients; j++) {
        double coefficient = coeffGroup.getGroup(0, j).getDouble(0, 0);
        coefficients.set(j, coefficient);
    }
    return coefficients;
}

Source File: SparkModelParser.java From ignite with Apache License 2.0

5 votes

/**
 * Load logistic regression model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadLogRegModel(String pathToMdl,
    LearningEnvironment learningEnvironment) {
    Vector coefficients = null;
    double interceptor = 0;

    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                interceptor = readInterceptor(g);
                coefficients = readCoefficients(g);
            }
        }

    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }

    return new LogisticRegressionModel(coefficients, interceptor);
}

Source File: SparkModelParser.java From ignite with Apache License 2.0

5 votes

/**
 * Load linear regression model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadLinRegModel(String pathToMdl,
    LearningEnvironment learningEnvironment) {
    Vector coefficients = null;
    double interceptor = 0;

    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                interceptor = readLinRegInterceptor(g);
                coefficients = readLinRegCoefficients(g);
            }
        }

    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }

    return new LinearRegressionModel(coefficients, interceptor);
}

Source File: SparkModelParser.java From ignite with Apache License 2.0

5 votes

/**
 * Load SVM model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadLinearSVMModel(String pathToMdl,
    LearningEnvironment learningEnvironment) {
    Vector coefficients = null;
    double interceptor = 0;

    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                interceptor = readSVMInterceptor(g);
                coefficients = readSVMCoefficients(g);
            }
        }
    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }

    return new SVMLinearClassificationModel(coefficients, interceptor);
}

Source File: SparkModelParser.java From ignite with Apache License 2.0

5 votes

/**
 * Load Decision Tree model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadDecisionTreeModel(String pathToMdl, LearningEnvironment learningEnvironment) {
    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
        final Map<Integer, NodeData> nodes = new TreeMap<>();

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));

            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                NodeData nodeData = extractNodeDataFromParquetRow(g);
                nodes.put(nodeData.id, nodeData);
            }
        }
        return buildDecisionTreeModel(nodes);
    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }
    return null;
}

Source File: ParquetAsJsonInputFormat.java From iow-hadoop-streaming with Apache License 2.0

4 votes

private void groupToJson(JsonGenerator currentGenerator, SimpleGroup grp)
      throws IOException {

    GroupType gt = grp.getType();

    currentGenerator.writeStartObject();
    for(int i = 0; i < gt.getFieldCount(); i ++) {

        String field = gt.getFieldName(i);
        try {
            Type t = gt.getType(i);
            int repetition = 1;
            boolean repeated = false;
            if (t.getRepetition() == Type.Repetition.REPEATED) {
                repeated = true;
                repetition = grp.getFieldRepetitionCount(i);
                currentGenerator.writeArrayFieldStart(field);
            }
            else
                currentGenerator.writeFieldName(field);

            for(int j = 0; j < repetition; j ++) {

                if (t.isPrimitive()) {
                    switch (t.asPrimitiveType().getPrimitiveTypeName()) {
                        case BINARY:
                            currentGenerator.writeString(grp.getString(i, j));
                            break;
                        case INT32:
                            currentGenerator.writeNumber(grp.getInteger(i, j));
                            break;
                        case INT96:
                        case INT64:
                            // clumsy way - TODO - Subclass SimpleGroup or something like that
                            currentGenerator.writeNumber(Long.parseLong(grp.getValueToString(i, j)));
                            break;
                        case DOUBLE:
                        case FLOAT:
                            currentGenerator.writeNumber(Double.parseDouble(grp.getValueToString(i, j)));
                            break;
                        case BOOLEAN:
                            currentGenerator.writeBoolean(grp.getBoolean(i, j));
                            break;
                        default:
                            throw new RuntimeException("Can't handle type " + gt.getType(i));
                    }
                } else {
                    groupToJson(currentGenerator, (SimpleGroup) grp.getGroup(i, j));
                }
            }

            if (repeated)
                currentGenerator.writeEndArray();
        }
        catch (Exception e) {
            if (e.getMessage().startsWith("not found") && gt.getType(i).getRepetition() == Type.Repetition.OPTIONAL)
                currentGenerator.writeNull();
            else
                 throw new RuntimeException(e);
        }
    }
    currentGenerator.writeEndObject();
}

Source File: ParquetAsJsonInputFormat.java From iow-hadoop-streaming with Apache License 2.0

4 votes

public JsonRecordReaderWrapper(ParquetInputFormat<SimpleGroup> newInputFormat, InputSplit oldSplit,
        JobConf oldJobConf, Reporter reporter) throws IOException {
    super(newInputFormat, oldSplit, oldJobConf, reporter);
}

Source File: ParquetResolverTest.java From pxf with Apache License 2.0

4 votes

@Test
public void testGetFields_Primitive_Repeated_Synthetic() {
    // this test does not read the actual Parquet file, but rather construct Group object synthetically
    schema = getParquetSchemaForPrimitiveTypes(Type.Repetition.REPEATED, true);
    // schema has changed, set metadata again
    context.setMetadata(schema);
    context.setTupleDescription(getColumnDescriptorsFromSchema(schema));
    resolver.initialize(context);

    /*
    Corresponding DB column types  are:
    TEXT,TEXT,INTEGER, DOUBLE PRECISION,NUMERIC,TIMESTAMP,REAL,BIGINT,BOOLEAN,SMALLINT,SMALLINT,VARCHAR(5),CHAR(3),BYTEA
     */

    Group group = new SimpleGroup(schema);

    group.add(0, "row1-1");
    group.add(0, "row1-2");

    // leave column 1 (t2) unset as part fo the test

    group.add(2, 1);
    group.add(2, 2);
    group.add(2, 3);

    group.add(3, 6.0d);
    group.add(3, -16.34d);

    BigDecimal value = new BigDecimal("12345678.9012345987654321"); // place of dot doesn't matter
    byte fillByte = (byte) (value.signum() < 0 ? 0xFF : 0x00);
    byte[] unscaled = value.unscaledValue().toByteArray();
    byte[] bytes = new byte[16];
    int offset = bytes.length - unscaled.length;
    for (int i = 0; i < bytes.length; i += 1) {
        bytes[i] = (i < offset) ? fillByte : unscaled[i - offset];
    }
    group.add(4, Binary.fromReusedByteArray(bytes));

    group.add(5, ParquetTypeConverter.getBinaryFromTimestamp("2019-03-14 14:10:28"));
    group.add(5, ParquetTypeConverter.getBinaryFromTimestamp("1969-12-30 05:42:23.211211"));

    group.add(6, 7.7f);
    group.add(6, -12345.35354646f);

    group.add(7, 23456789L);
    group.add(7, -123456789012345L);

    group.add(8, true);
    group.add(8, false);

    group.add(9, (short) 1);
    group.add(9, (short) -3);

    group.add(10, (short) 269);
    group.add(10, (short) -313);

    group.add(11, Binary.fromString("Hello"));
    group.add(11, Binary.fromString("World"));

    group.add(12, Binary.fromString("foo"));
    group.add(12, Binary.fromString("bar"));

    byte[] byteArray1 = new byte[]{(byte) 49, (byte) 50, (byte) 51};
    group.add(13, Binary.fromReusedByteArray(byteArray1, 0, 3));
    byte[] byteArray2 = new byte[]{(byte) 52, (byte) 53, (byte) 54};
    group.add(13, Binary.fromReusedByteArray(byteArray2, 0, 3));

    group.add(14, ParquetTypeConverter.getBinaryFromTimestampWithTimeZone("2019-03-14 14:10:28+07"));
    OffsetDateTime offsetDateTime1 = OffsetDateTime.parse("2019-03-14T14:10:28+07:00");
    ZonedDateTime localDateTime1 = offsetDateTime1.atZoneSameInstant(ZoneId.systemDefault());
    String localDateTimeString1 = localDateTime1.format(DateTimeFormatter.ofPattern("[yyyy-MM-dd HH:mm:ss]"));

    group.add(15, ParquetTypeConverter.getBinaryFromTimestampWithTimeZone("2019-03-14 14:10:28-07:30"));
    OffsetDateTime offsetDateTime2 = OffsetDateTime.parse("2019-03-14T14:10:28-07:30");
    ZonedDateTime localDateTime2 = offsetDateTime2.atZoneSameInstant(ZoneId.systemDefault());
    String localDateTimeString2 = localDateTime2.format(DateTimeFormatter.ofPattern("[yyyy-MM-dd HH:mm:ss]"));


    List<Group> groups = new ArrayList<>();
    groups.add(group);
    List<OneField> fields = assertRow(groups, 0, 16);

    assertField(fields, 0, "[\"row1-1\",\"row1-2\"]", DataType.TEXT);
    assertField(fields, 1, "[]", DataType.TEXT);
    assertField(fields, 2, "[1,2,3]", DataType.TEXT);
    assertField(fields, 3, "[6.0,-16.34]", DataType.TEXT);
    assertField(fields, 4, "[123456.789012345987654321]", DataType.TEXT); // scale fixed to 18 in schema
    assertField(fields, 5, "[\"2019-03-14 14:10:28\",\"1969-12-30 05:42:23.211211\"]", DataType.TEXT);
    assertField(fields, 6, "[7.7,-12345.354]", DataType.TEXT); // rounded to the precision of 8
    assertField(fields, 7, "[23456789,-123456789012345]", DataType.TEXT);
    assertField(fields, 8, "[true,false]", DataType.TEXT);
    assertField(fields, 9, "[1,-3]", DataType.TEXT);
    assertField(fields, 10, "[269,-313]", DataType.TEXT);
    assertField(fields, 11, "[\"Hello\",\"World\"]", DataType.TEXT);
    assertField(fields, 12, "[\"foo\",\"bar\"]", DataType.TEXT); // 3 chars only
    Base64.Encoder encoder = Base64.getEncoder(); // byte arrays are Base64 encoded into strings
    String expectedByteArrays = "[\"" + encoder.encodeToString(byteArray1) + "\",\"" + encoder.encodeToString(byteArray2) + "\"]";
    assertField(fields, 13, expectedByteArrays, DataType.TEXT);
    assertField(fields, 14, "[\"" + localDateTimeString1 + "\"]", DataType.TEXT);
    assertField(fields, 15, "[\"" + localDateTimeString2 + "\"]", DataType.TEXT);
}

Source File: ParquetAsTextOutputFormat.java From iow-hadoop-streaming with Apache License 2.0

4 votes

protected RecordWriter<Text, Text>
    createRecordWriter(ParquetRecordWriter<SimpleGroup> w, FileSystem fs, JobConf job, String name, Progressable p)
        throws IOException {

    return new TextRecordWriterWrapper(w, fs, job, name, p);
}

org.apache.parquet.example.data.simple.SimpleGroup Java Examples