org.apache.beam.sdk.values.PCollection#getSchema

Source File: BeamSqlDslAggregationNullableTest.java From beam with Apache License 2.0

6 votes

@Test
public void testAvgGroupByNullable() {
  String sql = "SELECT AVG(f_int1), f_int2 FROM PCOLLECTION GROUP BY f_int2";

  PCollection<Row> out = boundedInput.apply(SqlTransform.query(sql));
  Schema schema = out.getSchema();

  PAssert.that(out)
      .containsInAnyOrder(
          Row.withSchema(schema).addValues(null, null).build(),
          Row.withSchema(schema).addValues(2, 1).build(),
          Row.withSchema(schema).addValues(1, 5).build(),
          Row.withSchema(schema).addValues(3, 2).build());

  pipeline.run();
}

Source File: BeamSqlDslAggregationNullableTest.java From beam with Apache License 2.0

6 votes

@Test
public void testCountGroupByNullable() {
  String sql = "SELECT COUNT(f_int1) as c, f_int2 FROM PCOLLECTION GROUP BY f_int2";

  PCollection<Row> out = boundedInput.apply(SqlTransform.query(sql));
  Schema schema = out.getSchema();

  PAssert.that(out)
      .containsInAnyOrder(
          Row.withSchema(schema).addValues(0L, null).build(),
          Row.withSchema(schema).addValues(1L, 1).build(),
          Row.withSchema(schema).addValues(1L, 5).build(),
          Row.withSchema(schema).addValues(1L, 2).build());

  assertEquals(
      Schema.builder()
          // COUNT() is never nullable, and calcite knows it
          .addInt64Field("c")
          .addNullableField("f_int2", Schema.FieldType.INT32)
          .build(),
      schema);

  pipeline.run();
}

Source File: ToJson.java From beam with Apache License 2.0

6 votes

@Override
public PCollection<String> expand(PCollection<T> rows) {
  Schema inputSchema = rows.getSchema();
  // Throw exception if this schema is not supported by RowJson
  RowJson.verifySchemaSupported(inputSchema);
  SerializableFunction<T, Row> toRow = rows.getToRowFunction();
  return rows.apply(
      ParDo.of(
          new DoFn<T, String>() {
            @ProcessElement
            public void processElement(ProcessContext context) {
              context.output(
                  rowToJson(objectMapper(inputSchema), toRow.apply(context.element())));
            }
          }));
}

Source File: Select.java From beam with Apache License 2.0

6 votes

@Override
public PCollection<Row> expand(PCollection<T> input) {
  Schema inputSchema = input.getSchema();
  FieldAccessDescriptor resolved = getFieldAccessDescriptor().resolve(inputSchema);
  Schema outputSchema = getOutputSchema();
  if (outputSchema == null) {
    outputSchema = SelectHelpers.getOutputSchema(inputSchema, resolved);
  } else {
    inputSchema = uniquifyNames(inputSchema);
    Schema inferredSchema = SelectHelpers.getOutputSchema(inputSchema, resolved);
    Preconditions.checkArgument(
        outputSchema.typesEqual(inferredSchema),
        "Types not equal. provided output schema: "
            + outputSchema
            + " Schema inferred from select: "
            + inferredSchema
            + " from input type: "
            + input.getSchema());
  }
  return input
      .apply(ParDo.of(new SelectDoFn<>(resolved, inputSchema, outputSchema)))
      .setRowSchema(outputSchema);
}

Source File: Select.java From beam with Apache License 2.0

6 votes

@Override
public PCollection<Row> expand(PCollection<T> input) {
  Schema inputSchema = input.getSchema();

  FieldAccessDescriptor fieldAccessDescriptor =
      SelectHelpers.allLeavesDescriptor(
          inputSchema,
          n ->
              MoreObjects.firstNonNull(
                  getNameOverrides().get(String.join(".", n)), getNameFn().apply(n)));
  Schema inferredOutputSchema =
      SelectHelpers.getOutputSchema(inputSchema, fieldAccessDescriptor);
  Schema outputSchema = getOutputSchema();
  if (outputSchema != null) {
    Preconditions.checkArgument(outputSchema.typesEqual(inferredOutputSchema));
  } else {
    outputSchema = inferredOutputSchema;
  }
  return input
      .apply(ParDo.of(new SelectDoFn<>(fieldAccessDescriptor, inputSchema, outputSchema)))
      .setRowSchema(outputSchema);
}

Source File: Group.java From beam with Apache License 2.0

6 votes

@Override
public PCollection<Row> expand(PCollection<InputT> input) {
  Schema schema = input.getSchema();
  Schema keySchema = getKeySchema(schema);
  Schema outputSchema =
      Schema.builder()
          .addRowField(getKeyField(), keySchema)
          .addIterableField(getValueField(), FieldType.row(schema))
          .build();

  return input
      .apply("ToKvs", getToKvs())
      .apply(
          "ToRow",
          ParDo.of(
              new DoFn<KV<Row, Iterable<Row>>, Row>() {
                @ProcessElement
                public void process(@Element KV<Row, Iterable<Row>> e, OutputReceiver<Row> o) {
                  o.output(
                      Row.withSchema(outputSchema)
                          .attachValues(Lists.newArrayList(e.getKey(), e.getValue())));
                }
              }))
      .setRowSchema(outputSchema);
}

Source File: RenameFields.java From beam with Apache License 2.0

6 votes

@Override
public PCollection<Row> expand(PCollection<T> input) {
  Schema inputSchema = input.getSchema();

  List<RenamePair> pairs =
      renames.stream().map(r -> r.resolve(inputSchema)).collect(Collectors.toList());
  final Schema outputSchema = renameSchema(inputSchema, pairs);
  return input
      .apply(
          ParDo.of(
              new DoFn<T, Row>() {
                @ProcessElement
                public void processElement(@Element Row row, OutputReceiver<Row> o) {
                  o.output(Row.withSchema(outputSchema).attachValues(row.getValues()));
                }
              }))
      .setRowSchema(outputSchema);
}

Source File: JdbcIO.java From beam with Apache License 2.0

6 votes

@Override
public PDone expand(PCollection<T> input) {
  // fixme: validate invalid table input
  if (input.hasSchema() && !hasStatementAndSetter()) {
    checkArgument(
        inner.getTable() != null, "table cannot be null if statement is not provided");
    Schema schema = input.getSchema();
    List<SchemaUtil.FieldWithIndex> fields = getFilteredFields(schema);
    inner =
        inner.withStatement(
            JdbcUtil.generateStatement(
                inner.getTable(),
                fields.stream()
                    .map(SchemaUtil.FieldWithIndex::getField)
                    .collect(Collectors.toList())));
    inner =
        inner.withPreparedStatementSetter(
            new AutoGeneratedPreparedStatementSetter(fields, input.getToRowFunction()));
  }

  inner.expand(input);
  return PDone.in(input.getPipeline());
}

Source File: BeamPCollectionTable.java From beam with Apache License 2.0

5 votes

public BeamPCollectionTable(PCollection<InputT> upstream) {
  super(upstream.getSchema());
  if (!upstream.hasSchema()) {
    throw new IllegalArgumentException("SQL can only run over PCollections that have schemas.");
  }
  this.upstream = upstream;
}

Source File: BeamZetaSqlCalcRel.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<Row> expand(PCollectionList<Row> pinput) {
  Preconditions.checkArgument(
      pinput.size() == 1,
      "%s expected a single input PCollection, but received %d.",
      BeamZetaSqlCalcRel.class.getSimpleName(),
      pinput.size());
  PCollection<Row> upstream = pinput.get(0);

  final RexBuilder rexBuilder = getCluster().getRexBuilder();
  RexNode rex = rexBuilder.makeCall(SqlStdOperatorTable.ROW, getProgram().getProjectList());

  final RexNode condition = getProgram().getCondition();
  if (condition != null) {
    rex =
        rexBuilder.makeCall(
            SqlStdOperatorTable.CASE, condition, rex, rexBuilder.makeNullLiteral(getRowType()));
  }

  boolean verifyRowValues =
      pinput.getPipeline().getOptions().as(BeamSqlPipelineOptions.class).getVerifyRowValues();
  Schema outputSchema = CalciteUtils.toSchema(getRowType());
  CalcFn calcFn =
      new CalcFn(
          context.toSql(getProgram(), rex).toSqlString(DIALECT).getSql(),
          upstream.getSchema(),
          outputSchema,
          verifyRowValues);

  // validate prepared expressions
  calcFn.setup();

  return upstream.apply(ParDo.of(calcFn)).setRowSchema(outputSchema);
}

Source File: Group.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<KV<Row, Iterable<Row>>> expand(PCollection<InputT> input) {
  Schema schema = input.getSchema();
  FieldAccessDescriptor resolved = getFieldAccessDescriptor().resolve(schema);
  rowSelector = new RowSelectorContainer(schema, resolved, true);
  Schema keySchema = getKeySchema(schema);

  return input
      .apply("toRow", Convert.toRows())
      .apply(
          "selectKeys",
          WithKeys.of((Row e) -> rowSelector.select(e)).withKeyType(TypeDescriptors.rows()))
      .setCoder(KvCoder.of(SchemaCoder.of(keySchema), SchemaCoder.of(schema)))
      .apply("GroupByKey", GroupByKey.create());
}

Source File: DropFields.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<Row> expand(PCollection<T> input) {
  Schema inputSchema = input.getSchema();
  FieldAccessDescriptor selectDescriptor =
      complement(inputSchema, fieldsToDrop.resolve(inputSchema));

  return input.apply(Select.fieldAccess(selectDescriptor));
}

Source File: BigQueryChangeApplier.java From DataflowTemplates with Apache License 2.0

4 votes

@Override
public PDone expand(PCollection<Row> input) {
  Pipeline p = input.getPipeline();
  Schema inputCollectionSchema = input.getSchema();

  PCollection<KV<String, KV<Schema, Schema>>> tableSchemaCollection =
      buildTableSchemaCollection(input);
  PCollectionView<Map<String, KV<Schema, Schema>>> schemaMapView = tableSchemaCollection
      .apply(View.asMap());

  PCollection<TableRow> updatesToWrite = formatIntoTableRows(input);

  updatesToWrite.apply(
      BigQueryIO.writeTableRows()
          .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
          .withWriteDisposition(WriteDisposition.WRITE_APPEND)
          .withMethod(Method.STREAMING_INSERTS)
      .to(new ChangelogTableDynamicDestinations(changeLogDataset, gcpProjectId, schemaMapView)));

  String jobPrefix =
      String.format(
          "beam_cdc_%s_%s_", gcpProjectId.replace(':', '_').replace('.', '_'), replicaDataset);

  // If the input collection does not have a primary key field, then we do not need to issue
  // periodic merge requests.
  if (inputCollectionSchema.hasField(DataflowCdcRowFormat.PRIMARY_KEY)) {
    p.apply("MergeHeartbeat",
        GenerateSequence
            .from(0)
            .withRate(1, Duration.standardSeconds(updateFrequencySeconds)))
        .apply("KeyByTable", ParDo.of(new KeySchemasByTableFn(schemaMapView))
            .withSideInputs(schemaMapView))
        .apply("BuildMergeStatements",
            ParDo.of(
                new MergeStatementBuildingFn(changeLogDataset, replicaDataset, gcpProjectId)))
        .setCoder(SerializableCoder.of(
            TypeDescriptors.kvs(
                TypeDescriptors.strings(),
                TypeDescriptor.of(BigQueryAction.class))))
        .apply("IssueMergeStatements",
            ParDo.of(new BigQueryStatementIssuingFn(jobPrefix)));
  }
  return PDone.in(p);
}

Source File: CoGroup.java From beam with Apache License 2.0

4 votes

private static JoinInformation from(
    PCollectionTuple input,
    Function<String, FieldAccessDescriptor> getFieldAccessDescriptor,
    Function<String, Boolean> getIsSideInput) {
  KeyedPCollectionTuple<Row> keyedPCollectionTuple =
      KeyedPCollectionTuple.empty(input.getPipeline());

  List<String> sortedTags =
      input.getAll().keySet().stream()
          .map(TupleTag::getId)
          .sorted()
          .collect(Collectors.toList());

  // Keep this in a TreeMap so that it's sorted. This way we get a deterministic output
  // schema.
  TreeMap<String, Schema> componentSchemas = Maps.newTreeMap();
  Map<Integer, SerializableFunction<Object, Row>> toRows = Maps.newHashMap();

  Map<String, PCollectionView<Map<Row, Iterable<Row>>>> sideInputs = Maps.newHashMap();
  Map<Integer, String> tagToKeyedTag = Maps.newHashMap();
  Schema keySchema = null;
  for (Map.Entry<TupleTag<?>, PCollection<?>> entry : input.getAll().entrySet()) {
    String tag = entry.getKey().getId();
    int tagIndex = sortedTags.indexOf(tag);
    PCollection<?> pc = entry.getValue();
    Schema schema = pc.getSchema();
    componentSchemas.put(tag, schema);
    toRows.put(tagIndex, (SerializableFunction<Object, Row>) pc.getToRowFunction());
    FieldAccessDescriptor fieldAccessDescriptor = getFieldAccessDescriptor.apply(tag);
    if (fieldAccessDescriptor == null) {
      throw new IllegalStateException("No fields were set for input " + tag);
    }
    // Resolve the key schema, keeping the fields in the order specified by the user.
    // Otherwise, if different field names are specified for different PCollections, they
    // might not match up.
    // The key schema contains the field names from the first PCollection specified.
    FieldAccessDescriptor resolved = fieldAccessDescriptor.resolve(schema);
    Schema currentKeySchema = SelectHelpers.getOutputSchema(schema, resolved);
    if (keySchema == null) {
      keySchema = currentKeySchema;
    } else {
      keySchema = SchemaUtils.mergeWideningNullable(keySchema, currentKeySchema);
    }

    // Create a new tag for the output.
    TupleTag randomTag = new TupleTag<>();
    String keyedTag = tag + "_" + randomTag;
    tagToKeyedTag.put(tagIndex, keyedTag);
    PCollection<KV<Row, Row>> keyedPCollection =
        extractKey(pc, schema, keySchema, resolved, tag);
    if (getIsSideInput.apply(tag)) {
      sideInputs.put(
          keyedTag, keyedPCollection.apply("computeSideInputView" + tag, View.asMultimap()));
    } else {
      keyedPCollectionTuple = keyedPCollectionTuple.and(keyedTag, keyedPCollection);
    }
  }
  return new JoinInformation(
      keyedPCollectionTuple,
      sideInputs,
      keySchema,
      componentSchemas,
      toRows,
      sortedTags,
      tagToKeyedTag);
}

Java Code Examples for org.apache.beam.sdk.values.PCollection#getSchema()