Java Code Examples for org.apache.beam.sdk.values.PCollection#getSchema()
The following examples show how to use
org.apache.beam.sdk.values.PCollection#getSchema() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BeamSqlDslAggregationNullableTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testAvgGroupByNullable() { String sql = "SELECT AVG(f_int1), f_int2 FROM PCOLLECTION GROUP BY f_int2"; PCollection<Row> out = boundedInput.apply(SqlTransform.query(sql)); Schema schema = out.getSchema(); PAssert.that(out) .containsInAnyOrder( Row.withSchema(schema).addValues(null, null).build(), Row.withSchema(schema).addValues(2, 1).build(), Row.withSchema(schema).addValues(1, 5).build(), Row.withSchema(schema).addValues(3, 2).build()); pipeline.run(); }
Example 2
Source File: BeamSqlDslAggregationNullableTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testCountGroupByNullable() { String sql = "SELECT COUNT(f_int1) as c, f_int2 FROM PCOLLECTION GROUP BY f_int2"; PCollection<Row> out = boundedInput.apply(SqlTransform.query(sql)); Schema schema = out.getSchema(); PAssert.that(out) .containsInAnyOrder( Row.withSchema(schema).addValues(0L, null).build(), Row.withSchema(schema).addValues(1L, 1).build(), Row.withSchema(schema).addValues(1L, 5).build(), Row.withSchema(schema).addValues(1L, 2).build()); assertEquals( Schema.builder() // COUNT() is never nullable, and calcite knows it .addInt64Field("c") .addNullableField("f_int2", Schema.FieldType.INT32) .build(), schema); pipeline.run(); }
Example 3
Source File: ToJson.java From beam with Apache License 2.0 | 6 votes |
@Override public PCollection<String> expand(PCollection<T> rows) { Schema inputSchema = rows.getSchema(); // Throw exception if this schema is not supported by RowJson RowJson.verifySchemaSupported(inputSchema); SerializableFunction<T, Row> toRow = rows.getToRowFunction(); return rows.apply( ParDo.of( new DoFn<T, String>() { @ProcessElement public void processElement(ProcessContext context) { context.output( rowToJson(objectMapper(inputSchema), toRow.apply(context.element()))); } })); }
Example 4
Source File: Select.java From beam with Apache License 2.0 | 6 votes |
@Override public PCollection<Row> expand(PCollection<T> input) { Schema inputSchema = input.getSchema(); FieldAccessDescriptor resolved = getFieldAccessDescriptor().resolve(inputSchema); Schema outputSchema = getOutputSchema(); if (outputSchema == null) { outputSchema = SelectHelpers.getOutputSchema(inputSchema, resolved); } else { inputSchema = uniquifyNames(inputSchema); Schema inferredSchema = SelectHelpers.getOutputSchema(inputSchema, resolved); Preconditions.checkArgument( outputSchema.typesEqual(inferredSchema), "Types not equal. provided output schema: " + outputSchema + " Schema inferred from select: " + inferredSchema + " from input type: " + input.getSchema()); } return input .apply(ParDo.of(new SelectDoFn<>(resolved, inputSchema, outputSchema))) .setRowSchema(outputSchema); }
Example 5
Source File: Select.java From beam with Apache License 2.0 | 6 votes |
@Override public PCollection<Row> expand(PCollection<T> input) { Schema inputSchema = input.getSchema(); FieldAccessDescriptor fieldAccessDescriptor = SelectHelpers.allLeavesDescriptor( inputSchema, n -> MoreObjects.firstNonNull( getNameOverrides().get(String.join(".", n)), getNameFn().apply(n))); Schema inferredOutputSchema = SelectHelpers.getOutputSchema(inputSchema, fieldAccessDescriptor); Schema outputSchema = getOutputSchema(); if (outputSchema != null) { Preconditions.checkArgument(outputSchema.typesEqual(inferredOutputSchema)); } else { outputSchema = inferredOutputSchema; } return input .apply(ParDo.of(new SelectDoFn<>(fieldAccessDescriptor, inputSchema, outputSchema))) .setRowSchema(outputSchema); }
Example 6
Source File: Group.java From beam with Apache License 2.0 | 6 votes |
@Override public PCollection<Row> expand(PCollection<InputT> input) { Schema schema = input.getSchema(); Schema keySchema = getKeySchema(schema); Schema outputSchema = Schema.builder() .addRowField(getKeyField(), keySchema) .addIterableField(getValueField(), FieldType.row(schema)) .build(); return input .apply("ToKvs", getToKvs()) .apply( "ToRow", ParDo.of( new DoFn<KV<Row, Iterable<Row>>, Row>() { @ProcessElement public void process(@Element KV<Row, Iterable<Row>> e, OutputReceiver<Row> o) { o.output( Row.withSchema(outputSchema) .attachValues(Lists.newArrayList(e.getKey(), e.getValue()))); } })) .setRowSchema(outputSchema); }
Example 7
Source File: RenameFields.java From beam with Apache License 2.0 | 6 votes |
@Override public PCollection<Row> expand(PCollection<T> input) { Schema inputSchema = input.getSchema(); List<RenamePair> pairs = renames.stream().map(r -> r.resolve(inputSchema)).collect(Collectors.toList()); final Schema outputSchema = renameSchema(inputSchema, pairs); return input .apply( ParDo.of( new DoFn<T, Row>() { @ProcessElement public void processElement(@Element Row row, OutputReceiver<Row> o) { o.output(Row.withSchema(outputSchema).attachValues(row.getValues())); } })) .setRowSchema(outputSchema); }
Example 8
Source File: JdbcIO.java From beam with Apache License 2.0 | 6 votes |
@Override public PDone expand(PCollection<T> input) { // fixme: validate invalid table input if (input.hasSchema() && !hasStatementAndSetter()) { checkArgument( inner.getTable() != null, "table cannot be null if statement is not provided"); Schema schema = input.getSchema(); List<SchemaUtil.FieldWithIndex> fields = getFilteredFields(schema); inner = inner.withStatement( JdbcUtil.generateStatement( inner.getTable(), fields.stream() .map(SchemaUtil.FieldWithIndex::getField) .collect(Collectors.toList()))); inner = inner.withPreparedStatementSetter( new AutoGeneratedPreparedStatementSetter(fields, input.getToRowFunction())); } inner.expand(input); return PDone.in(input.getPipeline()); }
Example 9
Source File: BeamPCollectionTable.java From beam with Apache License 2.0 | 5 votes |
public BeamPCollectionTable(PCollection<InputT> upstream) { super(upstream.getSchema()); if (!upstream.hasSchema()) { throw new IllegalArgumentException("SQL can only run over PCollections that have schemas."); } this.upstream = upstream; }
Example 10
Source File: BeamZetaSqlCalcRel.java From beam with Apache License 2.0 | 5 votes |
@Override public PCollection<Row> expand(PCollectionList<Row> pinput) { Preconditions.checkArgument( pinput.size() == 1, "%s expected a single input PCollection, but received %d.", BeamZetaSqlCalcRel.class.getSimpleName(), pinput.size()); PCollection<Row> upstream = pinput.get(0); final RexBuilder rexBuilder = getCluster().getRexBuilder(); RexNode rex = rexBuilder.makeCall(SqlStdOperatorTable.ROW, getProgram().getProjectList()); final RexNode condition = getProgram().getCondition(); if (condition != null) { rex = rexBuilder.makeCall( SqlStdOperatorTable.CASE, condition, rex, rexBuilder.makeNullLiteral(getRowType())); } boolean verifyRowValues = pinput.getPipeline().getOptions().as(BeamSqlPipelineOptions.class).getVerifyRowValues(); Schema outputSchema = CalciteUtils.toSchema(getRowType()); CalcFn calcFn = new CalcFn( context.toSql(getProgram(), rex).toSqlString(DIALECT).getSql(), upstream.getSchema(), outputSchema, verifyRowValues); // validate prepared expressions calcFn.setup(); return upstream.apply(ParDo.of(calcFn)).setRowSchema(outputSchema); }
Example 11
Source File: Group.java From beam with Apache License 2.0 | 5 votes |
@Override public PCollection<KV<Row, Iterable<Row>>> expand(PCollection<InputT> input) { Schema schema = input.getSchema(); FieldAccessDescriptor resolved = getFieldAccessDescriptor().resolve(schema); rowSelector = new RowSelectorContainer(schema, resolved, true); Schema keySchema = getKeySchema(schema); return input .apply("toRow", Convert.toRows()) .apply( "selectKeys", WithKeys.of((Row e) -> rowSelector.select(e)).withKeyType(TypeDescriptors.rows())) .setCoder(KvCoder.of(SchemaCoder.of(keySchema), SchemaCoder.of(schema))) .apply("GroupByKey", GroupByKey.create()); }
Example 12
Source File: DropFields.java From beam with Apache License 2.0 | 5 votes |
@Override public PCollection<Row> expand(PCollection<T> input) { Schema inputSchema = input.getSchema(); FieldAccessDescriptor selectDescriptor = complement(inputSchema, fieldsToDrop.resolve(inputSchema)); return input.apply(Select.fieldAccess(selectDescriptor)); }
Example 13
Source File: BigQueryChangeApplier.java From DataflowTemplates with Apache License 2.0 | 4 votes |
@Override public PDone expand(PCollection<Row> input) { Pipeline p = input.getPipeline(); Schema inputCollectionSchema = input.getSchema(); PCollection<KV<String, KV<Schema, Schema>>> tableSchemaCollection = buildTableSchemaCollection(input); PCollectionView<Map<String, KV<Schema, Schema>>> schemaMapView = tableSchemaCollection .apply(View.asMap()); PCollection<TableRow> updatesToWrite = formatIntoTableRows(input); updatesToWrite.apply( BigQueryIO.writeTableRows() .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND) .withMethod(Method.STREAMING_INSERTS) .to(new ChangelogTableDynamicDestinations(changeLogDataset, gcpProjectId, schemaMapView))); String jobPrefix = String.format( "beam_cdc_%s_%s_", gcpProjectId.replace(':', '_').replace('.', '_'), replicaDataset); // If the input collection does not have a primary key field, then we do not need to issue // periodic merge requests. if (inputCollectionSchema.hasField(DataflowCdcRowFormat.PRIMARY_KEY)) { p.apply("MergeHeartbeat", GenerateSequence .from(0) .withRate(1, Duration.standardSeconds(updateFrequencySeconds))) .apply("KeyByTable", ParDo.of(new KeySchemasByTableFn(schemaMapView)) .withSideInputs(schemaMapView)) .apply("BuildMergeStatements", ParDo.of( new MergeStatementBuildingFn(changeLogDataset, replicaDataset, gcpProjectId))) .setCoder(SerializableCoder.of( TypeDescriptors.kvs( TypeDescriptors.strings(), TypeDescriptor.of(BigQueryAction.class)))) .apply("IssueMergeStatements", ParDo.of(new BigQueryStatementIssuingFn(jobPrefix))); } return PDone.in(p); }
Example 14
Source File: CoGroup.java From beam with Apache License 2.0 | 4 votes |
private static JoinInformation from( PCollectionTuple input, Function<String, FieldAccessDescriptor> getFieldAccessDescriptor, Function<String, Boolean> getIsSideInput) { KeyedPCollectionTuple<Row> keyedPCollectionTuple = KeyedPCollectionTuple.empty(input.getPipeline()); List<String> sortedTags = input.getAll().keySet().stream() .map(TupleTag::getId) .sorted() .collect(Collectors.toList()); // Keep this in a TreeMap so that it's sorted. This way we get a deterministic output // schema. TreeMap<String, Schema> componentSchemas = Maps.newTreeMap(); Map<Integer, SerializableFunction<Object, Row>> toRows = Maps.newHashMap(); Map<String, PCollectionView<Map<Row, Iterable<Row>>>> sideInputs = Maps.newHashMap(); Map<Integer, String> tagToKeyedTag = Maps.newHashMap(); Schema keySchema = null; for (Map.Entry<TupleTag<?>, PCollection<?>> entry : input.getAll().entrySet()) { String tag = entry.getKey().getId(); int tagIndex = sortedTags.indexOf(tag); PCollection<?> pc = entry.getValue(); Schema schema = pc.getSchema(); componentSchemas.put(tag, schema); toRows.put(tagIndex, (SerializableFunction<Object, Row>) pc.getToRowFunction()); FieldAccessDescriptor fieldAccessDescriptor = getFieldAccessDescriptor.apply(tag); if (fieldAccessDescriptor == null) { throw new IllegalStateException("No fields were set for input " + tag); } // Resolve the key schema, keeping the fields in the order specified by the user. // Otherwise, if different field names are specified for different PCollections, they // might not match up. // The key schema contains the field names from the first PCollection specified. FieldAccessDescriptor resolved = fieldAccessDescriptor.resolve(schema); Schema currentKeySchema = SelectHelpers.getOutputSchema(schema, resolved); if (keySchema == null) { keySchema = currentKeySchema; } else { keySchema = SchemaUtils.mergeWideningNullable(keySchema, currentKeySchema); } // Create a new tag for the output. TupleTag randomTag = new TupleTag<>(); String keyedTag = tag + "_" + randomTag; tagToKeyedTag.put(tagIndex, keyedTag); PCollection<KV<Row, Row>> keyedPCollection = extractKey(pc, schema, keySchema, resolved, tag); if (getIsSideInput.apply(tag)) { sideInputs.put( keyedTag, keyedPCollection.apply("computeSideInputView" + tag, View.asMultimap())); } else { keyedPCollectionTuple = keyedPCollectionTuple.and(keyedTag, keyedPCollection); } } return new JoinInformation( keyedPCollectionTuple, sideInputs, keySchema, componentSchemas, toRows, sortedTags, tagToKeyedTag); }