org.apache.beam.sdk.transforms.Flatten Java Examples
The following examples show how to use
org.apache.beam.sdk.transforms.Flatten.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DirectGraphVisitorTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void getValueToConsumersWithDuplicateInputSucceeds() { PCollection<String> created = p.apply(Create.of("1", "2", "3")); PCollection<String> flattened = PCollectionList.of(created).and(created).apply(Flatten.pCollections()); p.traverseTopologically(visitor); DirectGraph graph = visitor.getGraph(); AppliedPTransform<?, ?, ?> flattenedProducer = graph.getProducer(flattened); assertThat( graph.getPerElementConsumers(created), Matchers.containsInAnyOrder(new Object[] {flattenedProducer, flattenedProducer})); assertThat(graph.getPerElementConsumers(flattened), emptyIterable()); }
Example #2
Source File: FlattenTranslatorBatch.java From twister2 with Apache License 2.0 | 6 votes |
@Override public void translateNode( Flatten.PCollections<T> transform, Twister2BatchTranslationContext context) { Collection<PValue> pcs = context.getInputs().values(); List<BatchTSetImpl<WindowedValue<T>>> tSets = new ArrayList<>(); BatchTSetImpl<WindowedValue<T>> unionTSet = null; if (pcs.isEmpty()) { // TODO: create empty TSet throw new UnsupportedOperationException("Operation not implemented yet"); } else { for (PValue pc : pcs) { BatchTSetImpl<WindowedValue<T>> curr = context.getInputDataSet(pc); tSets.add(curr); } BatchTSetImpl<WindowedValue<T>> first = tSets.remove(0); Collection<TSet<WindowedValue<T>>> others = new ArrayList<>(); others.addAll(tSets); if (tSets.size() > 0) { unionTSet = first.union(others); } else { unionTSet = first; } } context.setOutputDataSet(context.getOutput(transform), unionTSet); }
Example #3
Source File: FileIndexerPipeline.java From dataflow-opinion-analysis with Apache License 2.0 | 6 votes |
/** * @param indexes * @return */ private static PCollection<ContentIndexSummary> enrichWithCNLP( PCollection<ContentIndexSummary> indexes, Float ratio) { PCollectionTuple splitAB = indexes .apply(ParDo.of(new SplitAB(ratio)) .withOutputTags(PipelineTags.BranchA, TupleTagList.of(PipelineTags.BranchB))); PCollection<ContentIndexSummary> branchACol = splitAB.get(PipelineTags.BranchA); PCollection<ContentIndexSummary> branchBCol = splitAB.get(PipelineTags.BranchB); PCollection<ContentIndexSummary> enrichedBCol = branchBCol.apply( ParDo.of(new EnrichWithCNLPEntities())); //Merge all collections with WebResource table records PCollectionList<ContentIndexSummary> contentIndexSummariesList = PCollectionList.of(branchACol).and(enrichedBCol); PCollection<ContentIndexSummary> allIndexSummaries = contentIndexSummariesList.apply(Flatten.<ContentIndexSummary>pCollections()); indexes = allIndexSummaries; return indexes; }
Example #4
Source File: IndexerPipeline.java From dataflow-opinion-analysis with Apache License 2.0 | 6 votes |
/** * @param filteredIndexes * @return */ private static PCollection<ContentIndexSummary> enrichWithCNLP( PCollection<ContentIndexSummary> filteredIndexes, Float ratio) { PCollectionTuple splitAB = filteredIndexes .apply(ParDo.of(new SplitAB(ratio)) .withOutputTags(PipelineTags.BranchA, TupleTagList.of(PipelineTags.BranchB))); PCollection<ContentIndexSummary> branchACol = splitAB.get(PipelineTags.BranchA); PCollection<ContentIndexSummary> branchBCol = splitAB.get(PipelineTags.BranchB); PCollection<ContentIndexSummary> enrichedBCol = branchBCol.apply( ParDo.of(new EnrichWithCNLPEntities())); //Merge all collections with WebResource table records PCollectionList<ContentIndexSummary> contentIndexSummariesList = PCollectionList.of(branchACol).and(enrichedBCol); PCollection<ContentIndexSummary> allIndexSummaries = contentIndexSummariesList.apply(Flatten.<ContentIndexSummary>pCollections()); filteredIndexes = allIndexSummaries; return filteredIndexes; }
Example #5
Source File: SideInputLoadTest.java From beam with Apache License 2.0 | 6 votes |
private void performTestWithMap( PCollection<KV<byte[], byte[]>> input, Optional<SyntheticStep> syntheticStep) { applyStepIfPresent(input, "Synthetic step", syntheticStep); PCollectionView<Map<byte[], byte[]>> sideInput = applyWindowingIfPresent(input).apply(View.asMap()); PCollectionView<List<byte[]>> randomKeys = pipeline .apply(Create.of(0)) .apply( ParDo.of(new GetRandomKeyList(sideInput, options.getAccessPercentage())) .withSideInputs(sideInput)) .apply(Flatten.iterables()) .apply(View.asList()); input .apply( ParDo.of(new SideInputTestWithMap(sideInput, randomKeys)) .withSideInputs(sideInput, randomKeys)) .apply("Collect end time metrics", ParDo.of(runtimeMonitor)); }
Example #6
Source File: Window.java From beam with Apache License 2.0 | 6 votes |
@Override public PCollection<T> expand(PCollection<T> input) { applicableTo(input); WindowingStrategy<?, ?> outputStrategy = getOutputStrategyInternal(input.getWindowingStrategy()); if (getWindowFn() == null) { // A new PCollection must be created in case input is reused in a different location as the // two PCollections will, in general, have a different windowing strategy. return PCollectionList.of(input) .apply(Flatten.pCollections()) .setWindowingStrategyInternal(outputStrategy); } else { // This is the AssignWindows primitive return input.apply(new Assign<>(this, outputStrategy)); } }
Example #7
Source File: PAssert.java From beam with Apache License 2.0 | 6 votes |
@Override public PCollectionView<ActualT> expand(PBegin input) { final Coder<T> coder = actual.getCoder(); return actual .apply("FilterActuals", rewindowActuals.prepareActuals()) .apply("GatherPanes", GatherAllPanes.globally()) .apply("ExtractPane", MapElements.via(extractPane)) .setCoder(IterableCoder.of(actual.getCoder())) .apply(Flatten.iterables()) .apply("RewindowActuals", rewindowActuals.windowActuals()) .apply( ParDo.of( new DoFn<T, T>() { @ProcessElement public void processElement(ProcessContext context) throws CoderException { context.output(CoderUtils.clone(coder, context.element())); } })) .apply(actualView); }
Example #8
Source File: TestStreamTest.java From beam with Apache License 2.0 | 6 votes |
@Test @Category({NeedsRunner.class, UsesTestStream.class}) public void testElementsAtAlmostPositiveInfinity() { Instant endOfGlobalWindow = GlobalWindow.INSTANCE.maxTimestamp(); TestStream<String> stream = TestStream.create(StringUtf8Coder.of()) .addElements( TimestampedValue.of("foo", endOfGlobalWindow), TimestampedValue.of("bar", endOfGlobalWindow)) .advanceWatermarkToInfinity(); FixedWindows windows = FixedWindows.of(Duration.standardHours(6)); PCollection<String> windowedValues = p.apply(stream) .apply(into(windows)) .apply(WithKeys.of(1)) .apply(GroupByKey.create()) .apply(Values.create()) .apply(Flatten.iterables()); PAssert.that(windowedValues) .inWindow(windows.assignWindow(endOfGlobalWindow)) .containsInAnyOrder("foo", "bar"); p.run(); }
Example #9
Source File: PTransformMatchers.java From beam with Apache License 2.0 | 6 votes |
/** * A {@link PTransformMatcher} which matches a {@link Flatten.PCollections} which consumes a * single input {@link PCollection} multiple times. */ public static PTransformMatcher flattenWithDuplicateInputs() { return new PTransformMatcher() { @Override public boolean matches(AppliedPTransform<?, ?, ?> application) { if (application.getTransform() instanceof Flatten.PCollections) { Set<PValue> observed = new HashSet<>(); for (PValue pvalue : application.getInputs().values()) { boolean firstInstance = observed.add(pvalue); if (!firstInstance) { return true; } } } return false; } @Override public String toString() { return MoreObjects.toStringHelper("FlattenWithDuplicateInputsMatcher").toString(); } }; }
Example #10
Source File: DeduplicatedFlattenFactoryTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void duplicatesInsertsMultipliers() { PTransform<PCollectionList<String>, PCollection<String>> replacement = new DeduplicatedFlattenFactory.FlattenWithoutDuplicateInputs<>(); final PCollectionList<String> inputList = PCollectionList.of(first).and(second).and(first).and(first); inputList.apply(replacement); pipeline.traverseTopologically( new Defaults() { @Override public void visitPrimitiveTransform(TransformHierarchy.Node node) { if (node.getTransform() instanceof Flatten.PCollections) { assertThat(node.getInputs(), not(equalTo(inputList.expand()))); } } }); }
Example #11
Source File: PTransformMatchersTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void emptyFlattenWithNonEmptyFlatten() { AppliedPTransform application = AppliedPTransform.of( "Flatten", Collections.singletonMap( new TupleTag<Integer>(), PCollection.createPrimitiveOutputInternal( p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarIntCoder.of())), Collections.singletonMap( new TupleTag<Integer>(), PCollection.createPrimitiveOutputInternal( p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarIntCoder.of())), Flatten.pCollections(), p); assertThat(PTransformMatchers.emptyFlatten().matches(application), is(false)); }
Example #12
Source File: PTransformMatchersTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void flattenWithDuplicateInputsWithoutDuplicates() { AppliedPTransform application = AppliedPTransform.of( "Flatten", Collections.singletonMap( new TupleTag<Integer>(), PCollection.createPrimitiveOutputInternal( p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarIntCoder.of())), Collections.singletonMap( new TupleTag<Integer>(), PCollection.createPrimitiveOutputInternal( p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarIntCoder.of())), Flatten.pCollections(), p); assertThat(PTransformMatchers.flattenWithDuplicateInputs().matches(application), is(false)); }
Example #13
Source File: PTransformMatchersTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void flattenWithDuplicateInputsNonFlatten() { AppliedPTransform application = AppliedPTransform .<PCollection<Iterable<Integer>>, PCollection<Integer>, Flatten.Iterables<Integer>>of( "EmptyFlatten", Collections.emptyMap(), Collections.singletonMap( new TupleTag<Integer>(), PCollection.createPrimitiveOutputInternal( p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarIntCoder.of())), /* This isn't actually possible to construct, but for the sake of example */ Flatten.iterables(), p); assertThat(PTransformMatchers.flattenWithDuplicateInputs().matches(application), is(false)); }
Example #14
Source File: UnconsumedReadsTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void doesNotConsumeAlreadyConsumedRead() { Unbounded<Long> transform = Read.from(CountingSource.unbounded()); final PCollection<Long> output = pipeline.apply(transform); final Flatten.PCollections<Long> consumer = Flatten.pCollections(); PCollectionList.of(output).apply(consumer); UnconsumedReads.ensureAllReadsConsumed(pipeline); pipeline.traverseTopologically( new PipelineVisitor.Defaults() { @Override public void visitPrimitiveTransform(Node node) { // The output should only be consumed by a single consumer if (node.getInputs().values().contains(output)) { assertThat(node.getTransform(), Matchers.is(consumer)); } } }); }
Example #15
Source File: EmptyFlattenAsCreateFactoryTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void getInputNonEmptyThrows() { PCollectionList<Long> nonEmpty = PCollectionList.of(pipeline.apply("unbounded", GenerateSequence.from(0))) .and(pipeline.apply("bounded", GenerateSequence.from(0).to(100))); thrown.expect(IllegalArgumentException.class); thrown.expectMessage(nonEmpty.expand().toString()); thrown.expectMessage(EmptyFlattenAsCreateFactory.class.getSimpleName()); factory.getReplacementTransform( AppliedPTransform.of( "nonEmptyInput", nonEmpty.expand(), Collections.emptyMap(), Flatten.pCollections(), pipeline)); }
Example #16
Source File: EmptyFlattenAsCreateFactoryTest.java From beam with Apache License 2.0 | 6 votes |
@Test @Category(NeedsRunner.class) public void testOverride() { PCollectionList<Long> empty = PCollectionList.empty(pipeline); PCollection<Long> emptyFlattened = empty.apply( factory .getReplacementTransform( AppliedPTransform.of( "nonEmptyInput", Collections.emptyMap(), Collections.emptyMap(), Flatten.pCollections(), pipeline)) .getTransform()); PAssert.that(emptyFlattened).empty(); pipeline.run(); }
Example #17
Source File: QueryablePipelineTest.java From beam with Apache License 2.0 | 6 votes |
/** * Tests that {@link QueryablePipeline#getPerElementConsumers(PCollectionNode)} returns a * transform that consumes the node more than once. */ @Test public void perElementConsumersWithConsumingMultipleTimes() { Pipeline p = Pipeline.create(); PCollection<Long> longs = p.apply("BoundedRead", Read.from(CountingSource.upTo(100L))); PCollectionList.of(longs).and(longs).and(longs).apply("flatten", Flatten.pCollections()); Components components = PipelineTranslation.toProto(p).getComponents(); // This breaks if the way that IDs are assigned to PTransforms changes in PipelineTranslation String readOutput = getOnlyElement(components.getTransformsOrThrow("BoundedRead").getOutputsMap().values()); QueryablePipeline qp = QueryablePipeline.forPrimitivesIn(components); Set<PTransformNode> consumers = qp.getPerElementConsumers( PipelineNode.pCollection(readOutput, components.getPcollectionsOrThrow(readOutput))); assertThat(consumers.size(), equalTo(1)); assertThat( getOnlyElement(consumers).getTransform().getSpec().getUrn(), equalTo(PTransformTranslation.FLATTEN_TRANSFORM_URN)); }
Example #18
Source File: PTransformMatchersTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void emptyFlattenWithNonFlatten() { AppliedPTransform application = AppliedPTransform .<PCollection<Iterable<Integer>>, PCollection<Integer>, Flatten.Iterables<Integer>>of( "EmptyFlatten", Collections.emptyMap(), Collections.singletonMap( new TupleTag<Integer>(), PCollection.createPrimitiveOutputInternal( p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarIntCoder.of())), /* This isn't actually possible to construct, but for the sake of example */ Flatten.iterables(), p); assertThat(PTransformMatchers.emptyFlatten().matches(application), is(false)); }
Example #19
Source File: CreateStreamTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testElementsAtAlmostPositiveInfinity() throws IOException { Instant endOfGlobalWindow = GlobalWindow.INSTANCE.maxTimestamp(); CreateStream<String> source = CreateStream.of(StringUtf8Coder.of(), batchDuration()) .nextBatch( TimestampedValue.of("foo", endOfGlobalWindow), TimestampedValue.of("bar", endOfGlobalWindow)) .advanceNextBatchWatermarkToInfinity(); FixedWindows windows = FixedWindows.of(Duration.standardHours(6)); PCollection<String> windowedValues = p.apply(source) .apply(Window.into(windows)) .apply(WithKeys.of(1)) .apply(GroupByKey.create()) .apply(Values.create()) .apply(Flatten.iterables()); PAssert.that(windowedValues) .inWindow(windows.assignWindow(GlobalWindow.INSTANCE.maxTimestamp())) .containsInAnyOrder("foo", "bar"); p.run(); }
Example #20
Source File: TrackStreamingSourcesTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testTrackFlattened() { options.setRunner(SparkRunner.class); JavaSparkContext jsc = SparkContextFactory.getSparkContext(options); JavaStreamingContext jssc = new JavaStreamingContext( jsc, new org.apache.spark.streaming.Duration(options.getBatchIntervalMillis())); Pipeline p = Pipeline.create(options); CreateStream<Integer> queueStream1 = CreateStream.of(VarIntCoder.of(), Duration.millis(options.getBatchIntervalMillis())) .emptyBatch(); CreateStream<Integer> queueStream2 = CreateStream.of(VarIntCoder.of(), Duration.millis(options.getBatchIntervalMillis())) .emptyBatch(); PCollection<Integer> pcol1 = p.apply(queueStream1); PCollection<Integer> pcol2 = p.apply(queueStream2); PCollection<Integer> flattened = PCollectionList.of(pcol1).and(pcol2).apply(Flatten.pCollections()); flattened.apply(ParDo.of(new PassthroughFn<>())); p.traverseTopologically(new StreamingSourceTracker(jssc, p, ParDo.MultiOutput.class, 0, 1)); assertThat(StreamingSourceTracker.numAssertions, equalTo(1)); }
Example #21
Source File: Sink.java From gcp-ingestion with Mozilla Public License 2.0 | 6 votes |
/** * Execute an Apache Beam pipeline and return the {@code PipelineResult}. */ public static PipelineResult run(SinkOptions.Parsed options) { final Pipeline pipeline = Pipeline.create(options); final List<PCollection<PubsubMessage>> failureCollections = new ArrayList<>(); pipeline // .apply(options.getInputType().read(options)) // .apply(DecompressPayload.enabled(options.getDecompressInputPayloads())) // .apply(options.getOutputType().write(options)).failuresTo(failureCollections); PCollectionList.of(failureCollections) // .apply("FlattenFailureCollections", Flatten.pCollections()) // .apply("WriteErrorOutput", options.getErrorOutputType().write(options)) // .output(); return pipeline.run(); }
Example #22
Source File: WatermarkManagerTest.java From beam with Apache License 2.0 | 5 votes |
@Before public void setup() { createdInts = p.apply("createdInts", Create.of(1, 2, 3)); filtered = createdInts.apply("filtered", Filter.greaterThan(1)); filteredTimesTwo = filtered.apply( "timesTwo", ParDo.of( new DoFn<Integer, Integer>() { @ProcessElement public void processElement(ProcessContext c) throws Exception { c.output(c.element() * 2); } })); keyed = createdInts.apply("keyed", WithKeys.of("MyKey")); intsToFlatten = p.apply("intsToFlatten", Create.of(-1, 256, 65535)); PCollectionList<Integer> preFlatten = PCollectionList.of(createdInts).and(intsToFlatten); flattened = preFlatten.apply("flattened", Flatten.pCollections()); clock = MockClock.fromInstant(new Instant(1000)); DirectGraphs.performDirectOverrides(p); graph = DirectGraphs.getGraph(p); manager = WatermarkManager.create(clock, graph, AppliedPTransform::getFullName); bundleFactory = ImmutableListBundleFactory.create(); }
Example #23
Source File: FlattenTranslatorBatch.java From beam with Apache License 2.0 | 5 votes |
@Override public void translateNode( Flatten.PCollections<T> transform, Twister2BatchTranslationContext context) { Collection<PValue> pcs = context.getInputs().values(); List<BatchTSetImpl<WindowedValue<T>>> tSets = new ArrayList<>(); BatchTSetImpl<WindowedValue<T>> unionTSet; if (pcs.isEmpty()) { final TSetEnvironment tsetEnv = context.getEnvironment(); unionTSet = ((BatchTSetEnvironment) tsetEnv) .createSource(new Twister2EmptySource(), context.getOptions().getParallelism()); } else { for (PValue pc : pcs) { BatchTSetImpl<WindowedValue<T>> curr = context.getInputDataSet(pc); tSets.add(curr); } BatchTSetImpl<WindowedValue<T>> first = tSets.remove(0); Collection<TSet<WindowedValue<T>>> others = new ArrayList<>(); others.addAll(tSets); if (tSets.size() > 0) { unionTSet = first.union(others); } else { unionTSet = first; } } context.setOutputDataSet(context.getOutput(transform), unionTSet); }
Example #24
Source File: PTransformMatchersTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void emptyFlattenWithEmptyFlatten() { AppliedPTransform application = AppliedPTransform.of( "EmptyFlatten", Collections.emptyMap(), Collections.singletonMap( new TupleTag<Integer>(), PCollection.createPrimitiveOutputInternal( p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarIntCoder.of())), Flatten.pCollections(), p); assertThat(PTransformMatchers.emptyFlatten().matches(application), is(true)); }
Example #25
Source File: AtomicInsertTest.java From beam with Apache License 2.0 | 5 votes |
@Override public PCollection<Row> expand(PBegin input) { Schema schema = Schema.of(Schema.Field.of("f0", Schema.FieldType.INT64)); Iterable<Row> bundle = IntStream.range(0, size) .mapToObj(x -> Row.withSchema(schema).addValue((long) x).build()) .collect(Collectors.toList()); // make sure we get one big bundle return input .getPipeline() .apply(Create.<Iterable<Row>>of(bundle).withCoder(IterableCoder.of(RowCoder.of(schema)))) .apply(Flatten.iterables()) .setRowSchema(schema); }
Example #26
Source File: BatchLoads.java From beam with Apache License 2.0 | 5 votes |
PCollection<WriteBundlesToFiles.Result<DestinationT>> writeDynamicallyShardedFiles( PCollection<KV<DestinationT, ElementT>> input, PCollectionView<String> tempFilePrefix) { TupleTag<WriteBundlesToFiles.Result<DestinationT>> writtenFilesTag = new TupleTag<WriteBundlesToFiles.Result<DestinationT>>("writtenFiles") {}; TupleTag<KV<ShardedKey<DestinationT>, ElementT>> unwrittedRecordsTag = new TupleTag<KV<ShardedKey<DestinationT>, ElementT>>("unwrittenRecords") {}; PCollectionTuple writeBundlesTuple = input.apply( "WriteBundlesToFiles", ParDo.of( new WriteBundlesToFiles<>( tempFilePrefix, unwrittedRecordsTag, maxNumWritersPerBundle, maxFileSize, rowWriterFactory)) .withSideInputs(tempFilePrefix) .withOutputTags(writtenFilesTag, TupleTagList.of(unwrittedRecordsTag))); PCollection<WriteBundlesToFiles.Result<DestinationT>> writtenFiles = writeBundlesTuple .get(writtenFilesTag) .setCoder(WriteBundlesToFiles.ResultCoder.of(destinationCoder)); PCollection<KV<ShardedKey<DestinationT>, ElementT>> unwrittenRecords = writeBundlesTuple .get(unwrittedRecordsTag) .setCoder(KvCoder.of(ShardedKeyCoder.of(destinationCoder), elementCoder)); // If the bundles contain too many output tables to be written inline to files (due to memory // limits), any unwritten records will be spilled to the unwrittenRecordsTag PCollection. // Group these records by key, and write the files after grouping. Since the record is grouped // by key, we can ensure that only one file is open at a time in each bundle. PCollection<WriteBundlesToFiles.Result<DestinationT>> writtenFilesGrouped = writeShardedRecords(unwrittenRecords, tempFilePrefix); // PCollection of filename, file byte size, and table destination. return PCollectionList.of(writtenFiles) .and(writtenFilesGrouped) .apply("FlattenFiles", Flatten.pCollections()) .setCoder(WriteBundlesToFiles.ResultCoder.of(destinationCoder)); }
Example #27
Source File: KafkaIOTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testUnboundedSourceSplits() throws Exception { int numElements = 1000; int numSplits = 10; // Coders must be specified explicitly here due to the way the transform // is used in the test. UnboundedSource<KafkaRecord<Integer, Long>, ?> initial = mkKafkaReadTransform(numElements, null) .withKeyDeserializerAndCoder(IntegerDeserializer.class, BigEndianIntegerCoder.of()) .withValueDeserializerAndCoder(LongDeserializer.class, BigEndianLongCoder.of()) .makeSource(); List<? extends UnboundedSource<KafkaRecord<Integer, Long>, ?>> splits = initial.split(numSplits, p.getOptions()); assertEquals("Expected exact splitting", numSplits, splits.size()); long elementsPerSplit = numElements / numSplits; assertEquals("Expected even splits", numElements, elementsPerSplit * numSplits); PCollectionList<Long> pcollections = PCollectionList.empty(p); for (int i = 0; i < splits.size(); ++i) { pcollections = pcollections.and( p.apply("split" + i, Read.from(splits.get(i)).withMaxNumRecords(elementsPerSplit)) .apply("Remove Metadata " + i, ParDo.of(new RemoveKafkaMetadata<>())) .apply("collection " + i, Values.create())); } PCollection<Long> input = pcollections.apply(Flatten.pCollections()); addCountingAsserts(input, numElements); p.run(); }
Example #28
Source File: UnionTranslator.java From beam with Apache License 2.0 | 5 votes |
@Override public PCollection<InputT> translate(Union<InputT> operator, PCollectionList<InputT> inputs) { final TypeDescriptor<InputT> outputType = operator.getOutputType().orElse(null); return operator .getName() .map(name -> inputs.apply(name, Flatten.pCollections()).setTypeDescriptor(outputType)) .orElseGet(() -> inputs.apply(Flatten.pCollections()).setTypeDescriptor(outputType)); }
Example #29
Source File: FlattenTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testFlatten() { PCollection<Integer> input1 = pipeline.apply(Create.of(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)); PCollection<Integer> input2 = pipeline.apply(Create.of(11, 12, 13, 14, 15, 16, 17, 18, 19, 20)); PCollectionList<Integer> pcs = PCollectionList.of(input1).and(input2); PCollection<Integer> input = pcs.apply(Flatten.pCollections()); PAssert.that(input) .containsInAnyOrder(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20); pipeline.run(); }
Example #30
Source File: PTransformMatchers.java From beam with Apache License 2.0 | 5 votes |
/** * A {@link PTransformMatcher} which matches a {@link Flatten.PCollections} which consumes no * input {@link PCollection PCollections}. */ public static PTransformMatcher emptyFlatten() { return new PTransformMatcher() { @Override public boolean matches(AppliedPTransform<?, ?, ?> application) { return (application.getTransform() instanceof Flatten.PCollections) && application.getInputs().isEmpty(); } @Override public String toString() { return MoreObjects.toStringHelper("EmptyFlattenMatcher").toString(); } }; }