org.apache.beam.sdk.transforms.GroupByKey Java Examples
The following examples show how to use
org.apache.beam.sdk.transforms.GroupByKey.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DataflowRunner.java From beam with Apache License 2.0 | 6 votes |
@Override public PCollection<KV<K, Iterable<V>>> expand(PCollection<KV<K, V>> input) { return input .apply("GroupAll", GroupByKey.create()) .apply( "SplitIntoBatches", ParDo.of( new DoFn<KV<K, Iterable<V>>, KV<K, Iterable<V>>>() { @ProcessElement public void process(ProcessContext c) { // Iterators.partition lazily creates the partitions as they are accessed // allowing it to partition very large iterators. Iterator<List<V>> iterator = Iterators.partition(c.element().getValue().iterator(), (int) batchSize); // Note that GroupIntoBatches only outputs when the batch is non-empty. while (iterator.hasNext()) { c.output(KV.of(c.element().getKey(), iterator.next())); } } })); }
Example #2
Source File: CacheTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void shouldCacheTest() { SparkPipelineOptions options = createOptions(); options.setCacheDisabled(true); Pipeline pipeline = Pipeline.create(options); Values<String> valuesTransform = Create.of("foo", "bar"); PCollection pCollection = mock(PCollection.class); JavaSparkContext jsc = SparkContextFactory.getSparkContext(options); EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options); ctxt.getCacheCandidates().put(pCollection, 2L); assertFalse(ctxt.shouldCache(valuesTransform, pCollection)); options.setCacheDisabled(false); assertTrue(ctxt.shouldCache(valuesTransform, pCollection)); GroupByKey<String, String> gbkTransform = GroupByKey.create(); assertFalse(ctxt.shouldCache(gbkTransform, pCollection)); }
Example #3
Source File: WriteFiles.java From beam with Apache License 2.0 | 6 votes |
@Override public PCollection<FileResult<DestinationT>> expand(PCollection<UserT> input) { List<PCollectionView<?>> shardingSideInputs = Lists.newArrayList(getSideInputs()); if (numShardsView != null) { shardingSideInputs.add(numShardsView); } ShardingFunction<UserT, DestinationT> shardingFunction = getShardingFunction() == null ? new RandomShardingFunction(destinationCoder) : getShardingFunction(); return input .apply( "ApplyShardingKey", ParDo.of(new ApplyShardingFunctionFn(shardingFunction, numShardsView)) .withSideInputs(shardingSideInputs)) .setCoder(KvCoder.of(ShardedKeyCoder.of(VarIntCoder.of()), input.getCoder())) .apply("GroupIntoShards", GroupByKey.create()) .apply( "WriteShardsIntoTempFiles", ParDo.of(new WriteShardsIntoTempFilesFn()).withSideInputs(getSideInputs())) .setCoder(fileResultCoder); }
Example #4
Source File: CreateStreamTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testElementsAtAlmostPositiveInfinity() throws IOException { Instant endOfGlobalWindow = GlobalWindow.INSTANCE.maxTimestamp(); CreateStream<String> source = CreateStream.of(StringUtf8Coder.of(), batchDuration()) .nextBatch( TimestampedValue.of("foo", endOfGlobalWindow), TimestampedValue.of("bar", endOfGlobalWindow)) .advanceNextBatchWatermarkToInfinity(); FixedWindows windows = FixedWindows.of(Duration.standardHours(6)); PCollection<String> windowedValues = p.apply(source) .apply(Window.into(windows)) .apply(WithKeys.of(1)) .apply(GroupByKey.create()) .apply(Values.create()) .apply(Flatten.iterables()); PAssert.that(windowedValues) .inWindow(windows.assignWindow(GlobalWindow.INSTANCE.maxTimestamp())) .containsInAnyOrder("foo", "bar"); p.run(); }
Example #5
Source File: NonMergingGroupByKeyTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testDisabledReIterationThrowsAnException() { // If output during closing is not supported, we can not chain DoFns and results // are therefore materialized during output serialization. Assume.assumeTrue(FlinkCapabilities.supportsOutputDuringClosing()); final Pipeline p = FlinkTestPipeline.createForBatch(); p.apply(Create.of(Arrays.asList(KV.of("a", 1), KV.of("b", 2), KV.of("c", 3)))) .apply(GroupByKey.create()) .apply(ParDo.of(new ReiterateDoFn<>())); Pipeline.PipelineExecutionException resultException = null; try { p.run().waitUntilFinish(); } catch (Pipeline.PipelineExecutionException exception) { resultException = exception; } Assert.assertEquals( IllegalStateException.class, Objects.requireNonNull(resultException).getCause().getClass()); Assert.assertTrue( resultException.getCause().getMessage().contains("GBK result is not re-iterable.")); }
Example #6
Source File: UnboundedWrite.java From components with Apache License 2.0 | 6 votes |
@Override public PDone expand(PCollection<KV<K, V>> in) { // Make sure that a window has been applied. in = ofDefaultWindow(in); // Add an artificial GroupByKey to collect the window results together. PCollection<KV<Instant, KV<K, V>>> pc2 = in.apply("GroupToOneShard", ParDo.of(new GroupToOneShard<KV<K, V>>())).setCoder( KvCoder.of(InstantCoder.of(), in.getCoder())); PCollection<KV<Instant, Iterable<KV<K, V>>>> pc3 = pc2.apply(GroupByKey.<Instant, KV<K, V>> create()); pc3.apply("UnboundedWrite", ParDo.of(new UnboundedWriteToFile<K, V>(sink))); return PDone.in(in.getPipeline()); }
Example #7
Source File: GatherAllPanes.java From beam with Apache License 2.0 | 6 votes |
@Override public PCollection<Iterable<ValueInSingleWindow<T>>> expand(PCollection<T> input) { WindowFn<?, ?> originalWindowFn = input.getWindowingStrategy().getWindowFn(); return input .apply(Reify.windows()) .apply( WithKeys.<Integer, ValueInSingleWindow<T>>of(0) .withKeyType(new TypeDescriptor<Integer>() {})) .apply( Window.into( new IdentityWindowFn<KV<Integer, ValueInSingleWindow<T>>>( originalWindowFn.windowCoder())) .triggering(Never.ever()) .withAllowedLateness(input.getWindowingStrategy().getAllowedLateness()) .discardingFiredPanes()) // all values have the same key so they all appear as a single output element .apply(GroupByKey.create()) .apply(Values.create()) .setWindowingStrategyInternal(input.getWindowingStrategy()); }
Example #8
Source File: GroupByKeyTranslator.java From beam with Apache License 2.0 | 6 votes |
@SuppressWarnings("unchecked") private static <K, InputT, OutputT> SystemReduceFn<K, InputT, ?, OutputT, BoundedWindow> getSystemReduceFn( PTransform<PCollection<KV<K, InputT>>, PCollection<KV<K, OutputT>>> transform, Pipeline pipeline, KvCoder<K, InputT> kvInputCoder) { if (transform instanceof GroupByKey) { return (SystemReduceFn<K, InputT, ?, OutputT, BoundedWindow>) SystemReduceFn.buffering(kvInputCoder.getValueCoder()); } else if (transform instanceof Combine.PerKey) { final CombineFnBase.GlobalCombineFn<? super InputT, ?, OutputT> combineFn = ((Combine.PerKey) transform).getFn(); return SystemReduceFn.combining( kvInputCoder.getKeyCoder(), AppliedCombineFn.withInputCoder(combineFn, pipeline.getCoderRegistry(), kvInputCoder)); } else { throw new RuntimeException("Transform " + transform + " cannot be translated as GroupByKey."); } }
Example #9
Source File: GroupByKeyLoadTest.java From beam with Apache License 2.0 | 6 votes |
@Override void loadTest() throws IOException { Optional<SyntheticStep> syntheticStep = createStep(options.getStepOptions()); PCollection<KV<byte[], byte[]>> input = pipeline .apply("Read input", readFromSource(sourceOptions)) .apply("Collect start time metrics", ParDo.of(runtimeMonitor)) .apply( "Total bytes monitor", ParDo.of(new ByteMonitor(METRICS_NAMESPACE, "totalBytes.count"))); input = applyWindowing(input); for (int branch = 0; branch < options.getFanout(); branch++) { applyStepIfPresent(input, format("Synthetic step (%s)", branch), syntheticStep) .apply(format("Group by key (%s)", branch), GroupByKey.create()) .apply( format("Ungroup and reiterate (%s)", branch), ParDo.of(new UngroupAndReiterate(options.getIterations()))) .apply(format("Collect end time metrics (%s)", branch), ParDo.of(runtimeMonitor)); } }
Example #10
Source File: GroupByKeyTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testGroupByKey() { List<KV<Integer, Integer>> elems = new ArrayList<>(); elems.add(KV.of(1, 1)); elems.add(KV.of(1, 3)); elems.add(KV.of(1, 5)); elems.add(KV.of(2, 2)); elems.add(KV.of(2, 4)); elems.add(KV.of(2, 6)); PCollection<KV<Integer, Iterable<Integer>>> input = pipeline.apply(Create.of(elems)).apply(GroupByKey.create()); PAssert.thatMap(input) .satisfies( results -> { assertThat(results.get(1), containsInAnyOrder(1, 3, 5)); assertThat(results.get(2), containsInAnyOrder(2, 4, 6)); return null; }); pipeline.run(); }
Example #11
Source File: TestStreamTest.java From beam with Apache License 2.0 | 6 votes |
@Test @Category({NeedsRunner.class, UsesTestStream.class}) public void testElementsAtAlmostPositiveInfinity() { Instant endOfGlobalWindow = GlobalWindow.INSTANCE.maxTimestamp(); TestStream<String> stream = TestStream.create(StringUtf8Coder.of()) .addElements( TimestampedValue.of("foo", endOfGlobalWindow), TimestampedValue.of("bar", endOfGlobalWindow)) .advanceWatermarkToInfinity(); FixedWindows windows = FixedWindows.of(Duration.standardHours(6)); PCollection<String> windowedValues = p.apply(stream) .apply(into(windows)) .apply(WithKeys.of(1)) .apply(GroupByKey.create()) .apply(Values.create()) .apply(Flatten.iterables()); PAssert.that(windowedValues) .inWindow(windows.assignWindow(endOfGlobalWindow)) .containsInAnyOrder("foo", "bar"); p.run(); }
Example #12
Source File: SortValuesTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testSecondaryKeySorting() { // Create a PCollection of <Key, <SecondaryKey, Value>> pairs. PCollection<KV<String, KV<String, Integer>>> input = p.apply( Create.of( Arrays.asList( KV.of("key1", KV.of("secondaryKey2", 20)), KV.of("key2", KV.of("secondaryKey2", 200)), KV.of("key1", KV.of("secondaryKey3", 30)), KV.of("key1", KV.of("secondaryKey1", 10)), KV.of("key2", KV.of("secondaryKey1", 100))))); // Group by Key, bringing <SecondaryKey, Value> pairs for the same Key together. PCollection<KV<String, Iterable<KV<String, Integer>>>> grouped = input.apply(GroupByKey.create()); // For every Key, sort the iterable of <SecondaryKey, Value> pairs by SecondaryKey. PCollection<KV<String, Iterable<KV<String, Integer>>>> groupedAndSorted = grouped.apply(SortValues.create(BufferedExternalSorter.options())); PAssert.that(groupedAndSorted) .satisfies(new AssertThatHasExpectedContentsForTestSecondaryKeySorting()); p.run(); }
Example #13
Source File: DataflowGroupByKeyTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testInvalidWindowsService() { Pipeline p = createTestServiceRunner(); List<KV<String, Integer>> ungroupedPairs = Arrays.asList(); PCollection<KV<String, Integer>> input = p.apply( Create.of(ungroupedPairs) .withCoder(KvCoder.of(StringUtf8Coder.of(), BigEndianIntegerCoder.of()))) .apply(Window.into(Sessions.withGapDuration(Duration.standardMinutes(1)))); thrown.expect(IllegalStateException.class); thrown.expectMessage("GroupByKey must have a valid Window merge function"); input.apply("GroupByKey", GroupByKey.create()).apply("GroupByKeyAgain", GroupByKey.create()); }
Example #14
Source File: IndexerPipeline.java From dataflow-opinion-analysis with Apache License 2.0 | 6 votes |
/** * @param Document indexes * @return a POJO containing 2 PCollections: Unique docs, and Duplicates */ private static ContentDuplicateOrNot filterSoftDuplicates( PCollection<ContentIndexSummary> indexes) { // PCollectionTuple dedupeOrNot = indexes .apply("Extract Text grouping key", ParDo.of(new GetContentIndexSummaryKeyFn())) .apply("Group by Text grouping key", GroupByKey.<ContentSoftDeduplicationKey, ContentIndexSummary>create()) .apply("Eliminate Text dupes", ParDo.of(new EliminateTextDupes()) .withOutputTags(PipelineTags.indexedContentNotToDedupeTag, TupleTagList.of(PipelineTags.indexedContentToDedupeTag))); PCollection<TableRow> dedupedWebresources = dedupeOrNot.get(PipelineTags.indexedContentToDedupeTag) .apply(ParDo.of(new CreateWebresourceTableRowFromDupeIndexSummaryFn())); ContentDuplicateOrNot contentDuplicateOrNot = new ContentDuplicateOrNot( dedupeOrNot.get(PipelineTags.indexedContentNotToDedupeTag), dedupedWebresources); return contentDuplicateOrNot; }
Example #15
Source File: DataflowPTransformMatchersTest.java From beam with Apache License 2.0 | 6 votes |
/** Creates a simple pipeline with a {@link Combine.GroupedValues} with side inputs. */ private static TestPipeline createCombineGroupedValuesWithSideInputsPipeline() { TestPipeline pipeline = TestPipeline.create().enableAbandonedNodeEnforcement(false); PCollection<KV<String, Integer>> input = pipeline .apply(Create.of(KV.of("key", 1))) .setCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of())); PCollection<String> sideInput = pipeline.apply(Create.of("side input")); PCollectionView<String> sideInputView = sideInput.apply(View.asSingleton()); input .apply(GroupByKey.create()) .apply( Combine.<String, Integer, Integer>groupedValues(new SumCombineFnWithContext()) .withSideInputs(sideInputView)); return pipeline; }
Example #16
Source File: CloningBundleFactoryTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void keyedBundleWorkingCoderSucceedsClonesOutput() { PCollection<Integer> created = p.apply(Create.of(1, 3).withCoder(VarIntCoder.of())); PCollection<KV<String, Iterable<Integer>>> keyed = created .apply(WithKeys.of("foo")) .setCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of())) .apply(GroupByKey.create()); WindowedValue<KV<String, Iterable<Integer>>> foos = WindowedValue.valueInGlobalWindow( KV.<String, Iterable<Integer>>of("foo", ImmutableList.of(1, 3))); CommittedBundle<KV<String, Iterable<Integer>>> keyedBundle = factory .createKeyedBundle(StructuralKey.of("foo", StringUtf8Coder.of()), keyed) .add(foos) .commit(Instant.now()); assertThat(keyedBundle.getElements(), containsInAnyOrder(foos)); assertThat( Iterables.getOnlyElement(keyedBundle.getElements()).getValue(), not(theInstance(foos.getValue()))); assertThat(keyedBundle.getPCollection(), equalTo(keyed)); assertThat(keyedBundle.getKey(), equalTo(StructuralKey.of("foo", StringUtf8Coder.of()))); }
Example #17
Source File: BigQueryMerger.java From DataflowTemplates with Apache License 2.0 | 6 votes |
@Override public PCollection<KV<K, V>> expand(PCollection<KV<K, V>> input) { return input .apply( Window.<KV<K, V>>into(new GlobalWindows()) .discardingFiredPanes() .triggering( Repeatedly.forever( AfterProcessingTime.pastFirstElementInPane() .plusDelayOf(Duration.ZERO) .alignedTo(intervalDuration, org.joda.time.Instant.now())))) .apply(GroupByKey.create()) .apply( ParDo.of( new DoFn<KV<K, Iterable<V>>, KV<K, V>>() { @ProcessElement public void process(ProcessContext c) { LOG.debug( "TS: {} | Element: {} | Pane: {}", c.timestamp(), c.element(), c.pane()); Iterator<V> it = c.element().getValue().iterator(); if (it.hasNext()) { c.output(KV.of(c.element().getKey(), it.next())); } } })); }
Example #18
Source File: BigQueryMerger.java From DataflowTemplates with Apache License 2.0 | 6 votes |
@Override public PCollection<KV<K, V>> expand(PCollection<KV<K, V>> input) { return input .apply( Window.<KV<K, V>>into(new GlobalWindows()) .discardingFiredPanes() .triggering( Repeatedly.forever( AfterProcessingTime.pastFirstElementInPane() .plusDelayOf(Duration.ZERO) .alignedTo(intervalDuration, org.joda.time.Instant.now())))) .apply(GroupByKey.create()) .apply( ParDo.of( new DoFn<KV<K, Iterable<V>>, KV<K, V>>() { @ProcessElement public void process(ProcessContext c) { LOG.debug( "TS: {} | Element: {} | Pane: {}", c.timestamp(), c.element(), c.pane()); Iterator<V> it = c.element().getValue().iterator(); if (it.hasNext()) { c.output(KV.of(c.element().getKey(), it.next())); } } })); }
Example #19
Source File: GroupByKeyTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testGroupByKeyPreservesWindowing() { pipeline .apply( Create.timestamped( TimestampedValue.of(KV.of(1, 1), new Instant(1)), TimestampedValue.of(KV.of(1, 3), new Instant(2)), TimestampedValue.of(KV.of(1, 5), new Instant(11)), TimestampedValue.of(KV.of(2, 2), new Instant(3)), TimestampedValue.of(KV.of(2, 4), new Instant(11)), TimestampedValue.of(KV.of(2, 6), new Instant(12)))) .apply(Window.into(FixedWindows.of(Duration.millis(10)))) .apply(GroupByKey.create()) // do manual assertion for windows because Passert do not support multiple kv with same key // (because multiple windows) .apply( ParDo.of( new DoFn<KV<Integer, Iterable<Integer>>, KV<Integer, Iterable<Integer>>>() { @ProcessElement public void processElement(ProcessContext context) { KV<Integer, Iterable<Integer>> element = context.element(); if (element.getKey() == 1) { if (Iterables.size(element.getValue()) == 2) { assertThat(element.getValue(), containsInAnyOrder(1, 3)); // window [0-10) } else { assertThat(element.getValue(), containsInAnyOrder(5)); // window [10-20) } } else { // key == 2 if (Iterables.size(element.getValue()) == 2) { assertThat(element.getValue(), containsInAnyOrder(4, 6)); // window [10-20) } else { assertThat(element.getValue(), containsInAnyOrder(2)); // window [0-10) } } context.output(element); } })); pipeline.run(); }
Example #20
Source File: MultiStepCombine.java From beam with Apache License 2.0 | 5 votes |
@Override public PCollection<KV<K, OutputT>> expand(PCollection<KV<K, InputT>> input) { checkArgument( input.getCoder() instanceof KvCoder, "Expected input to have a %s of type %s, got %s", Coder.class.getSimpleName(), KvCoder.class.getSimpleName(), input.getCoder()); KvCoder<K, InputT> inputCoder = (KvCoder<K, InputT>) input.getCoder(); Coder<InputT> inputValueCoder = inputCoder.getValueCoder(); Coder<AccumT> accumulatorCoder; try { accumulatorCoder = combineFn.getAccumulatorCoder(input.getPipeline().getCoderRegistry(), inputValueCoder); } catch (CannotProvideCoderException e) { throw new IllegalStateException( String.format( "Could not construct an Accumulator Coder with the provided %s %s", CombineFn.class.getSimpleName(), combineFn), e); } return input .apply( ParDo.of( new CombineInputs<>( combineFn, input.getWindowingStrategy().getTimestampCombiner(), inputCoder.getKeyCoder()))) .setCoder(KvCoder.of(inputCoder.getKeyCoder(), accumulatorCoder)) .apply(GroupByKey.create()) .apply(new MergeAndExtractAccumulatorOutput<>(combineFn, outputCoder)); }
Example #21
Source File: DirectGroupByKey.java From beam with Apache License 2.0 | 5 votes |
@Override public PCollection<KeyedWorkItem<K, V>> expand(PCollection<KV<K, V>> input) { return PCollection.createPrimitiveOutputInternal( input.getPipeline(), WindowingStrategy.globalDefault(), input.isBounded(), KeyedWorkItemCoder.of( GroupByKey.getKeyCoder(input.getCoder()), GroupByKey.getInputValueCoder(input.getCoder()), input.getWindowingStrategy().getWindowFn().windowCoder())); }
Example #22
Source File: ViewOverrideFactory.java From beam with Apache License 2.0 | 5 votes |
@Override public PCollection<ElemT> expand(final PCollection<ElemT> input) { input .apply(WithKeys.of((Void) null)) .setCoder(KvCoder.of(VoidCoder.of(), input.getCoder())) .apply(GroupByKey.create()) .apply(Values.create()) .apply(new WriteView<>(view)); return input; }
Example #23
Source File: DataflowPTransformMatchersTest.java From beam with Apache License 2.0 | 5 votes |
/** Creates a simple pipeline with a {@link Combine.GroupedValues}. */ private static TestPipeline createCombineGroupedValuesPipeline() { TestPipeline pipeline = TestPipeline.create().enableAbandonedNodeEnforcement(false); PCollection<KV<String, Integer>> input = pipeline .apply(Create.of(KV.of("key", 1))) .setCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of())); input.apply(GroupByKey.create()).apply(Combine.groupedValues(new SumCombineFn())); return pipeline; }
Example #24
Source File: ViewEvaluatorFactoryTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testInMemoryEvaluator() throws Exception { PCollection<String> input = p.apply(Create.of("foo", "bar")); PCollectionView<Iterable<String>> pCollectionView = input.apply(View.asIterable()); PCollection<Iterable<String>> concat = input .apply(WithKeys.of((Void) null)) .setCoder(KvCoder.of(VoidCoder.of(), StringUtf8Coder.of())) .apply(GroupByKey.create()) .apply(Values.create()); PCollection<Iterable<String>> view = concat.apply(new ViewOverrideFactory.WriteView<>(pCollectionView)); EvaluationContext context = mock(EvaluationContext.class); TestViewWriter<String, Iterable<String>> viewWriter = new TestViewWriter<>(); when(context.createPCollectionViewWriter(concat, pCollectionView)).thenReturn(viewWriter); CommittedBundle<String> inputBundle = bundleFactory.createBundle(input).commit(Instant.now()); AppliedPTransform<?, ?, ?> producer = DirectGraphs.getProducer(view); TransformEvaluator<Iterable<String>> evaluator = new ViewEvaluatorFactory(context).forApplication(producer, inputBundle); evaluator.processElement(WindowedValue.valueInGlobalWindow(ImmutableList.of("foo", "bar"))); assertThat(viewWriter.latest, nullValue()); evaluator.finishBundle(); assertThat( viewWriter.latest, containsInAnyOrder( WindowedValue.valueInGlobalWindow("foo"), WindowedValue.valueInGlobalWindow("bar"))); }
Example #25
Source File: KafkaExactlyOnceSink.java From beam with Apache License 2.0 | 5 votes |
@Override public PCollection<Void> expand(PCollection<ProducerRecord<K, V>> input) { int numShards = spec.getNumShards(); if (numShards <= 0) { try (Consumer<?, ?> consumer = openConsumer(spec)) { numShards = consumer.partitionsFor(spec.getTopic()).size(); LOG.info( "Using {} shards for exactly-once writer, matching number of partitions " + "for topic '{}'", numShards, spec.getTopic()); } } checkState(numShards > 0, "Could not set number of shards"); return input .apply( Window.<ProducerRecord<K, V>>into(new GlobalWindows()) // Everything into global window. .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1))) .discardingFiredPanes()) .apply( String.format("Shuffle across %d shards", numShards), ParDo.of(new Reshard<>(numShards))) .apply("Persist sharding", GroupByKey.create()) .apply("Assign sequential ids", ParDo.of(new Sequencer<>())) .apply("Persist ids", GroupByKey.create()) .apply( String.format("Write to Kafka topic '%s'", spec.getTopic()), ParDo.of(new ExactlyOnceWriter<>(spec, input.getCoder()))); }
Example #26
Source File: StreamingTransformTranslator.java From beam with Apache License 2.0 | 5 votes |
private static <K, V, W extends BoundedWindow> TransformEvaluator<GroupByKey<K, V>> groupByKey() { return new TransformEvaluator<GroupByKey<K, V>>() { @Override public void evaluate(GroupByKey<K, V> transform, EvaluationContext context) { @SuppressWarnings("unchecked") UnboundedDataset<KV<K, V>> inputDataset = (UnboundedDataset<KV<K, V>>) context.borrowDataset(transform); List<Integer> streamSources = inputDataset.getStreamSources(); JavaDStream<WindowedValue<KV<K, V>>> dStream = inputDataset.getDStream(); final KvCoder<K, V> coder = (KvCoder<K, V>) context.getInput(transform).getCoder(); @SuppressWarnings("unchecked") final WindowingStrategy<?, W> windowingStrategy = (WindowingStrategy<?, W>) context.getInput(transform).getWindowingStrategy(); @SuppressWarnings("unchecked") final WindowFn<Object, W> windowFn = (WindowFn<Object, W>) windowingStrategy.getWindowFn(); // --- coders. final WindowedValue.WindowedValueCoder<V> wvCoder = WindowedValue.FullWindowedValueCoder.of(coder.getValueCoder(), windowFn.windowCoder()); JavaDStream<WindowedValue<KV<K, Iterable<V>>>> outStream = SparkGroupAlsoByWindowViaWindowSet.groupByKeyAndWindow( dStream, coder.getKeyCoder(), wvCoder, windowingStrategy, context.getSerializableOptions(), streamSources, context.getCurrentTransform().getFullName()); context.putDataset(transform, new UnboundedDataset<>(outStream, streamSources)); } @Override public String toNativeString() { return "groupByKey()"; } }; }
Example #27
Source File: GroupByKeyTranslationTest.java From beam with Apache License 2.0 | 5 votes |
/** * Tests that the translator is registered so the URN can be retrieved (the only thing you can * meaningfully do with a {@link GroupByKey}). */ @Test public void testUrnRetrievable() throws Exception { assertThat( PTransformTranslation.urnForTransform(GroupByKey.create()), equalTo(GROUP_BY_KEY_TRANSFORM_URN)); }
Example #28
Source File: PipelineTranslationTest.java From beam with Apache License 2.0 | 5 votes |
@Parameters(name = "{index}") public static Iterable<Pipeline> testPipelines() { Pipeline trivialPipeline = Pipeline.create(); trivialPipeline.apply(Create.of(1, 2, 3)); Pipeline sideInputPipeline = Pipeline.create(); final PCollectionView<String> singletonView = sideInputPipeline.apply(Create.of("foo")).apply(View.asSingleton()); sideInputPipeline .apply(Create.of("main input")) .apply( ParDo.of( new DoFn<String, String>() { @ProcessElement public void process(ProcessContext c) { // actually never executed and no effect on translation c.sideInput(singletonView); } }) .withSideInputs(singletonView)); Pipeline complexPipeline = Pipeline.create(); BigEndianLongCoder customCoder = BigEndianLongCoder.of(); PCollection<Long> elems = complexPipeline.apply(GenerateSequence.from(0L).to(207L)); PCollection<Long> counted = elems.apply(Count.globally()).setCoder(customCoder); PCollection<Long> windowed = counted.apply( Window.<Long>into(FixedWindows.of(Duration.standardMinutes(7))) .triggering( AfterWatermark.pastEndOfWindow() .withLateFirings(AfterPane.elementCountAtLeast(19))) .accumulatingFiredPanes() .withAllowedLateness(Duration.standardMinutes(3L))); final WindowingStrategy<?, ?> windowedStrategy = windowed.getWindowingStrategy(); PCollection<KV<String, Long>> keyed = windowed.apply(WithKeys.of("foo")); PCollection<KV<String, Iterable<Long>>> grouped = keyed.apply(GroupByKey.create()); return ImmutableList.of(trivialPipeline, sideInputPipeline, complexPipeline); }
Example #29
Source File: GroupByKeyTranslatorBatch.java From beam with Apache License 2.0 | 5 votes |
@Override public void translateNode(GroupByKey<K, V> transform, Twister2BatchTranslationContext context) { PCollection<KV<K, V>> input = context.getInput(transform); BatchTSetImpl<WindowedValue<KV<K, V>>> inputTTSet = context.getInputDataSet(input); final KvCoder<K, V> coder = (KvCoder<K, V>) input.getCoder(); Coder<K> inputKeyCoder = coder.getKeyCoder(); WindowingStrategy windowingStrategy = input.getWindowingStrategy(); WindowFn<KV<K, V>, BoundedWindow> windowFn = (WindowFn<KV<K, V>, BoundedWindow>) windowingStrategy.getWindowFn(); final WindowedValue.WindowedValueCoder<V> wvCoder = WindowedValue.FullWindowedValueCoder.of(coder.getValueCoder(), windowFn.windowCoder()); KeyedTSet<byte[], byte[]> keyedTSet = inputTTSet.mapToTuple(new MapToTupleFunction<K, V>(inputKeyCoder, wvCoder)); // todo add support for a partition function to be specified, this would use // todo keyedPartition function instead of KeyedGather ComputeTSet<KV<K, Iterable<WindowedValue<V>>>, Iterator<Tuple<byte[], Iterator<byte[]>>>> groupedbyKeyTset = keyedTSet.keyedGather().map(new ByteToWindowFunction(inputKeyCoder, wvCoder)); // --- now group also by window. SystemReduceFnBuffering reduceFnBuffering = new SystemReduceFnBuffering(coder.getValueCoder()); ComputeTSet<WindowedValue<KV<K, Iterable<V>>>, Iterable<KV<K, Iterator<WindowedValue<V>>>>> outputTset = groupedbyKeyTset .direct() .<WindowedValue<KV<K, Iterable<V>>>>flatmap( new GroupByWindowFunction( windowingStrategy, reduceFnBuffering, context.getOptions())); PCollection output = context.getOutput(transform); context.setOutputDataSet(output, outputTset); }
Example #30
Source File: BreakFusionTransform.java From dataflow-java with Apache License 2.0 | 5 votes |
@Override public PCollection<T> expand(PCollection<T> input) { return input .apply("Break fusion mapper", ParDo.of(new DummyMapFn<T>())) .apply(GroupByKey.<T, Integer>create()) .apply(Keys.<T>create()); }