org.apache.beam.sdk.transforms.join.CoGroupByKey Java Examples
The following examples show how to use
org.apache.beam.sdk.transforms.join.CoGroupByKey.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MultinomialLogisticRegression.java From incubator-nemo with Apache License 2.0 | 6 votes |
@Override public PCollection<KV<Integer, List<Double>>> expand(final PCollection<KV<Integer, List<Double>>> model) { // Model as a view. final PCollectionView<Map<Integer, List<Double>>> modelView = model.apply(View.asMap()); // Find gradient. final PCollection<KV<Integer, List<Double>>> gradient = readInput .apply(ParDo.of( new CalculateGradient(modelView, numClasses, numFeatures)).withSideInputs(modelView)) .apply(Combine.perKey(new CombineFunction())); // Tags for CoGroupByKey. final TupleTag<List<Double>> gradientTag = new TupleTag<>(); final TupleTag<List<Double>> modelTag = new TupleTag<>(); final KeyedPCollectionTuple<Integer> coGbkInput = KeyedPCollectionTuple .of(gradientTag, gradient) .and(modelTag, model); final PCollection<KV<Integer, CoGbkResult>> groupResult = coGbkInput.apply(CoGroupByKey.create()); // Update the model return groupResult .apply(ParDo.of(new ApplyGradient(numFeatures, numClasses, iterationNum, gradientTag, modelTag))); }
Example #2
Source File: MusicBrainzTransforms.java From bigquery-etl-dataflow-sample with Apache License 2.0 | 6 votes |
private static PCollection<KV<Long, CoGbkResult>> group(String name, PCollection<KV<Long, MusicBrainzDataObject>> first, PCollection<KV<Long, MusicBrainzDataObject>> second, TupleTag<MusicBrainzDataObject> firstTag, TupleTag<MusicBrainzDataObject> secondTag ) { final CoGroupByKey<Long> grouper = CoGroupByKey.create(); PCollection<KV<Long, CoGbkResult>> joinedResult; try { joinedResult = KeyedPCollectionTuple .of(firstTag, first) .and(secondTag, second) .apply("joinResult_"+name, CoGroupByKey.<Long>create()); } catch (Exception e) { logger.error("exception grouping.", e); return null; } return joinedResult; }
Example #3
Source File: MultinomialLogisticRegression.java From nemo with Apache License 2.0 | 6 votes |
@Override public PCollection<KV<Integer, List<Double>>> expand(final PCollection<KV<Integer, List<Double>>> model) { // Model as a view. final PCollectionView<Map<Integer, List<Double>>> modelView = model.apply(View.asMap()); // Find gradient. final PCollection<KV<Integer, List<Double>>> gradient = readInput .apply(ParDo.of( new CalculateGradient(modelView, numClasses, numFeatures)).withSideInputs(modelView)) .apply(Combine.perKey(new CombineFunction())); // Tags for CoGroupByKey. final TupleTag<List<Double>> gradientTag = new TupleTag<>(); final TupleTag<List<Double>> modelTag = new TupleTag<>(); final KeyedPCollectionTuple<Integer> coGbkInput = KeyedPCollectionTuple .of(gradientTag, gradient) .and(modelTag, model); final PCollection<KV<Integer, CoGbkResult>> groupResult = coGbkInput.apply(CoGroupByKey.create()); // Update the model return groupResult .apply(ParDo.of(new ApplyGradient(numFeatures, numClasses, iterationNum, gradientTag, modelTag))); }
Example #4
Source File: JoinTranslator.java From beam with Apache License 2.0 | 6 votes |
@Override PCollection<KV<KeyT, OutputT>> translate( Join<LeftT, RightT, KeyT, OutputT> operator, PCollection<LeftT> left, PCollection<KV<KeyT, LeftT>> leftKeyed, PCollection<RightT> reight, PCollection<KV<KeyT, RightT>> rightKeyed) { final AccumulatorProvider accumulators = new LazyAccumulatorProvider(AccumulatorProvider.of(leftKeyed.getPipeline())); final TupleTag<LeftT> leftTag = new TupleTag<>(); final TupleTag<RightT> rightTag = new TupleTag<>(); final JoinFn<LeftT, RightT, KeyT, OutputT> joinFn = getJoinFn(operator, leftTag, rightTag, accumulators); return KeyedPCollectionTuple.of(leftTag, leftKeyed) .and(rightTag, rightKeyed) .apply("co-group-by-key", CoGroupByKey.create()) .apply(joinFn.getFnName(), ParDo.of(joinFn)); }
Example #5
Source File: VerifyBamId.java From dataflow-java with Apache License 2.0 | 6 votes |
/** * Filter, pile up, and sample reads, then join against reference statistics. * * @param reads A PCollection of reads * @param samplingFraction Fraction of reads to keep * @param samplingPrefix A prefix used in generating hashes used in sampling * @param refCounts A PCollection mapping position to counts of alleles in * a reference population. * @return A PCollection mapping Position to a ReadCounts proto */ static PCollection<KV<Position, ReadCounts>> combineReads(PCollection<Read> reads, double samplingFraction, String samplingPrefix, PCollection<KV<Position, AlleleFreq>> refFreq) { // Runs filters on input Reads, splits into individual aligned bases (emitting the // base and quality) and grabs a sample of them based on a hash mod of Position. PCollection<KV<Position, ReadBaseQuality>> joinReadCounts = reads.apply("IsOnChromosome", Filter.by(ReadFunctions.IS_ON_CHROMOSOME)) .apply("IsNotQCFailure", Filter.by(ReadFunctions.IS_NOT_QC_FAILURE)) .apply("IsNotDuplicate", Filter.by(ReadFunctions.IS_NOT_DUPLICATE)) .apply("IsProperPlacement", Filter.by(ReadFunctions.IS_PROPER_PLACEMENT)) .apply(ParDo.of(new SplitReads())) .apply(Filter.by(new SampleReads(samplingFraction, samplingPrefix))); TupleTag<ReadBaseQuality> readCountsTag = new TupleTag<>(); TupleTag<AlleleFreq> refFreqTag = new TupleTag<>(); // Pile up read counts, then join against reference stats. PCollection<KV<Position, CoGbkResult>> joined = KeyedPCollectionTuple .of(readCountsTag, joinReadCounts) .and(refFreqTag, refFreq) .apply(CoGroupByKey.<Position>create()); return joined.apply(ParDo.of(new PileupAndJoinReads(readCountsTag, refFreqTag))); }
Example #6
Source File: CoGroup.java From beam with Apache License 2.0 | 6 votes |
@Override public PCollection<Row> expand(PCollectionTuple input) { verify(input, joinArgs); JoinInformation joinInformation = JoinInformation.from( input, joinArgs::getFieldAccessDescriptor, joinArgs::getSideInputSource); Collection<PCollectionView<Map<Row, Iterable<Row>>>> views = joinInformation.sideInputs.values(); Schema outputSchema = Result.getUnexandedOutputSchema(keyFieldName, joinInformation); return joinInformation .keyedPCollectionTuple .apply("CoGroupByKey", CoGroupByKey.create()) .apply( ParDo.of( new ConvertCoGbkResult( joinInformation, joinArgs, ConvertType.UNEXPANDED, outputSchema)) .withSideInputs(views)) .setRowSchema(outputSchema); }
Example #7
Source File: TestExpansionService.java From beam with Apache License 2.0 | 6 votes |
@Override public PCollection<KV<Long, Iterable<String>>> expand(KeyedPCollectionTuple<Long> input) { Set<String> tagSet = ImmutableSet.of("col1", "col2"); return input .apply(CoGroupByKey.create()) .apply( ParDo.of( new DoFn<KV<Long, CoGbkResult>, KV<Long, Iterable<String>>>() { @ProcessElement public void processElement( @Element KV<Long, CoGbkResult> kv, OutputReceiver<KV<Long, Iterable<String>>> out) { Iterable<String> iter = () -> tagSet.stream() .flatMap( (String t) -> StreamSupport.stream( kv.getValue().<String>getAll(t).spliterator(), false)) .iterator(); out.output(KV.of(kv.getKey(), iter)); } })); }
Example #8
Source File: CompareDatabases.java From DataflowTemplates with Apache License 2.0 | 5 votes |
@Override public PCollection<Long> expand(PBegin begin) { final TupleTag<Struct> oneTag = new TupleTag<>(); PCollection<KV<String, Struct>> rowsOne = begin.apply("Read one", new ReadAllRows(one)); final TupleTag<Struct> twoTag = new TupleTag<>(); PCollection<KV<String, Struct>> rowsTwo = begin.apply("Read two", new ReadAllRows(two)); PCollection<KV<String, CoGbkResult>> cogroup = KeyedPCollectionTuple.of(oneTag, rowsOne).and(twoTag, rowsTwo).apply(CoGroupByKey.create()); PCollection<String> fails = cogroup.apply( ParDo.of( new DoFn<KV<String, CoGbkResult>, String>() { @ProcessElement public void processElement(ProcessContext c) { KV<String, CoGbkResult> element = c.element(); CoGbkResult gbk = element.getValue(); ArrayList<Struct> oneRows = Lists.newArrayList(gbk.getAll(oneTag)); ArrayList<Struct> twoRows = Lists.newArrayList(gbk.getAll(twoTag)); if (oneRows.size() != 1 || twoRows.size() != 1) { c.output(element.getKey()); return; } Struct sOne = oneRows.get(0); Struct sTwo = twoRows.get(0); if (!sOne.equals(sTwo)) { c.output(element.getKey()); } } })); return fails.apply(Count.globally()); }
Example #9
Source File: VerifyBamIdTest.java From dataflow-java with Apache License 2.0 | 5 votes |
@Test public void testPileupAndJoinReadsWithChrPrefix() throws Exception { ReadBaseQuality srq = new ReadBaseQuality("A", 10); PCollection<KV<Position, ReadBaseQuality>> readCounts = p.apply( "createInput", Create.of(KV.of(position1chrPrefix, srq))); PAssert.that(readCounts).containsInAnyOrder(KV.of(position1chrPrefix, srq)); PCollection<KV<Position, AlleleFreq>> refFreq = p.apply(Create.of(refCountList)); PAssert.that(refFreq).containsInAnyOrder(refCountList); TupleTag<ReadBaseQuality> readCountsTag = new TupleTag<>(); TupleTag<AlleleFreq> refFreqTag = new TupleTag<>(); PCollection<KV<Position, CoGbkResult>> joined = KeyedPCollectionTuple .of(readCountsTag, readCounts) .and(refFreqTag, refFreq) .apply(CoGroupByKey.<Position>create()); PCollection<KV<Position, ReadCounts>> result = joined.apply( ParDo.of(new PileupAndJoinReads(readCountsTag, refFreqTag))); KV<Position, ReadCounts> expectedResult1 = KV.of(position1, rc1); KV<Position, ReadCounts> expectedResult2 = KV.of(position2, rc2); KV<Position, ReadCounts> expectedResult3 = KV.of(position3, rc3); PAssert.that(result).containsInAnyOrder(expectedResult1, expectedResult2, expectedResult3); p.run(); }
Example #10
Source File: VerifyBamIdTest.java From dataflow-java with Apache License 2.0 | 5 votes |
@Test public void testPileupAndJoinReads() throws Exception { final ReadBaseQuality srq = new ReadBaseQuality("A", 10); PCollection<KV<Position, ReadBaseQuality>> readCounts = p.apply( "createInput", Create.of(KV.of(position1, srq))); PAssert.that(readCounts).containsInAnyOrder(KV.of(position1, srq)); PCollection<KV<Position, AlleleFreq>> refFreq = p.apply(Create.of(refCountList)); PAssert.that(refFreq).containsInAnyOrder(refCountList); final TupleTag<ReadBaseQuality> readCountsTag = new TupleTag<>(); TupleTag<AlleleFreq> refFreqTag = new TupleTag<>(); PCollection<KV<Position, CoGbkResult>> joined = KeyedPCollectionTuple .of(readCountsTag, readCounts) .and(refFreqTag, refFreq) .apply(CoGroupByKey.<Position>create()); PCollection<KV<Position, ReadCounts>> result = joined.apply( ParDo.of(new PileupAndJoinReads(readCountsTag, refFreqTag))); KV<Position, ReadCounts> expectedResult1 = KV.of(position1, rc1); KV<Position, ReadCounts> expectedResult2 = KV.of(position2, rc2); KV<Position, ReadCounts> expectedResult3 = KV.of(position3, rc3); PAssert.that(result).containsInAnyOrder(expectedResult1, expectedResult2, expectedResult3); p.run(); }
Example #11
Source File: CoGroup.java From beam with Apache License 2.0 | 5 votes |
@Override public PCollection<Row> expand(PCollectionTuple input) { verify(input, joinArgs); JoinInformation joinInformation = JoinInformation.from( input, joinArgs::getFieldAccessDescriptor, joinArgs::getSideInputSource); Result.verifyExpandedArgs(joinInformation, joinArgs); Schema outputSchema = Result.getExpandedOutputSchema(joinInformation, joinArgs); Collection<PCollectionView<Map<Row, Iterable<Row>>>> views = joinInformation.sideInputs.values(); PCollection<Row> expanded; if (joinInformation.keyedPCollectionTuple.getKeyedCollections().size() > 1) { expanded = joinInformation .keyedPCollectionTuple .apply("CoGroupByKey", CoGroupByKey.create()) .apply( ParDo.of( new ConvertCoGbkResult( joinInformation, joinArgs, ConvertType.EXPANDED, outputSchema)) .withSideInputs(views)); } else { TaggedKeyedPCollection<Row, Row> tpc = (TaggedKeyedPCollection<Row, Row>) Iterables.getOnlyElement( joinInformation.keyedPCollectionTuple.getKeyedCollections()); expanded = tpc.getCollection() .apply( ParDo.of(new ExpandRowResult(joinInformation, joinArgs, outputSchema)) .withSideInputs(views)); } return expanded.setRowSchema(outputSchema); }
Example #12
Source File: CoGroupByKeyLoadTest.java From beam with Apache License 2.0 | 5 votes |
@Override void loadTest() throws IOException { SyntheticSourceOptions coSourceOptions = fromJsonString(options.getCoSourceOptions(), SyntheticSourceOptions.class); Optional<SyntheticStep> syntheticStep = createStep(options.getStepOptions()); PCollection<KV<byte[], byte[]>> input = pipeline.apply("Read input", readFromSource(sourceOptions)); input = input.apply("Collect start time metrics (input)", ParDo.of(runtimeMonitor)); input = applyWindowing(input); input = applyStepIfPresent(input, "Synthetic step for input", syntheticStep); PCollection<KV<byte[], byte[]>> coInput = pipeline.apply("Read co-input", readFromSource(coSourceOptions)); coInput = coInput.apply("Collect start time metrics (co-input)", ParDo.of(runtimeMonitor)); coInput = applyWindowing(coInput, options.getCoInputWindowDurationSec()); coInput = applyStepIfPresent(coInput, "Synthetic step for co-input", syntheticStep); KeyedPCollectionTuple.of(INPUT_TAG, input) .and(CO_INPUT_TAG, coInput) .apply("CoGroupByKey", CoGroupByKey.create()) .apply("Ungroup and reiterate", ParDo.of(new UngroupAndReiterate(options.getIterations()))) .apply( "Collect total bytes", ParDo.of(new ByteMonitor(METRICS_NAMESPACE, "totalBytes.count"))) .apply("Collect end time metrics", ParDo.of(runtimeMonitor)); }
Example #13
Source File: Join.java From beam with Apache License 2.0 | 5 votes |
@Override public PCollection<KV<K, KV<V1, V2>>> expand(PCollection<KV<K, V1>> leftCollection) { checkNotNull(leftCollection); checkNotNull(rightCollection); final TupleTag<V1> v1Tuple = new TupleTag<>(); final TupleTag<V2> v2Tuple = new TupleTag<>(); PCollection<KV<K, CoGbkResult>> coGbkResultCollection = KeyedPCollectionTuple.of(v1Tuple, leftCollection) .and(v2Tuple, rightCollection) .apply("CoGBK", CoGroupByKey.create()); return coGbkResultCollection .apply( "Join", ParDo.of( new DoFn<KV<K, CoGbkResult>, KV<K, KV<V1, V2>>>() { @ProcessElement public void processElement(ProcessContext c) { KV<K, CoGbkResult> e = c.element(); Iterable<V1> leftValuesIterable = e.getValue().getAll(v1Tuple); Iterable<V2> rightValuesIterable = e.getValue().getAll(v2Tuple); for (V1 leftValue : leftValuesIterable) { for (V2 rightValue : rightValuesIterable) { c.output(KV.of(e.getKey(), KV.of(leftValue, rightValue))); } } } })) .setCoder( KvCoder.of( ((KvCoder) leftCollection.getCoder()).getKeyCoder(), KvCoder.of( ((KvCoder) leftCollection.getCoder()).getValueCoder(), ((KvCoder) rightCollection.getCoder()).getValueCoder()))); }
Example #14
Source File: Task.java From beam with Apache License 2.0 | 5 votes |
static PCollection<String> applyTransform( PCollection<String> fruits, PCollection<String> countries) { TupleTag<String> fruitsTag = new TupleTag<>(); TupleTag<String> countriesTag = new TupleTag<>(); MapElements<String, KV<String, String>> mapToAlphabetKv = MapElements.into(kvs(strings(), strings())) .via(word -> KV.of(word.substring(0, 1), word)); PCollection<KV<String, String>> fruitsPColl = fruits.apply("Fruit to KV", mapToAlphabetKv); PCollection<KV<String, String>> countriesPColl = countries .apply("Country to KV", mapToAlphabetKv); return KeyedPCollectionTuple .of(fruitsTag, fruitsPColl) .and(countriesTag, countriesPColl) .apply(CoGroupByKey.create()) .apply(ParDo.of(new DoFn<KV<String, CoGbkResult>, String>() { @ProcessElement public void processElement( @Element KV<String, CoGbkResult> element, OutputReceiver<String> out) { String alphabet = element.getKey(); CoGbkResult coGbkResult = element.getValue(); String fruit = coGbkResult.getOnly(fruitsTag); String country = coGbkResult.getOnly(countriesTag); out.output(new WordsAlphabet(alphabet, fruit, country).toString()); } })); }
Example #15
Source File: Snippets.java From beam with Apache License 2.0 | 5 votes |
/** Using a CoGroupByKey transform. */ public static PCollection<String> coGroupByKeyTuple( TupleTag<String> emailsTag, TupleTag<String> phonesTag, PCollection<KV<String, String>> emails, PCollection<KV<String, String>> phones) { // [START CoGroupByKeyTuple] PCollection<KV<String, CoGbkResult>> results = KeyedPCollectionTuple.of(emailsTag, emails) .and(phonesTag, phones) .apply(CoGroupByKey.create()); PCollection<String> contactLines = results.apply( ParDo.of( new DoFn<KV<String, CoGbkResult>, String>() { @ProcessElement public void processElement(ProcessContext c) { KV<String, CoGbkResult> e = c.element(); String name = e.getKey(); Iterable<String> emailsIter = e.getValue().getAll(emailsTag); Iterable<String> phonesIter = e.getValue().getAll(phonesTag); String formattedResult = Snippets.formatCoGbkResults(name, emailsIter, phonesIter); c.output(formattedResult); } })); // [END CoGroupByKeyTuple] return contactLines; }
Example #16
Source File: WinningBids.java From beam with Apache License 2.0 | 4 votes |
@Override public PCollection<AuctionBid> expand(PCollection<Event> events) { // Window auctions and bids into custom auction windows. New people events will be discarded. // This will allow us to bring bids and auctions together irrespective of how long // each auction is open for. events = events.apply("Window", Window.into(auctionOrBidWindowFn)); // Key auctions by their id. PCollection<KV<Long, Auction>> auctionsById = events .apply(NexmarkQueryUtil.JUST_NEW_AUCTIONS) .apply("AuctionById:", NexmarkQueryUtil.AUCTION_BY_ID); // Key bids by their auction id. PCollection<KV<Long, Bid>> bidsByAuctionId = events .apply(NexmarkQueryUtil.JUST_BIDS) .apply("BidByAuction", NexmarkQueryUtil.BID_BY_AUCTION); // Find the highest price valid bid for each closed auction. return // Join auctions and bids. KeyedPCollectionTuple.of(NexmarkQueryUtil.AUCTION_TAG, auctionsById) .and(NexmarkQueryUtil.BID_TAG, bidsByAuctionId) .apply(CoGroupByKey.create()) // Filter and select. .apply( name + ".Join", ParDo.of( new DoFn<KV<Long, CoGbkResult>, AuctionBid>() { private final Counter noAuctionCounter = Metrics.counter(name, "noAuction"); private final Counter underReserveCounter = Metrics.counter(name, "underReserve"); private final Counter noValidBidsCounter = Metrics.counter(name, "noValidBids"); @ProcessElement public void processElement(ProcessContext c) { @Nullable Auction auction = c.element().getValue().getOnly(NexmarkQueryUtil.AUCTION_TAG, null); if (auction == null) { // We have bids without a matching auction. Give up. noAuctionCounter.inc(); return; } // Find the current winning bid for auction. // The earliest bid with the maximum price above the reserve wins. Bid bestBid = null; for (Bid bid : c.element().getValue().getAll(NexmarkQueryUtil.BID_TAG)) { // Bids too late for their auction will have been // filtered out by the window merge function. checkState(bid.dateTime.compareTo(auction.expires) < 0); if (bid.price < auction.reserve) { // Bid price is below auction reserve. underReserveCounter.inc(); continue; } if (bestBid == null || Bid.PRICE_THEN_DESCENDING_TIME.compare(bid, bestBid) > 0) { bestBid = bid; } } if (bestBid == null) { // We don't have any valid bids for auction. noValidBidsCounter.inc(); return; } c.output(new AuctionBid(auction, bestBid)); } })); }
Example #17
Source File: Query8.java From beam with Apache License 2.0 | 4 votes |
@Override public PCollection<IdNameReserve> expand(PCollection<Event> events) { // Window and key new people by their id. PCollection<KV<Long, Person>> personsById = events .apply(NexmarkQueryUtil.JUST_NEW_PERSONS) .apply( "Query8.WindowPersons", Window.into(FixedWindows.of(Duration.standardSeconds(configuration.windowSizeSec)))) .apply("PersonById", NexmarkQueryUtil.PERSON_BY_ID); // Window and key new auctions by their id. PCollection<KV<Long, Auction>> auctionsBySeller = events .apply(NexmarkQueryUtil.JUST_NEW_AUCTIONS) .apply( "Query8.WindowAuctions", Window.into(FixedWindows.of(Duration.standardSeconds(configuration.windowSizeSec)))) .apply("AuctionBySeller", NexmarkQueryUtil.AUCTION_BY_SELLER); // Join people and auctions and project the person id, name and auction reserve price. return KeyedPCollectionTuple.of(NexmarkQueryUtil.PERSON_TAG, personsById) .and(NexmarkQueryUtil.AUCTION_TAG, auctionsBySeller) .apply(CoGroupByKey.create()) .apply( name + ".Select", ParDo.of( new DoFn<KV<Long, CoGbkResult>, IdNameReserve>() { @ProcessElement public void processElement(ProcessContext c) { @Nullable Person person = c.element().getValue().getOnly(NexmarkQueryUtil.PERSON_TAG, null); if (person == null) { // Person was not created in last window period. return; } for (Auction auction : c.element().getValue().getAll(NexmarkQueryUtil.AUCTION_TAG)) { c.output(new IdNameReserve(person.id, person.name, auction.reserve)); } } })); }
Example #18
Source File: Join.java From beam with Apache License 2.0 | 4 votes |
@Override public PCollection<KV<K, KV<V1, V2>>> expand(PCollection<KV<K, V1>> leftCollection) { checkNotNull(leftCollection); checkNotNull(rightCollection); checkNotNull(nullValue); final TupleTag<V1> v1Tuple = new TupleTag<>(); final TupleTag<V2> v2Tuple = new TupleTag<>(); PCollection<KV<K, CoGbkResult>> coGbkResultCollection = KeyedPCollectionTuple.of(v1Tuple, leftCollection) .and(v2Tuple, rightCollection) .apply("CoGBK", CoGroupByKey.create()); return coGbkResultCollection .apply( "Join", ParDo.of( new DoFn<KV<K, CoGbkResult>, KV<K, KV<V1, V2>>>() { @ProcessElement public void processElement(ProcessContext c) { KV<K, CoGbkResult> e = c.element(); Iterable<V1> leftValuesIterable = e.getValue().getAll(v1Tuple); Iterable<V2> rightValuesIterable = e.getValue().getAll(v2Tuple); for (V2 rightValue : rightValuesIterable) { if (leftValuesIterable.iterator().hasNext()) { for (V1 leftValue : leftValuesIterable) { c.output(KV.of(e.getKey(), KV.of(leftValue, rightValue))); } } else { c.output(KV.of(e.getKey(), KV.of(nullValue, rightValue))); } } } })) .setCoder( KvCoder.of( ((KvCoder) leftCollection.getCoder()).getKeyCoder(), KvCoder.of( ((KvCoder) leftCollection.getCoder()).getValueCoder(), ((KvCoder) rightCollection.getCoder()).getValueCoder()))); }
Example #19
Source File: Join.java From beam with Apache License 2.0 | 4 votes |
@Override public PCollection<KV<K, KV<V1, V2>>> expand(PCollection<KV<K, V1>> leftCollection) { checkNotNull(leftCollection); checkNotNull(rightCollection); checkNotNull(nullValue); final TupleTag<V1> v1Tuple = new TupleTag<>(); final TupleTag<V2> v2Tuple = new TupleTag<>(); PCollection<KV<K, CoGbkResult>> coGbkResultCollection = KeyedPCollectionTuple.of(v1Tuple, leftCollection) .and(v2Tuple, rightCollection) .apply("CoGBK", CoGroupByKey.create()); return coGbkResultCollection .apply( "Join", ParDo.of( new DoFn<KV<K, CoGbkResult>, KV<K, KV<V1, V2>>>() { @ProcessElement public void processElement(ProcessContext c) { KV<K, CoGbkResult> e = c.element(); Iterable<V1> leftValuesIterable = e.getValue().getAll(v1Tuple); Iterable<V2> rightValuesIterable = e.getValue().getAll(v2Tuple); for (V1 leftValue : leftValuesIterable) { if (rightValuesIterable.iterator().hasNext()) { for (V2 rightValue : rightValuesIterable) { c.output(KV.of(e.getKey(), KV.of(leftValue, rightValue))); } } else { c.output(KV.of(e.getKey(), KV.of(leftValue, nullValue))); } } } })) .setCoder( KvCoder.of( ((KvCoder) leftCollection.getCoder()).getKeyCoder(), KvCoder.of( ((KvCoder) leftCollection.getCoder()).getValueCoder(), ((KvCoder) rightCollection.getCoder()).getValueCoder()))); }