org.apache.beam.sdk.transforms.join.KeyedPCollectionTuple Java Examples
The following examples show how to use
org.apache.beam.sdk.transforms.join.KeyedPCollectionTuple.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MusicBrainzTransforms.java From bigquery-etl-dataflow-sample with Apache License 2.0 | 6 votes |
private static PCollection<KV<Long, CoGbkResult>> group(String name, PCollection<KV<Long, MusicBrainzDataObject>> first, PCollection<KV<Long, MusicBrainzDataObject>> second, TupleTag<MusicBrainzDataObject> firstTag, TupleTag<MusicBrainzDataObject> secondTag ) { final CoGroupByKey<Long> grouper = CoGroupByKey.create(); PCollection<KV<Long, CoGbkResult>> joinedResult; try { joinedResult = KeyedPCollectionTuple .of(firstTag, first) .and(secondTag, second) .apply("joinResult_"+name, CoGroupByKey.<Long>create()); } catch (Exception e) { logger.error("exception grouping.", e); return null; } return joinedResult; }
Example #2
Source File: MultinomialLogisticRegression.java From nemo with Apache License 2.0 | 6 votes |
@Override public PCollection<KV<Integer, List<Double>>> expand(final PCollection<KV<Integer, List<Double>>> model) { // Model as a view. final PCollectionView<Map<Integer, List<Double>>> modelView = model.apply(View.asMap()); // Find gradient. final PCollection<KV<Integer, List<Double>>> gradient = readInput .apply(ParDo.of( new CalculateGradient(modelView, numClasses, numFeatures)).withSideInputs(modelView)) .apply(Combine.perKey(new CombineFunction())); // Tags for CoGroupByKey. final TupleTag<List<Double>> gradientTag = new TupleTag<>(); final TupleTag<List<Double>> modelTag = new TupleTag<>(); final KeyedPCollectionTuple<Integer> coGbkInput = KeyedPCollectionTuple .of(gradientTag, gradient) .and(modelTag, model); final PCollection<KV<Integer, CoGbkResult>> groupResult = coGbkInput.apply(CoGroupByKey.create()); // Update the model return groupResult .apply(ParDo.of(new ApplyGradient(numFeatures, numClasses, iterationNum, gradientTag, modelTag))); }
Example #3
Source File: VerifyBamId.java From dataflow-java with Apache License 2.0 | 6 votes |
/** * Filter, pile up, and sample reads, then join against reference statistics. * * @param reads A PCollection of reads * @param samplingFraction Fraction of reads to keep * @param samplingPrefix A prefix used in generating hashes used in sampling * @param refCounts A PCollection mapping position to counts of alleles in * a reference population. * @return A PCollection mapping Position to a ReadCounts proto */ static PCollection<KV<Position, ReadCounts>> combineReads(PCollection<Read> reads, double samplingFraction, String samplingPrefix, PCollection<KV<Position, AlleleFreq>> refFreq) { // Runs filters on input Reads, splits into individual aligned bases (emitting the // base and quality) and grabs a sample of them based on a hash mod of Position. PCollection<KV<Position, ReadBaseQuality>> joinReadCounts = reads.apply("IsOnChromosome", Filter.by(ReadFunctions.IS_ON_CHROMOSOME)) .apply("IsNotQCFailure", Filter.by(ReadFunctions.IS_NOT_QC_FAILURE)) .apply("IsNotDuplicate", Filter.by(ReadFunctions.IS_NOT_DUPLICATE)) .apply("IsProperPlacement", Filter.by(ReadFunctions.IS_PROPER_PLACEMENT)) .apply(ParDo.of(new SplitReads())) .apply(Filter.by(new SampleReads(samplingFraction, samplingPrefix))); TupleTag<ReadBaseQuality> readCountsTag = new TupleTag<>(); TupleTag<AlleleFreq> refFreqTag = new TupleTag<>(); // Pile up read counts, then join against reference stats. PCollection<KV<Position, CoGbkResult>> joined = KeyedPCollectionTuple .of(readCountsTag, joinReadCounts) .and(refFreqTag, refFreq) .apply(CoGroupByKey.<Position>create()); return joined.apply(ParDo.of(new PileupAndJoinReads(readCountsTag, refFreqTag))); }
Example #4
Source File: JoinTranslator.java From beam with Apache License 2.0 | 6 votes |
@Override PCollection<KV<KeyT, OutputT>> translate( Join<LeftT, RightT, KeyT, OutputT> operator, PCollection<LeftT> left, PCollection<KV<KeyT, LeftT>> leftKeyed, PCollection<RightT> reight, PCollection<KV<KeyT, RightT>> rightKeyed) { final AccumulatorProvider accumulators = new LazyAccumulatorProvider(AccumulatorProvider.of(leftKeyed.getPipeline())); final TupleTag<LeftT> leftTag = new TupleTag<>(); final TupleTag<RightT> rightTag = new TupleTag<>(); final JoinFn<LeftT, RightT, KeyT, OutputT> joinFn = getJoinFn(operator, leftTag, rightTag, accumulators); return KeyedPCollectionTuple.of(leftTag, leftKeyed) .and(rightTag, rightKeyed) .apply("co-group-by-key", CoGroupByKey.create()) .apply(joinFn.getFnName(), ParDo.of(joinFn)); }
Example #5
Source File: ValidateRunnerXlangTest.java From beam with Apache License 2.0 | 6 votes |
@Test @Category({ValidatesRunner.class, UsesCrossLanguageTransforms.class}) public void coGroupByKeyTest() { PCollection<KV<Long, String>> col1 = testPipeline.apply("createCol1", Create.of(KV.of(0L, "1"), KV.of(0L, "2"), KV.of(1L, "3"))); PCollection<KV<Long, String>> col2 = testPipeline.apply("createCol2", Create.of(KV.of(0L, "4"), KV.of(1L, "5"), KV.of(1L, "6"))); PCollection<KV<Long, Iterable<String>>> cgbkCol = KeyedPCollectionTuple.of("col1", col1) .and("col2", col2) .apply(External.of(TEST_CGBK_URN, new byte[] {}, expansionAddr)); PCollection<String> col = cgbkCol.apply( MapElements.into(TypeDescriptors.strings()) .via( (KV<Long, Iterable<String>> kv) -> { String[] values = Iterables.toArray(kv.getValue(), String.class); Arrays.sort(values); return String.format("%s:%s", kv.getKey(), String.join(",", values)); })); PAssert.that(col).containsInAnyOrder("0:1,2,4", "1:3,5,6"); }
Example #6
Source File: CoGroup.java From beam with Apache License 2.0 | 6 votes |
private JoinInformation( KeyedPCollectionTuple<Row> keyedPCollectionTuple, Map<String, PCollectionView<Map<Row, Iterable<Row>>>> sideInputs, Schema keySchema, Map<String, Schema> componentSchemas, Map<Integer, SerializableFunction<Object, Row>> toRows, List<String> sortedTags, Map<Integer, String> tagToKeyedTag) { this.keyedPCollectionTuple = keyedPCollectionTuple; this.sideInputs = sideInputs; this.keySchema = keySchema; this.componentSchemas = componentSchemas; this.toRows = toRows; this.sortedTags = sortedTags; this.tagToKeyedTag = tagToKeyedTag; }
Example #7
Source File: TestExpansionService.java From beam with Apache License 2.0 | 6 votes |
@Override public PCollection<KV<Long, Iterable<String>>> expand(KeyedPCollectionTuple<Long> input) { Set<String> tagSet = ImmutableSet.of("col1", "col2"); return input .apply(CoGroupByKey.create()) .apply( ParDo.of( new DoFn<KV<Long, CoGbkResult>, KV<Long, Iterable<String>>>() { @ProcessElement public void processElement( @Element KV<Long, CoGbkResult> kv, OutputReceiver<KV<Long, Iterable<String>>> out) { Iterable<String> iter = () -> tagSet.stream() .flatMap( (String t) -> StreamSupport.stream( kv.getValue().<String>getAll(t).spliterator(), false)) .iterator(); out.output(KV.of(kv.getKey(), iter)); } })); }
Example #8
Source File: MultinomialLogisticRegression.java From incubator-nemo with Apache License 2.0 | 6 votes |
@Override public PCollection<KV<Integer, List<Double>>> expand(final PCollection<KV<Integer, List<Double>>> model) { // Model as a view. final PCollectionView<Map<Integer, List<Double>>> modelView = model.apply(View.asMap()); // Find gradient. final PCollection<KV<Integer, List<Double>>> gradient = readInput .apply(ParDo.of( new CalculateGradient(modelView, numClasses, numFeatures)).withSideInputs(modelView)) .apply(Combine.perKey(new CombineFunction())); // Tags for CoGroupByKey. final TupleTag<List<Double>> gradientTag = new TupleTag<>(); final TupleTag<List<Double>> modelTag = new TupleTag<>(); final KeyedPCollectionTuple<Integer> coGbkInput = KeyedPCollectionTuple .of(gradientTag, gradient) .and(modelTag, model); final PCollection<KV<Integer, CoGbkResult>> groupResult = coGbkInput.apply(CoGroupByKey.create()); // Update the model return groupResult .apply(ParDo.of(new ApplyGradient(numFeatures, numClasses, iterationNum, gradientTag, modelTag))); }
Example #9
Source File: TestExpansionService.java From beam with Apache License 2.0 | 5 votes |
@Override public KeyedPCollectionTuple<Long> createInput( Pipeline p, Map<String, PCollection<?>> inputs) { KeyedPCollectionTuple inputTuple = KeyedPCollectionTuple.empty(p); for (Map.Entry<String, PCollection<?>> entry : inputs.entrySet()) { inputTuple = inputTuple.and(new TupleTag(entry.getKey()), entry.getValue()); } return inputTuple; }
Example #10
Source File: VerifyBamIdTest.java From dataflow-java with Apache License 2.0 | 5 votes |
@Test public void testPileupAndJoinReadsWithChrPrefix() throws Exception { ReadBaseQuality srq = new ReadBaseQuality("A", 10); PCollection<KV<Position, ReadBaseQuality>> readCounts = p.apply( "createInput", Create.of(KV.of(position1chrPrefix, srq))); PAssert.that(readCounts).containsInAnyOrder(KV.of(position1chrPrefix, srq)); PCollection<KV<Position, AlleleFreq>> refFreq = p.apply(Create.of(refCountList)); PAssert.that(refFreq).containsInAnyOrder(refCountList); TupleTag<ReadBaseQuality> readCountsTag = new TupleTag<>(); TupleTag<AlleleFreq> refFreqTag = new TupleTag<>(); PCollection<KV<Position, CoGbkResult>> joined = KeyedPCollectionTuple .of(readCountsTag, readCounts) .and(refFreqTag, refFreq) .apply(CoGroupByKey.<Position>create()); PCollection<KV<Position, ReadCounts>> result = joined.apply( ParDo.of(new PileupAndJoinReads(readCountsTag, refFreqTag))); KV<Position, ReadCounts> expectedResult1 = KV.of(position1, rc1); KV<Position, ReadCounts> expectedResult2 = KV.of(position2, rc2); KV<Position, ReadCounts> expectedResult3 = KV.of(position3, rc3); PAssert.that(result).containsInAnyOrder(expectedResult1, expectedResult2, expectedResult3); p.run(); }
Example #11
Source File: VerifyBamIdTest.java From dataflow-java with Apache License 2.0 | 5 votes |
@Test public void testPileupAndJoinReads() throws Exception { final ReadBaseQuality srq = new ReadBaseQuality("A", 10); PCollection<KV<Position, ReadBaseQuality>> readCounts = p.apply( "createInput", Create.of(KV.of(position1, srq))); PAssert.that(readCounts).containsInAnyOrder(KV.of(position1, srq)); PCollection<KV<Position, AlleleFreq>> refFreq = p.apply(Create.of(refCountList)); PAssert.that(refFreq).containsInAnyOrder(refCountList); final TupleTag<ReadBaseQuality> readCountsTag = new TupleTag<>(); TupleTag<AlleleFreq> refFreqTag = new TupleTag<>(); PCollection<KV<Position, CoGbkResult>> joined = KeyedPCollectionTuple .of(readCountsTag, readCounts) .and(refFreqTag, refFreq) .apply(CoGroupByKey.<Position>create()); PCollection<KV<Position, ReadCounts>> result = joined.apply( ParDo.of(new PileupAndJoinReads(readCountsTag, refFreqTag))); KV<Position, ReadCounts> expectedResult1 = KV.of(position1, rc1); KV<Position, ReadCounts> expectedResult2 = KV.of(position2, rc2); KV<Position, ReadCounts> expectedResult3 = KV.of(position3, rc3); PAssert.that(result).containsInAnyOrder(expectedResult1, expectedResult2, expectedResult3); p.run(); }
Example #12
Source File: GroupWithoutRepartition.java From beam with Apache License 2.0 | 5 votes |
@Override @SuppressWarnings("unchecked") public OutputT expand(InputT input) { if (input instanceof PCollection) { return (OutputT) ((PCollection) input).apply(transform); } else if (input instanceof KeyedPCollectionTuple) { return (OutputT) ((KeyedPCollectionTuple) input).apply(transform); } else { throw new RuntimeException( transform.getName() + " is not supported with " + GroupWithoutRepartition.class.getSimpleName()); } }
Example #13
Source File: CoGroupByKeyLoadTest.java From beam with Apache License 2.0 | 5 votes |
@Override void loadTest() throws IOException { SyntheticSourceOptions coSourceOptions = fromJsonString(options.getCoSourceOptions(), SyntheticSourceOptions.class); Optional<SyntheticStep> syntheticStep = createStep(options.getStepOptions()); PCollection<KV<byte[], byte[]>> input = pipeline.apply("Read input", readFromSource(sourceOptions)); input = input.apply("Collect start time metrics (input)", ParDo.of(runtimeMonitor)); input = applyWindowing(input); input = applyStepIfPresent(input, "Synthetic step for input", syntheticStep); PCollection<KV<byte[], byte[]>> coInput = pipeline.apply("Read co-input", readFromSource(coSourceOptions)); coInput = coInput.apply("Collect start time metrics (co-input)", ParDo.of(runtimeMonitor)); coInput = applyWindowing(coInput, options.getCoInputWindowDurationSec()); coInput = applyStepIfPresent(coInput, "Synthetic step for co-input", syntheticStep); KeyedPCollectionTuple.of(INPUT_TAG, input) .and(CO_INPUT_TAG, coInput) .apply("CoGroupByKey", CoGroupByKey.create()) .apply("Ungroup and reiterate", ParDo.of(new UngroupAndReiterate(options.getIterations()))) .apply( "Collect total bytes", ParDo.of(new ByteMonitor(METRICS_NAMESPACE, "totalBytes.count"))) .apply("Collect end time metrics", ParDo.of(runtimeMonitor)); }
Example #14
Source File: Join.java From beam with Apache License 2.0 | 5 votes |
@Override public PCollection<KV<K, KV<V1, V2>>> expand(PCollection<KV<K, V1>> leftCollection) { checkNotNull(leftCollection); checkNotNull(rightCollection); final TupleTag<V1> v1Tuple = new TupleTag<>(); final TupleTag<V2> v2Tuple = new TupleTag<>(); PCollection<KV<K, CoGbkResult>> coGbkResultCollection = KeyedPCollectionTuple.of(v1Tuple, leftCollection) .and(v2Tuple, rightCollection) .apply("CoGBK", CoGroupByKey.create()); return coGbkResultCollection .apply( "Join", ParDo.of( new DoFn<KV<K, CoGbkResult>, KV<K, KV<V1, V2>>>() { @ProcessElement public void processElement(ProcessContext c) { KV<K, CoGbkResult> e = c.element(); Iterable<V1> leftValuesIterable = e.getValue().getAll(v1Tuple); Iterable<V2> rightValuesIterable = e.getValue().getAll(v2Tuple); for (V1 leftValue : leftValuesIterable) { for (V2 rightValue : rightValuesIterable) { c.output(KV.of(e.getKey(), KV.of(leftValue, rightValue))); } } } })) .setCoder( KvCoder.of( ((KvCoder) leftCollection.getCoder()).getKeyCoder(), KvCoder.of( ((KvCoder) leftCollection.getCoder()).getValueCoder(), ((KvCoder) rightCollection.getCoder()).getValueCoder()))); }
Example #15
Source File: Task.java From beam with Apache License 2.0 | 5 votes |
static PCollection<String> applyTransform( PCollection<String> fruits, PCollection<String> countries) { TupleTag<String> fruitsTag = new TupleTag<>(); TupleTag<String> countriesTag = new TupleTag<>(); MapElements<String, KV<String, String>> mapToAlphabetKv = MapElements.into(kvs(strings(), strings())) .via(word -> KV.of(word.substring(0, 1), word)); PCollection<KV<String, String>> fruitsPColl = fruits.apply("Fruit to KV", mapToAlphabetKv); PCollection<KV<String, String>> countriesPColl = countries .apply("Country to KV", mapToAlphabetKv); return KeyedPCollectionTuple .of(fruitsTag, fruitsPColl) .and(countriesTag, countriesPColl) .apply(CoGroupByKey.create()) .apply(ParDo.of(new DoFn<KV<String, CoGbkResult>, String>() { @ProcessElement public void processElement( @Element KV<String, CoGbkResult> element, OutputReceiver<String> out) { String alphabet = element.getKey(); CoGbkResult coGbkResult = element.getValue(); String fruit = coGbkResult.getOnly(fruitsTag); String country = coGbkResult.getOnly(countriesTag); out.output(new WordsAlphabet(alphabet, fruit, country).toString()); } })); }
Example #16
Source File: Snippets.java From beam with Apache License 2.0 | 5 votes |
/** Using a CoGroupByKey transform. */ public static PCollection<String> coGroupByKeyTuple( TupleTag<String> emailsTag, TupleTag<String> phonesTag, PCollection<KV<String, String>> emails, PCollection<KV<String, String>> phones) { // [START CoGroupByKeyTuple] PCollection<KV<String, CoGbkResult>> results = KeyedPCollectionTuple.of(emailsTag, emails) .and(phonesTag, phones) .apply(CoGroupByKey.create()); PCollection<String> contactLines = results.apply( ParDo.of( new DoFn<KV<String, CoGbkResult>, String>() { @ProcessElement public void processElement(ProcessContext c) { KV<String, CoGbkResult> e = c.element(); String name = e.getKey(); Iterable<String> emailsIter = e.getValue().getAll(emailsTag); Iterable<String> phonesIter = e.getValue().getAll(phonesTag); String formattedResult = Snippets.formatCoGbkResults(name, emailsIter, phonesIter); c.output(formattedResult); } })); // [END CoGroupByKeyTuple] return contactLines; }
Example #17
Source File: CompareDatabases.java From DataflowTemplates with Apache License 2.0 | 5 votes |
@Override public PCollection<Long> expand(PBegin begin) { final TupleTag<Struct> oneTag = new TupleTag<>(); PCollection<KV<String, Struct>> rowsOne = begin.apply("Read one", new ReadAllRows(one)); final TupleTag<Struct> twoTag = new TupleTag<>(); PCollection<KV<String, Struct>> rowsTwo = begin.apply("Read two", new ReadAllRows(two)); PCollection<KV<String, CoGbkResult>> cogroup = KeyedPCollectionTuple.of(oneTag, rowsOne).and(twoTag, rowsTwo).apply(CoGroupByKey.create()); PCollection<String> fails = cogroup.apply( ParDo.of( new DoFn<KV<String, CoGbkResult>, String>() { @ProcessElement public void processElement(ProcessContext c) { KV<String, CoGbkResult> element = c.element(); CoGbkResult gbk = element.getValue(); ArrayList<Struct> oneRows = Lists.newArrayList(gbk.getAll(oneTag)); ArrayList<Struct> twoRows = Lists.newArrayList(gbk.getAll(twoTag)); if (oneRows.size() != 1 || twoRows.size() != 1) { c.output(element.getKey()); return; } Struct sOne = oneRows.get(0); Struct sTwo = twoRows.get(0); if (!sOne.equals(sTwo)) { c.output(element.getKey()); } } })); return fails.apply(Count.globally()); }
Example #18
Source File: Query8.java From beam with Apache License 2.0 | 4 votes |
@Override public PCollection<IdNameReserve> expand(PCollection<Event> events) { // Window and key new people by their id. PCollection<KV<Long, Person>> personsById = events .apply(NexmarkQueryUtil.JUST_NEW_PERSONS) .apply( "Query8.WindowPersons", Window.into(FixedWindows.of(Duration.standardSeconds(configuration.windowSizeSec)))) .apply("PersonById", NexmarkQueryUtil.PERSON_BY_ID); // Window and key new auctions by their id. PCollection<KV<Long, Auction>> auctionsBySeller = events .apply(NexmarkQueryUtil.JUST_NEW_AUCTIONS) .apply( "Query8.WindowAuctions", Window.into(FixedWindows.of(Duration.standardSeconds(configuration.windowSizeSec)))) .apply("AuctionBySeller", NexmarkQueryUtil.AUCTION_BY_SELLER); // Join people and auctions and project the person id, name and auction reserve price. return KeyedPCollectionTuple.of(NexmarkQueryUtil.PERSON_TAG, personsById) .and(NexmarkQueryUtil.AUCTION_TAG, auctionsBySeller) .apply(CoGroupByKey.create()) .apply( name + ".Select", ParDo.of( new DoFn<KV<Long, CoGbkResult>, IdNameReserve>() { @ProcessElement public void processElement(ProcessContext c) { @Nullable Person person = c.element().getValue().getOnly(NexmarkQueryUtil.PERSON_TAG, null); if (person == null) { // Person was not created in last window period. return; } for (Auction auction : c.element().getValue().getAll(NexmarkQueryUtil.AUCTION_TAG)) { c.output(new IdNameReserve(person.id, person.name, auction.reserve)); } } })); }
Example #19
Source File: WinningBids.java From beam with Apache License 2.0 | 4 votes |
@Override public PCollection<AuctionBid> expand(PCollection<Event> events) { // Window auctions and bids into custom auction windows. New people events will be discarded. // This will allow us to bring bids and auctions together irrespective of how long // each auction is open for. events = events.apply("Window", Window.into(auctionOrBidWindowFn)); // Key auctions by their id. PCollection<KV<Long, Auction>> auctionsById = events .apply(NexmarkQueryUtil.JUST_NEW_AUCTIONS) .apply("AuctionById:", NexmarkQueryUtil.AUCTION_BY_ID); // Key bids by their auction id. PCollection<KV<Long, Bid>> bidsByAuctionId = events .apply(NexmarkQueryUtil.JUST_BIDS) .apply("BidByAuction", NexmarkQueryUtil.BID_BY_AUCTION); // Find the highest price valid bid for each closed auction. return // Join auctions and bids. KeyedPCollectionTuple.of(NexmarkQueryUtil.AUCTION_TAG, auctionsById) .and(NexmarkQueryUtil.BID_TAG, bidsByAuctionId) .apply(CoGroupByKey.create()) // Filter and select. .apply( name + ".Join", ParDo.of( new DoFn<KV<Long, CoGbkResult>, AuctionBid>() { private final Counter noAuctionCounter = Metrics.counter(name, "noAuction"); private final Counter underReserveCounter = Metrics.counter(name, "underReserve"); private final Counter noValidBidsCounter = Metrics.counter(name, "noValidBids"); @ProcessElement public void processElement(ProcessContext c) { @Nullable Auction auction = c.element().getValue().getOnly(NexmarkQueryUtil.AUCTION_TAG, null); if (auction == null) { // We have bids without a matching auction. Give up. noAuctionCounter.inc(); return; } // Find the current winning bid for auction. // The earliest bid with the maximum price above the reserve wins. Bid bestBid = null; for (Bid bid : c.element().getValue().getAll(NexmarkQueryUtil.BID_TAG)) { // Bids too late for their auction will have been // filtered out by the window merge function. checkState(bid.dateTime.compareTo(auction.expires) < 0); if (bid.price < auction.reserve) { // Bid price is below auction reserve. underReserveCounter.inc(); continue; } if (bestBid == null || Bid.PRICE_THEN_DESCENDING_TIME.compare(bid, bestBid) > 0) { bestBid = bid; } } if (bestBid == null) { // We don't have any valid bids for auction. noValidBidsCounter.inc(); return; } c.output(new AuctionBid(auction, bestBid)); } })); }
Example #20
Source File: TestExpansionService.java From beam with Apache License 2.0 | 4 votes |
@Override public PTransform<KeyedPCollectionTuple<Long>, PCollection<KV<Long, Iterable<String>>>> getTransform(RunnerApi.FunctionSpec spec) { return new TestCoGroupByKeyTransform(); }
Example #21
Source File: Join.java From beam with Apache License 2.0 | 4 votes |
@Override public PCollection<KV<K, KV<V1, V2>>> expand(PCollection<KV<K, V1>> leftCollection) { checkNotNull(leftCollection); checkNotNull(rightCollection); checkNotNull(nullValue); final TupleTag<V1> v1Tuple = new TupleTag<>(); final TupleTag<V2> v2Tuple = new TupleTag<>(); PCollection<KV<K, CoGbkResult>> coGbkResultCollection = KeyedPCollectionTuple.of(v1Tuple, leftCollection) .and(v2Tuple, rightCollection) .apply("CoGBK", CoGroupByKey.create()); return coGbkResultCollection .apply( "Join", ParDo.of( new DoFn<KV<K, CoGbkResult>, KV<K, KV<V1, V2>>>() { @ProcessElement public void processElement(ProcessContext c) { KV<K, CoGbkResult> e = c.element(); Iterable<V1> leftValuesIterable = e.getValue().getAll(v1Tuple); Iterable<V2> rightValuesIterable = e.getValue().getAll(v2Tuple); for (V2 rightValue : rightValuesIterable) { if (leftValuesIterable.iterator().hasNext()) { for (V1 leftValue : leftValuesIterable) { c.output(KV.of(e.getKey(), KV.of(leftValue, rightValue))); } } else { c.output(KV.of(e.getKey(), KV.of(nullValue, rightValue))); } } } })) .setCoder( KvCoder.of( ((KvCoder) leftCollection.getCoder()).getKeyCoder(), KvCoder.of( ((KvCoder) leftCollection.getCoder()).getValueCoder(), ((KvCoder) rightCollection.getCoder()).getValueCoder()))); }
Example #22
Source File: CoGroup.java From beam with Apache License 2.0 | 4 votes |
private static JoinInformation from( PCollectionTuple input, Function<String, FieldAccessDescriptor> getFieldAccessDescriptor, Function<String, Boolean> getIsSideInput) { KeyedPCollectionTuple<Row> keyedPCollectionTuple = KeyedPCollectionTuple.empty(input.getPipeline()); List<String> sortedTags = input.getAll().keySet().stream() .map(TupleTag::getId) .sorted() .collect(Collectors.toList()); // Keep this in a TreeMap so that it's sorted. This way we get a deterministic output // schema. TreeMap<String, Schema> componentSchemas = Maps.newTreeMap(); Map<Integer, SerializableFunction<Object, Row>> toRows = Maps.newHashMap(); Map<String, PCollectionView<Map<Row, Iterable<Row>>>> sideInputs = Maps.newHashMap(); Map<Integer, String> tagToKeyedTag = Maps.newHashMap(); Schema keySchema = null; for (Map.Entry<TupleTag<?>, PCollection<?>> entry : input.getAll().entrySet()) { String tag = entry.getKey().getId(); int tagIndex = sortedTags.indexOf(tag); PCollection<?> pc = entry.getValue(); Schema schema = pc.getSchema(); componentSchemas.put(tag, schema); toRows.put(tagIndex, (SerializableFunction<Object, Row>) pc.getToRowFunction()); FieldAccessDescriptor fieldAccessDescriptor = getFieldAccessDescriptor.apply(tag); if (fieldAccessDescriptor == null) { throw new IllegalStateException("No fields were set for input " + tag); } // Resolve the key schema, keeping the fields in the order specified by the user. // Otherwise, if different field names are specified for different PCollections, they // might not match up. // The key schema contains the field names from the first PCollection specified. FieldAccessDescriptor resolved = fieldAccessDescriptor.resolve(schema); Schema currentKeySchema = SelectHelpers.getOutputSchema(schema, resolved); if (keySchema == null) { keySchema = currentKeySchema; } else { keySchema = SchemaUtils.mergeWideningNullable(keySchema, currentKeySchema); } // Create a new tag for the output. TupleTag randomTag = new TupleTag<>(); String keyedTag = tag + "_" + randomTag; tagToKeyedTag.put(tagIndex, keyedTag); PCollection<KV<Row, Row>> keyedPCollection = extractKey(pc, schema, keySchema, resolved, tag); if (getIsSideInput.apply(tag)) { sideInputs.put( keyedTag, keyedPCollection.apply("computeSideInputView" + tag, View.asMultimap())); } else { keyedPCollectionTuple = keyedPCollectionTuple.and(keyedTag, keyedPCollection); } } return new JoinInformation( keyedPCollectionTuple, sideInputs, keySchema, componentSchemas, toRows, sortedTags, tagToKeyedTag); }
Example #23
Source File: Join.java From beam with Apache License 2.0 | 4 votes |
@Override public PCollection<KV<K, KV<V1, V2>>> expand(PCollection<KV<K, V1>> leftCollection) { checkNotNull(leftCollection); checkNotNull(rightCollection); checkNotNull(nullValue); final TupleTag<V1> v1Tuple = new TupleTag<>(); final TupleTag<V2> v2Tuple = new TupleTag<>(); PCollection<KV<K, CoGbkResult>> coGbkResultCollection = KeyedPCollectionTuple.of(v1Tuple, leftCollection) .and(v2Tuple, rightCollection) .apply("CoGBK", CoGroupByKey.create()); return coGbkResultCollection .apply( "Join", ParDo.of( new DoFn<KV<K, CoGbkResult>, KV<K, KV<V1, V2>>>() { @ProcessElement public void processElement(ProcessContext c) { KV<K, CoGbkResult> e = c.element(); Iterable<V1> leftValuesIterable = e.getValue().getAll(v1Tuple); Iterable<V2> rightValuesIterable = e.getValue().getAll(v2Tuple); for (V1 leftValue : leftValuesIterable) { if (rightValuesIterable.iterator().hasNext()) { for (V2 rightValue : rightValuesIterable) { c.output(KV.of(e.getKey(), KV.of(leftValue, rightValue))); } } else { c.output(KV.of(e.getKey(), KV.of(leftValue, nullValue))); } } } })) .setCoder( KvCoder.of( ((KvCoder) leftCollection.getCoder()).getKeyCoder(), KvCoder.of( ((KvCoder) leftCollection.getCoder()).getValueCoder(), ((KvCoder) rightCollection.getCoder()).getValueCoder()))); }