org.apache.beam.sdk.transforms.join.CoGbkResult Java Exaples

Source File: VerifyBamId.java From dataflow-java with Apache License 2.0

6 votes

/**
 * Filter, pile up, and sample reads, then join against reference statistics.
 *
 * @param reads A PCollection of reads
 * @param samplingFraction Fraction of reads to keep
 * @param samplingPrefix A prefix used in generating hashes used in sampling
 * @param refCounts A PCollection mapping position to counts of alleles in
 *   a reference population.
 * @return A PCollection mapping Position to a ReadCounts proto
 */
static PCollection<KV<Position, ReadCounts>> combineReads(PCollection<Read> reads,
    double samplingFraction, String samplingPrefix,
    PCollection<KV<Position, AlleleFreq>> refFreq) {
  // Runs filters on input Reads, splits into individual aligned bases (emitting the
  // base and quality) and grabs a sample of them based on a hash mod of Position.
  PCollection<KV<Position, ReadBaseQuality>> joinReadCounts =
      reads.apply("IsOnChromosome", Filter.by(ReadFunctions.IS_ON_CHROMOSOME))
      .apply("IsNotQCFailure", Filter.by(ReadFunctions.IS_NOT_QC_FAILURE))
      .apply("IsNotDuplicate", Filter.by(ReadFunctions.IS_NOT_DUPLICATE))
      .apply("IsProperPlacement", Filter.by(ReadFunctions.IS_PROPER_PLACEMENT))
      .apply(ParDo.of(new SplitReads()))
      .apply(Filter.by(new SampleReads(samplingFraction, samplingPrefix)));

  TupleTag<ReadBaseQuality> readCountsTag = new TupleTag<>();
  TupleTag<AlleleFreq> refFreqTag = new TupleTag<>();
  // Pile up read counts, then join against reference stats.
  PCollection<KV<Position, CoGbkResult>> joined = KeyedPCollectionTuple
      .of(readCountsTag, joinReadCounts)
      .and(refFreqTag, refFreq)
      .apply(CoGroupByKey.<Position>create());
  return joined.apply(ParDo.of(new PileupAndJoinReads(readCountsTag, refFreqTag)));
}

Source File: MusicBrainzTransforms.java From bigquery-etl-dataflow-sample with Apache License 2.0

6 votes

private static PCollection<KV<Long, CoGbkResult>> group(String name,
                                                        PCollection<KV<Long, MusicBrainzDataObject>> first,
                                                        PCollection<KV<Long, MusicBrainzDataObject>> second,
                                                        TupleTag<MusicBrainzDataObject> firstTag,
                                                        TupleTag<MusicBrainzDataObject> secondTag
) {
  final CoGroupByKey<Long> grouper = CoGroupByKey.create();


  PCollection<KV<Long, CoGbkResult>> joinedResult;

  try {
    joinedResult = KeyedPCollectionTuple
                    .of(firstTag, first)
                    .and(secondTag, second)
                    .apply("joinResult_"+name,
                           CoGroupByKey.<Long>create());
  } catch (Exception e) {
    logger.error("exception grouping.", e);
    return null;
  }
  return joinedResult;
}

Source File: MultinomialLogisticRegression.java From nemo with Apache License 2.0

6 votes

@Override
public PCollection<KV<Integer, List<Double>>> expand(final PCollection<KV<Integer, List<Double>>> model) {
  // Model as a view.
  final PCollectionView<Map<Integer, List<Double>>> modelView = model.apply(View.asMap());

  // Find gradient.
  final PCollection<KV<Integer, List<Double>>> gradient = readInput
      .apply(ParDo.of(
          new CalculateGradient(modelView, numClasses, numFeatures)).withSideInputs(modelView))
      .apply(Combine.perKey(new CombineFunction()));

  // Tags for CoGroupByKey.
  final TupleTag<List<Double>> gradientTag = new TupleTag<>();
  final TupleTag<List<Double>> modelTag = new TupleTag<>();
  final KeyedPCollectionTuple<Integer> coGbkInput = KeyedPCollectionTuple
      .of(gradientTag, gradient)
      .and(modelTag, model);

  final PCollection<KV<Integer, CoGbkResult>> groupResult =
      coGbkInput.apply(CoGroupByKey.create());

  // Update the model
  return groupResult
      .apply(ParDo.of(new ApplyGradient(numFeatures, numClasses, iterationNum, gradientTag, modelTag)));
}

Source File: TestExpansionService.java From beam with Apache License 2.0

6 votes

@Override
public PCollection<KV<Long, Iterable<String>>> expand(KeyedPCollectionTuple<Long> input) {
  Set<String> tagSet = ImmutableSet.of("col1", "col2");
  return input
      .apply(CoGroupByKey.create())
      .apply(
          ParDo.of(
              new DoFn<KV<Long, CoGbkResult>, KV<Long, Iterable<String>>>() {
                @ProcessElement
                public void processElement(
                    @Element KV<Long, CoGbkResult> kv,
                    OutputReceiver<KV<Long, Iterable<String>>> out) {
                  Iterable<String> iter =
                      () ->
                          tagSet.stream()
                              .flatMap(
                                  (String t) ->
                                      StreamSupport.stream(
                                          kv.getValue().<String>getAll(t).spliterator(),
                                          false))
                              .iterator();
                  out.output(KV.of(kv.getKey(), iter));
                }
              }));
}

Source File: MultinomialLogisticRegression.java From incubator-nemo with Apache License 2.0

6 votes

@Override
public PCollection<KV<Integer, List<Double>>> expand(final PCollection<KV<Integer, List<Double>>> model) {
  // Model as a view.
  final PCollectionView<Map<Integer, List<Double>>> modelView = model.apply(View.asMap());

  // Find gradient.
  final PCollection<KV<Integer, List<Double>>> gradient = readInput
    .apply(ParDo.of(
      new CalculateGradient(modelView, numClasses, numFeatures)).withSideInputs(modelView))
    .apply(Combine.perKey(new CombineFunction()));

  // Tags for CoGroupByKey.
  final TupleTag<List<Double>> gradientTag = new TupleTag<>();
  final TupleTag<List<Double>> modelTag = new TupleTag<>();
  final KeyedPCollectionTuple<Integer> coGbkInput = KeyedPCollectionTuple
    .of(gradientTag, gradient)
    .and(modelTag, model);

  final PCollection<KV<Integer, CoGbkResult>> groupResult =
    coGbkInput.apply(CoGroupByKey.create());

  // Update the model
  return groupResult
    .apply(ParDo.of(new ApplyGradient(numFeatures, numClasses, iterationNum, gradientTag, modelTag)));
}

Source File: Join.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<KV<K, KV<V1, V2>>> expand(PCollection<KV<K, V1>> leftCollection) {
  checkNotNull(leftCollection);
  checkNotNull(rightCollection);

  final TupleTag<V1> v1Tuple = new TupleTag<>();
  final TupleTag<V2> v2Tuple = new TupleTag<>();

  PCollection<KV<K, CoGbkResult>> coGbkResultCollection =
      KeyedPCollectionTuple.of(v1Tuple, leftCollection)
          .and(v2Tuple, rightCollection)
          .apply("CoGBK", CoGroupByKey.create());

  return coGbkResultCollection
      .apply(
          "Join",
          ParDo.of(
              new DoFn<KV<K, CoGbkResult>, KV<K, KV<V1, V2>>>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  KV<K, CoGbkResult> e = c.element();

                  Iterable<V1> leftValuesIterable = e.getValue().getAll(v1Tuple);
                  Iterable<V2> rightValuesIterable = e.getValue().getAll(v2Tuple);

                  for (V1 leftValue : leftValuesIterable) {
                    for (V2 rightValue : rightValuesIterable) {
                      c.output(KV.of(e.getKey(), KV.of(leftValue, rightValue)));
                    }
                  }
                }
              }))
      .setCoder(
          KvCoder.of(
              ((KvCoder) leftCollection.getCoder()).getKeyCoder(),
              KvCoder.of(
                  ((KvCoder) leftCollection.getCoder()).getValueCoder(),
                  ((KvCoder) rightCollection.getCoder()).getValueCoder())));
}

Source File: VerifyBamIdTest.java From dataflow-java with Apache License 2.0

5 votes

@Test
public void testPileupAndJoinReadsWithChrPrefix() throws Exception {
  ReadBaseQuality srq = new ReadBaseQuality("A", 10);
  PCollection<KV<Position, ReadBaseQuality>> readCounts = p.apply(
      "createInput", Create.of(KV.of(position1chrPrefix, srq)));
  PAssert.that(readCounts).containsInAnyOrder(KV.of(position1chrPrefix, srq));

  PCollection<KV<Position, AlleleFreq>> refFreq = p.apply(Create.of(refCountList));
  PAssert.that(refFreq).containsInAnyOrder(refCountList);

  TupleTag<ReadBaseQuality> readCountsTag = new TupleTag<>();
  TupleTag<AlleleFreq> refFreqTag = new TupleTag<>();
  PCollection<KV<Position, CoGbkResult>> joined = KeyedPCollectionTuple
      .of(readCountsTag, readCounts)
      .and(refFreqTag, refFreq)
      .apply(CoGroupByKey.<Position>create());

  PCollection<KV<Position, ReadCounts>> result = joined.apply(
      ParDo.of(new PileupAndJoinReads(readCountsTag, refFreqTag)));

  KV<Position, ReadCounts> expectedResult1 = KV.of(position1, rc1);
  KV<Position, ReadCounts> expectedResult2 = KV.of(position2, rc2);
  KV<Position, ReadCounts> expectedResult3 = KV.of(position3, rc3);

  PAssert.that(result).containsInAnyOrder(expectedResult1, expectedResult2, expectedResult3);
  p.run();
}

Source File: VerifyBamIdTest.java From dataflow-java with Apache License 2.0

5 votes

@Test
public void testPileupAndJoinReads() throws Exception {
  final ReadBaseQuality srq = new ReadBaseQuality("A", 10);
  PCollection<KV<Position, ReadBaseQuality>> readCounts = p.apply(
      "createInput", Create.of(KV.of(position1, srq)));
  PAssert.that(readCounts).containsInAnyOrder(KV.of(position1, srq));

  PCollection<KV<Position, AlleleFreq>> refFreq = p.apply(Create.of(refCountList));

  PAssert.that(refFreq).containsInAnyOrder(refCountList);

  final TupleTag<ReadBaseQuality> readCountsTag = new TupleTag<>();
  TupleTag<AlleleFreq> refFreqTag = new TupleTag<>();
  PCollection<KV<Position, CoGbkResult>> joined = KeyedPCollectionTuple
      .of(readCountsTag, readCounts)
      .and(refFreqTag, refFreq)
      .apply(CoGroupByKey.<Position>create());

  PCollection<KV<Position, ReadCounts>> result = joined.apply(
      ParDo.of(new PileupAndJoinReads(readCountsTag, refFreqTag)));

  KV<Position, ReadCounts> expectedResult1 = KV.of(position1, rc1);
  KV<Position, ReadCounts> expectedResult2 = KV.of(position2, rc2);
  KV<Position, ReadCounts> expectedResult3 = KV.of(position3, rc3);

  PAssert.that(result).containsInAnyOrder(expectedResult1, expectedResult2, expectedResult3);
  p.run();
}

Source File: CoGroup.java From beam with Apache License 2.0

5 votes

@ProcessElement
public void process(
    @Element KV<Row, CoGbkResult> element, ProcessContext c, OutputReceiver<Row> o) {
  Result result =
      Result.from(
          joinInformation, joinArgs, element.getKey(), outputSchema, element.getValue(), c);
  if (convertType == ConvertType.UNEXPANDED) {
    result.outputUnexpandedRow(outputSchema, o);
  } else {
    result.outputExpandedRows(o);
  }
}

Source File: CoGroup.java From beam with Apache License 2.0

5 votes

static Result from(
    JoinInformation joinInformation,
    JoinArguments joinArgs,
    Row key,
    Schema outputSchema,
    CoGbkResult coGbkResult,
    DoFn<?, Row>.ProcessContext processContext) {
  return from(
      joinInformation, joinArgs, key, outputSchema, coGbkResult::getAll, processContext);
}

Source File: JoinTranslator.java From beam with Apache License 2.0

5 votes

@ProcessElement
@SuppressWarnings("unused")
public final void processElement(@Element KV<KeyT, CoGbkResult> element, ProcessContext ctx) {
  getCollector().setProcessContext(ctx);
  doJoin(
      requireNonNull(element.getValue()).getAll(leftTag),
      requireNonNull(element.getValue()).getAll(rightTag));
}

Source File: Task.java From beam with Apache License 2.0

5 votes

static PCollection<String> applyTransform(
    PCollection<String> fruits, PCollection<String> countries) {

  TupleTag<String> fruitsTag = new TupleTag<>();
  TupleTag<String> countriesTag = new TupleTag<>();

  MapElements<String, KV<String, String>> mapToAlphabetKv =
      MapElements.into(kvs(strings(), strings()))
          .via(word -> KV.of(word.substring(0, 1), word));

  PCollection<KV<String, String>> fruitsPColl = fruits.apply("Fruit to KV", mapToAlphabetKv);
  PCollection<KV<String, String>> countriesPColl = countries
      .apply("Country to KV", mapToAlphabetKv);

  return KeyedPCollectionTuple
      .of(fruitsTag, fruitsPColl)
      .and(countriesTag, countriesPColl)

      .apply(CoGroupByKey.create())

      .apply(ParDo.of(new DoFn<KV<String, CoGbkResult>, String>() {

        @ProcessElement
        public void processElement(
            @Element KV<String, CoGbkResult> element, OutputReceiver<String> out) {

          String alphabet = element.getKey();
          CoGbkResult coGbkResult = element.getValue();

          String fruit = coGbkResult.getOnly(fruitsTag);
          String country = coGbkResult.getOnly(countriesTag);

          out.output(new WordsAlphabet(alphabet, fruit, country).toString());
        }

      }));
}

Source File: Snippets.java From beam with Apache License 2.0

5 votes

/** Using a CoGroupByKey transform. */
public static PCollection<String> coGroupByKeyTuple(
    TupleTag<String> emailsTag,
    TupleTag<String> phonesTag,
    PCollection<KV<String, String>> emails,
    PCollection<KV<String, String>> phones) {

  // [START CoGroupByKeyTuple]
  PCollection<KV<String, CoGbkResult>> results =
      KeyedPCollectionTuple.of(emailsTag, emails)
          .and(phonesTag, phones)
          .apply(CoGroupByKey.create());

  PCollection<String> contactLines =
      results.apply(
          ParDo.of(
              new DoFn<KV<String, CoGbkResult>, String>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  KV<String, CoGbkResult> e = c.element();
                  String name = e.getKey();
                  Iterable<String> emailsIter = e.getValue().getAll(emailsTag);
                  Iterable<String> phonesIter = e.getValue().getAll(phonesTag);
                  String formattedResult =
                      Snippets.formatCoGbkResults(name, emailsIter, phonesIter);
                  c.output(formattedResult);
                }
              }));
  // [END CoGroupByKeyTuple]
  return contactLines;
}

Source File: CoGroupByKeyResultMappingTransform.java From component-runtime with Apache License 2.0

5 votes

private Record createMap(final ProcessContext context) {
    final KV<K, CoGbkResult> element = context.element();
    final CoGbkResult result = element.getValue();
    final RecordBuilderFactory builderFactory = builderFactory();
    final Record.Builder builder = result
            .getSchema()
            .getTupleTagList()
            .getAll()
            .stream()
            .map(key -> new Pair<>(key.getId(), Record.class.cast(result.getOnly(key, null))))
            .filter(p -> p.getSecond() != null)
            .collect(builderFactory::newRecordBuilder, (b, p) -> {
                final Record record = p.getSecond();
                final Schema.Entry entry = builderFactory
                        .newEntryBuilder()
                        .withName(p.getFirst())
                        .withType(Schema.Type.ARRAY)
                        .withElementSchema(record.getSchema())
                        .build();
                b.withArray(entry, singletonList(record));
            }, RecordCollectors::merge);
    if (propagateKey) {
        final Record internalRecord =
                builderFactory.newRecordBuilder().withString("key", String.valueOf(element.getKey())).build();
        builder
                .withRecord(builderFactory
                        .newEntryBuilder()
                        .withName("__talend_internal")
                        .withType(Schema.Type.RECORD)
                        .withElementSchema(internalRecord.getSchema())
                        .build(), internalRecord);
    }
    return builder.build();
}

Source File: CompareDatabases.java From DataflowTemplates with Apache License 2.0

5 votes

@Override
public PCollection<Long> expand(PBegin begin) {

  final TupleTag<Struct> oneTag = new TupleTag<>();
  PCollection<KV<String, Struct>> rowsOne = begin.apply("Read one", new ReadAllRows(one));
  final TupleTag<Struct> twoTag = new TupleTag<>();
  PCollection<KV<String, Struct>> rowsTwo = begin.apply("Read two", new ReadAllRows(two));

  PCollection<KV<String, CoGbkResult>> cogroup =
      KeyedPCollectionTuple.of(oneTag, rowsOne).and(twoTag, rowsTwo).apply(CoGroupByKey.create());

  PCollection<String> fails =
      cogroup.apply(
          ParDo.of(
              new DoFn<KV<String, CoGbkResult>, String>() {

                @ProcessElement
                public void processElement(ProcessContext c) {
                  KV<String, CoGbkResult> element = c.element();
                  CoGbkResult gbk = element.getValue();
                  ArrayList<Struct> oneRows = Lists.newArrayList(gbk.getAll(oneTag));
                  ArrayList<Struct> twoRows = Lists.newArrayList(gbk.getAll(twoTag));

                  if (oneRows.size() != 1 || twoRows.size() != 1) {
                    c.output(element.getKey());
                    return;
                  }

                  Struct sOne = oneRows.get(0);
                  Struct sTwo = twoRows.get(0);

                  if (!sOne.equals(sTwo)) {
                    c.output(element.getKey());
                  }
                }
              }));

  return fails.apply(Count.globally());
}

Source File: JoinTranslator.java From beam with Apache License 2.0

4 votes

AdaptableCollector<KV<KeyT, CoGbkResult>, KV<KeyT, OutputT>, OutputT> getCollector() {
  return resultsCollector;
}

Source File: Join.java From beam with Apache License 2.0

4 votes

@Override
public PCollection<KV<K, KV<V1, V2>>> expand(PCollection<KV<K, V1>> leftCollection) {
  checkNotNull(leftCollection);
  checkNotNull(rightCollection);
  checkNotNull(nullValue);
  final TupleTag<V1> v1Tuple = new TupleTag<>();
  final TupleTag<V2> v2Tuple = new TupleTag<>();

  PCollection<KV<K, CoGbkResult>> coGbkResultCollection =
      KeyedPCollectionTuple.of(v1Tuple, leftCollection)
          .and(v2Tuple, rightCollection)
          .apply("CoGBK", CoGroupByKey.create());

  return coGbkResultCollection
      .apply(
          "Join",
          ParDo.of(
              new DoFn<KV<K, CoGbkResult>, KV<K, KV<V1, V2>>>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  KV<K, CoGbkResult> e = c.element();

                  Iterable<V1> leftValuesIterable = e.getValue().getAll(v1Tuple);
                  Iterable<V2> rightValuesIterable = e.getValue().getAll(v2Tuple);

                  for (V1 leftValue : leftValuesIterable) {
                    if (rightValuesIterable.iterator().hasNext()) {
                      for (V2 rightValue : rightValuesIterable) {
                        c.output(KV.of(e.getKey(), KV.of(leftValue, rightValue)));
                      }
                    } else {
                      c.output(KV.of(e.getKey(), KV.of(leftValue, nullValue)));
                    }
                  }
                }
              }))
      .setCoder(
          KvCoder.of(
              ((KvCoder) leftCollection.getCoder()).getKeyCoder(),
              KvCoder.of(
                  ((KvCoder) leftCollection.getCoder()).getValueCoder(),
                  ((KvCoder) rightCollection.getCoder()).getValueCoder())));
}

Source File: Join.java From beam with Apache License 2.0

4 votes

@Override
public PCollection<KV<K, KV<V1, V2>>> expand(PCollection<KV<K, V1>> leftCollection) {
  checkNotNull(leftCollection);
  checkNotNull(rightCollection);
  checkNotNull(nullValue);

  final TupleTag<V1> v1Tuple = new TupleTag<>();
  final TupleTag<V2> v2Tuple = new TupleTag<>();

  PCollection<KV<K, CoGbkResult>> coGbkResultCollection =
      KeyedPCollectionTuple.of(v1Tuple, leftCollection)
          .and(v2Tuple, rightCollection)
          .apply("CoGBK", CoGroupByKey.create());

  return coGbkResultCollection
      .apply(
          "Join",
          ParDo.of(
              new DoFn<KV<K, CoGbkResult>, KV<K, KV<V1, V2>>>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  KV<K, CoGbkResult> e = c.element();

                  Iterable<V1> leftValuesIterable = e.getValue().getAll(v1Tuple);
                  Iterable<V2> rightValuesIterable = e.getValue().getAll(v2Tuple);

                  for (V2 rightValue : rightValuesIterable) {
                    if (leftValuesIterable.iterator().hasNext()) {
                      for (V1 leftValue : leftValuesIterable) {
                        c.output(KV.of(e.getKey(), KV.of(leftValue, rightValue)));
                      }
                    } else {
                      c.output(KV.of(e.getKey(), KV.of(nullValue, rightValue)));
                    }
                  }
                }
              }))
      .setCoder(
          KvCoder.of(
              ((KvCoder) leftCollection.getCoder()).getKeyCoder(),
              KvCoder.of(
                  ((KvCoder) leftCollection.getCoder()).getValueCoder(),
                  ((KvCoder) rightCollection.getCoder()).getValueCoder())));
}

Source File: Query8.java From beam with Apache License 2.0

4 votes

@Override
public PCollection<IdNameReserve> expand(PCollection<Event> events) {
  // Window and key new people by their id.
  PCollection<KV<Long, Person>> personsById =
      events
          .apply(NexmarkQueryUtil.JUST_NEW_PERSONS)
          .apply(
              "Query8.WindowPersons",
              Window.into(FixedWindows.of(Duration.standardSeconds(configuration.windowSizeSec))))
          .apply("PersonById", NexmarkQueryUtil.PERSON_BY_ID);

  // Window and key new auctions by their id.
  PCollection<KV<Long, Auction>> auctionsBySeller =
      events
          .apply(NexmarkQueryUtil.JUST_NEW_AUCTIONS)
          .apply(
              "Query8.WindowAuctions",
              Window.into(FixedWindows.of(Duration.standardSeconds(configuration.windowSizeSec))))
          .apply("AuctionBySeller", NexmarkQueryUtil.AUCTION_BY_SELLER);

  // Join people and auctions and project the person id, name and auction reserve price.
  return KeyedPCollectionTuple.of(NexmarkQueryUtil.PERSON_TAG, personsById)
      .and(NexmarkQueryUtil.AUCTION_TAG, auctionsBySeller)
      .apply(CoGroupByKey.create())
      .apply(
          name + ".Select",
          ParDo.of(
              new DoFn<KV<Long, CoGbkResult>, IdNameReserve>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  @Nullable
                  Person person =
                      c.element().getValue().getOnly(NexmarkQueryUtil.PERSON_TAG, null);
                  if (person == null) {
                    // Person was not created in last window period.
                    return;
                  }
                  for (Auction auction :
                      c.element().getValue().getAll(NexmarkQueryUtil.AUCTION_TAG)) {
                    c.output(new IdNameReserve(person.id, person.name, auction.reserve));
                  }
                }
              }));
}

Source File: WinningBids.java From beam with Apache License 2.0

4 votes

@Override
public PCollection<AuctionBid> expand(PCollection<Event> events) {
  // Window auctions and bids into custom auction windows. New people events will be discarded.
  // This will allow us to bring bids and auctions together irrespective of how long
  // each auction is open for.
  events = events.apply("Window", Window.into(auctionOrBidWindowFn));

  // Key auctions by their id.
  PCollection<KV<Long, Auction>> auctionsById =
      events
          .apply(NexmarkQueryUtil.JUST_NEW_AUCTIONS)
          .apply("AuctionById:", NexmarkQueryUtil.AUCTION_BY_ID);

  // Key bids by their auction id.
  PCollection<KV<Long, Bid>> bidsByAuctionId =
      events
          .apply(NexmarkQueryUtil.JUST_BIDS)
          .apply("BidByAuction", NexmarkQueryUtil.BID_BY_AUCTION);

  // Find the highest price valid bid for each closed auction.
  return
  // Join auctions and bids.
  KeyedPCollectionTuple.of(NexmarkQueryUtil.AUCTION_TAG, auctionsById)
      .and(NexmarkQueryUtil.BID_TAG, bidsByAuctionId)
      .apply(CoGroupByKey.create())
      // Filter and select.
      .apply(
          name + ".Join",
          ParDo.of(
              new DoFn<KV<Long, CoGbkResult>, AuctionBid>() {
                private final Counter noAuctionCounter = Metrics.counter(name, "noAuction");
                private final Counter underReserveCounter = Metrics.counter(name, "underReserve");
                private final Counter noValidBidsCounter = Metrics.counter(name, "noValidBids");

                @ProcessElement
                public void processElement(ProcessContext c) {
                  @Nullable
                  Auction auction =
                      c.element().getValue().getOnly(NexmarkQueryUtil.AUCTION_TAG, null);
                  if (auction == null) {
                    // We have bids without a matching auction. Give up.
                    noAuctionCounter.inc();
                    return;
                  }
                  // Find the current winning bid for auction.
                  // The earliest bid with the maximum price above the reserve wins.
                  Bid bestBid = null;
                  for (Bid bid : c.element().getValue().getAll(NexmarkQueryUtil.BID_TAG)) {
                    // Bids too late for their auction will have been
                    // filtered out by the window merge function.
                    checkState(bid.dateTime.compareTo(auction.expires) < 0);
                    if (bid.price < auction.reserve) {
                      // Bid price is below auction reserve.
                      underReserveCounter.inc();
                      continue;
                    }

                    if (bestBid == null
                        || Bid.PRICE_THEN_DESCENDING_TIME.compare(bid, bestBid) > 0) {
                      bestBid = bid;
                    }
                  }
                  if (bestBid == null) {
                    // We don't have any valid bids for auction.
                    noValidBidsCounter.inc();
                    return;
                  }
                  c.output(new AuctionBid(auction, bestBid));
                }
              }));
}

Source File: SnippetsTest.java From beam with Apache License 2.0

4 votes

@Test
public void testCoGroupByKeyTuple() throws IOException {
  // [START CoGroupByKeyTupleInputs]
  final List<KV<String, String>> emailsList =
      Arrays.asList(
          KV.of("amy", "[email protected]"),
          KV.of("carl", "[email protected]"),
          KV.of("julia", "[email protected]"),
          KV.of("carl", "[email protected]"));

  final List<KV<String, String>> phonesList =
      Arrays.asList(
          KV.of("amy", "111-222-3333"),
          KV.of("james", "222-333-4444"),
          KV.of("amy", "333-444-5555"),
          KV.of("carl", "444-555-6666"));

  PCollection<KV<String, String>> emails = p.apply("CreateEmails", Create.of(emailsList));
  PCollection<KV<String, String>> phones = p.apply("CreatePhones", Create.of(phonesList));
  // [END CoGroupByKeyTupleInputs]

  // [START CoGroupByKeyTupleOutputs]
  final TupleTag<String> emailsTag = new TupleTag<>();
  final TupleTag<String> phonesTag = new TupleTag<>();

  final List<KV<String, CoGbkResult>> expectedResults =
      Arrays.asList(
          KV.of(
              "amy",
              CoGbkResult.of(emailsTag, Arrays.asList("[email protected]"))
                  .and(phonesTag, Arrays.asList("111-222-3333", "333-444-5555"))),
          KV.of(
              "carl",
              CoGbkResult.of(emailsTag, Arrays.asList("[email protected]", "[email protected]"))
                  .and(phonesTag, Arrays.asList("444-555-6666"))),
          KV.of(
              "james",
              CoGbkResult.of(emailsTag, Arrays.asList())
                  .and(phonesTag, Arrays.asList("222-333-4444"))),
          KV.of(
              "julia",
              CoGbkResult.of(emailsTag, Arrays.asList("[email protected]"))
                  .and(phonesTag, Arrays.asList())));
  // [END CoGroupByKeyTupleOutputs]

  PCollection<String> actualFormattedResults =
      Snippets.coGroupByKeyTuple(emailsTag, phonesTag, emails, phones);

  // [START CoGroupByKeyTupleFormattedOutputs]
  final List<String> formattedResults =
      Arrays.asList(
          "amy; ['[email protected]']; ['111-222-3333', '333-444-5555']",
          "carl; ['[email protected]', '[email protected]']; ['444-555-6666']",
          "james; []; ['222-333-4444']",
          "julia; ['[email protected]']; []");
  // [END CoGroupByKeyTupleFormattedOutputs]

  // Make sure that both 'expectedResults' and 'actualFormattedResults' match with the
  // 'formattedResults'. 'expectedResults' will have to be formatted before comparing
  List<String> expectedFormattedResultsList = new ArrayList<>(expectedResults.size());
  for (KV<String, CoGbkResult> e : expectedResults) {
    String name = e.getKey();
    Iterable<String> emailsIter = e.getValue().getAll(emailsTag);
    Iterable<String> phonesIter = e.getValue().getAll(phonesTag);
    String formattedResult = Snippets.formatCoGbkResults(name, emailsIter, phonesIter);
    expectedFormattedResultsList.add(formattedResult);
  }
  PCollection<String> expectedFormattedResultsPColl =
      p.apply(Create.of(expectedFormattedResultsList));
  PAssert.that(expectedFormattedResultsPColl).containsInAnyOrder(formattedResults);
  PAssert.that(actualFormattedResults).containsInAnyOrder(formattedResults);

  p.run();
}

Source File: CoGroupByKeyResultMappingTransform.java From component-runtime with Apache License 2.0

4 votes

@Override
public PCollection<Record> expand(final PCollection<KV<K, CoGbkResult>> input) {
    return input.apply(ParDo.of(new CoGBKMappingFn<>(plugin, propagateKey, null)));
}

org.apache.beam.sdk.transforms.join.CoGbkResult Java Examples