org.apache.beam.sdk.transforms.join.CoGbkResult Java Examples

The following examples show how to use org.apache.beam.sdk.transforms.join.CoGbkResult. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: VerifyBamId.java    From dataflow-java with Apache License 2.0 6 votes vote down vote up
/**
 * Filter, pile up, and sample reads, then join against reference statistics.
 *
 * @param reads A PCollection of reads
 * @param samplingFraction Fraction of reads to keep
 * @param samplingPrefix A prefix used in generating hashes used in sampling
 * @param refCounts A PCollection mapping position to counts of alleles in
 *   a reference population.
 * @return A PCollection mapping Position to a ReadCounts proto
 */
static PCollection<KV<Position, ReadCounts>> combineReads(PCollection<Read> reads,
    double samplingFraction, String samplingPrefix,
    PCollection<KV<Position, AlleleFreq>> refFreq) {
  // Runs filters on input Reads, splits into individual aligned bases (emitting the
  // base and quality) and grabs a sample of them based on a hash mod of Position.
  PCollection<KV<Position, ReadBaseQuality>> joinReadCounts =
      reads.apply("IsOnChromosome", Filter.by(ReadFunctions.IS_ON_CHROMOSOME))
      .apply("IsNotQCFailure", Filter.by(ReadFunctions.IS_NOT_QC_FAILURE))
      .apply("IsNotDuplicate", Filter.by(ReadFunctions.IS_NOT_DUPLICATE))
      .apply("IsProperPlacement", Filter.by(ReadFunctions.IS_PROPER_PLACEMENT))
      .apply(ParDo.of(new SplitReads()))
      .apply(Filter.by(new SampleReads(samplingFraction, samplingPrefix)));

  TupleTag<ReadBaseQuality> readCountsTag = new TupleTag<>();
  TupleTag<AlleleFreq> refFreqTag = new TupleTag<>();
  // Pile up read counts, then join against reference stats.
  PCollection<KV<Position, CoGbkResult>> joined = KeyedPCollectionTuple
      .of(readCountsTag, joinReadCounts)
      .and(refFreqTag, refFreq)
      .apply(CoGroupByKey.<Position>create());
  return joined.apply(ParDo.of(new PileupAndJoinReads(readCountsTag, refFreqTag)));
}
 
Example #2
Source File: MusicBrainzTransforms.java    From bigquery-etl-dataflow-sample with Apache License 2.0 6 votes vote down vote up
private static PCollection<KV<Long, CoGbkResult>> group(String name,
                                                        PCollection<KV<Long, MusicBrainzDataObject>> first,
                                                        PCollection<KV<Long, MusicBrainzDataObject>> second,
                                                        TupleTag<MusicBrainzDataObject> firstTag,
                                                        TupleTag<MusicBrainzDataObject> secondTag
) {
  final CoGroupByKey<Long> grouper = CoGroupByKey.create();


  PCollection<KV<Long, CoGbkResult>> joinedResult;

  try {
    joinedResult = KeyedPCollectionTuple
                    .of(firstTag, first)
                    .and(secondTag, second)
                    .apply("joinResult_"+name,
                           CoGroupByKey.<Long>create());
  } catch (Exception e) {
    logger.error("exception grouping.", e);
    return null;
  }
  return joinedResult;
}
 
Example #3
Source File: MultinomialLogisticRegression.java    From nemo with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<KV<Integer, List<Double>>> expand(final PCollection<KV<Integer, List<Double>>> model) {
  // Model as a view.
  final PCollectionView<Map<Integer, List<Double>>> modelView = model.apply(View.asMap());

  // Find gradient.
  final PCollection<KV<Integer, List<Double>>> gradient = readInput
      .apply(ParDo.of(
          new CalculateGradient(modelView, numClasses, numFeatures)).withSideInputs(modelView))
      .apply(Combine.perKey(new CombineFunction()));

  // Tags for CoGroupByKey.
  final TupleTag<List<Double>> gradientTag = new TupleTag<>();
  final TupleTag<List<Double>> modelTag = new TupleTag<>();
  final KeyedPCollectionTuple<Integer> coGbkInput = KeyedPCollectionTuple
      .of(gradientTag, gradient)
      .and(modelTag, model);

  final PCollection<KV<Integer, CoGbkResult>> groupResult =
      coGbkInput.apply(CoGroupByKey.create());

  // Update the model
  return groupResult
      .apply(ParDo.of(new ApplyGradient(numFeatures, numClasses, iterationNum, gradientTag, modelTag)));
}
 
Example #4
Source File: TestExpansionService.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<KV<Long, Iterable<String>>> expand(KeyedPCollectionTuple<Long> input) {
  Set<String> tagSet = ImmutableSet.of("col1", "col2");
  return input
      .apply(CoGroupByKey.create())
      .apply(
          ParDo.of(
              new DoFn<KV<Long, CoGbkResult>, KV<Long, Iterable<String>>>() {
                @ProcessElement
                public void processElement(
                    @Element KV<Long, CoGbkResult> kv,
                    OutputReceiver<KV<Long, Iterable<String>>> out) {
                  Iterable<String> iter =
                      () ->
                          tagSet.stream()
                              .flatMap(
                                  (String t) ->
                                      StreamSupport.stream(
                                          kv.getValue().<String>getAll(t).spliterator(),
                                          false))
                              .iterator();
                  out.output(KV.of(kv.getKey(), iter));
                }
              }));
}
 
Example #5
Source File: MultinomialLogisticRegression.java    From incubator-nemo with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<KV<Integer, List<Double>>> expand(final PCollection<KV<Integer, List<Double>>> model) {
  // Model as a view.
  final PCollectionView<Map<Integer, List<Double>>> modelView = model.apply(View.asMap());

  // Find gradient.
  final PCollection<KV<Integer, List<Double>>> gradient = readInput
    .apply(ParDo.of(
      new CalculateGradient(modelView, numClasses, numFeatures)).withSideInputs(modelView))
    .apply(Combine.perKey(new CombineFunction()));

  // Tags for CoGroupByKey.
  final TupleTag<List<Double>> gradientTag = new TupleTag<>();
  final TupleTag<List<Double>> modelTag = new TupleTag<>();
  final KeyedPCollectionTuple<Integer> coGbkInput = KeyedPCollectionTuple
    .of(gradientTag, gradient)
    .and(modelTag, model);

  final PCollection<KV<Integer, CoGbkResult>> groupResult =
    coGbkInput.apply(CoGroupByKey.create());

  // Update the model
  return groupResult
    .apply(ParDo.of(new ApplyGradient(numFeatures, numClasses, iterationNum, gradientTag, modelTag)));
}
 
Example #6
Source File: Join.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<K, KV<V1, V2>>> expand(PCollection<KV<K, V1>> leftCollection) {
  checkNotNull(leftCollection);
  checkNotNull(rightCollection);

  final TupleTag<V1> v1Tuple = new TupleTag<>();
  final TupleTag<V2> v2Tuple = new TupleTag<>();

  PCollection<KV<K, CoGbkResult>> coGbkResultCollection =
      KeyedPCollectionTuple.of(v1Tuple, leftCollection)
          .and(v2Tuple, rightCollection)
          .apply("CoGBK", CoGroupByKey.create());

  return coGbkResultCollection
      .apply(
          "Join",
          ParDo.of(
              new DoFn<KV<K, CoGbkResult>, KV<K, KV<V1, V2>>>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  KV<K, CoGbkResult> e = c.element();

                  Iterable<V1> leftValuesIterable = e.getValue().getAll(v1Tuple);
                  Iterable<V2> rightValuesIterable = e.getValue().getAll(v2Tuple);

                  for (V1 leftValue : leftValuesIterable) {
                    for (V2 rightValue : rightValuesIterable) {
                      c.output(KV.of(e.getKey(), KV.of(leftValue, rightValue)));
                    }
                  }
                }
              }))
      .setCoder(
          KvCoder.of(
              ((KvCoder) leftCollection.getCoder()).getKeyCoder(),
              KvCoder.of(
                  ((KvCoder) leftCollection.getCoder()).getValueCoder(),
                  ((KvCoder) rightCollection.getCoder()).getValueCoder())));
}
 
Example #7
Source File: VerifyBamIdTest.java    From dataflow-java with Apache License 2.0 5 votes vote down vote up
@Test
public void testPileupAndJoinReadsWithChrPrefix() throws Exception {
  ReadBaseQuality srq = new ReadBaseQuality("A", 10);
  PCollection<KV<Position, ReadBaseQuality>> readCounts = p.apply(
      "createInput", Create.of(KV.of(position1chrPrefix, srq)));
  PAssert.that(readCounts).containsInAnyOrder(KV.of(position1chrPrefix, srq));

  PCollection<KV<Position, AlleleFreq>> refFreq = p.apply(Create.of(refCountList));
  PAssert.that(refFreq).containsInAnyOrder(refCountList);

  TupleTag<ReadBaseQuality> readCountsTag = new TupleTag<>();
  TupleTag<AlleleFreq> refFreqTag = new TupleTag<>();
  PCollection<KV<Position, CoGbkResult>> joined = KeyedPCollectionTuple
      .of(readCountsTag, readCounts)
      .and(refFreqTag, refFreq)
      .apply(CoGroupByKey.<Position>create());

  PCollection<KV<Position, ReadCounts>> result = joined.apply(
      ParDo.of(new PileupAndJoinReads(readCountsTag, refFreqTag)));

  KV<Position, ReadCounts> expectedResult1 = KV.of(position1, rc1);
  KV<Position, ReadCounts> expectedResult2 = KV.of(position2, rc2);
  KV<Position, ReadCounts> expectedResult3 = KV.of(position3, rc3);

  PAssert.that(result).containsInAnyOrder(expectedResult1, expectedResult2, expectedResult3);
  p.run();
}
 
Example #8
Source File: VerifyBamIdTest.java    From dataflow-java with Apache License 2.0 5 votes vote down vote up
@Test
public void testPileupAndJoinReads() throws Exception {
  final ReadBaseQuality srq = new ReadBaseQuality("A", 10);
  PCollection<KV<Position, ReadBaseQuality>> readCounts = p.apply(
      "createInput", Create.of(KV.of(position1, srq)));
  PAssert.that(readCounts).containsInAnyOrder(KV.of(position1, srq));

  PCollection<KV<Position, AlleleFreq>> refFreq = p.apply(Create.of(refCountList));

  PAssert.that(refFreq).containsInAnyOrder(refCountList);

  final TupleTag<ReadBaseQuality> readCountsTag = new TupleTag<>();
  TupleTag<AlleleFreq> refFreqTag = new TupleTag<>();
  PCollection<KV<Position, CoGbkResult>> joined = KeyedPCollectionTuple
      .of(readCountsTag, readCounts)
      .and(refFreqTag, refFreq)
      .apply(CoGroupByKey.<Position>create());

  PCollection<KV<Position, ReadCounts>> result = joined.apply(
      ParDo.of(new PileupAndJoinReads(readCountsTag, refFreqTag)));

  KV<Position, ReadCounts> expectedResult1 = KV.of(position1, rc1);
  KV<Position, ReadCounts> expectedResult2 = KV.of(position2, rc2);
  KV<Position, ReadCounts> expectedResult3 = KV.of(position3, rc3);

  PAssert.that(result).containsInAnyOrder(expectedResult1, expectedResult2, expectedResult3);
  p.run();
}
 
Example #9
Source File: CoGroup.java    From beam with Apache License 2.0 5 votes vote down vote up
@ProcessElement
public void process(
    @Element KV<Row, CoGbkResult> element, ProcessContext c, OutputReceiver<Row> o) {
  Result result =
      Result.from(
          joinInformation, joinArgs, element.getKey(), outputSchema, element.getValue(), c);
  if (convertType == ConvertType.UNEXPANDED) {
    result.outputUnexpandedRow(outputSchema, o);
  } else {
    result.outputExpandedRows(o);
  }
}
 
Example #10
Source File: CoGroup.java    From beam with Apache License 2.0 5 votes vote down vote up
static Result from(
    JoinInformation joinInformation,
    JoinArguments joinArgs,
    Row key,
    Schema outputSchema,
    CoGbkResult coGbkResult,
    DoFn<?, Row>.ProcessContext processContext) {
  return from(
      joinInformation, joinArgs, key, outputSchema, coGbkResult::getAll, processContext);
}
 
Example #11
Source File: JoinTranslator.java    From beam with Apache License 2.0 5 votes vote down vote up
@ProcessElement
@SuppressWarnings("unused")
public final void processElement(@Element KV<KeyT, CoGbkResult> element, ProcessContext ctx) {
  getCollector().setProcessContext(ctx);
  doJoin(
      requireNonNull(element.getValue()).getAll(leftTag),
      requireNonNull(element.getValue()).getAll(rightTag));
}
 
Example #12
Source File: Task.java    From beam with Apache License 2.0 5 votes vote down vote up
static PCollection<String> applyTransform(
    PCollection<String> fruits, PCollection<String> countries) {

  TupleTag<String> fruitsTag = new TupleTag<>();
  TupleTag<String> countriesTag = new TupleTag<>();

  MapElements<String, KV<String, String>> mapToAlphabetKv =
      MapElements.into(kvs(strings(), strings()))
          .via(word -> KV.of(word.substring(0, 1), word));

  PCollection<KV<String, String>> fruitsPColl = fruits.apply("Fruit to KV", mapToAlphabetKv);
  PCollection<KV<String, String>> countriesPColl = countries
      .apply("Country to KV", mapToAlphabetKv);

  return KeyedPCollectionTuple
      .of(fruitsTag, fruitsPColl)
      .and(countriesTag, countriesPColl)

      .apply(CoGroupByKey.create())

      .apply(ParDo.of(new DoFn<KV<String, CoGbkResult>, String>() {

        @ProcessElement
        public void processElement(
            @Element KV<String, CoGbkResult> element, OutputReceiver<String> out) {

          String alphabet = element.getKey();
          CoGbkResult coGbkResult = element.getValue();

          String fruit = coGbkResult.getOnly(fruitsTag);
          String country = coGbkResult.getOnly(countriesTag);

          out.output(new WordsAlphabet(alphabet, fruit, country).toString());
        }

      }));
}
 
Example #13
Source File: Snippets.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Using a CoGroupByKey transform. */
public static PCollection<String> coGroupByKeyTuple(
    TupleTag<String> emailsTag,
    TupleTag<String> phonesTag,
    PCollection<KV<String, String>> emails,
    PCollection<KV<String, String>> phones) {

  // [START CoGroupByKeyTuple]
  PCollection<KV<String, CoGbkResult>> results =
      KeyedPCollectionTuple.of(emailsTag, emails)
          .and(phonesTag, phones)
          .apply(CoGroupByKey.create());

  PCollection<String> contactLines =
      results.apply(
          ParDo.of(
              new DoFn<KV<String, CoGbkResult>, String>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  KV<String, CoGbkResult> e = c.element();
                  String name = e.getKey();
                  Iterable<String> emailsIter = e.getValue().getAll(emailsTag);
                  Iterable<String> phonesIter = e.getValue().getAll(phonesTag);
                  String formattedResult =
                      Snippets.formatCoGbkResults(name, emailsIter, phonesIter);
                  c.output(formattedResult);
                }
              }));
  // [END CoGroupByKeyTuple]
  return contactLines;
}
 
Example #14
Source File: CoGroupByKeyResultMappingTransform.java    From component-runtime with Apache License 2.0 5 votes vote down vote up
private Record createMap(final ProcessContext context) {
    final KV<K, CoGbkResult> element = context.element();
    final CoGbkResult result = element.getValue();
    final RecordBuilderFactory builderFactory = builderFactory();
    final Record.Builder builder = result
            .getSchema()
            .getTupleTagList()
            .getAll()
            .stream()
            .map(key -> new Pair<>(key.getId(), Record.class.cast(result.getOnly(key, null))))
            .filter(p -> p.getSecond() != null)
            .collect(builderFactory::newRecordBuilder, (b, p) -> {
                final Record record = p.getSecond();
                final Schema.Entry entry = builderFactory
                        .newEntryBuilder()
                        .withName(p.getFirst())
                        .withType(Schema.Type.ARRAY)
                        .withElementSchema(record.getSchema())
                        .build();
                b.withArray(entry, singletonList(record));
            }, RecordCollectors::merge);
    if (propagateKey) {
        final Record internalRecord =
                builderFactory.newRecordBuilder().withString("key", String.valueOf(element.getKey())).build();
        builder
                .withRecord(builderFactory
                        .newEntryBuilder()
                        .withName("__talend_internal")
                        .withType(Schema.Type.RECORD)
                        .withElementSchema(internalRecord.getSchema())
                        .build(), internalRecord);
    }
    return builder.build();
}
 
Example #15
Source File: CompareDatabases.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Long> expand(PBegin begin) {

  final TupleTag<Struct> oneTag = new TupleTag<>();
  PCollection<KV<String, Struct>> rowsOne = begin.apply("Read one", new ReadAllRows(one));
  final TupleTag<Struct> twoTag = new TupleTag<>();
  PCollection<KV<String, Struct>> rowsTwo = begin.apply("Read two", new ReadAllRows(two));

  PCollection<KV<String, CoGbkResult>> cogroup =
      KeyedPCollectionTuple.of(oneTag, rowsOne).and(twoTag, rowsTwo).apply(CoGroupByKey.create());

  PCollection<String> fails =
      cogroup.apply(
          ParDo.of(
              new DoFn<KV<String, CoGbkResult>, String>() {

                @ProcessElement
                public void processElement(ProcessContext c) {
                  KV<String, CoGbkResult> element = c.element();
                  CoGbkResult gbk = element.getValue();
                  ArrayList<Struct> oneRows = Lists.newArrayList(gbk.getAll(oneTag));
                  ArrayList<Struct> twoRows = Lists.newArrayList(gbk.getAll(twoTag));

                  if (oneRows.size() != 1 || twoRows.size() != 1) {
                    c.output(element.getKey());
                    return;
                  }

                  Struct sOne = oneRows.get(0);
                  Struct sTwo = twoRows.get(0);

                  if (!sOne.equals(sTwo)) {
                    c.output(element.getKey());
                  }
                }
              }));

  return fails.apply(Count.globally());
}
 
Example #16
Source File: JoinTranslator.java    From beam with Apache License 2.0 4 votes vote down vote up
AdaptableCollector<KV<KeyT, CoGbkResult>, KV<KeyT, OutputT>, OutputT> getCollector() {
  return resultsCollector;
}
 
Example #17
Source File: Join.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<KV<K, KV<V1, V2>>> expand(PCollection<KV<K, V1>> leftCollection) {
  checkNotNull(leftCollection);
  checkNotNull(rightCollection);
  checkNotNull(nullValue);
  final TupleTag<V1> v1Tuple = new TupleTag<>();
  final TupleTag<V2> v2Tuple = new TupleTag<>();

  PCollection<KV<K, CoGbkResult>> coGbkResultCollection =
      KeyedPCollectionTuple.of(v1Tuple, leftCollection)
          .and(v2Tuple, rightCollection)
          .apply("CoGBK", CoGroupByKey.create());

  return coGbkResultCollection
      .apply(
          "Join",
          ParDo.of(
              new DoFn<KV<K, CoGbkResult>, KV<K, KV<V1, V2>>>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  KV<K, CoGbkResult> e = c.element();

                  Iterable<V1> leftValuesIterable = e.getValue().getAll(v1Tuple);
                  Iterable<V2> rightValuesIterable = e.getValue().getAll(v2Tuple);

                  for (V1 leftValue : leftValuesIterable) {
                    if (rightValuesIterable.iterator().hasNext()) {
                      for (V2 rightValue : rightValuesIterable) {
                        c.output(KV.of(e.getKey(), KV.of(leftValue, rightValue)));
                      }
                    } else {
                      c.output(KV.of(e.getKey(), KV.of(leftValue, nullValue)));
                    }
                  }
                }
              }))
      .setCoder(
          KvCoder.of(
              ((KvCoder) leftCollection.getCoder()).getKeyCoder(),
              KvCoder.of(
                  ((KvCoder) leftCollection.getCoder()).getValueCoder(),
                  ((KvCoder) rightCollection.getCoder()).getValueCoder())));
}
 
Example #18
Source File: Join.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<KV<K, KV<V1, V2>>> expand(PCollection<KV<K, V1>> leftCollection) {
  checkNotNull(leftCollection);
  checkNotNull(rightCollection);
  checkNotNull(nullValue);

  final TupleTag<V1> v1Tuple = new TupleTag<>();
  final TupleTag<V2> v2Tuple = new TupleTag<>();

  PCollection<KV<K, CoGbkResult>> coGbkResultCollection =
      KeyedPCollectionTuple.of(v1Tuple, leftCollection)
          .and(v2Tuple, rightCollection)
          .apply("CoGBK", CoGroupByKey.create());

  return coGbkResultCollection
      .apply(
          "Join",
          ParDo.of(
              new DoFn<KV<K, CoGbkResult>, KV<K, KV<V1, V2>>>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  KV<K, CoGbkResult> e = c.element();

                  Iterable<V1> leftValuesIterable = e.getValue().getAll(v1Tuple);
                  Iterable<V2> rightValuesIterable = e.getValue().getAll(v2Tuple);

                  for (V2 rightValue : rightValuesIterable) {
                    if (leftValuesIterable.iterator().hasNext()) {
                      for (V1 leftValue : leftValuesIterable) {
                        c.output(KV.of(e.getKey(), KV.of(leftValue, rightValue)));
                      }
                    } else {
                      c.output(KV.of(e.getKey(), KV.of(nullValue, rightValue)));
                    }
                  }
                }
              }))
      .setCoder(
          KvCoder.of(
              ((KvCoder) leftCollection.getCoder()).getKeyCoder(),
              KvCoder.of(
                  ((KvCoder) leftCollection.getCoder()).getValueCoder(),
                  ((KvCoder) rightCollection.getCoder()).getValueCoder())));
}
 
Example #19
Source File: Query8.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<IdNameReserve> expand(PCollection<Event> events) {
  // Window and key new people by their id.
  PCollection<KV<Long, Person>> personsById =
      events
          .apply(NexmarkQueryUtil.JUST_NEW_PERSONS)
          .apply(
              "Query8.WindowPersons",
              Window.into(FixedWindows.of(Duration.standardSeconds(configuration.windowSizeSec))))
          .apply("PersonById", NexmarkQueryUtil.PERSON_BY_ID);

  // Window and key new auctions by their id.
  PCollection<KV<Long, Auction>> auctionsBySeller =
      events
          .apply(NexmarkQueryUtil.JUST_NEW_AUCTIONS)
          .apply(
              "Query8.WindowAuctions",
              Window.into(FixedWindows.of(Duration.standardSeconds(configuration.windowSizeSec))))
          .apply("AuctionBySeller", NexmarkQueryUtil.AUCTION_BY_SELLER);

  // Join people and auctions and project the person id, name and auction reserve price.
  return KeyedPCollectionTuple.of(NexmarkQueryUtil.PERSON_TAG, personsById)
      .and(NexmarkQueryUtil.AUCTION_TAG, auctionsBySeller)
      .apply(CoGroupByKey.create())
      .apply(
          name + ".Select",
          ParDo.of(
              new DoFn<KV<Long, CoGbkResult>, IdNameReserve>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  @Nullable
                  Person person =
                      c.element().getValue().getOnly(NexmarkQueryUtil.PERSON_TAG, null);
                  if (person == null) {
                    // Person was not created in last window period.
                    return;
                  }
                  for (Auction auction :
                      c.element().getValue().getAll(NexmarkQueryUtil.AUCTION_TAG)) {
                    c.output(new IdNameReserve(person.id, person.name, auction.reserve));
                  }
                }
              }));
}
 
Example #20
Source File: WinningBids.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<AuctionBid> expand(PCollection<Event> events) {
  // Window auctions and bids into custom auction windows. New people events will be discarded.
  // This will allow us to bring bids and auctions together irrespective of how long
  // each auction is open for.
  events = events.apply("Window", Window.into(auctionOrBidWindowFn));

  // Key auctions by their id.
  PCollection<KV<Long, Auction>> auctionsById =
      events
          .apply(NexmarkQueryUtil.JUST_NEW_AUCTIONS)
          .apply("AuctionById:", NexmarkQueryUtil.AUCTION_BY_ID);

  // Key bids by their auction id.
  PCollection<KV<Long, Bid>> bidsByAuctionId =
      events
          .apply(NexmarkQueryUtil.JUST_BIDS)
          .apply("BidByAuction", NexmarkQueryUtil.BID_BY_AUCTION);

  // Find the highest price valid bid for each closed auction.
  return
  // Join auctions and bids.
  KeyedPCollectionTuple.of(NexmarkQueryUtil.AUCTION_TAG, auctionsById)
      .and(NexmarkQueryUtil.BID_TAG, bidsByAuctionId)
      .apply(CoGroupByKey.create())
      // Filter and select.
      .apply(
          name + ".Join",
          ParDo.of(
              new DoFn<KV<Long, CoGbkResult>, AuctionBid>() {
                private final Counter noAuctionCounter = Metrics.counter(name, "noAuction");
                private final Counter underReserveCounter = Metrics.counter(name, "underReserve");
                private final Counter noValidBidsCounter = Metrics.counter(name, "noValidBids");

                @ProcessElement
                public void processElement(ProcessContext c) {
                  @Nullable
                  Auction auction =
                      c.element().getValue().getOnly(NexmarkQueryUtil.AUCTION_TAG, null);
                  if (auction == null) {
                    // We have bids without a matching auction. Give up.
                    noAuctionCounter.inc();
                    return;
                  }
                  // Find the current winning bid for auction.
                  // The earliest bid with the maximum price above the reserve wins.
                  Bid bestBid = null;
                  for (Bid bid : c.element().getValue().getAll(NexmarkQueryUtil.BID_TAG)) {
                    // Bids too late for their auction will have been
                    // filtered out by the window merge function.
                    checkState(bid.dateTime.compareTo(auction.expires) < 0);
                    if (bid.price < auction.reserve) {
                      // Bid price is below auction reserve.
                      underReserveCounter.inc();
                      continue;
                    }

                    if (bestBid == null
                        || Bid.PRICE_THEN_DESCENDING_TIME.compare(bid, bestBid) > 0) {
                      bestBid = bid;
                    }
                  }
                  if (bestBid == null) {
                    // We don't have any valid bids for auction.
                    noValidBidsCounter.inc();
                    return;
                  }
                  c.output(new AuctionBid(auction, bestBid));
                }
              }));
}
 
Example #21
Source File: SnippetsTest.java    From beam with Apache License 2.0 4 votes vote down vote up
@Test
public void testCoGroupByKeyTuple() throws IOException {
  // [START CoGroupByKeyTupleInputs]
  final List<KV<String, String>> emailsList =
      Arrays.asList(
          KV.of("amy", "amy@example.com"),
          KV.of("carl", "carl@example.com"),
          KV.of("julia", "julia@example.com"),
          KV.of("carl", "carl@email.com"));

  final List<KV<String, String>> phonesList =
      Arrays.asList(
          KV.of("amy", "111-222-3333"),
          KV.of("james", "222-333-4444"),
          KV.of("amy", "333-444-5555"),
          KV.of("carl", "444-555-6666"));

  PCollection<KV<String, String>> emails = p.apply("CreateEmails", Create.of(emailsList));
  PCollection<KV<String, String>> phones = p.apply("CreatePhones", Create.of(phonesList));
  // [END CoGroupByKeyTupleInputs]

  // [START CoGroupByKeyTupleOutputs]
  final TupleTag<String> emailsTag = new TupleTag<>();
  final TupleTag<String> phonesTag = new TupleTag<>();

  final List<KV<String, CoGbkResult>> expectedResults =
      Arrays.asList(
          KV.of(
              "amy",
              CoGbkResult.of(emailsTag, Arrays.asList("amy@example.com"))
                  .and(phonesTag, Arrays.asList("111-222-3333", "333-444-5555"))),
          KV.of(
              "carl",
              CoGbkResult.of(emailsTag, Arrays.asList("carl@email.com", "carl@example.com"))
                  .and(phonesTag, Arrays.asList("444-555-6666"))),
          KV.of(
              "james",
              CoGbkResult.of(emailsTag, Arrays.asList())
                  .and(phonesTag, Arrays.asList("222-333-4444"))),
          KV.of(
              "julia",
              CoGbkResult.of(emailsTag, Arrays.asList("julia@example.com"))
                  .and(phonesTag, Arrays.asList())));
  // [END CoGroupByKeyTupleOutputs]

  PCollection<String> actualFormattedResults =
      Snippets.coGroupByKeyTuple(emailsTag, phonesTag, emails, phones);

  // [START CoGroupByKeyTupleFormattedOutputs]
  final List<String> formattedResults =
      Arrays.asList(
          "amy; ['amy@example.com']; ['111-222-3333', '333-444-5555']",
          "carl; ['carl@email.com', 'carl@example.com']; ['444-555-6666']",
          "james; []; ['222-333-4444']",
          "julia; ['julia@example.com']; []");
  // [END CoGroupByKeyTupleFormattedOutputs]

  // Make sure that both 'expectedResults' and 'actualFormattedResults' match with the
  // 'formattedResults'. 'expectedResults' will have to be formatted before comparing
  List<String> expectedFormattedResultsList = new ArrayList<>(expectedResults.size());
  for (KV<String, CoGbkResult> e : expectedResults) {
    String name = e.getKey();
    Iterable<String> emailsIter = e.getValue().getAll(emailsTag);
    Iterable<String> phonesIter = e.getValue().getAll(phonesTag);
    String formattedResult = Snippets.formatCoGbkResults(name, emailsIter, phonesIter);
    expectedFormattedResultsList.add(formattedResult);
  }
  PCollection<String> expectedFormattedResultsPColl =
      p.apply(Create.of(expectedFormattedResultsList));
  PAssert.that(expectedFormattedResultsPColl).containsInAnyOrder(formattedResults);
  PAssert.that(actualFormattedResults).containsInAnyOrder(formattedResults);

  p.run();
}
 
Example #22
Source File: CoGroupByKeyResultMappingTransform.java    From component-runtime with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<Record> expand(final PCollection<KV<K, CoGbkResult>> input) {
    return input.apply(ParDo.of(new CoGBKMappingFn<>(plugin, propagateKey, null)));
}