org.apache.beam.sdk.transforms.join.KeyedPCollectionTuple Java Exaples

Source File: MusicBrainzTransforms.java From bigquery-etl-dataflow-sample with Apache License 2.0

6 votes

private static PCollection<KV<Long, CoGbkResult>> group(String name,
                                                        PCollection<KV<Long, MusicBrainzDataObject>> first,
                                                        PCollection<KV<Long, MusicBrainzDataObject>> second,
                                                        TupleTag<MusicBrainzDataObject> firstTag,
                                                        TupleTag<MusicBrainzDataObject> secondTag
) {
  final CoGroupByKey<Long> grouper = CoGroupByKey.create();


  PCollection<KV<Long, CoGbkResult>> joinedResult;

  try {
    joinedResult = KeyedPCollectionTuple
                    .of(firstTag, first)
                    .and(secondTag, second)
                    .apply("joinResult_"+name,
                           CoGroupByKey.<Long>create());
  } catch (Exception e) {
    logger.error("exception grouping.", e);
    return null;
  }
  return joinedResult;
}

Source File: MultinomialLogisticRegression.java From nemo with Apache License 2.0

6 votes

@Override
public PCollection<KV<Integer, List<Double>>> expand(final PCollection<KV<Integer, List<Double>>> model) {
  // Model as a view.
  final PCollectionView<Map<Integer, List<Double>>> modelView = model.apply(View.asMap());

  // Find gradient.
  final PCollection<KV<Integer, List<Double>>> gradient = readInput
      .apply(ParDo.of(
          new CalculateGradient(modelView, numClasses, numFeatures)).withSideInputs(modelView))
      .apply(Combine.perKey(new CombineFunction()));

  // Tags for CoGroupByKey.
  final TupleTag<List<Double>> gradientTag = new TupleTag<>();
  final TupleTag<List<Double>> modelTag = new TupleTag<>();
  final KeyedPCollectionTuple<Integer> coGbkInput = KeyedPCollectionTuple
      .of(gradientTag, gradient)
      .and(modelTag, model);

  final PCollection<KV<Integer, CoGbkResult>> groupResult =
      coGbkInput.apply(CoGroupByKey.create());

  // Update the model
  return groupResult
      .apply(ParDo.of(new ApplyGradient(numFeatures, numClasses, iterationNum, gradientTag, modelTag)));
}

Source File: VerifyBamId.java From dataflow-java with Apache License 2.0

6 votes

/**
 * Filter, pile up, and sample reads, then join against reference statistics.
 *
 * @param reads A PCollection of reads
 * @param samplingFraction Fraction of reads to keep
 * @param samplingPrefix A prefix used in generating hashes used in sampling
 * @param refCounts A PCollection mapping position to counts of alleles in
 *   a reference population.
 * @return A PCollection mapping Position to a ReadCounts proto
 */
static PCollection<KV<Position, ReadCounts>> combineReads(PCollection<Read> reads,
    double samplingFraction, String samplingPrefix,
    PCollection<KV<Position, AlleleFreq>> refFreq) {
  // Runs filters on input Reads, splits into individual aligned bases (emitting the
  // base and quality) and grabs a sample of them based on a hash mod of Position.
  PCollection<KV<Position, ReadBaseQuality>> joinReadCounts =
      reads.apply("IsOnChromosome", Filter.by(ReadFunctions.IS_ON_CHROMOSOME))
      .apply("IsNotQCFailure", Filter.by(ReadFunctions.IS_NOT_QC_FAILURE))
      .apply("IsNotDuplicate", Filter.by(ReadFunctions.IS_NOT_DUPLICATE))
      .apply("IsProperPlacement", Filter.by(ReadFunctions.IS_PROPER_PLACEMENT))
      .apply(ParDo.of(new SplitReads()))
      .apply(Filter.by(new SampleReads(samplingFraction, samplingPrefix)));

  TupleTag<ReadBaseQuality> readCountsTag = new TupleTag<>();
  TupleTag<AlleleFreq> refFreqTag = new TupleTag<>();
  // Pile up read counts, then join against reference stats.
  PCollection<KV<Position, CoGbkResult>> joined = KeyedPCollectionTuple
      .of(readCountsTag, joinReadCounts)
      .and(refFreqTag, refFreq)
      .apply(CoGroupByKey.<Position>create());
  return joined.apply(ParDo.of(new PileupAndJoinReads(readCountsTag, refFreqTag)));
}

Source File: JoinTranslator.java From beam with Apache License 2.0

6 votes

@Override
PCollection<KV<KeyT, OutputT>> translate(
    Join<LeftT, RightT, KeyT, OutputT> operator,
    PCollection<LeftT> left,
    PCollection<KV<KeyT, LeftT>> leftKeyed,
    PCollection<RightT> reight,
    PCollection<KV<KeyT, RightT>> rightKeyed) {
  final AccumulatorProvider accumulators =
      new LazyAccumulatorProvider(AccumulatorProvider.of(leftKeyed.getPipeline()));
  final TupleTag<LeftT> leftTag = new TupleTag<>();
  final TupleTag<RightT> rightTag = new TupleTag<>();
  final JoinFn<LeftT, RightT, KeyT, OutputT> joinFn =
      getJoinFn(operator, leftTag, rightTag, accumulators);
  return KeyedPCollectionTuple.of(leftTag, leftKeyed)
      .and(rightTag, rightKeyed)
      .apply("co-group-by-key", CoGroupByKey.create())
      .apply(joinFn.getFnName(), ParDo.of(joinFn));
}

Source File: ValidateRunnerXlangTest.java From beam with Apache License 2.0

6 votes

@Test
@Category({ValidatesRunner.class, UsesCrossLanguageTransforms.class})
public void coGroupByKeyTest() {
  PCollection<KV<Long, String>> col1 =
      testPipeline.apply("createCol1", Create.of(KV.of(0L, "1"), KV.of(0L, "2"), KV.of(1L, "3")));
  PCollection<KV<Long, String>> col2 =
      testPipeline.apply("createCol2", Create.of(KV.of(0L, "4"), KV.of(1L, "5"), KV.of(1L, "6")));
  PCollection<KV<Long, Iterable<String>>> cgbkCol =
      KeyedPCollectionTuple.of("col1", col1)
          .and("col2", col2)
          .apply(External.of(TEST_CGBK_URN, new byte[] {}, expansionAddr));
  PCollection<String> col =
      cgbkCol.apply(
          MapElements.into(TypeDescriptors.strings())
              .via(
                  (KV<Long, Iterable<String>> kv) -> {
                    String[] values = Iterables.toArray(kv.getValue(), String.class);
                    Arrays.sort(values);
                    return String.format("%s:%s", kv.getKey(), String.join(",", values));
                  }));
  PAssert.that(col).containsInAnyOrder("0:1,2,4", "1:3,5,6");
}

Source File: CoGroup.java From beam with Apache License 2.0

6 votes

private JoinInformation(
    KeyedPCollectionTuple<Row> keyedPCollectionTuple,
    Map<String, PCollectionView<Map<Row, Iterable<Row>>>> sideInputs,
    Schema keySchema,
    Map<String, Schema> componentSchemas,
    Map<Integer, SerializableFunction<Object, Row>> toRows,
    List<String> sortedTags,
    Map<Integer, String> tagToKeyedTag) {
  this.keyedPCollectionTuple = keyedPCollectionTuple;
  this.sideInputs = sideInputs;
  this.keySchema = keySchema;
  this.componentSchemas = componentSchemas;
  this.toRows = toRows;
  this.sortedTags = sortedTags;
  this.tagToKeyedTag = tagToKeyedTag;
}

Source File: TestExpansionService.java From beam with Apache License 2.0

6 votes

@Override
public PCollection<KV<Long, Iterable<String>>> expand(KeyedPCollectionTuple<Long> input) {
  Set<String> tagSet = ImmutableSet.of("col1", "col2");
  return input
      .apply(CoGroupByKey.create())
      .apply(
          ParDo.of(
              new DoFn<KV<Long, CoGbkResult>, KV<Long, Iterable<String>>>() {
                @ProcessElement
                public void processElement(
                    @Element KV<Long, CoGbkResult> kv,
                    OutputReceiver<KV<Long, Iterable<String>>> out) {
                  Iterable<String> iter =
                      () ->
                          tagSet.stream()
                              .flatMap(
                                  (String t) ->
                                      StreamSupport.stream(
                                          kv.getValue().<String>getAll(t).spliterator(),
                                          false))
                              .iterator();
                  out.output(KV.of(kv.getKey(), iter));
                }
              }));
}

Source File: MultinomialLogisticRegression.java From incubator-nemo with Apache License 2.0

6 votes

@Override
public PCollection<KV<Integer, List<Double>>> expand(final PCollection<KV<Integer, List<Double>>> model) {
  // Model as a view.
  final PCollectionView<Map<Integer, List<Double>>> modelView = model.apply(View.asMap());

  // Find gradient.
  final PCollection<KV<Integer, List<Double>>> gradient = readInput
    .apply(ParDo.of(
      new CalculateGradient(modelView, numClasses, numFeatures)).withSideInputs(modelView))
    .apply(Combine.perKey(new CombineFunction()));

  // Tags for CoGroupByKey.
  final TupleTag<List<Double>> gradientTag = new TupleTag<>();
  final TupleTag<List<Double>> modelTag = new TupleTag<>();
  final KeyedPCollectionTuple<Integer> coGbkInput = KeyedPCollectionTuple
    .of(gradientTag, gradient)
    .and(modelTag, model);

  final PCollection<KV<Integer, CoGbkResult>> groupResult =
    coGbkInput.apply(CoGroupByKey.create());

  // Update the model
  return groupResult
    .apply(ParDo.of(new ApplyGradient(numFeatures, numClasses, iterationNum, gradientTag, modelTag)));
}

Source File: TestExpansionService.java From beam with Apache License 2.0

5 votes

@Override
public KeyedPCollectionTuple<Long> createInput(
    Pipeline p, Map<String, PCollection<?>> inputs) {
  KeyedPCollectionTuple inputTuple = KeyedPCollectionTuple.empty(p);
  for (Map.Entry<String, PCollection<?>> entry : inputs.entrySet()) {
    inputTuple = inputTuple.and(new TupleTag(entry.getKey()), entry.getValue());
  }
  return inputTuple;
}

Source File: VerifyBamIdTest.java From dataflow-java with Apache License 2.0

5 votes

@Test
public void testPileupAndJoinReadsWithChrPrefix() throws Exception {
  ReadBaseQuality srq = new ReadBaseQuality("A", 10);
  PCollection<KV<Position, ReadBaseQuality>> readCounts = p.apply(
      "createInput", Create.of(KV.of(position1chrPrefix, srq)));
  PAssert.that(readCounts).containsInAnyOrder(KV.of(position1chrPrefix, srq));

  PCollection<KV<Position, AlleleFreq>> refFreq = p.apply(Create.of(refCountList));
  PAssert.that(refFreq).containsInAnyOrder(refCountList);

  TupleTag<ReadBaseQuality> readCountsTag = new TupleTag<>();
  TupleTag<AlleleFreq> refFreqTag = new TupleTag<>();
  PCollection<KV<Position, CoGbkResult>> joined = KeyedPCollectionTuple
      .of(readCountsTag, readCounts)
      .and(refFreqTag, refFreq)
      .apply(CoGroupByKey.<Position>create());

  PCollection<KV<Position, ReadCounts>> result = joined.apply(
      ParDo.of(new PileupAndJoinReads(readCountsTag, refFreqTag)));

  KV<Position, ReadCounts> expectedResult1 = KV.of(position1, rc1);
  KV<Position, ReadCounts> expectedResult2 = KV.of(position2, rc2);
  KV<Position, ReadCounts> expectedResult3 = KV.of(position3, rc3);

  PAssert.that(result).containsInAnyOrder(expectedResult1, expectedResult2, expectedResult3);
  p.run();
}

Source File: VerifyBamIdTest.java From dataflow-java with Apache License 2.0

5 votes

@Test
public void testPileupAndJoinReads() throws Exception {
  final ReadBaseQuality srq = new ReadBaseQuality("A", 10);
  PCollection<KV<Position, ReadBaseQuality>> readCounts = p.apply(
      "createInput", Create.of(KV.of(position1, srq)));
  PAssert.that(readCounts).containsInAnyOrder(KV.of(position1, srq));

  PCollection<KV<Position, AlleleFreq>> refFreq = p.apply(Create.of(refCountList));

  PAssert.that(refFreq).containsInAnyOrder(refCountList);

  final TupleTag<ReadBaseQuality> readCountsTag = new TupleTag<>();
  TupleTag<AlleleFreq> refFreqTag = new TupleTag<>();
  PCollection<KV<Position, CoGbkResult>> joined = KeyedPCollectionTuple
      .of(readCountsTag, readCounts)
      .and(refFreqTag, refFreq)
      .apply(CoGroupByKey.<Position>create());

  PCollection<KV<Position, ReadCounts>> result = joined.apply(
      ParDo.of(new PileupAndJoinReads(readCountsTag, refFreqTag)));

  KV<Position, ReadCounts> expectedResult1 = KV.of(position1, rc1);
  KV<Position, ReadCounts> expectedResult2 = KV.of(position2, rc2);
  KV<Position, ReadCounts> expectedResult3 = KV.of(position3, rc3);

  PAssert.that(result).containsInAnyOrder(expectedResult1, expectedResult2, expectedResult3);
  p.run();
}

Source File: GroupWithoutRepartition.java From beam with Apache License 2.0

5 votes

@Override
@SuppressWarnings("unchecked")
public OutputT expand(InputT input) {
  if (input instanceof PCollection) {
    return (OutputT) ((PCollection) input).apply(transform);
  } else if (input instanceof KeyedPCollectionTuple) {
    return (OutputT) ((KeyedPCollectionTuple) input).apply(transform);
  } else {
    throw new RuntimeException(
        transform.getName()
            + " is not supported with "
            + GroupWithoutRepartition.class.getSimpleName());
  }
}

Source File: CoGroupByKeyLoadTest.java From beam with Apache License 2.0

5 votes

@Override
void loadTest() throws IOException {
  SyntheticSourceOptions coSourceOptions =
      fromJsonString(options.getCoSourceOptions(), SyntheticSourceOptions.class);

  Optional<SyntheticStep> syntheticStep = createStep(options.getStepOptions());

  PCollection<KV<byte[], byte[]>> input =
      pipeline.apply("Read input", readFromSource(sourceOptions));
  input = input.apply("Collect start time metrics (input)", ParDo.of(runtimeMonitor));
  input = applyWindowing(input);
  input = applyStepIfPresent(input, "Synthetic step for input", syntheticStep);

  PCollection<KV<byte[], byte[]>> coInput =
      pipeline.apply("Read co-input", readFromSource(coSourceOptions));
  coInput = coInput.apply("Collect start time metrics (co-input)", ParDo.of(runtimeMonitor));
  coInput = applyWindowing(coInput, options.getCoInputWindowDurationSec());
  coInput = applyStepIfPresent(coInput, "Synthetic step for co-input", syntheticStep);

  KeyedPCollectionTuple.of(INPUT_TAG, input)
      .and(CO_INPUT_TAG, coInput)
      .apply("CoGroupByKey", CoGroupByKey.create())
      .apply("Ungroup and reiterate", ParDo.of(new UngroupAndReiterate(options.getIterations())))
      .apply(
          "Collect total bytes", ParDo.of(new ByteMonitor(METRICS_NAMESPACE, "totalBytes.count")))
      .apply("Collect end time metrics", ParDo.of(runtimeMonitor));
}

Source File: Join.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<KV<K, KV<V1, V2>>> expand(PCollection<KV<K, V1>> leftCollection) {
  checkNotNull(leftCollection);
  checkNotNull(rightCollection);

  final TupleTag<V1> v1Tuple = new TupleTag<>();
  final TupleTag<V2> v2Tuple = new TupleTag<>();

  PCollection<KV<K, CoGbkResult>> coGbkResultCollection =
      KeyedPCollectionTuple.of(v1Tuple, leftCollection)
          .and(v2Tuple, rightCollection)
          .apply("CoGBK", CoGroupByKey.create());

  return coGbkResultCollection
      .apply(
          "Join",
          ParDo.of(
              new DoFn<KV<K, CoGbkResult>, KV<K, KV<V1, V2>>>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  KV<K, CoGbkResult> e = c.element();

                  Iterable<V1> leftValuesIterable = e.getValue().getAll(v1Tuple);
                  Iterable<V2> rightValuesIterable = e.getValue().getAll(v2Tuple);

                  for (V1 leftValue : leftValuesIterable) {
                    for (V2 rightValue : rightValuesIterable) {
                      c.output(KV.of(e.getKey(), KV.of(leftValue, rightValue)));
                    }
                  }
                }
              }))
      .setCoder(
          KvCoder.of(
              ((KvCoder) leftCollection.getCoder()).getKeyCoder(),
              KvCoder.of(
                  ((KvCoder) leftCollection.getCoder()).getValueCoder(),
                  ((KvCoder) rightCollection.getCoder()).getValueCoder())));
}

Source File: Task.java From beam with Apache License 2.0

5 votes

static PCollection<String> applyTransform(
    PCollection<String> fruits, PCollection<String> countries) {

  TupleTag<String> fruitsTag = new TupleTag<>();
  TupleTag<String> countriesTag = new TupleTag<>();

  MapElements<String, KV<String, String>> mapToAlphabetKv =
      MapElements.into(kvs(strings(), strings()))
          .via(word -> KV.of(word.substring(0, 1), word));

  PCollection<KV<String, String>> fruitsPColl = fruits.apply("Fruit to KV", mapToAlphabetKv);
  PCollection<KV<String, String>> countriesPColl = countries
      .apply("Country to KV", mapToAlphabetKv);

  return KeyedPCollectionTuple
      .of(fruitsTag, fruitsPColl)
      .and(countriesTag, countriesPColl)

      .apply(CoGroupByKey.create())

      .apply(ParDo.of(new DoFn<KV<String, CoGbkResult>, String>() {

        @ProcessElement
        public void processElement(
            @Element KV<String, CoGbkResult> element, OutputReceiver<String> out) {

          String alphabet = element.getKey();
          CoGbkResult coGbkResult = element.getValue();

          String fruit = coGbkResult.getOnly(fruitsTag);
          String country = coGbkResult.getOnly(countriesTag);

          out.output(new WordsAlphabet(alphabet, fruit, country).toString());
        }

      }));
}

Source File: Snippets.java From beam with Apache License 2.0

5 votes

/** Using a CoGroupByKey transform. */
public static PCollection<String> coGroupByKeyTuple(
    TupleTag<String> emailsTag,
    TupleTag<String> phonesTag,
    PCollection<KV<String, String>> emails,
    PCollection<KV<String, String>> phones) {

  // [START CoGroupByKeyTuple]
  PCollection<KV<String, CoGbkResult>> results =
      KeyedPCollectionTuple.of(emailsTag, emails)
          .and(phonesTag, phones)
          .apply(CoGroupByKey.create());

  PCollection<String> contactLines =
      results.apply(
          ParDo.of(
              new DoFn<KV<String, CoGbkResult>, String>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  KV<String, CoGbkResult> e = c.element();
                  String name = e.getKey();
                  Iterable<String> emailsIter = e.getValue().getAll(emailsTag);
                  Iterable<String> phonesIter = e.getValue().getAll(phonesTag);
                  String formattedResult =
                      Snippets.formatCoGbkResults(name, emailsIter, phonesIter);
                  c.output(formattedResult);
                }
              }));
  // [END CoGroupByKeyTuple]
  return contactLines;
}

Source File: CompareDatabases.java From DataflowTemplates with Apache License 2.0

5 votes

@Override
public PCollection<Long> expand(PBegin begin) {

  final TupleTag<Struct> oneTag = new TupleTag<>();
  PCollection<KV<String, Struct>> rowsOne = begin.apply("Read one", new ReadAllRows(one));
  final TupleTag<Struct> twoTag = new TupleTag<>();
  PCollection<KV<String, Struct>> rowsTwo = begin.apply("Read two", new ReadAllRows(two));

  PCollection<KV<String, CoGbkResult>> cogroup =
      KeyedPCollectionTuple.of(oneTag, rowsOne).and(twoTag, rowsTwo).apply(CoGroupByKey.create());

  PCollection<String> fails =
      cogroup.apply(
          ParDo.of(
              new DoFn<KV<String, CoGbkResult>, String>() {

                @ProcessElement
                public void processElement(ProcessContext c) {
                  KV<String, CoGbkResult> element = c.element();
                  CoGbkResult gbk = element.getValue();
                  ArrayList<Struct> oneRows = Lists.newArrayList(gbk.getAll(oneTag));
                  ArrayList<Struct> twoRows = Lists.newArrayList(gbk.getAll(twoTag));

                  if (oneRows.size() != 1 || twoRows.size() != 1) {
                    c.output(element.getKey());
                    return;
                  }

                  Struct sOne = oneRows.get(0);
                  Struct sTwo = twoRows.get(0);

                  if (!sOne.equals(sTwo)) {
                    c.output(element.getKey());
                  }
                }
              }));

  return fails.apply(Count.globally());
}

Source File: Query8.java From beam with Apache License 2.0

4 votes

@Override
public PCollection<IdNameReserve> expand(PCollection<Event> events) {
  // Window and key new people by their id.
  PCollection<KV<Long, Person>> personsById =
      events
          .apply(NexmarkQueryUtil.JUST_NEW_PERSONS)
          .apply(
              "Query8.WindowPersons",
              Window.into(FixedWindows.of(Duration.standardSeconds(configuration.windowSizeSec))))
          .apply("PersonById", NexmarkQueryUtil.PERSON_BY_ID);

  // Window and key new auctions by their id.
  PCollection<KV<Long, Auction>> auctionsBySeller =
      events
          .apply(NexmarkQueryUtil.JUST_NEW_AUCTIONS)
          .apply(
              "Query8.WindowAuctions",
              Window.into(FixedWindows.of(Duration.standardSeconds(configuration.windowSizeSec))))
          .apply("AuctionBySeller", NexmarkQueryUtil.AUCTION_BY_SELLER);

  // Join people and auctions and project the person id, name and auction reserve price.
  return KeyedPCollectionTuple.of(NexmarkQueryUtil.PERSON_TAG, personsById)
      .and(NexmarkQueryUtil.AUCTION_TAG, auctionsBySeller)
      .apply(CoGroupByKey.create())
      .apply(
          name + ".Select",
          ParDo.of(
              new DoFn<KV<Long, CoGbkResult>, IdNameReserve>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  @Nullable
                  Person person =
                      c.element().getValue().getOnly(NexmarkQueryUtil.PERSON_TAG, null);
                  if (person == null) {
                    // Person was not created in last window period.
                    return;
                  }
                  for (Auction auction :
                      c.element().getValue().getAll(NexmarkQueryUtil.AUCTION_TAG)) {
                    c.output(new IdNameReserve(person.id, person.name, auction.reserve));
                  }
                }
              }));
}

Source File: WinningBids.java From beam with Apache License 2.0

4 votes

@Override
public PCollection<AuctionBid> expand(PCollection<Event> events) {
  // Window auctions and bids into custom auction windows. New people events will be discarded.
  // This will allow us to bring bids and auctions together irrespective of how long
  // each auction is open for.
  events = events.apply("Window", Window.into(auctionOrBidWindowFn));

  // Key auctions by their id.
  PCollection<KV<Long, Auction>> auctionsById =
      events
          .apply(NexmarkQueryUtil.JUST_NEW_AUCTIONS)
          .apply("AuctionById:", NexmarkQueryUtil.AUCTION_BY_ID);

  // Key bids by their auction id.
  PCollection<KV<Long, Bid>> bidsByAuctionId =
      events
          .apply(NexmarkQueryUtil.JUST_BIDS)
          .apply("BidByAuction", NexmarkQueryUtil.BID_BY_AUCTION);

  // Find the highest price valid bid for each closed auction.
  return
  // Join auctions and bids.
  KeyedPCollectionTuple.of(NexmarkQueryUtil.AUCTION_TAG, auctionsById)
      .and(NexmarkQueryUtil.BID_TAG, bidsByAuctionId)
      .apply(CoGroupByKey.create())
      // Filter and select.
      .apply(
          name + ".Join",
          ParDo.of(
              new DoFn<KV<Long, CoGbkResult>, AuctionBid>() {
                private final Counter noAuctionCounter = Metrics.counter(name, "noAuction");
                private final Counter underReserveCounter = Metrics.counter(name, "underReserve");
                private final Counter noValidBidsCounter = Metrics.counter(name, "noValidBids");

                @ProcessElement
                public void processElement(ProcessContext c) {
                  @Nullable
                  Auction auction =
                      c.element().getValue().getOnly(NexmarkQueryUtil.AUCTION_TAG, null);
                  if (auction == null) {
                    // We have bids without a matching auction. Give up.
                    noAuctionCounter.inc();
                    return;
                  }
                  // Find the current winning bid for auction.
                  // The earliest bid with the maximum price above the reserve wins.
                  Bid bestBid = null;
                  for (Bid bid : c.element().getValue().getAll(NexmarkQueryUtil.BID_TAG)) {
                    // Bids too late for their auction will have been
                    // filtered out by the window merge function.
                    checkState(bid.dateTime.compareTo(auction.expires) < 0);
                    if (bid.price < auction.reserve) {
                      // Bid price is below auction reserve.
                      underReserveCounter.inc();
                      continue;
                    }

                    if (bestBid == null
                        || Bid.PRICE_THEN_DESCENDING_TIME.compare(bid, bestBid) > 0) {
                      bestBid = bid;
                    }
                  }
                  if (bestBid == null) {
                    // We don't have any valid bids for auction.
                    noValidBidsCounter.inc();
                    return;
                  }
                  c.output(new AuctionBid(auction, bestBid));
                }
              }));
}

Source File: TestExpansionService.java From beam with Apache License 2.0

4 votes

@Override
public PTransform<KeyedPCollectionTuple<Long>, PCollection<KV<Long, Iterable<String>>>>
    getTransform(RunnerApi.FunctionSpec spec) {
  return new TestCoGroupByKeyTransform();
}

Source File: Join.java From beam with Apache License 2.0

4 votes

@Override
public PCollection<KV<K, KV<V1, V2>>> expand(PCollection<KV<K, V1>> leftCollection) {
  checkNotNull(leftCollection);
  checkNotNull(rightCollection);
  checkNotNull(nullValue);

  final TupleTag<V1> v1Tuple = new TupleTag<>();
  final TupleTag<V2> v2Tuple = new TupleTag<>();

  PCollection<KV<K, CoGbkResult>> coGbkResultCollection =
      KeyedPCollectionTuple.of(v1Tuple, leftCollection)
          .and(v2Tuple, rightCollection)
          .apply("CoGBK", CoGroupByKey.create());

  return coGbkResultCollection
      .apply(
          "Join",
          ParDo.of(
              new DoFn<KV<K, CoGbkResult>, KV<K, KV<V1, V2>>>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  KV<K, CoGbkResult> e = c.element();

                  Iterable<V1> leftValuesIterable = e.getValue().getAll(v1Tuple);
                  Iterable<V2> rightValuesIterable = e.getValue().getAll(v2Tuple);

                  for (V2 rightValue : rightValuesIterable) {
                    if (leftValuesIterable.iterator().hasNext()) {
                      for (V1 leftValue : leftValuesIterable) {
                        c.output(KV.of(e.getKey(), KV.of(leftValue, rightValue)));
                      }
                    } else {
                      c.output(KV.of(e.getKey(), KV.of(nullValue, rightValue)));
                    }
                  }
                }
              }))
      .setCoder(
          KvCoder.of(
              ((KvCoder) leftCollection.getCoder()).getKeyCoder(),
              KvCoder.of(
                  ((KvCoder) leftCollection.getCoder()).getValueCoder(),
                  ((KvCoder) rightCollection.getCoder()).getValueCoder())));
}

Source File: CoGroup.java From beam with Apache License 2.0

4 votes

private static JoinInformation from(
    PCollectionTuple input,
    Function<String, FieldAccessDescriptor> getFieldAccessDescriptor,
    Function<String, Boolean> getIsSideInput) {
  KeyedPCollectionTuple<Row> keyedPCollectionTuple =
      KeyedPCollectionTuple.empty(input.getPipeline());

  List<String> sortedTags =
      input.getAll().keySet().stream()
          .map(TupleTag::getId)
          .sorted()
          .collect(Collectors.toList());

  // Keep this in a TreeMap so that it's sorted. This way we get a deterministic output
  // schema.
  TreeMap<String, Schema> componentSchemas = Maps.newTreeMap();
  Map<Integer, SerializableFunction<Object, Row>> toRows = Maps.newHashMap();

  Map<String, PCollectionView<Map<Row, Iterable<Row>>>> sideInputs = Maps.newHashMap();
  Map<Integer, String> tagToKeyedTag = Maps.newHashMap();
  Schema keySchema = null;
  for (Map.Entry<TupleTag<?>, PCollection<?>> entry : input.getAll().entrySet()) {
    String tag = entry.getKey().getId();
    int tagIndex = sortedTags.indexOf(tag);
    PCollection<?> pc = entry.getValue();
    Schema schema = pc.getSchema();
    componentSchemas.put(tag, schema);
    toRows.put(tagIndex, (SerializableFunction<Object, Row>) pc.getToRowFunction());
    FieldAccessDescriptor fieldAccessDescriptor = getFieldAccessDescriptor.apply(tag);
    if (fieldAccessDescriptor == null) {
      throw new IllegalStateException("No fields were set for input " + tag);
    }
    // Resolve the key schema, keeping the fields in the order specified by the user.
    // Otherwise, if different field names are specified for different PCollections, they
    // might not match up.
    // The key schema contains the field names from the first PCollection specified.
    FieldAccessDescriptor resolved = fieldAccessDescriptor.resolve(schema);
    Schema currentKeySchema = SelectHelpers.getOutputSchema(schema, resolved);
    if (keySchema == null) {
      keySchema = currentKeySchema;
    } else {
      keySchema = SchemaUtils.mergeWideningNullable(keySchema, currentKeySchema);
    }

    // Create a new tag for the output.
    TupleTag randomTag = new TupleTag<>();
    String keyedTag = tag + "_" + randomTag;
    tagToKeyedTag.put(tagIndex, keyedTag);
    PCollection<KV<Row, Row>> keyedPCollection =
        extractKey(pc, schema, keySchema, resolved, tag);
    if (getIsSideInput.apply(tag)) {
      sideInputs.put(
          keyedTag, keyedPCollection.apply("computeSideInputView" + tag, View.asMultimap()));
    } else {
      keyedPCollectionTuple = keyedPCollectionTuple.and(keyedTag, keyedPCollection);
    }
  }
  return new JoinInformation(
      keyedPCollectionTuple,
      sideInputs,
      keySchema,
      componentSchemas,
      toRows,
      sortedTags,
      tagToKeyedTag);
}

Source File: Join.java From beam with Apache License 2.0

4 votes

@Override
public PCollection<KV<K, KV<V1, V2>>> expand(PCollection<KV<K, V1>> leftCollection) {
  checkNotNull(leftCollection);
  checkNotNull(rightCollection);
  checkNotNull(nullValue);
  final TupleTag<V1> v1Tuple = new TupleTag<>();
  final TupleTag<V2> v2Tuple = new TupleTag<>();

  PCollection<KV<K, CoGbkResult>> coGbkResultCollection =
      KeyedPCollectionTuple.of(v1Tuple, leftCollection)
          .and(v2Tuple, rightCollection)
          .apply("CoGBK", CoGroupByKey.create());

  return coGbkResultCollection
      .apply(
          "Join",
          ParDo.of(
              new DoFn<KV<K, CoGbkResult>, KV<K, KV<V1, V2>>>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  KV<K, CoGbkResult> e = c.element();

                  Iterable<V1> leftValuesIterable = e.getValue().getAll(v1Tuple);
                  Iterable<V2> rightValuesIterable = e.getValue().getAll(v2Tuple);

                  for (V1 leftValue : leftValuesIterable) {
                    if (rightValuesIterable.iterator().hasNext()) {
                      for (V2 rightValue : rightValuesIterable) {
                        c.output(KV.of(e.getKey(), KV.of(leftValue, rightValue)));
                      }
                    } else {
                      c.output(KV.of(e.getKey(), KV.of(leftValue, nullValue)));
                    }
                  }
                }
              }))
      .setCoder(
          KvCoder.of(
              ((KvCoder) leftCollection.getCoder()).getKeyCoder(),
              KvCoder.of(
                  ((KvCoder) leftCollection.getCoder()).getValueCoder(),
                  ((KvCoder) rightCollection.getCoder()).getValueCoder())));
}

org.apache.beam.sdk.transforms.join.KeyedPCollectionTuple Java Examples