org.apache.beam.sdk.transforms.Keys Java Exaples

Source File: TfIdfTest.java From beam with Apache License 2.0

6 votes

/** Test that the example runs. */
@Test
public void testTfIdf() throws Exception {

  pipeline.getCoderRegistry().registerCoderForClass(URI.class, StringDelegateCoder.of(URI.class));

  PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf =
      pipeline
          .apply(
              Create.of(
                  KV.of(new URI("x"), "a b c d"),
                  KV.of(new URI("y"), "a b c"),
                  KV.of(new URI("z"), "a m n")))
          .apply(new TfIdf.ComputeTfIdf());

  PCollection<String> words = wordToUriAndTfIdf.apply(Keys.create()).apply(Distinct.create());

  PAssert.that(words).containsInAnyOrder(Arrays.asList("a", "m", "n", "b", "c", "d"));

  pipeline.run().waitUntilFinish();
}

Source File: AvroTableFileAsMutations.java From DataflowTemplates with Apache License 2.0

5 votes

@Override
public PCollection<Mutation> expand(PCollection<KV<String, String>> filesToTables) {

  // Map<filename,tablename>
  PCollectionView<Map<String, String>> filenamesToTableNamesMapView =
      filesToTables.apply("asView", View.asMap());

  return filesToTables
      .apply("Get Filenames", Keys.create())
      // PCollection<String>
      .apply(FileIO.matchAll().withEmptyMatchTreatment(EmptyMatchTreatment.DISALLOW))
      // PCollection<Match.Metadata>
      .apply(FileIO.readMatches())
      // Pcollection<FileIO.ReadableFile>
      .apply(
          "Split into ranges",
          ParDo.of(
                  new SplitIntoRangesFn(
                      SplitIntoRangesFn.DEFAULT_BUNDLE_SIZE, filenamesToTableNamesMapView))
              .withSideInputs(filenamesToTableNamesMapView))
      .setCoder(FileShard.Coder.of())
      // PCollection<FileShard>
      .apply("Reshuffle", Reshuffle.viaRandomKey())
      // PCollection<FileShard>

      .apply("Read ranges", ParDo.of(new ReadFileRangesFn(ddlView)).withSideInputs(ddlView));
}

Source File: SyntheticStepTest.java From beam with Apache License 2.0

5 votes

@Test
public void testSyntheticStepWithPreservingInputKeyDistribution() throws Exception {
  SyntheticStep.Options options =
      SyntheticTestUtils.optionsFromString(
          "{\"outputRecordsPerInputRecord\": 2,"
              + " \"preservesInputKeyDistribution\": true,"
              + "\"keySizeBytes\": 10,"
              + "\"valueSizeBytes\": 20,"
              + "\"numHotKeys\": 3,"
              + "\"hotKeyFraction\": 0.3,"
              + "\"seed\": 123456}",
          SyntheticStep.Options.class);
  options.delayDistribution =
      SyntheticOptions.fromRealDistribution(new ConstantRealDistribution(10));

  PCollection<byte[]> result =
      p.apply(
              Create.of(
                  ImmutableList.of(
                      KV.of(intToByteArray(1), intToByteArray(11)),
                      KV.of(intToByteArray(2), intToByteArray(22)),
                      KV.of(intToByteArray(3), intToByteArray(33)))))
          .apply(ParDo.of(new SyntheticStep(options)))
          .apply(Keys.create());

  List<byte[]> expected =
      ImmutableList.of(
          intToByteArray(1),
          intToByteArray(1),
          intToByteArray(2),
          intToByteArray(2),
          intToByteArray(3),
          intToByteArray(3));
  PAssert.that(result).containsInAnyOrder(expected);
  p.run().waitUntilFinish();
}

Source File: KeyedPValueTrackingVisitorTest.java From beam with Apache License 2.0

5 votes

@Test
public void traverseMultipleTimesThrows() {
  p.apply(
          Create.of(KV.of(1, (Void) null), KV.of(2, (Void) null), KV.of(3, (Void) null))
              .withCoder(KvCoder.of(VarIntCoder.of(), VoidCoder.of())))
      .apply(GroupByKey.create())
      .apply(Keys.create());

  p.traverseTopologically(visitor);
  thrown.expect(IllegalStateException.class);
  thrown.expectMessage("already been finalized");
  thrown.expectMessage(KeyedPValueTrackingVisitor.class.getSimpleName());
  p.traverseTopologically(visitor);
}

Source File: LazyAvroCoderTest.java From components with Apache License 2.0

5 votes

/**
 * Basic use of the LazyAvroCoder with the default schema supplier.
 */
@Test
public void testBasic() {
    // Create a PCollection of simple records, and assign it to be encoded with a LazyAvroCoder.
    PCollection<IndexedRecord> a = p.apply("a", RowGeneratorIO.read().withSchema(SampleSchemas.recordSimple()));
    a.setCoder(LazyAvroCoder.of());

    // Construct the a job looks like (a and c are collections of IndexedRecords):
    //
    // a ----> b ----> c ----> d
    // |
    // \-> b2

    // Trigger a transformation that requires the data to be shuffled and run the pipelne.
    PCollection<KV<IndexedRecord, Long>> b = a.apply("b", Count.<IndexedRecord> perElement());
    PCollection<IndexedRecord> c = b.apply("c", Keys.<IndexedRecord> create());
    c.setCoder(LazyAvroCoder.of());
    PCollection<KV<IndexedRecord, Long>> d = c.apply("d", Count.<IndexedRecord> perElement());

    PCollection<KV<IndexedRecord, Long>> b2 = a.apply("b2", Count.<IndexedRecord> perElement());

    p.run().waitUntilFinish();

    // No exception should have occurred.

    assertThat(LazyAvroCoder.StaticSchemaHolderSupplier.getSchemas(), hasSize(2));
    assertThat(LazyAvroCoder.StaticSchemaHolderSupplier.getSchemas(),
            contains(SampleSchemas.recordSimple(), SampleSchemas.recordSimple()));

    // Check that the reset cleans the supplier.
    LazyAvroCoder.resetSchemaSupplier();
    assertThat(LazyAvroCoder.StaticSchemaHolderSupplier.getSchemas(), emptyIterable());
}

Source File: LazyAvroCoderTest.java From components with Apache License 2.0

5 votes

/**
 * Exactly the same test as {@link #testBasic()} but reusing the LazyAvroCoder.
 */
@Test
public void testBasicReuse() {
    LazyAvroCoder lac = LazyAvroCoder.of();

    // Create a PCollection of simple records, and assign it to be encoded with a LazyAvroCoder.
    PCollection<IndexedRecord> a = p.apply("a", RowGeneratorIO.read().withSchema(SampleSchemas.recordSimple()));
    a.setCoder(lac);

    // Construct the a job looks like (a and c are collections of IndexedRecords):
    //
    // a ----> b ----> c ----> d
    // |
    // \-> b2

    // Trigger a transformation that requires the data to be shuffled and run the pipelne.
    PCollection<KV<IndexedRecord, Long>> b = a.apply("b", Count.<IndexedRecord> perElement());
    PCollection<IndexedRecord> c = b.apply("c", Keys.<IndexedRecord> create());
    c.setCoder(lac);
    PCollection<KV<IndexedRecord, Long>> d = c.apply("d", Count.<IndexedRecord> perElement());

    PCollection<KV<IndexedRecord, Long>> b2 = a.apply("b2", Count.<IndexedRecord> perElement());

    p.run().waitUntilFinish();

    // No exception should have occurred.

    // Only one schema was registered.
    assertThat(LazyAvroCoder.StaticSchemaHolderSupplier.getSchemas(), hasSize(1));
    assertThat(LazyAvroCoder.StaticSchemaHolderSupplier.getSchemas(), contains(SampleSchemas.recordSimple()));
}

Source File: BreakFusionTransform.java From dataflow-java with Apache License 2.0

5 votes

@Override
public PCollection<T> expand(PCollection<T> input) {
  return input
      .apply("Break fusion mapper", ParDo.of(new DummyMapFn<T>()))
      .apply(GroupByKey.<T, Integer>create())
      .apply(Keys.<T>create());
}

Source File: TextImportTransform.java From DataflowTemplates with Apache License 2.0

4 votes

@Override
public PCollection<Mutation> expand(PCollection<KV<String, String>> filesToTables) {
  // Map<filename,tablename>
  PCollectionView<Map<String, String>> filesToTablesMapView =
      filesToTables.apply("asView", View.asMap());
  TextImportPipeline.Options options =
      filesToTables.getPipeline().getOptions().as(TextImportPipeline.Options.class);

  return filesToTables
      .apply("Get Filenames", Keys.create())
      // PCollection<String>
      .apply(FileIO.matchAll().withEmptyMatchTreatment(EmptyMatchTreatment.DISALLOW))
      // PCollection<Match.Metadata>
      .apply(FileIO.readMatches())
      // PCollection<FileIO.ReadableFile>
      .apply(
          "Split into ranges",
          ParDo.of(
                  new SplitIntoRangesFn(
                      SplitIntoRangesFn.DEFAULT_BUNDLE_SIZE, filesToTablesMapView))
              .withSideInputs(filesToTablesMapView))
      .setCoder(FileShard.Coder.of())
      // PCollection<FileShard>
      .apply("Reshuffle", Reshuffle.viaRandomKey())
      // PCollection<FileShard>
      .apply(
          "Read lines",
          ParDo.of(
              new DoFn<FileShard, KV<String, String>>() {

                @ProcessElement
                public void processElement(ProcessContext c) {
                  FileShard shard = c.element();

                  // Create a TextSource, passing null as the delimiter to use the default
                  // delimiters ('\n', '\r', or '\r\n').
                  TextSource textSource =
                      new TextSource(
                          shard.getFile().getMetadata(),
                          shard.getRange().getFrom(),
                          shard.getRange().getTo(),
                          null);
                  String line;
                  try {
                    BoundedSource.BoundedReader<String> reader =
                        textSource
                            .createForSubrangeOfFile(
                                shard.getFile().getMetadata(),
                                shard.getRange().getFrom(),
                                shard.getRange().getTo())
                            .createReader(c.getPipelineOptions());
                    for (boolean more = reader.start(); more; more = reader.advance()) {
                      c.output(KV.of(shard.getTableName(), reader.getCurrent()));
                    }
                  } catch (IOException e) {
                    throw new RuntimeException(
                        "Unable to readFile: "
                            + shard.getFile().getMetadata().resourceId().toString());
                  }
                }
              }))
      // PCollection<KV<String, String>>: tableName, line
      .apply(
          ParDo.of(
                  new TextRowToMutation(
                      ddlView,
                      tableColumnsView,
                      options.getColumnDelimiter(),
                      options.getFieldQualifier(),
                      options.getTrailingDelimiter(),
                      options.getEscape(),
                      options.getNullString(),
                      options.getDateFormat(),
                      options.getTimestampFormat()))
              .withSideInputs(ddlView, tableColumnsView));
}

Source File: ResumeFromCheckpointStreamingTest.java From beam with Apache License 2.0

4 votes

private SparkPipelineResult run(Optional<Instant> stopWatermarkOption, int expectedAssertions) {
  KafkaIO.Read<String, Instant> read =
      KafkaIO.<String, Instant>read()
          .withBootstrapServers(EMBEDDED_KAFKA_CLUSTER.getBrokerList())
          .withTopics(Collections.singletonList(TOPIC))
          .withKeyDeserializer(StringDeserializer.class)
          .withValueDeserializer(InstantDeserializer.class)
          .withConsumerConfigUpdates(ImmutableMap.of("auto.offset.reset", "earliest"))
          .withTimestampFn(KV::getValue)
          .withWatermarkFn(
              kv -> {
                // at EOF move WM to infinity.
                String key = kv.getKey();
                Instant instant = kv.getValue();
                return "EOF".equals(key) ? BoundedWindow.TIMESTAMP_MAX_VALUE : instant;
              });

  TestSparkPipelineOptions options =
      PipelineOptionsFactory.create().as(TestSparkPipelineOptions.class);
  options.setSparkMaster("local[*]");
  options.setCheckpointDurationMillis(options.getBatchIntervalMillis());
  options.setExpectedAssertions(expectedAssertions);
  options.setRunner(TestSparkRunner.class);
  options.setEnableSparkMetricSinks(false);
  options.setForceStreaming(true);
  options.setCheckpointDir(temporaryFolder.getRoot().getPath());
  // timeout is per execution so it can be injected by the caller.
  if (stopWatermarkOption.isPresent()) {
    options.setStopPipelineWatermark(stopWatermarkOption.get().getMillis());
  }

  Pipeline p = Pipeline.create(options);

  PCollection<String> expectedCol =
      p.apply(Create.of(ImmutableList.of("side1", "side2")).withCoder(StringUtf8Coder.of()));
  PCollectionView<List<String>> view = expectedCol.apply(View.asList());

  PCollection<KV<String, Instant>> kafkaStream = p.apply(read.withoutMetadata());

  PCollection<Iterable<String>> grouped =
      kafkaStream
          .apply(Keys.create())
          .apply("EOFShallNotPassFn", ParDo.of(new EOFShallNotPassFn(view)).withSideInputs(view))
          .apply(
              Window.<String>into(FixedWindows.of(Duration.millis(500)))
                  .triggering(AfterWatermark.pastEndOfWindow())
                  .accumulatingFiredPanes()
                  .withAllowedLateness(Duration.ZERO))
          .apply(WithKeys.of(1))
          .apply(GroupByKey.create())
          .apply(Values.create());

  grouped.apply(new PAssertWithoutFlatten<>("k1", "k2", "k3", "k4", "k5"));

  return (SparkPipelineResult) p.run();
}

org.apache.beam.sdk.transforms.Keys Java Examples