org.apache.beam.sdk.values.PCollection#apply

Source File: FileIndexerPipeline.java From dataflow-opinion-analysis with Apache License 2.0

6 votes

/**
 * @param indexes
 * @return
 */
private static PCollection<ContentIndexSummary> enrichWithCNLP(
		PCollection<ContentIndexSummary> indexes, Float ratio) {
	
	PCollectionTuple splitAB = indexes
		.apply(ParDo.of(new SplitAB(ratio))
			.withOutputTags(PipelineTags.BranchA,  
				TupleTagList.of(PipelineTags.BranchB))); 
	
	PCollection<ContentIndexSummary> branchACol = splitAB.get(PipelineTags.BranchA);
	PCollection<ContentIndexSummary> branchBCol = splitAB.get(PipelineTags.BranchB);
	
	PCollection<ContentIndexSummary> enrichedBCol = branchBCol.apply(
		ParDo.of(new EnrichWithCNLPEntities()));
	
	//Merge all collections with WebResource table records
	PCollectionList<ContentIndexSummary> contentIndexSummariesList = 
		PCollectionList.of(branchACol).and(enrichedBCol);
	PCollection<ContentIndexSummary> allIndexSummaries = 
		contentIndexSummariesList.apply(Flatten.<ContentIndexSummary>pCollections());

	indexes = allIndexSummaries;
	return indexes;
}

Source File: GroupByKeyTest.java From beam with Apache License 2.0

6 votes

@Test
public void testGroupByKeyNonDeterministic() throws Exception {

  List<KV<Map<String, String>, Integer>> ungroupedPairs = Arrays.asList();

  PCollection<KV<Map<String, String>, Integer>> input =
      p.apply(
          Create.of(ungroupedPairs)
              .withCoder(
                  KvCoder.of(
                      MapCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()),
                      BigEndianIntegerCoder.of())));

  thrown.expect(IllegalStateException.class);
  thrown.expectMessage("must be deterministic");
  input.apply(GroupByKey.create());
}

Source File: BeamSqlMultipleSchemasTest.java From beam with Apache License 2.0

6 votes

@Test
public void testOverrideQualifiedMainSchema() {
  PCollection<Row> inputMain =
      pipeline.apply("mainInput", create(row(1, "pcollection_1"), row(2, "pcollection_2")));

  PCollection<Row> inputExtra =
      pipeline.apply("extraInput", create(row(1, "_extra_table_1"), row(2, "_extra_table_2")));

  TableProvider extraInputProvider = extraTableProvider("extraTable", inputExtra);

  PCollection<Row> result =
      inputMain.apply(
          SqlTransform.query("SELECT f_int, f_string FROM beam.extraTable")
              .withTableProvider("beam", extraInputProvider));

  PAssert.that(result).containsInAnyOrder(row(1, "_extra_table_1"), row(2, "_extra_table_2"));
  pipeline.run();
}

Source File: SortValuesTest.java From beam with Apache License 2.0

6 votes

@Test
public void testSecondaryKeySorting() {
  // Create a PCollection of <Key, <SecondaryKey, Value>> pairs.
  PCollection<KV<String, KV<String, Integer>>> input =
      p.apply(
          Create.of(
              Arrays.asList(
                  KV.of("key1", KV.of("secondaryKey2", 20)),
                  KV.of("key2", KV.of("secondaryKey2", 200)),
                  KV.of("key1", KV.of("secondaryKey3", 30)),
                  KV.of("key1", KV.of("secondaryKey1", 10)),
                  KV.of("key2", KV.of("secondaryKey1", 100)))));

  // Group by Key, bringing <SecondaryKey, Value> pairs for the same Key together.
  PCollection<KV<String, Iterable<KV<String, Integer>>>> grouped =
      input.apply(GroupByKey.create());

  // For every Key, sort the iterable of <SecondaryKey, Value> pairs by SecondaryKey.
  PCollection<KV<String, Iterable<KV<String, Integer>>>> groupedAndSorted =
      grouped.apply(SortValues.create(BufferedExternalSorter.options()));

  PAssert.that(groupedAndSorted)
      .satisfies(new AssertThatHasExpectedContentsForTestSecondaryKeySorting());

  p.run();
}

Source File: Task.java From beam with Apache License 2.0

6 votes

public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<BigInteger> numbers =
      pipeline.apply(
          Create.of(
              BigInteger.valueOf(10), BigInteger.valueOf(20), BigInteger.valueOf(30),
              BigInteger.valueOf(40), BigInteger.valueOf(50)
          ));

  PCollection<BigInteger> output = applyTransform(numbers);

  output.apply(Log.ofElements());

  pipeline.run();
}

Source File: BatchViewOverrides.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<KV<Integer, Iterable<KV<W, WindowedValue<T>>>>> expand(
    PCollection<T> input) {
  @SuppressWarnings("unchecked")
  Coder<W> windowCoder = (Coder<W>) input.getWindowingStrategy().getWindowFn().windowCoder();
  PCollection<KV<Integer, KV<W, WindowedValue<T>>>> rval =
      input.apply(
          ParDo.of(new UseWindowHashAsKeyAndWindowAsSortKeyDoFn<T, W>(ismCoderForHash)));
  rval.setCoder(
      KvCoder.of(
          VarIntCoder.of(),
          KvCoder.of(windowCoder, FullWindowedValueCoder.of(input.getCoder(), windowCoder))));
  return rval.apply(new GroupByKeyAndSortValuesOnly<>());
}

Source File: TextTableProviderTest.java From beam with Apache License 2.0

5 votes

/**
 * Tests {@code CREATE EXTERNAL TABLE TYPE text} with a format other than "csv" or "lines" results
 * in a CSV read of that format.
 */
@Test
public void testLegacyTdfCsv() throws Exception {
  Files.write(
      tempFolder.newFile("test.csv").toPath(),
      "hello\t13\n\ngoodbye\t42\n".getBytes(Charsets.UTF_8));

  BeamSqlEnv env = BeamSqlEnv.inMemory(new TextTableProvider());
  env.executeDdl(
      String.format(
          "CREATE EXTERNAL TABLE test %s TYPE text LOCATION '%s/*' TBLPROPERTIES '{\"format\":\"TDF\"}'",
          SQL_CSV_SCHEMA, tempFolder.getRoot()));

  PCollection<Row> rows =
      BeamSqlRelUtils.toPCollection(pipeline, env.parseQuery("SELECT * FROM test"));

  rows.apply(
      MapElements.into(TypeDescriptors.voids())
          .via(
              r -> {
                System.out.println(r.toString());
                return null;
              }));

  PAssert.that(rows)
      .containsInAnyOrder(
          Row.withSchema(CSV_SCHEMA).addValues("hello", 13).build(),
          Row.withSchema(CSV_SCHEMA).addValues("goodbye", 42).build());
  pipeline.run();
}

Source File: CassandraIO.java From beam with Apache License 2.0

5 votes

@Override
public PDone expand(PCollection<T> input) {
  if (mutationType() == MutationType.DELETE) {
    input.apply(ParDo.of(new DeleteFn<>(this)));
  } else {
    input.apply(ParDo.of(new WriteFn<>(this)));
  }
  return PDone.in(input.getPipeline());
}

Source File: FixedFlowInputRuntimeTest.java From components with Apache License 2.0

5 votes

@Test
public void test_MultipleInput_OneOutputRow() throws Exception {
    String inputAsString = generateInputJSON(inputSchema, inputIndexedRecord1)
            + generateInputJSON(inputSchema, inputIndexedRecord2);

    FixedFlowInputProperties properties = new FixedFlowInputProperties("test");
    properties.init();
    properties.schemaFlow.schema.setValue(inputSchema);
    properties.values.setValue(inputAsString);
    properties.nbRows.setValue(1);

    FixedFlowInputRuntime runtime = new FixedFlowInputRuntime();
    runtime.initialize(null, properties);

    PCollection<IndexedRecord> indexRecords = pipeline.apply(runtime);
    try (DirectCollector<IndexedRecord> collector = DirectCollector.of()) {
        indexRecords.apply(collector);

        // Run the pipeline to fill the collectors.
        pipeline.run().waitUntilFinish();;

        // Validate the contents of the collected outputs.
        List<IndexedRecord> outputs = collector.getRecords();
        assertEquals(2, outputs.size());
        assertEquals(inputIndexedRecord1.toString(), outputs.get(0).toString());
        assertEquals(inputIndexedRecord2.toString(), outputs.get(1).toString());
    }
}

Source File: CombineTranslationTest.java From beam with Apache License 2.0

5 votes

@Test
public void testToProtoWithoutSideInputs() throws Exception {
  PCollection<Integer> input = pipeline.apply(Create.of(1, 2, 3));
  CombineFnWithContext<Integer, int[], Integer> combineFn = new TestCombineFnWithContext();
  input.apply(Combine.globally(combineFn).withoutDefaults());
  final AtomicReference<AppliedPTransform<?, ?, Combine.Globally<?, ?>>> combine =
      new AtomicReference<>();
  pipeline.traverseTopologically(
      new PipelineVisitor.Defaults() {
        @Override
        public void leaveCompositeTransform(Node node) {
          if (node.getTransform() instanceof Combine.Globally) {
            checkState(combine.get() == null);
            combine.set((AppliedPTransform) node.toAppliedPTransform(getPipeline()));
          }
        }
      });
  checkState(combine.get() != null);
  assertEquals(combineFn, combine.get().getTransform().getFn());

  SdkComponents sdkComponents = SdkComponents.create();
  sdkComponents.registerEnvironment(Environments.createDockerEnvironment("java"));
  CombinePayload combineProto =
      CombineTranslation.CombineGloballyPayloadTranslator.payloadForCombineGlobally(
          (AppliedPTransform) combine.get(), sdkComponents);
  RunnerApi.Components componentsProto = sdkComponents.toComponents();

  assertEquals(
      combineFn.getAccumulatorCoder(pipeline.getCoderRegistry(), input.getCoder()),
      getAccumulatorCoder(combineProto, RehydratedComponents.forComponents(componentsProto)));
  assertEquals(
      combineFn,
      SerializableUtils.deserializeFromByteArray(
          combineProto.getCombineFn().getPayload().toByteArray(), "CombineFn"));
}

Source File: ReifyTimestampsTest.java From beam with Apache License 2.0

5 votes

@Test
@Category(ValidatesRunner.class)
public void extractFromValuesSucceeds() {
  PCollection<KV<String, TimestampedValue<Integer>>> preified =
      pipeline.apply(
          Create.of(
              KV.of("foo", TimestampedValue.of(0, new Instant(0))),
              KV.of("foo", TimestampedValue.of(1, new Instant(1))),
              KV.of("bar", TimestampedValue.of(2, new Instant(2))),
              KV.of("baz", TimestampedValue.of(3, new Instant(3)))));

  PCollection<KV<String, Integer>> timestamped =
      preified.apply(ReifyTimestamps.extractFromValues());

  PAssert.that(timestamped)
      .containsInAnyOrder(KV.of("foo", 0), KV.of("foo", 1), KV.of("bar", 2), KV.of("baz", 3));

  timestamped.apply(
      "AssertElementTimestamps",
      ParDo.of(
          new DoFn<KV<String, Integer>, Void>() {
            @ProcessElement
            public void verifyTimestampsEqualValue(ProcessContext context) {
              assertThat(
                  new Instant(context.element().getValue().longValue()),
                  equalTo(context.timestamp()));
            }
          }));

  pipeline.run();
}

Source File: SparkPortableExecutionTest.java From beam with Apache License 2.0

5 votes

@Test(timeout = 120_000)
public void testExecStageWithMultipleConsumers() throws Exception {
  PipelineOptions options = PipelineOptionsFactory.create();
  options.setRunner(CrashingRunner.class);
  options
      .as(PortablePipelineOptions.class)
      .setDefaultEnvironmentType(Environments.ENVIRONMENT_EMBEDDED);
  Pipeline pipeline = Pipeline.create(options);
  PCollection<KV<String, Iterable<String>>> f =
      pipeline
          .apply("impulse", Impulse.create())
          .apply("F", ParDo.of(new DoFnWithSideEffect<>("F")))
          // use GBK to prevent fusion of F, G, and H
          .apply(GroupByKey.create());
  f.apply("G", ParDo.of(new DoFnWithSideEffect<>("G")));
  f.apply("H", ParDo.of(new DoFnWithSideEffect<>("H")));
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline);
  JobInvocation jobInvocation =
      SparkJobInvoker.createJobInvocation(
          "testExecStageWithMultipleConsumers",
          "testExecStageWithMultipleConsumersRetrievalToken",
          sparkJobExecutor,
          pipelineProto,
          options.as(SparkPipelineOptions.class));
  jobInvocation.start();
  Assert.assertEquals(Enum.DONE, jobInvocation.getState());
}

Source File: SingleInputOutputOverrideFactoryTest.java From beam with Apache License 2.0

5 votes

@Test
public void testMapOutputs() {
  PCollection<Integer> input = pipeline.apply(Create.of(1, 2, 3));
  PCollection<Integer> output = input.apply("Map", MapElements.via(fn));
  PCollection<Integer> reappliedOutput = input.apply("ReMap", MapElements.via(fn));
  Map<PValue, ReplacementOutput> replacementMap =
      factory.mapOutputs(output.expand(), reappliedOutput);
  assertThat(
      replacementMap,
      Matchers.hasEntry(
          reappliedOutput,
          ReplacementOutput.of(
              TaggedPValue.ofExpandedValue(output),
              TaggedPValue.ofExpandedValue(reappliedOutput))));
}

Source File: CoGroupByKeyLoadTest.java From beam with Apache License 2.0

5 votes

@Override
void loadTest() throws IOException {
  SyntheticSourceOptions coSourceOptions =
      fromJsonString(options.getCoSourceOptions(), SyntheticSourceOptions.class);

  Optional<SyntheticStep> syntheticStep = createStep(options.getStepOptions());

  PCollection<KV<byte[], byte[]>> input =
      pipeline.apply("Read input", readFromSource(sourceOptions));
  input = input.apply("Collect start time metrics (input)", ParDo.of(runtimeMonitor));
  input = applyWindowing(input);
  input = applyStepIfPresent(input, "Synthetic step for input", syntheticStep);

  PCollection<KV<byte[], byte[]>> coInput =
      pipeline.apply("Read co-input", readFromSource(coSourceOptions));
  coInput = coInput.apply("Collect start time metrics (co-input)", ParDo.of(runtimeMonitor));
  coInput = applyWindowing(coInput, options.getCoInputWindowDurationSec());
  coInput = applyStepIfPresent(coInput, "Synthetic step for co-input", syntheticStep);

  KeyedPCollectionTuple.of(INPUT_TAG, input)
      .and(CO_INPUT_TAG, coInput)
      .apply("CoGroupByKey", CoGroupByKey.create())
      .apply("Ungroup and reiterate", ParDo.of(new UngroupAndReiterate(options.getIterations())))
      .apply(
          "Collect total bytes", ParDo.of(new ByteMonitor(METRICS_NAMESPACE, "totalBytes.count")))
      .apply("Collect end time metrics", ParDo.of(runtimeMonitor));
}

Source File: BeamSideInputJoinRelTest.java From beam with Apache License 2.0

5 votes

@Test
public void testLeftOuterJoin() throws Exception {
  String sql =
      "SELECT o1.order_id, o1.sum_site_id, o2.buyer FROM "
          + "(select order_id, sum(site_id) as sum_site_id FROM ORDER_DETAILS "
          + "          GROUP BY order_id, TUMBLE(order_time, INTERVAL '1' HOUR)) o1 "
          + " LEFT OUTER JOIN "
          + " ORDER_DETAILS1 o2 "
          + " on "
          + " o1.order_id=o2.order_id";

  PCollection<Row> rows = compilePipeline(sql, pipeline);

  rows.apply(ParDo.of(new BeamSqlOutputToConsoleFn("helloworld")));

  PAssert.that(rows.apply(ParDo.of(new TestUtils.BeamSqlRow2StringDoFn())))
      .containsInAnyOrder(
          TestUtils.RowsBuilder.of(
                  Schema.builder()
                      .addField("order_id", Schema.FieldType.INT32)
                      .addField("sum_site_id", Schema.FieldType.INT32)
                      .addNullableField("buyer", Schema.FieldType.STRING)
                      .build())
              .addRows(1, 3, "james", 2, 5, "bond", 3, 3, null)
              .getStringRows());
  pipeline.run();
}

Source File: ViewEvaluatorFactoryTest.java From beam with Apache License 2.0

5 votes

@Test
public void testInMemoryEvaluator() throws Exception {
  PCollection<String> input = p.apply(Create.of("foo", "bar"));
  PCollectionView<Iterable<String>> pCollectionView = input.apply(View.asIterable());
  PCollection<Iterable<String>> concat =
      input
          .apply(WithKeys.of((Void) null))
          .setCoder(KvCoder.of(VoidCoder.of(), StringUtf8Coder.of()))
          .apply(GroupByKey.create())
          .apply(Values.create());
  PCollection<Iterable<String>> view =
      concat.apply(new ViewOverrideFactory.WriteView<>(pCollectionView));

  EvaluationContext context = mock(EvaluationContext.class);
  TestViewWriter<String, Iterable<String>> viewWriter = new TestViewWriter<>();
  when(context.createPCollectionViewWriter(concat, pCollectionView)).thenReturn(viewWriter);

  CommittedBundle<String> inputBundle = bundleFactory.createBundle(input).commit(Instant.now());
  AppliedPTransform<?, ?, ?> producer = DirectGraphs.getProducer(view);
  TransformEvaluator<Iterable<String>> evaluator =
      new ViewEvaluatorFactory(context).forApplication(producer, inputBundle);

  evaluator.processElement(WindowedValue.valueInGlobalWindow(ImmutableList.of("foo", "bar")));
  assertThat(viewWriter.latest, nullValue());

  evaluator.finishBundle();
  assertThat(
      viewWriter.latest,
      containsInAnyOrder(
          WindowedValue.valueInGlobalWindow("foo"), WindowedValue.valueInGlobalWindow("bar")));
}

Source File: S3OutputRuntimeTestIT.java From components with Apache License 2.0

5 votes

@Test
public void testParquet_merge() throws IOException {
    S3DatasetProperties datasetProps = s3.createS3DatasetProperties();
    datasetProps.format.setValue(SimpleFileIOFormat.PARQUET);
    S3OutputProperties outputProperties = new S3OutputProperties("out");
    outputProperties.init();
    outputProperties.setDatasetProperties(datasetProps);
    outputProperties.mergeOutput.setValue(true);

    // Create the runtime.
    S3OutputRuntime runtime = new S3OutputRuntime();
    runtime.initialize(null, outputProperties);

    // Use the runtime in a Spark pipeline to test.
    final Pipeline p = spark.createPipeline();
    PCollection<IndexedRecord> input = p.apply( //
            Create.of(ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), //
                    ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))); //
    input.apply(runtime);

    // And run the test.
    p.run().waitUntilFinish();

    FileSystem s3FileSystem = S3Connection.createFileSystem(datasetProps);
    MiniDfsResource.assertReadParquetFile(s3FileSystem, s3.getS3APath(datasetProps),
            new HashSet<IndexedRecord>(Arrays.asList(ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), //
                    ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))),
            false);
    MiniDfsResource.assertFileNumber(s3FileSystem, s3.getS3APath(datasetProps), 1);

}

Source File: Println.java From gcp-ingestion with Mozilla Public License 2.0

4 votes

@Override
public PDone expand(PCollection<String> input) {
  input.apply(ParDo.of(fn));
  return PDone.in(input.getPipeline());
}

Source File: TextTableProvider.java From beam with Apache License 2.0

4 votes

@Override
public PCollection<String> expand(PCollection<Row> input) {
  return input.apply(
      "rowToCsv",
      MapElements.into(TypeDescriptors.strings()).via(row -> beamRow2CsvLine(row, csvFormat)));
}

Source File: LimitRuntime.java From components with Apache License 2.0

4 votes

@Override
public PCollection expand(PCollection<IndexedRecord> inputPCollection) {
    LimitDoFn doFn = new LimitDoFn().withProperties(properties);
    return inputPCollection.apply(ParDo.of(doFn));
}

Java Code Examples for org.apache.beam.sdk.values.PCollection#apply()