Java Code Examples for org.apache.beam.sdk.values.PCollection#apply()
The following examples show how to use
org.apache.beam.sdk.values.PCollection#apply() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: FileIndexerPipeline.java From dataflow-opinion-analysis with Apache License 2.0 | 6 votes |
/** * @param indexes * @return */ private static PCollection<ContentIndexSummary> enrichWithCNLP( PCollection<ContentIndexSummary> indexes, Float ratio) { PCollectionTuple splitAB = indexes .apply(ParDo.of(new SplitAB(ratio)) .withOutputTags(PipelineTags.BranchA, TupleTagList.of(PipelineTags.BranchB))); PCollection<ContentIndexSummary> branchACol = splitAB.get(PipelineTags.BranchA); PCollection<ContentIndexSummary> branchBCol = splitAB.get(PipelineTags.BranchB); PCollection<ContentIndexSummary> enrichedBCol = branchBCol.apply( ParDo.of(new EnrichWithCNLPEntities())); //Merge all collections with WebResource table records PCollectionList<ContentIndexSummary> contentIndexSummariesList = PCollectionList.of(branchACol).and(enrichedBCol); PCollection<ContentIndexSummary> allIndexSummaries = contentIndexSummariesList.apply(Flatten.<ContentIndexSummary>pCollections()); indexes = allIndexSummaries; return indexes; }
Example 2
Source File: GroupByKeyTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testGroupByKeyNonDeterministic() throws Exception { List<KV<Map<String, String>, Integer>> ungroupedPairs = Arrays.asList(); PCollection<KV<Map<String, String>, Integer>> input = p.apply( Create.of(ungroupedPairs) .withCoder( KvCoder.of( MapCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()), BigEndianIntegerCoder.of()))); thrown.expect(IllegalStateException.class); thrown.expectMessage("must be deterministic"); input.apply(GroupByKey.create()); }
Example 3
Source File: BeamSqlMultipleSchemasTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testOverrideQualifiedMainSchema() { PCollection<Row> inputMain = pipeline.apply("mainInput", create(row(1, "pcollection_1"), row(2, "pcollection_2"))); PCollection<Row> inputExtra = pipeline.apply("extraInput", create(row(1, "_extra_table_1"), row(2, "_extra_table_2"))); TableProvider extraInputProvider = extraTableProvider("extraTable", inputExtra); PCollection<Row> result = inputMain.apply( SqlTransform.query("SELECT f_int, f_string FROM beam.extraTable") .withTableProvider("beam", extraInputProvider)); PAssert.that(result).containsInAnyOrder(row(1, "_extra_table_1"), row(2, "_extra_table_2")); pipeline.run(); }
Example 4
Source File: SortValuesTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testSecondaryKeySorting() { // Create a PCollection of <Key, <SecondaryKey, Value>> pairs. PCollection<KV<String, KV<String, Integer>>> input = p.apply( Create.of( Arrays.asList( KV.of("key1", KV.of("secondaryKey2", 20)), KV.of("key2", KV.of("secondaryKey2", 200)), KV.of("key1", KV.of("secondaryKey3", 30)), KV.of("key1", KV.of("secondaryKey1", 10)), KV.of("key2", KV.of("secondaryKey1", 100))))); // Group by Key, bringing <SecondaryKey, Value> pairs for the same Key together. PCollection<KV<String, Iterable<KV<String, Integer>>>> grouped = input.apply(GroupByKey.create()); // For every Key, sort the iterable of <SecondaryKey, Value> pairs by SecondaryKey. PCollection<KV<String, Iterable<KV<String, Integer>>>> groupedAndSorted = grouped.apply(SortValues.create(BufferedExternalSorter.options())); PAssert.that(groupedAndSorted) .satisfies(new AssertThatHasExpectedContentsForTestSecondaryKeySorting()); p.run(); }
Example 5
Source File: Task.java From beam with Apache License 2.0 | 6 votes |
public static void main(String[] args) { PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create(); Pipeline pipeline = Pipeline.create(options); PCollection<BigInteger> numbers = pipeline.apply( Create.of( BigInteger.valueOf(10), BigInteger.valueOf(20), BigInteger.valueOf(30), BigInteger.valueOf(40), BigInteger.valueOf(50) )); PCollection<BigInteger> output = applyTransform(numbers); output.apply(Log.ofElements()); pipeline.run(); }
Example 6
Source File: BatchViewOverrides.java From beam with Apache License 2.0 | 5 votes |
@Override public PCollection<KV<Integer, Iterable<KV<W, WindowedValue<T>>>>> expand( PCollection<T> input) { @SuppressWarnings("unchecked") Coder<W> windowCoder = (Coder<W>) input.getWindowingStrategy().getWindowFn().windowCoder(); PCollection<KV<Integer, KV<W, WindowedValue<T>>>> rval = input.apply( ParDo.of(new UseWindowHashAsKeyAndWindowAsSortKeyDoFn<T, W>(ismCoderForHash))); rval.setCoder( KvCoder.of( VarIntCoder.of(), KvCoder.of(windowCoder, FullWindowedValueCoder.of(input.getCoder(), windowCoder)))); return rval.apply(new GroupByKeyAndSortValuesOnly<>()); }
Example 7
Source File: TextTableProviderTest.java From beam with Apache License 2.0 | 5 votes |
/** * Tests {@code CREATE EXTERNAL TABLE TYPE text} with a format other than "csv" or "lines" results * in a CSV read of that format. */ @Test public void testLegacyTdfCsv() throws Exception { Files.write( tempFolder.newFile("test.csv").toPath(), "hello\t13\n\ngoodbye\t42\n".getBytes(Charsets.UTF_8)); BeamSqlEnv env = BeamSqlEnv.inMemory(new TextTableProvider()); env.executeDdl( String.format( "CREATE EXTERNAL TABLE test %s TYPE text LOCATION '%s/*' TBLPROPERTIES '{\"format\":\"TDF\"}'", SQL_CSV_SCHEMA, tempFolder.getRoot())); PCollection<Row> rows = BeamSqlRelUtils.toPCollection(pipeline, env.parseQuery("SELECT * FROM test")); rows.apply( MapElements.into(TypeDescriptors.voids()) .via( r -> { System.out.println(r.toString()); return null; })); PAssert.that(rows) .containsInAnyOrder( Row.withSchema(CSV_SCHEMA).addValues("hello", 13).build(), Row.withSchema(CSV_SCHEMA).addValues("goodbye", 42).build()); pipeline.run(); }
Example 8
Source File: CassandraIO.java From beam with Apache License 2.0 | 5 votes |
@Override public PDone expand(PCollection<T> input) { if (mutationType() == MutationType.DELETE) { input.apply(ParDo.of(new DeleteFn<>(this))); } else { input.apply(ParDo.of(new WriteFn<>(this))); } return PDone.in(input.getPipeline()); }
Example 9
Source File: FixedFlowInputRuntimeTest.java From components with Apache License 2.0 | 5 votes |
@Test public void test_MultipleInput_OneOutputRow() throws Exception { String inputAsString = generateInputJSON(inputSchema, inputIndexedRecord1) + generateInputJSON(inputSchema, inputIndexedRecord2); FixedFlowInputProperties properties = new FixedFlowInputProperties("test"); properties.init(); properties.schemaFlow.schema.setValue(inputSchema); properties.values.setValue(inputAsString); properties.nbRows.setValue(1); FixedFlowInputRuntime runtime = new FixedFlowInputRuntime(); runtime.initialize(null, properties); PCollection<IndexedRecord> indexRecords = pipeline.apply(runtime); try (DirectCollector<IndexedRecord> collector = DirectCollector.of()) { indexRecords.apply(collector); // Run the pipeline to fill the collectors. pipeline.run().waitUntilFinish();; // Validate the contents of the collected outputs. List<IndexedRecord> outputs = collector.getRecords(); assertEquals(2, outputs.size()); assertEquals(inputIndexedRecord1.toString(), outputs.get(0).toString()); assertEquals(inputIndexedRecord2.toString(), outputs.get(1).toString()); } }
Example 10
Source File: CombineTranslationTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testToProtoWithoutSideInputs() throws Exception { PCollection<Integer> input = pipeline.apply(Create.of(1, 2, 3)); CombineFnWithContext<Integer, int[], Integer> combineFn = new TestCombineFnWithContext(); input.apply(Combine.globally(combineFn).withoutDefaults()); final AtomicReference<AppliedPTransform<?, ?, Combine.Globally<?, ?>>> combine = new AtomicReference<>(); pipeline.traverseTopologically( new PipelineVisitor.Defaults() { @Override public void leaveCompositeTransform(Node node) { if (node.getTransform() instanceof Combine.Globally) { checkState(combine.get() == null); combine.set((AppliedPTransform) node.toAppliedPTransform(getPipeline())); } } }); checkState(combine.get() != null); assertEquals(combineFn, combine.get().getTransform().getFn()); SdkComponents sdkComponents = SdkComponents.create(); sdkComponents.registerEnvironment(Environments.createDockerEnvironment("java")); CombinePayload combineProto = CombineTranslation.CombineGloballyPayloadTranslator.payloadForCombineGlobally( (AppliedPTransform) combine.get(), sdkComponents); RunnerApi.Components componentsProto = sdkComponents.toComponents(); assertEquals( combineFn.getAccumulatorCoder(pipeline.getCoderRegistry(), input.getCoder()), getAccumulatorCoder(combineProto, RehydratedComponents.forComponents(componentsProto))); assertEquals( combineFn, SerializableUtils.deserializeFromByteArray( combineProto.getCombineFn().getPayload().toByteArray(), "CombineFn")); }
Example 11
Source File: ReifyTimestampsTest.java From beam with Apache License 2.0 | 5 votes |
@Test @Category(ValidatesRunner.class) public void extractFromValuesSucceeds() { PCollection<KV<String, TimestampedValue<Integer>>> preified = pipeline.apply( Create.of( KV.of("foo", TimestampedValue.of(0, new Instant(0))), KV.of("foo", TimestampedValue.of(1, new Instant(1))), KV.of("bar", TimestampedValue.of(2, new Instant(2))), KV.of("baz", TimestampedValue.of(3, new Instant(3))))); PCollection<KV<String, Integer>> timestamped = preified.apply(ReifyTimestamps.extractFromValues()); PAssert.that(timestamped) .containsInAnyOrder(KV.of("foo", 0), KV.of("foo", 1), KV.of("bar", 2), KV.of("baz", 3)); timestamped.apply( "AssertElementTimestamps", ParDo.of( new DoFn<KV<String, Integer>, Void>() { @ProcessElement public void verifyTimestampsEqualValue(ProcessContext context) { assertThat( new Instant(context.element().getValue().longValue()), equalTo(context.timestamp())); } })); pipeline.run(); }
Example 12
Source File: SparkPortableExecutionTest.java From beam with Apache License 2.0 | 5 votes |
@Test(timeout = 120_000) public void testExecStageWithMultipleConsumers() throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); options.setRunner(CrashingRunner.class); options .as(PortablePipelineOptions.class) .setDefaultEnvironmentType(Environments.ENVIRONMENT_EMBEDDED); Pipeline pipeline = Pipeline.create(options); PCollection<KV<String, Iterable<String>>> f = pipeline .apply("impulse", Impulse.create()) .apply("F", ParDo.of(new DoFnWithSideEffect<>("F"))) // use GBK to prevent fusion of F, G, and H .apply(GroupByKey.create()); f.apply("G", ParDo.of(new DoFnWithSideEffect<>("G"))); f.apply("H", ParDo.of(new DoFnWithSideEffect<>("H"))); RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline); JobInvocation jobInvocation = SparkJobInvoker.createJobInvocation( "testExecStageWithMultipleConsumers", "testExecStageWithMultipleConsumersRetrievalToken", sparkJobExecutor, pipelineProto, options.as(SparkPipelineOptions.class)); jobInvocation.start(); Assert.assertEquals(Enum.DONE, jobInvocation.getState()); }
Example 13
Source File: SingleInputOutputOverrideFactoryTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testMapOutputs() { PCollection<Integer> input = pipeline.apply(Create.of(1, 2, 3)); PCollection<Integer> output = input.apply("Map", MapElements.via(fn)); PCollection<Integer> reappliedOutput = input.apply("ReMap", MapElements.via(fn)); Map<PValue, ReplacementOutput> replacementMap = factory.mapOutputs(output.expand(), reappliedOutput); assertThat( replacementMap, Matchers.hasEntry( reappliedOutput, ReplacementOutput.of( TaggedPValue.ofExpandedValue(output), TaggedPValue.ofExpandedValue(reappliedOutput)))); }
Example 14
Source File: CoGroupByKeyLoadTest.java From beam with Apache License 2.0 | 5 votes |
@Override void loadTest() throws IOException { SyntheticSourceOptions coSourceOptions = fromJsonString(options.getCoSourceOptions(), SyntheticSourceOptions.class); Optional<SyntheticStep> syntheticStep = createStep(options.getStepOptions()); PCollection<KV<byte[], byte[]>> input = pipeline.apply("Read input", readFromSource(sourceOptions)); input = input.apply("Collect start time metrics (input)", ParDo.of(runtimeMonitor)); input = applyWindowing(input); input = applyStepIfPresent(input, "Synthetic step for input", syntheticStep); PCollection<KV<byte[], byte[]>> coInput = pipeline.apply("Read co-input", readFromSource(coSourceOptions)); coInput = coInput.apply("Collect start time metrics (co-input)", ParDo.of(runtimeMonitor)); coInput = applyWindowing(coInput, options.getCoInputWindowDurationSec()); coInput = applyStepIfPresent(coInput, "Synthetic step for co-input", syntheticStep); KeyedPCollectionTuple.of(INPUT_TAG, input) .and(CO_INPUT_TAG, coInput) .apply("CoGroupByKey", CoGroupByKey.create()) .apply("Ungroup and reiterate", ParDo.of(new UngroupAndReiterate(options.getIterations()))) .apply( "Collect total bytes", ParDo.of(new ByteMonitor(METRICS_NAMESPACE, "totalBytes.count"))) .apply("Collect end time metrics", ParDo.of(runtimeMonitor)); }
Example 15
Source File: BeamSideInputJoinRelTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testLeftOuterJoin() throws Exception { String sql = "SELECT o1.order_id, o1.sum_site_id, o2.buyer FROM " + "(select order_id, sum(site_id) as sum_site_id FROM ORDER_DETAILS " + " GROUP BY order_id, TUMBLE(order_time, INTERVAL '1' HOUR)) o1 " + " LEFT OUTER JOIN " + " ORDER_DETAILS1 o2 " + " on " + " o1.order_id=o2.order_id"; PCollection<Row> rows = compilePipeline(sql, pipeline); rows.apply(ParDo.of(new BeamSqlOutputToConsoleFn("helloworld"))); PAssert.that(rows.apply(ParDo.of(new TestUtils.BeamSqlRow2StringDoFn()))) .containsInAnyOrder( TestUtils.RowsBuilder.of( Schema.builder() .addField("order_id", Schema.FieldType.INT32) .addField("sum_site_id", Schema.FieldType.INT32) .addNullableField("buyer", Schema.FieldType.STRING) .build()) .addRows(1, 3, "james", 2, 5, "bond", 3, 3, null) .getStringRows()); pipeline.run(); }
Example 16
Source File: ViewEvaluatorFactoryTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testInMemoryEvaluator() throws Exception { PCollection<String> input = p.apply(Create.of("foo", "bar")); PCollectionView<Iterable<String>> pCollectionView = input.apply(View.asIterable()); PCollection<Iterable<String>> concat = input .apply(WithKeys.of((Void) null)) .setCoder(KvCoder.of(VoidCoder.of(), StringUtf8Coder.of())) .apply(GroupByKey.create()) .apply(Values.create()); PCollection<Iterable<String>> view = concat.apply(new ViewOverrideFactory.WriteView<>(pCollectionView)); EvaluationContext context = mock(EvaluationContext.class); TestViewWriter<String, Iterable<String>> viewWriter = new TestViewWriter<>(); when(context.createPCollectionViewWriter(concat, pCollectionView)).thenReturn(viewWriter); CommittedBundle<String> inputBundle = bundleFactory.createBundle(input).commit(Instant.now()); AppliedPTransform<?, ?, ?> producer = DirectGraphs.getProducer(view); TransformEvaluator<Iterable<String>> evaluator = new ViewEvaluatorFactory(context).forApplication(producer, inputBundle); evaluator.processElement(WindowedValue.valueInGlobalWindow(ImmutableList.of("foo", "bar"))); assertThat(viewWriter.latest, nullValue()); evaluator.finishBundle(); assertThat( viewWriter.latest, containsInAnyOrder( WindowedValue.valueInGlobalWindow("foo"), WindowedValue.valueInGlobalWindow("bar"))); }
Example 17
Source File: S3OutputRuntimeTestIT.java From components with Apache License 2.0 | 5 votes |
@Test public void testParquet_merge() throws IOException { S3DatasetProperties datasetProps = s3.createS3DatasetProperties(); datasetProps.format.setValue(SimpleFileIOFormat.PARQUET); S3OutputProperties outputProperties = new S3OutputProperties("out"); outputProperties.init(); outputProperties.setDatasetProperties(datasetProps); outputProperties.mergeOutput.setValue(true); // Create the runtime. S3OutputRuntime runtime = new S3OutputRuntime(); runtime.initialize(null, outputProperties); // Use the runtime in a Spark pipeline to test. final Pipeline p = spark.createPipeline(); PCollection<IndexedRecord> input = p.apply( // Create.of(ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), // ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))); // input.apply(runtime); // And run the test. p.run().waitUntilFinish(); FileSystem s3FileSystem = S3Connection.createFileSystem(datasetProps); MiniDfsResource.assertReadParquetFile(s3FileSystem, s3.getS3APath(datasetProps), new HashSet<IndexedRecord>(Arrays.asList(ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), // ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))), false); MiniDfsResource.assertFileNumber(s3FileSystem, s3.getS3APath(datasetProps), 1); }
Example 18
Source File: Println.java From gcp-ingestion with Mozilla Public License 2.0 | 4 votes |
@Override public PDone expand(PCollection<String> input) { input.apply(ParDo.of(fn)); return PDone.in(input.getPipeline()); }
Example 19
Source File: TextTableProvider.java From beam with Apache License 2.0 | 4 votes |
@Override public PCollection<String> expand(PCollection<Row> input) { return input.apply( "rowToCsv", MapElements.into(TypeDescriptors.strings()).via(row -> beamRow2CsvLine(row, csvFormat))); }
Example 20
Source File: LimitRuntime.java From components with Apache License 2.0 | 4 votes |
@Override public PCollection expand(PCollection<IndexedRecord> inputPCollection) { LimitDoFn doFn = new LimitDoFn().withProperties(properties); return inputPCollection.apply(ParDo.of(doFn)); }