org.apache.beam.sdk.transforms.View Java Examples
The following examples show how to use
org.apache.beam.sdk.transforms.View.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: IndexerPipeline.java From dataflow-opinion-analysis with Apache License 2.0 | 6 votes |
/** * @param options * @param pipeline * @param readContent * @return */ private static PCollection<InputContent> filterAlreadyProcessedUrls( PCollection<InputContent> readContent, Pipeline pipeline, IndexerPipelineOptions options) { PCollection<InputContent> contentToProcess; String query = IndexerPipelineUtils.buildBigQueryProcessedUrlsQuery(options); PCollection<KV<String,Long>> alreadyProcessedUrls = pipeline .apply("Get processed URLs",BigQueryIO.read().fromQuery(query)) .apply(ParDo.of(new GetUrlFn())); final PCollectionView<Map<String,Long>> alreadyProcessedUrlsSideInput = alreadyProcessedUrls.apply(View.<String,Long>asMap()); contentToProcess = readContent .apply(ParDo.of(new FilterProcessedUrls(alreadyProcessedUrlsSideInput)) .withSideInputs(alreadyProcessedUrlsSideInput)); return contentToProcess; }
Example #2
Source File: Write.java From components with Apache License 2.0 | 6 votes |
@Override public PCollectionView<Integer> expand(PCollection<T> input) { return input .getPipeline() .apply(Create.of(0)) .apply( "FixedNumShards", ParDo.of( new DoFn<Integer, Integer>() { @ProcessElement public void outputNumShards(ProcessContext ctxt) { checkArgument( numShards.isAccessible(), "NumShards must be accessible at runtime to use constant sharding"); ctxt.output(numShards.get()); } })) .apply(View.<Integer>asSingleton()); }
Example #3
Source File: DLPReidentifyTextTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void throwsExceptionWhenDelimiterIsNullAndHeadersAreSet() { PCollectionView<List<String>> header = testPipeline.apply(Create.of("header")).apply(View.asList()); assertThrows( "Column delimiter should be set if headers are present.", IllegalArgumentException.class, () -> DLPReidentifyText.newBuilder() .setProjectId(PROJECT_ID) .setBatchSizeBytes(BATCH_SIZE_SMALL) .setReidentifyTemplateName(TEMPLATE_NAME) .setHeaderColumns(header) .build()); testPipeline.run().waitUntilFinish(); }
Example #4
Source File: DataflowPTransformMatchersTest.java From beam with Apache License 2.0 | 6 votes |
/** Creates a simple pipeline with a {@link Combine.GroupedValues} with side inputs. */ private static TestPipeline createCombineGroupedValuesWithSideInputsPipeline() { TestPipeline pipeline = TestPipeline.create().enableAbandonedNodeEnforcement(false); PCollection<KV<String, Integer>> input = pipeline .apply(Create.of(KV.of("key", 1))) .setCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of())); PCollection<String> sideInput = pipeline.apply(Create.of("side input")); PCollectionView<String> sideInputView = sideInput.apply(View.asSingleton()); input .apply(GroupByKey.create()) .apply( Combine.<String, Integer, Integer>groupedValues(new SumCombineFnWithContext()) .withSideInputs(sideInputView)); return pipeline; }
Example #5
Source File: DataflowPipelineTranslator.java From beam with Apache License 2.0 | 6 votes |
private <ElemT, ViewT> void translateTyped( View.CreatePCollectionView<ElemT, ViewT> transform, TranslationContext context) { StepTranslationContext stepContext = context.addStep(transform, "CollectionToSingleton"); PCollection<ElemT> input = context.getInput(transform); stepContext.addInput(PropertyNames.PARALLEL_INPUT, input); WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy(); stepContext.addInput( PropertyNames.WINDOWING_STRATEGY, byteArrayToJsonString( serializeWindowingStrategy(windowingStrategy, context.getPipelineOptions()))); stepContext.addInput( PropertyNames.IS_MERGING_WINDOW_FN, !windowingStrategy.getWindowFn().isNonMerging()); stepContext.addCollectionToSingletonOutput( input, PropertyNames.OUTPUT, transform.getView()); }
Example #6
Source File: HadoopFormatIO.java From beam with Apache License 2.0 | 6 votes |
/** * Creates {@link PCollectionView} with one {@link Configuration} based on the set source of the * configuration. * * @param input input data * @return PCollectionView with single {@link Configuration} * @see Builder#withConfiguration(Configuration) * @see Builder#withConfigurationTransform(PTransform) */ private PCollectionView<Configuration> createConfigurationView( PCollection<KV<KeyT, ValueT>> input) { PCollectionView<Configuration> config; if (configuration != null) { config = input .getPipeline() .apply("CreateOutputConfig", Create.<Configuration>of(configuration)) .apply(View.<Configuration>asSingleton().withDefaultValue(configuration)); } else { config = input.apply("TransformDataIntoConfig", configTransform); } return config; }
Example #7
Source File: BigQueryIOReadTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testPassThroughThenCleanupExecuted() throws Exception { p.apply(Create.empty(VarIntCoder.of())) .apply( new PassThroughThenCleanup<>( new PassThroughThenCleanup.CleanupOperation() { @Override void cleanup(PassThroughThenCleanup.ContextContainer c) throws Exception { throw new RuntimeException("cleanup executed"); } }, p.apply("Create1", Create.of("")).apply(View.asSingleton()))); thrown.expect(RuntimeException.class); thrown.expectMessage("cleanup executed"); p.run(); }
Example #8
Source File: DirectGraphVisitorTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void getViewsReturnsViews() { PCollectionView<List<String>> listView = p.apply("listCreate", Create.of("foo", "bar")) .apply( ParDo.of( new DoFn<String, String>() { @ProcessElement public void processElement(DoFn<String, String>.ProcessContext c) throws Exception { c.output(Integer.toString(c.element().length())); } })) .apply(View.asList()); PCollectionView<Object> singletonView = p.apply("singletonCreate", Create.<Object>of(1, 2, 3)).apply(View.asSingleton()); p.replaceAll( DirectRunner.fromOptions(TestPipeline.testingPipelineOptions()) .defaultTransformOverrides()); p.traverseTopologically(visitor); assertThat(visitor.getGraph().getViews(), Matchers.containsInAnyOrder(listView, singletonView)); }
Example #9
Source File: SideInputContainerTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void writeForElementInMultipleWindowsSucceeds() throws Exception { ImmutableList.Builder<WindowedValue<?>> valuesBuilder = ImmutableList.builder(); for (Object materializedValue : materializeValuesFor(singletonView.getPipeline().getOptions(), View.asSingleton(), 2.875)) { valuesBuilder.add( WindowedValue.of( materializedValue, FIRST_WINDOW.maxTimestamp().minus(200L), ImmutableList.of(FIRST_WINDOW, SECOND_WINDOW), PaneInfo.ON_TIME_AND_ONLY_FIRING)); } container.write(singletonView, valuesBuilder.build()); assertThat( container .createReaderForViews(ImmutableList.of(singletonView)) .get(singletonView, FIRST_WINDOW), equalTo(2.875)); assertThat( container .createReaderForViews(ImmutableList.of(singletonView)) .get(singletonView, SECOND_WINDOW), equalTo(2.875)); }
Example #10
Source File: IndexerPipeline.java From dataflow-opinion-analysis with Apache License 2.0 | 5 votes |
/** * @param contentToIndexNotSkipped * @param contentNotToIndexSkipped * @param pipeline * @param options * @return */ private static ContentToIndexOrNot filterAlreadyProcessedDocuments( PCollection<InputContent> contentToIndexNotSkipped, PCollection<InputContent> contentNotToIndexSkipped, Pipeline pipeline, IndexerPipelineOptions options) { PCollection<KV<String,Long>> alreadyProcessedDocs = null; if (!options.getWriteTruncate()) { String query = IndexerPipelineUtils.buildBigQueryProcessedDocsQuery(options); alreadyProcessedDocs = pipeline .apply("Get already processed Documents",BigQueryIO.read().fromQuery(query)) .apply(ParDo.of(new GetDocumentHashFn())); } else { Map<String, Long> map = new HashMap<String,Long>(); alreadyProcessedDocs = pipeline .apply("Create empty side input of Docs", Create.of(map).withCoder(KvCoder.of(StringUtf8Coder.of(),VarLongCoder.of()))); } final PCollectionView<Map<String,Long>> alreadyProcessedDocsSideInput = alreadyProcessedDocs.apply(View.<String,Long>asMap()); PCollectionTuple indexOrNotBasedOnExactDupes = contentToIndexNotSkipped .apply("Extract DocumentHash key", ParDo.of(new GetInputContentDocumentHashFn())) .apply("Group by DocumentHash key", GroupByKey.<String, InputContent>create()) .apply("Eliminate InputContent Dupes", ParDo.of(new EliminateInputContentDupes(alreadyProcessedDocsSideInput)) .withSideInputs(alreadyProcessedDocsSideInput) .withOutputTags(PipelineTags.contentToIndexNotExactDupesTag, // main output collection TupleTagList.of(PipelineTags.contentNotToIndexExactDupesTag))); // side output collection PCollection<InputContent> contentToIndexNotExactDupes = indexOrNotBasedOnExactDupes.get(PipelineTags.contentToIndexNotExactDupesTag); PCollection<InputContent> contentNotToIndexExactDupes = indexOrNotBasedOnExactDupes.get(PipelineTags.contentNotToIndexExactDupesTag); // Merge the sets of items that are dupes or skipped PCollectionList<InputContent> contentNotToIndexList = PCollectionList.of(contentNotToIndexExactDupes).and(contentNotToIndexSkipped); ContentToIndexOrNot content = new ContentToIndexOrNot(contentToIndexNotExactDupes, contentNotToIndexList.apply(Flatten.<InputContent>pCollections())); return content; }
Example #11
Source File: SideInputHandlerTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testIsReady() { SideInputHandler sideInputHandler = new SideInputHandler( ImmutableList.of(view1, view2), InMemoryStateInternals.<Void>forKey(null)); IntervalWindow firstWindow = new IntervalWindow(new Instant(0), new Instant(WINDOW_MSECS_1)); IntervalWindow secondWindow = new IntervalWindow(new Instant(0), new Instant(WINDOW_MSECS_2)); // side input should not yet be ready assertFalse(sideInputHandler.isReady(view1, firstWindow)); // add a value for view1 sideInputHandler.addSideInputValue( view1, valuesInWindow( materializeValuesFor(view1.getPipeline().getOptions(), View.asIterable(), "Hello"), new Instant(0), firstWindow)); // now side input should be ready assertTrue(sideInputHandler.isReady(view1, firstWindow)); // second window input should still not be ready assertFalse(sideInputHandler.isReady(view1, secondWindow)); }
Example #12
Source File: DataflowPipelineTranslatorTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testToSingletonTranslationWithIsmSideInput() throws Exception { // A "change detector" test that makes sure the translation // of getting a PCollectionView<T> does not change // in bad ways during refactor DataflowPipelineOptions options = buildPipelineOptions(); DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options); Pipeline pipeline = Pipeline.create(options); pipeline.apply(Create.of(1)).apply(View.asSingleton()); DataflowRunner runner = DataflowRunner.fromOptions(options); runner.replaceTransforms(pipeline); SdkComponents sdkComponents = createSdkComponents(options); RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline, sdkComponents, true); Job job = translator .translate(pipeline, pipelineProto, sdkComponents, runner, Collections.emptyList()) .getJob(); assertAllStepOutputsHaveUniqueIds(job); List<Step> steps = job.getSteps(); assertEquals(9, steps.size()); @SuppressWarnings("unchecked") List<Map<String, Object>> toIsmRecordOutputs = (List<Map<String, Object>>) steps.get(steps.size() - 2).getProperties().get(PropertyNames.OUTPUT_INFO); assertTrue( Structs.getBoolean(Iterables.getOnlyElement(toIsmRecordOutputs), "use_indexed_format")); Step collectionToSingletonStep = steps.get(steps.size() - 1); assertEquals("CollectionToSingleton", collectionToSingletonStep.getKind()); }
Example #13
Source File: Broadcast.java From nemo with Apache License 2.0 | 5 votes |
/** * Main function for the BEAM program. * @param args arguments. */ public static void main(final String[] args) { final String inputFilePath = args[0]; final String outputFilePath = args[1]; final PipelineOptions options = PipelineOptionsFactory.create(); options.setRunner(NemoPipelineRunner.class); final Pipeline p = Pipeline.create(options); final PCollection<String> elemCollection = GenericSourceSink.read(p, inputFilePath); final PCollectionView<Iterable<String>> allCollection = elemCollection.apply(View.<String>asIterable()); final PCollection<String> result = elemCollection.apply(ParDo.of(new DoFn<String, String>() { @ProcessElement public void processElement(final ProcessContext c) { final String line = c.element(); final Iterable<String> all = c.sideInput(allCollection); final Optional<String> appended = StreamSupport.stream(all.spliterator(), false) .reduce((l, r) -> l + '\n' + r); if (appended.isPresent()) { c.output("line: " + line + "\n" + appended.get()); } else { c.output("error"); } } }).withSideInputs(allCollection) ); GenericSourceSink.write(result, outputFilePath); p.run(); }
Example #14
Source File: DataflowPipelineTranslatorTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testToSingletonTranslationWithFnApiSideInput() throws Exception { // A "change detector" test that makes sure the translation // of getting a PCollectionView<T> does not change // in bad ways during refactor DataflowPipelineOptions options = buildPipelineOptions(); options.setExperiments(Arrays.asList("beam_fn_api")); DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options); Pipeline pipeline = Pipeline.create(options); pipeline.apply(Create.of(1)).apply(View.asSingleton()); DataflowRunner runner = DataflowRunner.fromOptions(options); runner.replaceTransforms(pipeline); SdkComponents sdkComponents = createSdkComponents(options); RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline, sdkComponents, true); Job job = translator .translate(pipeline, pipelineProto, sdkComponents, runner, Collections.emptyList()) .getJob(); assertAllStepOutputsHaveUniqueIds(job); List<Step> steps = job.getSteps(); assertEquals(9, steps.size()); Step collectionToSingletonStep = steps.get(steps.size() - 1); assertEquals("CollectionToSingleton", collectionToSingletonStep.getKind()); @SuppressWarnings("unchecked") List<Map<String, Object>> ctsOutputs = (List<Map<String, Object>>) steps.get(steps.size() - 1).getProperties().get(PropertyNames.OUTPUT_INFO); assertTrue(Structs.getBoolean(Iterables.getOnlyElement(ctsOutputs), "use_indexed_format")); }
Example #15
Source File: TextRowToMutationTest.java From DataflowTemplates with Apache License 2.0 | 5 votes |
@Test(expected = PipelineExecutionException.class) public void parseRowToMutationTooManyColumns() throws Exception { PCollectionView<Ddl> ddlView = pipeline.apply("ddl", Create.of(getTestDdl())).apply(View.asSingleton()); PCollectionView<Map<String, List<TableManifest.Column>>> tableColumnsMapView = pipeline .apply( "tableColumnsMap", Create.<Map<String, List<TableManifest.Column>>>of(getEmptyTableColumnsMap()) .withCoder( MapCoder.of( StringUtf8Coder.of(), ListCoder.of(ProtoCoder.of(TableManifest.Column.class))))) .apply("Map as view", View.asSingleton()); PCollection<KV<String, String>> input = pipeline.apply( "input", Create.of(KV.of(testTableName, "123,a string,yet another string,1.23,True,,,,,,,"))); PCollection<Mutation> mutations = input.apply( ParDo.of( new TextRowToMutation( ddlView, tableColumnsMapView, columnDelimiter, StaticValueProvider.of('"'), trailingDelimiter, escape, nullString, dateFormat, timestampFormat)) .withSideInputs(ddlView, tableColumnsMapView)); pipeline.run(); }
Example #16
Source File: SpannerIO.java From beam with Apache License 2.0 | 5 votes |
@Override public PCollectionView<Transaction> expand(PBegin input) { getSpannerConfig().validate(); return input .apply(Create.of(1)) .apply("Create transaction", ParDo.of(new CreateTransactionFn(this))) .apply("As PCollectionView", View.asSingleton()); }
Example #17
Source File: PipelineTranslationTest.java From beam with Apache License 2.0 | 5 votes |
@Parameters(name = "{index}") public static Iterable<Pipeline> testPipelines() { Pipeline trivialPipeline = Pipeline.create(); trivialPipeline.apply(Create.of(1, 2, 3)); Pipeline sideInputPipeline = Pipeline.create(); final PCollectionView<String> singletonView = sideInputPipeline.apply(Create.of("foo")).apply(View.asSingleton()); sideInputPipeline .apply(Create.of("main input")) .apply( ParDo.of( new DoFn<String, String>() { @ProcessElement public void process(ProcessContext c) { // actually never executed and no effect on translation c.sideInput(singletonView); } }) .withSideInputs(singletonView)); Pipeline complexPipeline = Pipeline.create(); BigEndianLongCoder customCoder = BigEndianLongCoder.of(); PCollection<Long> elems = complexPipeline.apply(GenerateSequence.from(0L).to(207L)); PCollection<Long> counted = elems.apply(Count.globally()).setCoder(customCoder); PCollection<Long> windowed = counted.apply( Window.<Long>into(FixedWindows.of(Duration.standardMinutes(7))) .triggering( AfterWatermark.pastEndOfWindow() .withLateFirings(AfterPane.elementCountAtLeast(19))) .accumulatingFiredPanes() .withAllowedLateness(Duration.standardMinutes(3L))); final WindowingStrategy<?, ?> windowedStrategy = windowed.getWindowingStrategy(); PCollection<KV<String, Long>> keyed = windowed.apply(WithKeys.of("foo")); PCollection<KV<String, Iterable<Long>>> grouped = keyed.apply(GroupByKey.create()); return ImmutableList.of(trivialPipeline, sideInputPipeline, complexPipeline); }
Example #18
Source File: Broadcast.java From incubator-nemo with Apache License 2.0 | 5 votes |
/** * Main function for the BEAM program. * * @param args arguments. */ public static void main(final String[] args) { final String inputFilePath = args[0]; final String outputFilePath = args[1]; final PipelineOptions options = NemoPipelineOptionsFactory.create(); final Pipeline p = Pipeline.create(options); final PCollection<String> elemCollection = GenericSourceSink.read(p, inputFilePath); final PCollectionView<Iterable<String>> allCollection = elemCollection.apply(View.<String>asIterable()); final PCollection<String> result = elemCollection.apply(ParDo.of(new DoFn<String, String>() { @ProcessElement public void processElement(final ProcessContext c) { final String line = c.element(); final Iterable<String> all = c.sideInput(allCollection); final Optional<String> appended = StreamSupport.stream(all.spliterator(), false) .reduce((l, r) -> l + '\n' + r); if (appended.isPresent()) { c.output("line: " + line + "\n" + appended.get()); } else { c.output("error"); } } }).withSideInputs(allCollection) ); GenericSourceSink.write(result, outputFilePath); p.run().waitUntilFinish(); }
Example #19
Source File: TransformTranslator.java From beam with Apache License 2.0 | 5 votes |
private static <ReadT, WriteT> TransformEvaluator<View.CreatePCollectionView<ReadT, WriteT>> createPCollView() { return new TransformEvaluator<View.CreatePCollectionView<ReadT, WriteT>>() { @Override public void evaluate( View.CreatePCollectionView<ReadT, WriteT> transform, EvaluationContext context) { Iterable<? extends WindowedValue<?>> iter = context.getWindowedValues(context.getInput(transform)); PCollectionView<WriteT> output = transform.getView(); Coder<Iterable<WindowedValue<?>>> coderInternal = (Coder) IterableCoder.of( WindowedValue.getFullCoder( output.getCoderInternal(), output.getWindowingStrategyInternal().getWindowFn().windowCoder())); @SuppressWarnings("unchecked") Iterable<WindowedValue<?>> iterCast = (Iterable<WindowedValue<?>>) iter; context.putPView(output, iterCast, coderInternal); } @Override public String toNativeString() { return "<createPCollectionView>"; } }; }
Example #20
Source File: EvaluationContextTest.java From beam with Apache License 2.0 | 5 votes |
@Before public void setup() { DirectRunner runner = DirectRunner.fromOptions(PipelineOptionsFactory.create()); created = p.apply(Create.of(1, 2, 3)); downstream = created.apply(WithKeys.of("foo")); view = created.apply(View.asIterable()); unbounded = p.apply(GenerateSequence.from(0)); p.replaceAll(runner.defaultTransformOverrides()); KeyedPValueTrackingVisitor keyedPValueTrackingVisitor = KeyedPValueTrackingVisitor.create(); p.traverseTopologically(keyedPValueTrackingVisitor); BundleFactory bundleFactory = ImmutableListBundleFactory.create(); DirectGraphs.performDirectOverrides(p); graph = DirectGraphs.getGraph(p); context = EvaluationContext.create( NanosOffsetClock.create(), bundleFactory, graph, keyedPValueTrackingVisitor.getKeyedPValues(), Executors.newSingleThreadExecutor()); createdProducer = graph.getProducer(created); downstreamProducer = graph.getProducer(downstream); viewProducer = graph.getProducer(view); unboundedProducer = graph.getProducer(unbounded); }
Example #21
Source File: PTransformMatchersTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void createViewWithViewFnDifferentViewFn() { PCollection<Integer> input = p.apply(Create.of(1)); PCollectionView<Iterable<Integer>> view = input.apply(View.asIterable()); // Purposely create a subclass to get a different class then what was expected. IterableViewFn<Integer> viewFn = new PCollectionViews.IterableViewFn<Integer>(() -> TypeDescriptors.integers()) {}; CreatePCollectionView<?, ?> createView = CreatePCollectionView.of(view); PTransformMatcher matcher = PTransformMatchers.createViewWithViewFn(viewFn.getClass()); assertThat(matcher.matches(getAppliedTransform(createView)), is(false)); }
Example #22
Source File: BoundedSideInputJoin.java From beam with Apache License 2.0 | 5 votes |
@Override public PCollection<Bid> expand(PCollection<Event> events) { checkState(getSideInput() != null, "Configuration error: side input is null"); final PCollectionView<Map<Long, String>> sideInputMap = getSideInput().apply(View.asMap()); return events // Only want the bid events; easier to fake some side input data .apply(NexmarkQueryUtil.JUST_BIDS) // Map the conversion function over all bids. .apply( name + ".JoinToFiles", ParDo.of( new DoFn<Bid, Bid>() { @ProcessElement public void processElement(ProcessContext c) { Bid bid = c.element(); c.output( new Bid( bid.auction, bid.bidder, bid.price, bid.dateTime, c.sideInput(sideInputMap) .get(bid.bidder % configuration.sideInputRowCount))); } }) .withSideInputs(sideInputMap)); }
Example #23
Source File: SideInputLoadTest.java From beam with Apache License 2.0 | 5 votes |
private void performTestWithList( PCollection<KV<byte[], byte[]>> input, Optional<SyntheticStep> syntheticStep) { applyStepIfPresent(input, "Synthetic step", syntheticStep); PCollectionView<List<KV<byte[], byte[]>>> sideInput = applyWindowingIfPresent(input).apply(View.asList()); input .apply(ParDo.of(new SideInputTestWithList(sideInput)).withSideInputs(sideInput)) .apply("Collect end time metrics", ParDo.of(runtimeMonitor)); }
Example #24
Source File: BatchLoads.java From beam with Apache License 2.0 | 5 votes |
private PCollectionView<String> createTempFilePrefixView( Pipeline p, final PCollectionView<String> jobIdView) { return p.apply(Create.of("")) .apply( "GetTempFilePrefix", ParDo.of( new DoFn<String, String>() { @ProcessElement public void getTempFilePrefix(ProcessContext c) { String tempLocationRoot; if (customGcsTempLocation != null) { tempLocationRoot = customGcsTempLocation.get(); } else { tempLocationRoot = c.getPipelineOptions().getTempLocation(); } String tempLocation = resolveTempLocation( tempLocationRoot, "BigQueryWriteTemp", c.sideInput(jobIdView)); LOG.info( "Writing BigQuery temporary files to {} before loading them.", tempLocation); c.output(tempLocation); } }) .withSideInputs(jobIdView)) .apply("TempFilePrefixView", View.asSingleton()); }
Example #25
Source File: SideInputLoadTest.java From beam with Apache License 2.0 | 5 votes |
private void performTestWithIterable( PCollection<KV<byte[], byte[]>> input, Optional<SyntheticStep> syntheticStep) { applyStepIfPresent(input, "Synthetic step", syntheticStep); PCollectionView<Iterable<KV<byte[], byte[]>>> sideInput; sideInput = applyWindowingIfPresent(input).apply(View.asIterable()); input .apply(ParDo.of(new SideInputTestWithIterable(sideInput)).withSideInputs(sideInput)) .apply("Collect end time metrics", ParDo.of(runtimeMonitor)); }
Example #26
Source File: PAssert.java From beam with Apache License 2.0 | 5 votes |
/** * Constructs a {@link SingletonAssert} for the value of the provided {@link PCollection} with the * specified reason. * * <p>Note that the actual value must be coded by a {@link KvCoder}, not just any {@code Coder<K, * V>}. */ public static <K, V> SingletonAssert<Map<K, Iterable<V>>> thatMultimap( String reason, PCollection<KV<K, V>> actual) { @SuppressWarnings("unchecked") KvCoder<K, V> kvCoder = (KvCoder<K, V>) actual.getCoder(); return new PCollectionViewAssert<>( actual, View.asMultimap(), MapCoder.of(kvCoder.getKeyCoder(), IterableCoder.of(kvCoder.getValueCoder())), PAssertionSite.capture(reason)); }
Example #27
Source File: PAssert.java From beam with Apache License 2.0 | 5 votes |
/** * Constructs a {@link SingletonAssert} for the value of the provided {@link PCollection} with the * specified reason. The {@link PCollection} must have at most one value per key. * * <p>Note that the actual value must be coded by a {@link KvCoder}, not just any {@code Coder<K, * V>}. */ public static <K, V> SingletonAssert<Map<K, V>> thatMap( String reason, PCollection<KV<K, V>> actual) { @SuppressWarnings("unchecked") KvCoder<K, V> kvCoder = (KvCoder<K, V>) actual.getCoder(); return new PCollectionViewAssert<>( actual, View.asMap(), MapCoder.of(kvCoder.getKeyCoder(), kvCoder.getValueCoder()), PAssertionSite.capture(reason)); }
Example #28
Source File: CacheTest.java From beam with Apache License 2.0 | 5 votes |
/** * Test checks how the cache candidates map is populated by the runner when evaluating the * pipeline. */ @Test public void cacheCandidatesUpdaterTest() { SparkPipelineOptions options = createOptions(); Pipeline pipeline = Pipeline.create(options); PCollection<String> pCollection = pipeline.apply(Create.of("foo", "bar")); // First use of pCollection. pCollection.apply(Count.globally()); // Second use of pCollection. PCollectionView<List<String>> view = pCollection.apply(View.asList()); // Internally View.asList() creates a PCollection that underlies the PCollectionView, that // PCollection should not be cached as the SparkRunner does not access that PCollection to // access the PCollectionView. pipeline .apply(Create.of("foo", "baz")) .apply( ParDo.of( new DoFn<String, String>() { @ProcessElement public void processElement(ProcessContext processContext) { if (processContext.sideInput(view).contains(processContext.element())) { processContext.output(processContext.element()); } } }) .withSideInputs(view)); JavaSparkContext jsc = SparkContextFactory.getSparkContext(options); EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options); SparkRunner.CacheVisitor cacheVisitor = new SparkRunner.CacheVisitor(new TransformTranslator.Translator(), ctxt); pipeline.traverseTopologically(cacheVisitor); assertEquals(2L, (long) ctxt.getCacheCandidates().get(pCollection)); assertEquals(1L, ctxt.getCacheCandidates().values().stream().filter(l -> l > 1).count()); }
Example #29
Source File: DataflowPipelineTranslatorTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testToIterableTranslationWithIsmSideInput() throws Exception { // A "change detector" test that makes sure the translation // of getting a PCollectionView<Iterable<T>> does not change // in bad ways during refactor DataflowPipelineOptions options = buildPipelineOptions(); DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options); Pipeline pipeline = Pipeline.create(options); pipeline.apply(Create.of(1, 2, 3)).apply(View.asIterable()); DataflowRunner runner = DataflowRunner.fromOptions(options); runner.replaceTransforms(pipeline); SdkComponents sdkComponents = createSdkComponents(options); RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline, sdkComponents, true); Job job = translator .translate(pipeline, pipelineProto, sdkComponents, runner, Collections.emptyList()) .getJob(); assertAllStepOutputsHaveUniqueIds(job); List<Step> steps = job.getSteps(); assertEquals(3, steps.size()); @SuppressWarnings("unchecked") List<Map<String, Object>> toIsmRecordOutputs = (List<Map<String, Object>>) steps.get(steps.size() - 2).getProperties().get(PropertyNames.OUTPUT_INFO); assertTrue( Structs.getBoolean(Iterables.getOnlyElement(toIsmRecordOutputs), "use_indexed_format")); Step collectionToSingletonStep = steps.get(steps.size() - 1); assertEquals("CollectionToSingleton", collectionToSingletonStep.getKind()); }
Example #30
Source File: SamzaPublishViewTransformOverride.java From beam with Apache License 2.0 | 5 votes |
@Override public PTransformReplacement<PCollection<ElemT>, PCollection<ElemT>> getReplacementTransform( AppliedPTransform< PCollection<ElemT>, PCollection<ElemT>, View.CreatePCollectionView<ElemT, ViewT>> transform) { @SuppressWarnings("unchecked") PCollection<ElemT> input = (PCollection<ElemT>) Iterables.getOnlyElement(transform.getInputs().values()); return PTransformReplacement.of( input, new SamzaCreatePCollectionViewTransform<>(transform.getTransform().getView())); }