org.apache.beam.sdk.Pipeline Java Exaples

Source File: ElasticsearchDatasetRuntime.java From components with Apache License 2.0

6 votes

@Override
public void getSample(int limit, Consumer<IndexedRecord> consumer) {
    // Create an input runtime based on the properties: ensure to read only the first batch of documents
    // from the index since we're computing a sample
    ElasticsearchInputRuntime inputRuntime = new ElasticsearchInputRuntime(true);
    ElasticsearchInputProperties inputProperties = new ElasticsearchInputProperties(null);
    inputProperties.init();
    inputProperties.setDatasetProperties(properties);
    inputRuntime.initialize(null, inputProperties);

    DirectOptions options = BeamLocalRunnerOption.getOptions();
    final Pipeline p = Pipeline.create(options);

    try (DirectConsumerCollector<IndexedRecord> collector = DirectConsumerCollector.of(consumer)) {
        // Collect a sample of the input records.
        p.apply(inputRuntime) //
                .apply(Sample.<IndexedRecord> any(limit)).apply(collector);
        p.run().waitUntilFinish();
    }
}

Source File: HadoopFormatIOWriteTest.java From beam with Apache License 2.0

6 votes

@Test
public void testWritingDataFailInvalidKeyType() {

  conf.set(HadoopFormatIO.OUTPUT_DIR, tmpFolder.getRoot().getAbsolutePath());
  List<KV<String, Employee>> data = new ArrayList<>();
  data.add(KV.of("key", new Employee("name", "address")));
  PCollection<KV<String, Employee>> input =
      p.apply("CreateData", Create.of(data))
          .setTypeDescriptor(
              TypeDescriptors.kvs(
                  new TypeDescriptor<String>() {}, new TypeDescriptor<Employee>() {}));

  thrown.expect(Pipeline.PipelineExecutionException.class);
  thrown.expectMessage(String.class.getName());

  input.apply(
      "Write",
      HadoopFormatIO.<String, Employee>write()
          .withConfiguration(conf)
          .withPartitioning()
          .withExternalSynchronization(new HDFSSynchronization(getLocksDirPath())));
  p.run().waitUntilFinish();
}

Source File: TestDataflowRunnerTest.java From beam with Apache License 2.0

6 votes

/**
 * Tests that when a streaming pipeline terminates and doesn't fail due to {@link PAssert} that
 * the {@link TestPipelineOptions#setOnSuccessMatcher(SerializableMatcher) on success matcher} is
 * invoked.
 */
@Test
public void testStreamingOnSuccessMatcherWhenPipelineSucceeds() throws Exception {
  options.setStreaming(true);
  Pipeline p = TestPipeline.create(options);
  PCollection<Integer> pc = p.apply(Create.of(1, 2, 3));
  PAssert.that(pc).containsInAnyOrder(1, 2, 3);

  final DataflowPipelineJob mockJob = Mockito.mock(DataflowPipelineJob.class);
  when(mockJob.getState()).thenReturn(State.DONE);
  when(mockJob.getProjectId()).thenReturn("test-project");
  when(mockJob.getJobId()).thenReturn("test-job");

  DataflowRunner mockRunner = Mockito.mock(DataflowRunner.class);
  when(mockRunner.run(any(Pipeline.class))).thenReturn(mockJob);

  TestDataflowRunner runner = TestDataflowRunner.fromOptionsAndClient(options, mockClient);
  options.as(TestPipelineOptions.class).setOnSuccessMatcher(new TestSuccessMatcher(mockJob, 1));

  when(mockJob.waitUntilFinish(any(Duration.class), any(JobMessagesHandler.class)))
      .thenReturn(State.DONE);

  when(mockClient.getJobMetrics(anyString()))
      .thenReturn(generateMockMetricResponse(true /* success */, true /* tentative */));
  runner.run(p, mockRunner);
}

Source File: Task.java From beam with Apache License 2.0

6 votes

public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<String> fruits =
      pipeline.apply("Fruits",
          Create.of("apple", "banana", "cherry")
      );

  PCollection<String> countries =
      pipeline.apply("Countries",
          Create.of("australia", "brazil", "canada")
      );

  PCollection<String> output = applyTransform(fruits, countries);

  output.apply(Log.ofElements());

  pipeline.run();
}

Source File: DataflowPTransformMatchersTest.java From beam with Apache License 2.0

6 votes

/** Traverse the pipeline and return the first {@link Combine.GroupedValues} found. */
private static AppliedPTransform<?, ?, ?> getCombineGroupedValuesFrom(TestPipeline pipeline) {
  final AppliedPTransform<?, ?, ?>[] transform = new AppliedPTransform<?, ?, ?>[1];
  pipeline.traverseTopologically(
      new Pipeline.PipelineVisitor.Defaults() {
        @Override
        public CompositeBehavior enterCompositeTransform(TransformHierarchy.Node node) {
          if (!node.isRootNode()
              && node.toAppliedPTransform(getPipeline())
                  .getTransform()
                  .getClass()
                  .equals(Combine.GroupedValues.class)) {
            transform[0] = node.toAppliedPTransform(getPipeline());
            return CompositeBehavior.DO_NOT_ENTER_TRANSFORM;
          }
          return CompositeBehavior.ENTER_TRANSFORM;
        }
      });
  return transform[0];
}

Source File: ImpulseEvaluatorFactoryTest.java From beam with Apache License 2.0

6 votes

@Test
public void testRootProvider() {
  Pipeline p = Pipeline.create();
  PCollection<byte[]> impulseOut = p.apply(Impulse.create());
  // Add a second impulse to demonstrate no crosstalk between applications
  @SuppressWarnings("unused")
  PCollection<byte[]> impulseOutTwo = p.apply(Impulse.create());
  AppliedPTransform<?, ?, ?> impulseApplication = DirectGraphs.getProducer(impulseOut);

  ImpulseRootProvider rootProvider = new ImpulseRootProvider(context);
  when(context.createRootBundle()).thenReturn(bundleFactory.createRootBundle());

  Collection<CommittedBundle<?>> inputs =
      rootProvider.getInitialInputs((AppliedPTransform) impulseApplication, 100);

  assertThat("Only one impulse bundle per application", inputs, hasSize(1));
  assertThat(
      "Only one impulse shard per bundle",
      Iterables.size(inputs.iterator().next().getElements()),
      equalTo(1));
}

Source File: UnionTest.java From beam with Apache License 2.0

6 votes

@Test
public void testUnion_threeDataSets() {
  execute(
      new TestCase<Integer>() {

        @Override
        public PCollection<Integer> getOutput(Pipeline pipeline) {
          final PCollection<Integer> first = createDataset(pipeline, 1, 2, 3, 4, 5, 6);
          final PCollection<Integer> second = createDataset(pipeline, 7, 8, 9, 10, 11, 12);
          final PCollection<Integer> third = createDataset(pipeline, 13, 14, 15, 16, 17, 18);
          return Union.of(first, second, third).output();
        }

        @Override
        public List<Integer> getUnorderedOutput() {
          return Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18);
        }
      });
}

Source File: BeamEnumerableConverter.java From beam with Apache License 2.0

6 votes

private static boolean containsUnboundedPCollection(Pipeline p) {
  class BoundednessVisitor extends PipelineVisitor.Defaults {
    IsBounded boundedness = IsBounded.BOUNDED;

    @Override
    public void visitValue(PValue value, Node producer) {
      if (value instanceof PCollection) {
        boundedness = boundedness.and(((PCollection) value).isBounded());
      }
    }
  }

  BoundednessVisitor visitor = new BoundednessVisitor();
  p.traverseTopologically(visitor);
  return visitor.boundedness == IsBounded.UNBOUNDED;
}

Source File: CacheTest.java From beam with Apache License 2.0

6 votes

@Test
public void shouldCacheTest() {
  SparkPipelineOptions options = createOptions();
  options.setCacheDisabled(true);
  Pipeline pipeline = Pipeline.create(options);

  Values<String> valuesTransform = Create.of("foo", "bar");
  PCollection pCollection = mock(PCollection.class);

  JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
  EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options);
  ctxt.getCacheCandidates().put(pCollection, 2L);

  assertFalse(ctxt.shouldCache(valuesTransform, pCollection));

  options.setCacheDisabled(false);
  assertTrue(ctxt.shouldCache(valuesTransform, pCollection));

  GroupByKey<String, String> gbkTransform = GroupByKey.create();
  assertFalse(ctxt.shouldCache(gbkTransform, pCollection));
}

Source File: DirectRunnerTest.java From beam with Apache License 2.0

6 votes

/**
 * Tests that a {@link DoFn} that mutates its input with a good equals() fails in the {@link
 * DirectRunner}.
 */
@Test
public void testMutatingInputDoFnError() throws Exception {
  Pipeline pipeline = getPipeline();

  pipeline
      .apply(
          Create.of(Arrays.asList(1, 2, 3), Arrays.asList(4, 5, 6))
              .withCoder(ListCoder.of(VarIntCoder.of())))
      .apply(
          ParDo.of(
              new DoFn<List<Integer>, Integer>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  List<Integer> inputList = c.element();
                  inputList.set(0, 37);
                  c.output(12);
                }
              }));

  thrown.expect(IllegalMutationException.class);
  thrown.expectMessage("Input");
  thrown.expectMessage("must not be mutated");
  pipeline.run();
}

Source File: DataflowPipelineTranslatorTest.java From beam with Apache License 2.0

6 votes

@Test
public void testNetworkConfigMissing() throws IOException {
  DataflowPipelineOptions options = buildPipelineOptions();

  Pipeline p = buildPipeline(options);
  p.traverseTopologically(new RecordingPipelineVisitor());
  SdkComponents sdkComponents = createSdkComponents(options);
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p, sdkComponents, true);
  Job job =
      DataflowPipelineTranslator.fromOptions(options)
          .translate(
              p,
              pipelineProto,
              sdkComponents,
              DataflowRunner.fromOptions(options),
              Collections.emptyList())
          .getJob();

  assertEquals(1, job.getEnvironment().getWorkerPools().size());
  assertNull(job.getEnvironment().getWorkerPools().get(0).getNetwork());
}

Source File: DataflowRunner.java From beam with Apache License 2.0

6 votes

private boolean containsUnboundedPCollection(Pipeline p) {
  class BoundednessVisitor extends PipelineVisitor.Defaults {
    IsBounded boundedness = IsBounded.BOUNDED;

    @Override
    public void visitValue(PValue value, Node producer) {
      if (value instanceof PCollection) {
        boundedness = boundedness.and(((PCollection) value).isBounded());
      }
    }
  }

  BoundednessVisitor visitor = new BoundednessVisitor();
  p.traverseTopologically(visitor);
  return visitor.boundedness == IsBounded.UNBOUNDED;
}

Source File: FlinkPipelineExecutionEnvironmentTest.java From beam with Apache License 2.0

6 votes

@Test
public void shouldUseTransformOverrides() {
  boolean[] testParameters = {true, false};
  for (boolean streaming : testParameters) {
    FlinkPipelineOptions options = PipelineOptionsFactory.as(FlinkPipelineOptions.class);
    options.setStreaming(streaming);
    options.setRunner(FlinkRunner.class);
    FlinkPipelineExecutionEnvironment flinkEnv = new FlinkPipelineExecutionEnvironment(options);
    Pipeline p = Mockito.spy(Pipeline.create(options));

    flinkEnv.translate(p);

    ArgumentCaptor<ImmutableList> captor = ArgumentCaptor.forClass(ImmutableList.class);
    Mockito.verify(p).replaceAll(captor.capture());
    ImmutableList<PTransformOverride> overridesList = captor.getValue();

    assertThat(overridesList.isEmpty(), is(false));
    assertThat(
        overridesList.size(), is(FlinkTransformOverrides.getDefaultOverrides(options).size()));
  }
}

Source File: BigQueryDatasetRuntime.java From components with Apache License 2.0

6 votes

public void getSampleDeprecated(int limit, Consumer<IndexedRecord> consumer) {
    // Create a pipeline using the input component to get records.
    DirectOptions options = BeamLocalRunnerOption.getOptions();
    final Pipeline p = Pipeline.create(options);

    // Create an input runtime based on the properties.
    BigQueryInputRuntime inputRuntime = new BigQueryInputRuntime();
    BigQueryInputProperties inputProperties = new BigQueryInputProperties(null);
    inputProperties.init();
    inputProperties.setDatasetProperties(properties);
    inputRuntime.initialize(new BeamJobRuntimeContainer(options), inputProperties);

    try (DirectConsumerCollector<IndexedRecord> collector = DirectConsumerCollector.of(consumer)) {
        // Collect a sample of the input records.
        p
                .apply(inputRuntime) //
                .apply(Sample.<IndexedRecord> any(limit))
                .apply(collector);
        PipelineResult pr = p.run();
        pr.waitUntilFinish();
    }
}

Source File: BigQueryTimePartitioningClusteringIT.java From beam with Apache License 2.0

6 votes

@Test
public void testE2EBigQueryTimePartitioning() throws Exception {
  String tableName = "weather_stations_time_partitioned_" + System.currentTimeMillis();

  Pipeline p = Pipeline.create(options);

  p.apply(BigQueryIO.readTableRows().from(options.getBqcInput()))
      .apply(ParDo.of(new KeepStationNumberAndConvertDate()))
      .apply(
          BigQueryIO.writeTableRows()
              .to(String.format("%s.%s", DATASET_NAME, tableName))
              .withTimePartitioning(TIME_PARTITIONING)
              .withSchema(SCHEMA)
              .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));

  p.run().waitUntilFinish();

  bqClient = BigqueryClient.getNewBigquerryClient(options.getAppName());
  Table table = bqClient.tables().get(options.getProject(), DATASET_NAME, tableName).execute();

  Assert.assertEquals(table.getTimePartitioning(), TIME_PARTITIONING);
}

Source File: QueryablePipelineTest.java From beam with Apache License 2.0

6 votes

@Test
public void getEnvironmentWithEnvironment() {
  Pipeline p = Pipeline.create();
  PCollection<Long> longs = p.apply("BoundedRead", Read.from(CountingSource.upTo(100L)));
  longs.apply(WithKeys.of("a")).apply("groupByKey", GroupByKey.create());

  Components components = PipelineTranslation.toProto(p).getComponents();
  QueryablePipeline qp = QueryablePipeline.forPrimitivesIn(components);

  PTransformNode environmentalRead =
      PipelineNode.pTransform("BoundedRead", components.getTransformsOrThrow("BoundedRead"));
  PTransformNode nonEnvironmentalTransform =
      PipelineNode.pTransform("groupByKey", components.getTransformsOrThrow("groupByKey"));

  assertThat(qp.getEnvironment(environmentalRead).isPresent(), is(true));
  assertThat(
      qp.getEnvironment(environmentalRead).get().getUrn(),
      equalTo(Environments.JAVA_SDK_HARNESS_ENVIRONMENT.getUrn()));
  assertThat(
      qp.getEnvironment(environmentalRead).get().getPayload(),
      equalTo(Environments.JAVA_SDK_HARNESS_ENVIRONMENT.getPayload()));
  assertThat(qp.getEnvironment(nonEnvironmentalTransform).isPresent(), is(false));
}

Source File: DirectRunnerTest.java From beam with Apache License 2.0

6 votes

/**
 * Tests that a {@link DoFn} that mutates an output with a good equals() fails in the {@link
 * DirectRunner}.
 */
@Test
public void testMutatingOutputThenTerminateDoFnError() throws Exception {
  Pipeline pipeline = getPipeline();

  pipeline
      .apply(Create.of(42))
      .apply(
          ParDo.of(
              new DoFn<Integer, List<Integer>>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  List<Integer> outputList = Arrays.asList(1, 2, 3, 4);
                  c.output(outputList);
                  outputList.set(0, 37);
                }
              }));

  thrown.expect(IllegalMutationException.class);
  thrown.expectMessage("output");
  thrown.expectMessage("must not be mutated");
  pipeline.run();
}

Source File: Task.java From beam with Apache License 2.0

6 votes

public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<String> wordsStartingWithA =
      pipeline.apply("Words starting with A",
          Create.of("apple", "ant", "arrow")
      );

  PCollection<String> wordsStartingWithB =
      pipeline.apply("Words starting with B",
          Create.of("ball", "book", "bow")
      );

  PCollection<String> output = applyTransform(wordsStartingWithA, wordsStartingWithB);

  output.apply(Log.ofElements());

  pipeline.run();
}

Source File: DatastoreToText.java From DataflowTemplates with Apache License 2.0

6 votes

/**
 * Runs a pipeline which reads in Entities from Datastore, passes in the JSON encoded Entities
 * to a Javascript UDF, and writes the JSON to TextIO sink.
 *
 * @param args arguments to the pipeline
 */
public static void main(String[] args) {
  DatastoreToTextOptions options = PipelineOptionsFactory.fromArgs(args)
      .withValidation()
      .as(DatastoreToTextOptions.class);

  Pipeline pipeline = Pipeline.create(options);

  pipeline
      .apply(ReadJsonEntities.newBuilder()
          .setGqlQuery(options.getDatastoreReadGqlQuery())
          .setProjectId(options.getDatastoreReadProjectId())
          .setNamespace(options.getDatastoreReadNamespace())
          .build())
      .apply(TransformTextViaJavascript.newBuilder()
          .setFileSystemPath(options.getJavascriptTextTransformGcsPath())
          .setFunctionName(options.getJavascriptTextTransformFunctionName())
          .build())
      .apply(TextIO.write()
          .to(options.getTextWritePrefix())
          .withSuffix(".json"));

  pipeline.run();
}

Source File: BatchLoads.java From beam with Apache License 2.0

6 votes

private PCollectionView<String> createLoadJobIdPrefixView(Pipeline p) {
  // Create a singleton job ID token at execution time. This will be used as the base for all
  // load jobs issued from this instance of the transform.
  return p.apply("JobIdCreationRoot", Create.of((Void) null))
      .apply(
          "CreateJobId",
          ParDo.of(
              new DoFn<Void, String>() {
                @ProcessElement
                public void process(ProcessContext c) {
                  c.output(
                      String.format(
                          "beam_load_%s_%s",
                          c.getPipelineOptions().getJobName().replaceAll("-", ""),
                          BigQueryHelpers.randomUUIDString()));
                }
              }))
      .apply(View.asSingleton());
}

Source File: DirectRunnerTest.java From beam with Apache License 2.0

5 votes

@Test
public void splitsInputs() {
  Pipeline p = getPipeline();
  PCollection<Long> longs = p.apply(Read.from(MustSplitSource.of(CountingSource.upTo(3))));

  PAssert.that(longs).containsInAnyOrder(0L, 1L, 2L);
  p.run();
}

Source File: KafkaDatasetRuntime.java From components with Apache License 2.0

5 votes

/**
 * @param limit the maximum number of records to return.
 * @param consumer a callback that will be applied to each sampled record. This callback should throw a
 * {@link org.talend.daikon.exception.TalendRuntimeException} if there was an error processing the record. Kafka is
 * a unbounded source, have to set time out to stop reading, 1 second as the time out for get Sample, no matter if
 * it get sample or not.
 */
@Override
public void getSample(int limit, Consumer<IndexedRecord> consumer) {
    // Create an input runtime based on the properties.
    KafkaInputPTransformRuntime inputRuntime = new KafkaInputPTransformRuntime();
    KafkaInputProperties inputProperties = new KafkaInputProperties(null);
    inputProperties.init();
    inputProperties.setDatasetProperties(dataset);
    inputProperties.useMaxReadTime.setValue(true);
    inputProperties.maxReadTime.setValue(1000l);
    inputProperties.autoOffsetReset.setValue(KafkaInputProperties.OffsetType.EARLIEST);
    // TODO: BEAM-1847: Enable both stopping conditions when they can be set, and remove Sample transform from job.
    // inputProperties.useMaxNumRecords.setValue(true);
    // inputProperties.maxNumRecords.setValue(Long.valueOf(limit));
    inputRuntime.initialize(null, inputProperties);

    // Create a pipeline using the input component to get records.
    PipelineOptions options = PipelineOptionsFactory.create();
    final Pipeline p = Pipeline.create(options);

    try (DirectConsumerCollector<IndexedRecord> collector = DirectConsumerCollector.of(consumer)) {
        // Collect a sample of the input records.
        p.apply(inputRuntime) //
                .apply(Sample.<IndexedRecord> any(limit)).apply(collector);
        p.run().waitUntilFinish();
    }
}

Source File: FlinkSavepointTest.java From beam with Apache License 2.0

5 votes

private void restoreFromSavepointLegacy(Pipeline pipeline, String savepointDir)
    throws ExecutionException, InterruptedException {
  JobGraph jobGraph = getJobGraph(pipeline);
  SavepointRestoreSettings savepointSettings = SavepointRestoreSettings.forPath(savepointDir);
  jobGraph.setSavepointRestoreSettings(savepointSettings);
  flinkCluster.submitJob(jobGraph).get();
}

Source File: JdbcAvroJob.java From dbeam with Apache License 2.0

5 votes

public JdbcAvroJob(
    final PipelineOptions pipelineOptions,
    final Pipeline pipeline,
    final JdbcExportArgs jdbcExportArgs,
    final String output) {
  this.pipelineOptions = pipelineOptions;
  this.pipeline = pipeline;
  this.jdbcExportArgs = jdbcExportArgs;
  this.output = output;
  Preconditions.checkArgument(
      this.output != null && this.output.length() > 0, "'output' must be defined");
}

Source File: LoadTest.java From beam with Apache License 2.0

5 votes

LoadTest(String[] args, Class<OptionsT> testOptions, String metricsNamespace) throws IOException {
  this.metricsNamespace = metricsNamespace;
  this.runtimeMonitor = new TimeMonitor<>(metricsNamespace, "runtime");
  this.options = LoadTestOptions.readFromArgs(args, testOptions);
  this.sourceOptions = fromJsonString(options.getSourceOptions(), SyntheticSourceOptions.class);
  this.pipeline = Pipeline.create(options);
  this.runner = getRunnerName(options.getRunner().getName());
  settings =
      InfluxDBSettings.builder()
          .withHost(options.getInfluxHost())
          .withDatabase(options.getInfluxDatabase())
          .withMeasurement(options.getInfluxMeasurement())
          .get();
}

Source File: SparkPortableExecutionTest.java From beam with Apache License 2.0

5 votes

@Test(timeout = 120_000)
public void testExecStageWithMultipleOutputs() throws Exception {
  PipelineOptions options = PipelineOptionsFactory.create();
  options.setRunner(CrashingRunner.class);
  options
      .as(PortablePipelineOptions.class)
      .setDefaultEnvironmentType(Environments.ENVIRONMENT_EMBEDDED);
  Pipeline pipeline = Pipeline.create(options);
  PCollection<KV<String, String>> a =
      pipeline
          .apply("impulse", Impulse.create())
          .apply("A", ParDo.of(new DoFnWithSideEffect<>("A")));
  PCollection<KV<String, String>> b = a.apply("B", ParDo.of(new DoFnWithSideEffect<>("B")));
  PCollection<KV<String, String>> c = a.apply("C", ParDo.of(new DoFnWithSideEffect<>("C")));
  // Use GBKs to force re-computation of executable stage unless cached.
  b.apply(GroupByKey.create());
  c.apply(GroupByKey.create());
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline);
  JobInvocation jobInvocation =
      SparkJobInvoker.createJobInvocation(
          "testExecStageWithMultipleOutputs",
          "testExecStageWithMultipleOutputsRetrievalToken",
          sparkJobExecutor,
          pipelineProto,
          options.as(SparkPipelineOptions.class));
  jobInvocation.start();
  Assert.assertEquals(Enum.DONE, jobInvocation.getState());
}

Source File: PCollection.java From beam with Apache License 2.0

5 votes

/** <b><i>For internal use only; no backwards-compatibility guarantees.</i></b> */
@Internal
public static <T> PCollection<T> createPrimitiveOutputInternal(
    Pipeline pipeline,
    WindowingStrategy<?, ?> windowingStrategy,
    IsBounded isBounded,
    @Nullable Coder<T> coder) {
  PCollection<T> res = new PCollection<>(pipeline, windowingStrategy, isBounded);
  if (coder != null) {
    res.setCoder(coder);
  }
  return res;
}

Source File: IsmSideInputReaderTest.java From beam with Apache License 2.0

5 votes

@Test
public void testIsmReaderReferenceCaching() throws Exception {
  Coder<WindowedValue<Long>> valueCoder =
      WindowedValue.getFullCoder(VarLongCoder.of(), GLOBAL_WINDOW_CODER);
  final WindowedValue<Long> element = valueInGlobalWindow(42L);
  final PCollectionView<Long> view =
      Pipeline.create().apply(Create.empty(VarLongCoder.of())).apply(View.asSingleton());

  final Source source =
      initInputFile(
          fromValues(Arrays.asList(element)),
          IsmRecordCoder.of(1, 0, ImmutableList.<Coder<?>>of(GLOBAL_WINDOW_CODER), valueCoder));
  final Source emptySource =
      initInputFile(
          fromValues(Arrays.asList()),
          IsmRecordCoder.of(1, 0, ImmutableList.<Coder<?>>of(GLOBAL_WINDOW_CODER), valueCoder));

  final IsmSideInputReader reader =
      sideInputReader(view.getTagInternal().getId(), source, emptySource);

  assertTrue(reader.tagToIsmReaderMap.containsKey(view.getTagInternal()));
  assertEquals(1, reader.tagToIsmReaderMap.get(view.getTagInternal()).size());
  assertEquals(
      FileSystems.matchSingleFileSpec(getString(source.getSpec(), WorkerPropertyNames.FILENAME))
          .resourceId(),
      reader.tagToIsmReaderMap.get(view.getTagInternal()).get(0).getResourceId());
  assertTrue(reader.tagToEmptyIsmReaderMap.containsKey(view.getTagInternal()));
  assertEquals(1, reader.tagToEmptyIsmReaderMap.get(view.getTagInternal()).size());
  assertEquals(
      FileSystems.matchSingleFileSpec(
              getString(emptySource.getSpec(), WorkerPropertyNames.FILENAME))
          .resourceId(),
      reader.tagToEmptyIsmReaderMap.get(view.getTagInternal()).get(0).getResourceId());
}

Source File: DocumentationExamplesTest.java From beam with Apache License 2.0

5 votes

@Test
public void metricsAndAccumulatorsSection() {
  final PipelineOptions options = PipelineOptionsFactory.create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<String> dataset = pipeline.apply(Create.of("a", "x"));

  PCollection<String> flatMapped =
      FlatMap.named("FlatMap1")
          .of(dataset)
          .using(
              (String value, Collector<String> context) -> {
                context.getCounter("my-counter").increment();
                context.collect(value);
              })
          .output();

  PCollection<String> mapped =
      MapElements.named("MapThem")
          .of(dataset)
          .using(
              (value, context) -> {
                // use simple counter
                context.getCounter("my-counter").increment();

                return value.toLowerCase();
              })
          .output();
}

Source File: Broadcast.java From nemo with Apache License 2.0

5 votes

/**
 * Main function for the BEAM program.
 * @param args arguments.
 */
public static void main(final String[] args) {
  final String inputFilePath = args[0];
  final String outputFilePath = args[1];
  final PipelineOptions options = PipelineOptionsFactory.create();
  options.setRunner(NemoPipelineRunner.class);

  final Pipeline p = Pipeline.create(options);
  final PCollection<String> elemCollection = GenericSourceSink.read(p, inputFilePath);
  final PCollectionView<Iterable<String>> allCollection = elemCollection.apply(View.<String>asIterable());

  final PCollection<String> result = elemCollection.apply(ParDo.of(new DoFn<String, String>() {
        @ProcessElement
        public void processElement(final ProcessContext c) {
          final String line = c.element();
          final Iterable<String> all = c.sideInput(allCollection);
          final Optional<String> appended = StreamSupport.stream(all.spliterator(), false)
              .reduce((l, r) -> l + '\n' + r);
          if (appended.isPresent()) {
            c.output("line: " + line + "\n" + appended.get());
          } else {
            c.output("error");
          }
        }
      }).withSideInputs(allCollection)
  );

  GenericSourceSink.write(result, outputFilePath);
  p.run();
}

org.apache.beam.sdk.Pipeline Java Examples