org.apache.beam.sdk.Pipeline#traverseTopologically

Source File: TrackStreamingSourcesTest.java From beam with Apache License 2.0

6 votes

@Test
public void testTrackSingle() {
  options.setRunner(SparkRunner.class);
  JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
  JavaStreamingContext jssc =
      new JavaStreamingContext(
          jsc, new org.apache.spark.streaming.Duration(options.getBatchIntervalMillis()));

  Pipeline p = Pipeline.create(options);

  CreateStream<Integer> emptyStream =
      CreateStream.of(VarIntCoder.of(), Duration.millis(options.getBatchIntervalMillis()))
          .emptyBatch();

  p.apply(emptyStream).apply(ParDo.of(new PassthroughFn<>()));

  p.traverseTopologically(new StreamingSourceTracker(jssc, p, ParDo.MultiOutput.class, 0));
  assertThat(StreamingSourceTracker.numAssertions, equalTo(1));
}

Source File: BeamEnumerableConverter.java From beam with Apache License 2.0

6 votes

private static boolean containsUnboundedPCollection(Pipeline p) {
  class BoundednessVisitor extends PipelineVisitor.Defaults {
    IsBounded boundedness = IsBounded.BOUNDED;

    @Override
    public void visitValue(PValue value, Node producer) {
      if (value instanceof PCollection) {
        boundedness = boundedness.and(((PCollection) value).isBounded());
      }
    }
  }

  BoundednessVisitor visitor = new BoundednessVisitor();
  p.traverseTopologically(visitor);
  return visitor.boundedness == IsBounded.UNBOUNDED;
}

Source File: DataflowRunner.java From beam with Apache License 2.0

6 votes

private boolean containsUnboundedPCollection(Pipeline p) {
  class BoundednessVisitor extends PipelineVisitor.Defaults {
    IsBounded boundedness = IsBounded.BOUNDED;

    @Override
    public void visitValue(PValue value, Node producer) {
      if (value instanceof PCollection) {
        boundedness = boundedness.and(((PCollection) value).isBounded());
      }
    }
  }

  BoundednessVisitor visitor = new BoundednessVisitor();
  p.traverseTopologically(visitor);
  return visitor.boundedness == IsBounded.UNBOUNDED;
}

Source File: DataflowRunnerTest.java From beam with Apache License 2.0

6 votes

/** Tests that all reads are consumed by at least one {@link PTransform}. */
@Test
public void testUnconsumedReads() throws IOException {
  DataflowPipelineOptions dataflowOptions = buildPipelineOptions();
  RuntimeTestOptions options = dataflowOptions.as(RuntimeTestOptions.class);
  Pipeline p = buildDataflowPipeline(dataflowOptions);
  p.apply(TextIO.read().from(options.getInput()));
  DataflowRunner.fromOptions(dataflowOptions).replaceTransforms(p);
  final AtomicBoolean unconsumedSeenAsInput = new AtomicBoolean();
  p.traverseTopologically(
      new PipelineVisitor.Defaults() {
        @Override
        public void visitPrimitiveTransform(Node node) {
          unconsumedSeenAsInput.set(true);
        }
      });
  assertThat(unconsumedSeenAsInput.get(), is(true));
}

Source File: DataflowPipelineTranslatorTest.java From beam with Apache License 2.0

6 votes

@Test
public void testDiskSizeGbConfig() throws IOException {
  final Integer diskSizeGb = 1234;

  DataflowPipelineOptions options = buildPipelineOptions();
  options.setDiskSizeGb(diskSizeGb);

  Pipeline p = buildPipeline(options);
  p.traverseTopologically(new RecordingPipelineVisitor());
  SdkComponents sdkComponents = createSdkComponents(options);
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p, sdkComponents, true);
  Job job =
      DataflowPipelineTranslator.fromOptions(options)
          .translate(
              p,
              pipelineProto,
              sdkComponents,
              DataflowRunner.fromOptions(options),
              Collections.emptyList())
          .getJob();

  assertEquals(1, job.getEnvironment().getWorkerPools().size());
  assertEquals(diskSizeGb, job.getEnvironment().getWorkerPools().get(0).getDiskSizeGb());
}

Source File: UnconsumedReads.java From beam with Apache License 2.0

6 votes

public static void ensureAllReadsConsumed(Pipeline pipeline) {
  final Set<PCollection<?>> unconsumed = new HashSet<>();
  pipeline.traverseTopologically(
      new PipelineVisitor.Defaults() {
        @Override
        public void visitPrimitiveTransform(Node node) {
          unconsumed.removeAll(node.getInputs().values());
        }

        @Override
        public void visitValue(PValue value, Node producer) {
          String urn = PTransformTranslation.urnForTransformOrNull(producer.getTransform());
          if (PTransformTranslation.READ_TRANSFORM_URN.equals(urn)) {
            unconsumed.add((PCollection<?>) value);
          }
        }
      });
  int i = 0;
  for (PCollection<?> unconsumedPCollection : unconsumed) {
    consume(unconsumedPCollection, i);
    i++;
  }
}

Source File: DataflowPipelineTranslatorTest.java From beam with Apache License 2.0

6 votes

@Test
public void testSubnetworkConfigMissing() throws IOException {
  DataflowPipelineOptions options = buildPipelineOptions();

  Pipeline p = buildPipeline(options);
  p.traverseTopologically(new RecordingPipelineVisitor());
  SdkComponents sdkComponents = createSdkComponents(options);
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p, sdkComponents, true);
  Job job =
      DataflowPipelineTranslator.fromOptions(options)
          .translate(
              p,
              pipelineProto,
              sdkComponents,
              DataflowRunner.fromOptions(options),
              Collections.emptyList())
          .getJob();

  assertEquals(1, job.getEnvironment().getWorkerPools().size());
  assertNull(job.getEnvironment().getWorkerPools().get(0).getSubnetwork());
}

Source File: TrackStreamingSourcesTest.java From beam with Apache License 2.0

6 votes

@Test
public void testTrackFlattened() {
  options.setRunner(SparkRunner.class);
  JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
  JavaStreamingContext jssc =
      new JavaStreamingContext(
          jsc, new org.apache.spark.streaming.Duration(options.getBatchIntervalMillis()));

  Pipeline p = Pipeline.create(options);

  CreateStream<Integer> queueStream1 =
      CreateStream.of(VarIntCoder.of(), Duration.millis(options.getBatchIntervalMillis()))
          .emptyBatch();
  CreateStream<Integer> queueStream2 =
      CreateStream.of(VarIntCoder.of(), Duration.millis(options.getBatchIntervalMillis()))
          .emptyBatch();

  PCollection<Integer> pcol1 = p.apply(queueStream1);
  PCollection<Integer> pcol2 = p.apply(queueStream2);
  PCollection<Integer> flattened =
      PCollectionList.of(pcol1).and(pcol2).apply(Flatten.pCollections());
  flattened.apply(ParDo.of(new PassthroughFn<>()));

  p.traverseTopologically(new StreamingSourceTracker(jssc, p, ParDo.MultiOutput.class, 0, 1));
  assertThat(StreamingSourceTracker.numAssertions, equalTo(1));
}

Source File: PipelineInit.java From component-runtime with Apache License 2.0

5 votes

public static void lazyStart(final JobStateAware.State jobState, final Supplier<DIPipeline> pipelineSupplier) {
    final AtomicBoolean pipelineStarted = jobState.getPipelineStarted();
    if (!pipelineStarted.get() && pipelineStarted.compareAndSet(false, true)) {
        final Pipeline pipeline = pipelineSupplier.get();
        final TransformCounter counter = new TransformCounter();
        pipeline.traverseTopologically(counter);
        if (counter.transforms.get() > 0) {
            final PipelineResult result = pipeline.run();
            new Thread("talend-component-kit-di-pipeline-awaiter") {

                @Override
                public void run() {
                    log.debug("Starting to watch beam pipeline");
                    try {
                        result.waitUntilFinish();
                    } finally {
                        final PipelineResult.State state = result.getState();
                        log.debug("Exited pipeline with state {}", state.name());
                        if (state.isTerminal()) {
                            log.info("Beam pipeline ended");
                        } else {
                            log.debug("Beam pipeline ended by interruption");
                        }
                        jobState.getPipelineDone().complete(true);
                    }
                }
            }.start();
        } else {
            jobState.getPipelineDone().complete(true);
            log.warn("A pipeline was created but not transform were found, is your job correctly configured?");
        }
    }
}

Source File: NemoPipelineRunner.java From nemo with Apache License 2.0

5 votes

/**
 * Method to run the Pipeline.
 * @param pipeline the Pipeline to run.
 * @return The result of the pipeline.
 */
public NemoPipelineResult run(final Pipeline pipeline) {
  final DAGBuilder builder = new DAGBuilder<>();
  final NemoPipelineVisitor nemoPipelineVisitor = new NemoPipelineVisitor(builder, nemoPipelineOptions);
  pipeline.traverseTopologically(nemoPipelineVisitor);
  final DAG dag = builder.build();
  final NemoPipelineResult nemoPipelineResult = new NemoPipelineResult();
  JobLauncher.launchDAG(dag);
  return nemoPipelineResult;
}

Source File: CacheTest.java From beam with Apache License 2.0

5 votes

/**
 * Test checks how the cache candidates map is populated by the runner when evaluating the
 * pipeline.
 */
@Test
public void cacheCandidatesUpdaterTest() {
  SparkPipelineOptions options = createOptions();
  Pipeline pipeline = Pipeline.create(options);
  PCollection<String> pCollection = pipeline.apply(Create.of("foo", "bar"));

  // First use of pCollection.
  pCollection.apply(Count.globally());
  // Second use of pCollection.
  PCollectionView<List<String>> view = pCollection.apply(View.asList());

  // Internally View.asList() creates a PCollection that underlies the PCollectionView, that
  // PCollection should not be cached as the SparkRunner does not access that PCollection to
  // access the PCollectionView.
  pipeline
      .apply(Create.of("foo", "baz"))
      .apply(
          ParDo.of(
                  new DoFn<String, String>() {
                    @ProcessElement
                    public void processElement(ProcessContext processContext) {
                      if (processContext.sideInput(view).contains(processContext.element())) {
                        processContext.output(processContext.element());
                      }
                    }
                  })
              .withSideInputs(view));

  JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
  EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options);
  SparkRunner.CacheVisitor cacheVisitor =
      new SparkRunner.CacheVisitor(new TransformTranslator.Translator(), ctxt);
  pipeline.traverseTopologically(cacheVisitor);
  assertEquals(2L, (long) ctxt.getCacheCandidates().get(pCollection));
  assertEquals(1L, ctxt.getCacheCandidates().values().stream().filter(l -> l > 1).count());
}

Source File: NemoRunner.java From incubator-nemo with Apache License 2.0

5 votes

/**
 * Method to run the Pipeline.
 *
 * @param pipeline the Pipeline to run.
 * @return The result of the pipeline.
 */
public NemoPipelineResult run(final Pipeline pipeline) {
  final PipelineVisitor pipelineVisitor = new PipelineVisitor(pipeline, nemoPipelineOptions);
  pipeline.traverseTopologically(pipelineVisitor);
  final NemoPipelineResult nemoPipelineResult = new NemoPipelineResult();
  CompletableFuture.runAsync(() ->
    JobLauncher.launchDAG(pipelineVisitor.getConvertedPipeline(), nemoPipelineOptions.getJobName()))
    .thenRun(nemoPipelineResult::setJobDone);
  return nemoPipelineResult;
}

Source File: DataflowPipelineTranslatorTest.java From beam with Apache License 2.0

5 votes

@Test
public void testScalingAlgorithmMissing() throws IOException {
  DataflowPipelineOptions options = buildPipelineOptions();

  Pipeline p = buildPipeline(options);
  p.traverseTopologically(new RecordingPipelineVisitor());
  SdkComponents sdkComponents = createSdkComponents(options);
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p, sdkComponents, true);
  Job job =
      DataflowPipelineTranslator.fromOptions(options)
          .translate(
              p,
              pipelineProto,
              sdkComponents,
              DataflowRunner.fromOptions(options),
              Collections.emptyList())
          .getJob();

  assertEquals(1, job.getEnvironment().getWorkerPools().size());
  // Autoscaling settings are always set.
  assertNull(
      job.getEnvironment().getWorkerPools().get(0).getAutoscalingSettings().getAlgorithm());
  assertEquals(
      0,
      job.getEnvironment()
          .getWorkerPools()
          .get(0)
          .getAutoscalingSettings()
          .getMaxNumWorkers()
          .intValue());
}

Source File: SparkRunner.java From beam with Apache License 2.0

5 votes

/** Visit the pipeline to determine the translation mode (batch/streaming). */
private void detectTranslationMode(Pipeline pipeline) {
  TranslationModeDetector detector = new TranslationModeDetector();
  pipeline.traverseTopologically(detector);
  if (detector.getTranslationMode().equals(TranslationMode.STREAMING)) {
    // set streaming mode if it's a streaming pipeline
    this.mOptions.setStreaming(true);
  }
}

Source File: DirectRunner.java From beam with Apache License 2.0

4 votes

@Override
public DirectPipelineResult run(Pipeline pipeline) {
  try {
    options =
        MAPPER
            .readValue(MAPPER.writeValueAsBytes(options), PipelineOptions.class)
            .as(DirectOptions.class);
  } catch (IOException e) {
    throw new IllegalArgumentException(
        "PipelineOptions specified failed to serialize to JSON.", e);
  }

  pipeline.replaceAll(defaultTransformOverrides());
  MetricsEnvironment.setMetricsSupported(true);
  try {
    DirectGraphVisitor graphVisitor = new DirectGraphVisitor();
    pipeline.traverseTopologically(graphVisitor);

    @SuppressWarnings("rawtypes")
    KeyedPValueTrackingVisitor keyedPValueVisitor = KeyedPValueTrackingVisitor.create();
    pipeline.traverseTopologically(keyedPValueVisitor);

    DisplayDataValidator.validatePipeline(pipeline);
    DisplayDataValidator.validateOptions(options);

    ExecutorService metricsPool =
        Executors.newCachedThreadPool(
            new ThreadFactoryBuilder()
                .setThreadFactory(MoreExecutors.platformThreadFactory())
                .setDaemon(false) // otherwise you say you want to leak, please don't!
                .setNameFormat("direct-metrics-counter-committer")
                .build());
    DirectGraph graph = graphVisitor.getGraph();
    EvaluationContext context =
        EvaluationContext.create(
            clockSupplier.get(),
            Enforcement.bundleFactoryFor(enabledEnforcements, graph),
            graph,
            keyedPValueVisitor.getKeyedPValues(),
            metricsPool);

    TransformEvaluatorRegistry registry =
        TransformEvaluatorRegistry.javaSdkNativeRegistry(context, options);
    PipelineExecutor executor =
        ExecutorServiceParallelExecutor.create(
            options.getTargetParallelism(),
            registry,
            Enforcement.defaultModelEnforcements(enabledEnforcements),
            context,
            metricsPool);
    executor.start(graph, RootProviderRegistry.javaNativeRegistry(context, options));

    DirectPipelineResult result = new DirectPipelineResult(executor, context);
    if (options.isBlockOnRun()) {
      try {
        result.waitUntilFinish();
      } catch (UserCodeException userException) {
        throw new PipelineExecutionException(userException.getCause());
      } catch (Throwable t) {
        if (t instanceof RuntimeException) {
          throw (RuntimeException) t;
        }
        throw new RuntimeException(t);
      }
    }
    return result;
  } finally {
    MetricsEnvironment.setMetricsSupported(false);
  }
}

Source File: SparkRunner.java From beam with Apache License 2.0

4 votes

/** Evaluator that update/populate the cache candidates. */
public static void updateCacheCandidates(
    Pipeline pipeline, SparkPipelineTranslator translator, EvaluationContext evaluationContext) {
  CacheVisitor cacheVisitor = new CacheVisitor(translator, evaluationContext);
  pipeline.traverseTopologically(cacheVisitor);
}

Source File: PViewToIdMapper.java From beam with Apache License 2.0

4 votes

public static Map<PValue, String> buildIdMap(Pipeline pipeline) {
  final PViewToIdMapper mapper = new PViewToIdMapper();
  pipeline.traverseTopologically(mapper);
  return mapper.getIdMap();
}

Source File: DisplayDataEvaluator.java From beam with Apache License 2.0

4 votes

private static Set<DisplayData> displayDataForPipeline(Pipeline pipeline, PTransform<?, ?> root) {
  PrimitiveDisplayDataPTransformVisitor visitor = new PrimitiveDisplayDataPTransformVisitor(root);
  pipeline.traverseTopologically(visitor);
  return visitor.getPrimitivesDisplayData();
}

Source File: FlinkPipelineTranslator.java From beam with Apache License 2.0

2 votes

/**
 * Translates the pipeline by passing this class as a visitor.
 *
 * @param pipeline The pipeline to be translated
 */
public void translate(Pipeline pipeline) {
  pipeline.traverseTopologically(this);
}

Source File: Twister2PipelineTranslator.java From twister2 with Apache License 2.0

2 votes

/**
 * Translates the pipeline by passing this class as a visitor.
 *
 * @param pipeline The pipeline to be translated
 */
public void translate(Pipeline pipeline) {
  pipeline.traverseTopologically(this);
}

Java Code Examples for org.apache.beam.sdk.Pipeline#traverseTopologically()