com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions Java Exaples

Source File: DataflowFactory.java From dockerflow with Apache License 2.0

6 votes

/**
 * Create Dataflow Pipeline options from the standard command-line options, "--project=",
 * "--runner=" and "--stagingLocation="
 *
 * @param args
 * @return
 * @throws IOException
 */
public static DataflowPipelineOptions pipelineOptions(String[] args) throws IOException {
  LOG.info("Set up Dataflow options");
  DataflowPipelineOptions o = PipelineOptionsFactory.as(DataflowPipelineOptions.class);

  Map<String, String> m = StringUtils.parseArgs(args);
  o.setProject(m.get(PROJECT));
  if (m.containsKey(STAGING)) {
    o.setStagingLocation(m.get(STAGING));
  } else if (m.containsKey(STAGING_LOCATION)) {
    o.setStagingLocation(m.get(STAGING_LOCATION));
  } else if (m.containsKey(WORKSPACE)) {
    o.setStagingLocation(m.get(WORKSPACE) + "/staging");
  }
  o.setRunner(runner(m.get(RUNNER)));
  o.setMaxNumWorkers(m.get(MAX_WORKERS) == null ? 1 : Integer.parseInt(m.get(MAX_WORKERS)));
  if (m.containsKey(MACHINE_TYPE)) {
    o.setWorkerMachineType(m.get(MACHINE_TYPE));
  } else {
    o.setWorkerMachineType(DEFAULT_MACHINE_TYPE);
  }
  return o;
}

Source File: TaskRunner.java From dockerflow with Apache License 2.0

6 votes

/** Run a Docker workflow on Dataflow. */
public static void run(Workflow w, Map<String, WorkflowArgs> a, DataflowPipelineOptions o)
    throws IOException {
  LOG.info("Running workflow graph");
  if (w.getArgs().getProjectId() == null) {
    throw new IllegalArgumentException("Project id is required");
  }

  Pipeline p = DataflowFactory.dataflow(w, a, o);

  LOG.info("Created Dataflow pipeline");
  LOG.debug(w.toString());

  PipelineResult r = p.run();

  LOG.info("Dataflow pipeline completed");
  LOG.info("Result state: " + r.getState());
}

Source File: LiveStateCheckerApp.java From policyscanner with Apache License 2.0

5 votes

private PipelineOptions getCloudExecutionOptions(String stagingLocation) {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setProject(Constants.PROJECT_ID);
  options.setStagingLocation(stagingLocation);
  options.setRunner(BlockingDataflowPipelineRunner.class);
  return options;
}

Source File: UserManagedKeysApp.java From policyscanner with Apache License 2.0

5 votes

private PipelineOptions getCloudExecutionOptions(String stagingLocation) {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setProject(SystemProperty.applicationId.get());
  options.setStagingLocation(stagingLocation);
  options.setRunner(BlockingDataflowPipelineRunner.class);
  return options;
}

Source File: LiveStateCheckerRunner.java From policyscanner with Apache License 2.0

5 votes

private static PipelineOptions getCloudExecutionOptions(String stagingLocation) {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setProject(SystemProperty.applicationId.get());
  options.setStagingLocation(stagingLocation);
  options.setRunner(BlockingDataflowPipelineRunner.class);
  return options;
}

Source File: DesiredStateEnforcerApp.java From policyscanner with Apache License 2.0

5 votes

private PipelineOptions getCloudExecutionOptions(String stagingLocation) {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setProject(SystemProperty.applicationId.get());
  options.setStagingLocation(stagingLocation);
  options.setRunner(BlockingDataflowPipelineRunner.class);
  return options;
}

Source File: WorkflowDefn.java From dockerflow with Apache License 2.0

5 votes

/**
 * The workflow defn implementation is responsible for defining the workflow steps and default
 * args, and creating a Dataflow pipeline.
 *
 * @throws URISyntaxException
 */
default Pipeline createDataflow(
    Map<String, WorkflowArgs> argsTable, DataflowPipelineOptions pipelineOptions, String[] args)
    throws IOException {
  return DataflowBuilder.of(createWorkflow(args))
      .createFrom(argsTable)
      .pipelineOptions(pipelineOptions)
      .build();
}

Source File: MultiLinearGraph.java From dockerflow with Apache License 2.0

5 votes

/**
 * For simple linear graphs, it's not too hard to generate the Dataflow pipeline yourself. Here's
 * the equivalent Dataflow code for this simple example.
 */
public static void manualDataflow(String[] args) throws IOException {
  LOG.info("Parsing Dataflow options");
  DataflowPipelineOptions o = DataflowFactory.pipelineOptions(args);
  o.setAppName(MultiLinearGraph.class.getSimpleName());
  Pipeline p = Pipeline.create(o);

  p.apply(Create.of(ArgsTableBuilder.fromArgs(args).build()))
      .apply(DockerDo.of(taskOne()))
      .apply(DockerDo.of(taskTwo()));
  p.run();
}

Source File: GatkPairedSingleSampleAlt.java From dockerflow with Apache License 2.0

4 votes

/**
 * Only this one method is different from GatkPairedSingleSample.java.
 */
@Override
public Pipeline createDataflow(
    Map<String, WorkflowArgs> argsTable,
    DataflowPipelineOptions pipelineOptions,
    String[] args) throws IOException {

  DataflowPipelineOptions o = DataflowFactory.pipelineOptions(args);
  o.setAppName(GatkPairedSingleSampleAlt.class.getSimpleName());
  Pipeline p = Pipeline.create(o);

  // Merge sample-specific args with default workflow args
  for (String key : argsTable.keySet()) {
    WorkflowArgs instanceArgs = argsTable.get(key);
    instanceArgs.mergeDefaultArgs(workflowArgs);
  }

  // Declarations
  PCollection<KV<String, WorkflowArgs>> mainBranch, branchOne, branchTwo;
  PCollectionList<KV<String, WorkflowArgs>> mergeList;

  // Construct the workflow graph
  mainBranch  = p.apply(Create.of(argsTable));
  branchOne = mainBranch.apply(DockerDo.of(CreateSequenceGroupingTSV));
  branchTwo = mainBranch.apply(DockerDo.of(BwaVersion))
      .apply(DockerDo.of(SamToFastqAndBwaMem))
      .apply(DockerDo.of(MergeBamAlignment))
      .apply(DockerDo.of(SortAndFixReadGroupBam))
      .apply(DockerDo.of(MarkDuplicates))
      .apply(DockerDo.of(SortAndFixSampleBam));
  mergeList = PCollectionList.of(branchOne).and(branchTwo);
  mainBranch = mergeList.apply(new MergeBranches())
      .apply(DockerDo.of(BaseRecalibrator))
      .apply(DockerDo.of(ApplyBQSR))
      .apply(DockerDo.of(GatherBqsrReports))
      .apply(DockerDo.of(ApplyBQSRToUnmappedReads))
      .apply(DockerDo.of(GatherBamFiles));
  branchOne = mainBranch.apply(DockerDo.of(ConvertToCram));
  branchTwo = mainBranch.apply(DockerDo.of(HaplotypeCaller))
      .apply(DockerDo.of(GatherVCFs));
  mergeList = PCollectionList.of(branchOne).and(branchTwo);
  mainBranch = mergeList.apply(new MergeBranches());

  return p;
}

Source File: DataflowFactory.java From dockerflow with Apache License 2.0

4 votes

/**
 * Dynamically construct a Dataflow from the workflow definition. The root PCollection has one
 * element, the root task's name.
 *
 * @param workflow
 * @param dataflowArgs
 * @return
 * @throws IOException
 */
public static Pipeline dataflow(
    Workflow workflow, Map<String, WorkflowArgs> workflowArgs, DataflowPipelineOptions o)
    throws IOException {

  assert (workflow != null);
  assert (o != null);
  assert (workflow.getDefn() != null);

  // Set defaults
  if (o.getAppName() == null) {
    o.setAppName(workflow.getDefn().getName());
  }
  if (o.getProject() == null && workflow.getArgs() != null) {
    o.setProject(workflow.getArgs().getProjectId());
  }
  if (o.getMaxNumWorkers() == 0) {
    o.setMaxNumWorkers(1);
  }
  if (o.getWorkerMachineType() == null) {
    o.setWorkerMachineType(DEFAULT_MACHINE_TYPE);
  }

  LOG.info("Initializing dataflow pipeline");
  Pipeline p = Pipeline.create(o);

  LOG.info("Creating input collection of workflow args");
  if (workflowArgs == null) {
    workflowArgs = new HashMap<String, WorkflowArgs>();
  }
  if (workflowArgs.isEmpty()) {
    LOG.info("No workflow args were provided. Using default values.");
    workflowArgs.put(workflow.getDefn().getName(), new WorkflowArgs());
  } else if (workflow.getArgs() != null) {
    LOG.info("Merging default workflow args with instance-specific args");

    for (String key : workflowArgs.keySet()) {
      WorkflowArgs instanceArgs = workflowArgs.get(key);
      instanceArgs.mergeDefaultArgs(workflow.getArgs());
      LOG.debug("Merged args: " + StringUtils.toJson(instanceArgs));
    }
  }

  LOG.info("Creating dataflow pipeline for workflow " + workflow.getDefn().getName());
  PCollection<KV<String, WorkflowArgs>> input = p.apply(Create.of(workflowArgs));
  input = dataflow(Workflow.Steps.graph(workflow), input);

  if (workflowArgs.values().iterator().next().getDeleteFiles()) {
    LOG.info("Intermediate files will be deleted");
    input =
        input.apply(
            ParDo.named("DeleteIntermediateFiles").of(new DeleteIntermediateFiles(workflow)));
  }

  return p;
}

Source File: DataflowBuilder.java From dockerflow with Apache License 2.0

4 votes

public DataflowBuilder pipelineOptions(DataflowPipelineOptions options) {
  pipelineOptions = options;
  pipelineOptions.setAppName(workflow.getDefn().getName());
  return this;
}

Source File: FlinkPipelineOptions.java From flink-dataflow with Apache License 2.0

4 votes

/**
 * The job name is used to identify jobs running on a Flink cluster.
 */
@Description("Dataflow job name, to uniquely identify active jobs. "
		+ "Defaults to using the ApplicationName-UserName-Date.")
@Default.InstanceFactory(DataflowPipelineOptions.JobNameFactory.class)
String getJobName();

com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions Java Examples