org.apache.beam.sdk.values.PDone Java Examples

The following examples show how to use org.apache.beam.sdk.values.PDone. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: RepublishPerDocType.java    From gcp-ingestion with Mozilla Public License 2.0 7 votes vote down vote up
@Override
public PDone expand(PCollection<PubsubMessage> input) {
  List<Destination> destinations = baseOptions.getPerDocTypeDestinations().entrySet().stream()
      .flatMap(
          entry -> entry.getValue().stream().map(value -> new Destination(entry.getKey(), value)))
      .collect(Collectors.toList());

  int numDestinations = destinations.size();
  int numPartitions = numDestinations + 1;
  PCollectionList<PubsubMessage> partitioned = input.apply("PartitionByDocType",
      Partition.of(numPartitions, new PartitionFn(destinations)));

  for (int i = 0; i < numDestinations; i++) {
    Destination destination = destinations.get(i);
    RepublisherOptions.Parsed opts = baseOptions.as(RepublisherOptions.Parsed.class);
    opts.setOutput(StaticValueProvider.of(destination.dest));
    String name = String.join("_", "republish", destination.namespace, destination.docType);
    partitioned.get(i).apply(name, opts.getOutputType().write(opts));
  }

  return PDone.in(input.getPipeline());
}
 
Example #2
Source File: KafkaIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PDone expand(PCollection<KV<K, V>> input) {
  checkArgument(getTopic() != null, "withTopic() is required");

  KvCoder<K, V> kvCoder = (KvCoder<K, V>) input.getCoder();
  return input
      .apply(
          "Kafka ProducerRecord",
          MapElements.via(
              new SimpleFunction<KV<K, V>, ProducerRecord<K, V>>() {
                @Override
                public ProducerRecord<K, V> apply(KV<K, V> element) {
                  return new ProducerRecord<>(getTopic(), element.getKey(), element.getValue());
                }
              }))
      .setCoder(ProducerRecordCoder.of(kvCoder.getKeyCoder(), kvCoder.getValueCoder()))
      .apply(getWriteRecordsTransform());
}
 
Example #3
Source File: TfIdf.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PDone expand(PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf) {
  return wordToUriAndTfIdf
      .apply(
          "Format",
          ParDo.of(
              new DoFn<KV<String, KV<URI, Double>>, String>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  c.output(
                      String.format(
                          "%s,\t%s,\t%f",
                          c.element().getKey(),
                          c.element().getValue().getKey(),
                          c.element().getValue().getValue()));
                }
              }))
      .apply(TextIO.write().to(output).withSuffix(".csv"));
}
 
Example #4
Source File: ExpansionService.java    From beam with Apache License 2.0 6 votes vote down vote up
default Map<String, PCollection<?>> extractOutputs(OutputT output) {
  if (output instanceof PDone) {
    return Collections.emptyMap();
  } else if (output instanceof PCollection) {
    return ImmutableMap.of("output", (PCollection<?>) output);
  } else if (output instanceof PCollectionTuple) {
    return ((PCollectionTuple) output)
        .getAll().entrySet().stream()
            .collect(Collectors.toMap(entry -> entry.getKey().getId(), Map.Entry::getValue));
  } else if (output instanceof PCollectionList<?>) {
    PCollectionList<?> listOutput = (PCollectionList<?>) output;
    return IntStream.range(0, listOutput.size())
        .boxed()
        .collect(Collectors.toMap(Object::toString, listOutput::get));
  } else {
    throw new UnsupportedOperationException("Unknown output type: " + output.getClass());
  }
}
 
Example #5
Source File: DirectRunnerTest.java    From beam with Apache License 2.0 6 votes vote down vote up
private PTransform<PBegin, PDone> outputStartTo(StaticQueue<Integer> queue) {
  return new PTransform<PBegin, PDone>() {
    @Override
    public PDone expand(PBegin input) {
      input
          .apply(Create.of(1))
          .apply(
              MapElements.into(TypeDescriptors.voids())
                  .via(
                      in -> {
                        queue.add(in);
                        return null;
                      }));
      return PDone.in(input.getPipeline());
    }
  };
}
 
Example #6
Source File: BeamSqlBuiltinFunctionsIntegrationTestBase.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PDone expand(PBegin begin) {
  PCollection<Boolean> result =
      begin
          .apply(Create.of(DUMMY_ROW).withRowSchema(DUMMY_SCHEMA))
          .apply(SqlTransform.query("SELECT " + expr))
          .apply(MapElements.into(TypeDescriptors.booleans()).via(row -> row.getBoolean(0)));

  PAssert.that(result)
      .satisfies(
          input -> {
            assertTrue("Test expression is false: " + expr, Iterables.getOnlyElement(input));
            return null;
          });
  return PDone.in(begin.getPipeline());
}
 
Example #7
Source File: ClickHouseIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PDone expand(PCollection<T> input) {
  TableSchema tableSchema = getTableSchema(jdbcUrl(), table());
  Properties properties = properties();

  set(properties, ClickHouseQueryParam.MAX_INSERT_BLOCK_SIZE, maxInsertBlockSize());
  set(properties, ClickHouseQueryParam.INSERT_QUORUM, insertQuorum());
  set(properties, "insert_distributed_sync", insertDistributedSync());
  set(properties, "insert_deduplication", insertDeduplicate());

  WriteFn<T> fn =
      new AutoValue_ClickHouseIO_WriteFn.Builder<T>()
          .jdbcUrl(jdbcUrl())
          .table(table())
          .maxInsertBlockSize(maxInsertBlockSize())
          .schema(tableSchema)
          .properties(properties)
          .initialBackoff(initialBackoff())
          .maxCumulativeBackoff(maxCumulativeBackoff())
          .maxRetries(maxRetries())
          .build();

  input.apply(ParDo.of(fn));

  return PDone.in(input.getPipeline());
}
 
Example #8
Source File: RepublishPerNamespace.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
@Override
public PDone expand(PCollection<PubsubMessage> input) {
  List<Destination> destinations = baseOptions.getPerNamespaceDestinations().entrySet().stream()
      .map(entry -> new Destination(entry.getKey(), entry.getValue()))
      .collect(Collectors.toList());
  int numDestinations = destinations.size();
  int numPartitions = numDestinations + 1;
  PCollectionList<PubsubMessage> partitioned = input.apply("PartitionByNamespace",
      Partition.of(numPartitions, new PartitionFn(destinations)));

  for (int i = 0; i < numDestinations; i++) {
    Destination destination = destinations.get(i);
    RepublisherOptions.Parsed opts = baseOptions.as(RepublisherOptions.Parsed.class);
    opts.setOutput(StaticValueProvider.of(destination.dest));
    String name = String.join("_", "republish", destination.namespace);
    partitioned.get(i).apply(name, opts.getOutputType().write(opts));
  }

  return PDone.in(input.getPipeline());
}
 
Example #9
Source File: WriteFailureMetricsTransform.java    From feast with Apache License 2.0 6 votes vote down vote up
@Override
public PDone expand(PCollection<FailedElement> input) {
  ImportOptions options = input.getPipeline().getOptions().as(ImportOptions.class);
  if ("statsd".equals(options.getMetricsExporterType())) {
    input.apply(
        "WriteDeadletterMetrics",
        ParDo.of(
            WriteDeadletterRowMetricsDoFn.newBuilder()
                .setStatsdHost(options.getStatsdHost())
                .setStatsdPort(options.getStatsdPort())
                .setStoreName(getStoreName())
                .build()));
  } else {
    input.apply(
        "Noop",
        ParDo.of(
            new DoFn<FailedElement, Void>() {
              @ProcessElement
              public void processElement(ProcessContext c) {}
            }));
  }
  return PDone.in(input.getPipeline());
}
 
Example #10
Source File: XmlIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PDone expand(PCollection<T> input) {
  checkArgument(getRecordClass() != null, "withRecordClass() is required");
  checkArgument(getRootElement() != null, "withRootElement() is required");
  checkArgument(getFilenamePrefix() != null, "to() is required");
  checkArgument(getCharset() != null, "withCharset() is required");
  try {
    JAXBContext.newInstance(getRecordClass());
  } catch (JAXBException e) {
    throw new RuntimeException("Error binding classes to a JAXB Context.", e);
  }

  ResourceId prefix =
      FileSystems.matchNewResource(getFilenamePrefix(), false /* isDirectory */);
  input.apply(
      FileIO.<T>write()
          .via(
              sink(getRecordClass())
                  .withCharset(Charset.forName(getCharset()))
                  .withRootElement(getRootElement()))
          .to(prefix.getCurrentDirectory().toString())
          .withPrefix(prefix.getFilename())
          .withSuffix(".xml")
          .withIgnoreWindowing());
  return PDone.in(input.getPipeline());
}
 
Example #11
Source File: JdbcIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PDone expand(PCollection<T> input) {
  // fixme: validate invalid table input
  if (input.hasSchema() && !hasStatementAndSetter()) {
    checkArgument(
        inner.getTable() != null, "table cannot be null if statement is not provided");
    Schema schema = input.getSchema();
    List<SchemaUtil.FieldWithIndex> fields = getFilteredFields(schema);
    inner =
        inner.withStatement(
            JdbcUtil.generateStatement(
                inner.getTable(),
                fields.stream()
                    .map(SchemaUtil.FieldWithIndex::getField)
                    .collect(Collectors.toList())));
    inner =
        inner.withPreparedStatementSetter(
            new AutoGeneratedPreparedStatementSetter(fields, input.getToRowFunction()));
  }

  inner.expand(input);
  return PDone.in(input.getPipeline());
}
 
Example #12
Source File: NameUtilsTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testPTransformNameWithAnonOuterClass() throws Exception {
  AnonymousClass anonymousClassObj =
      new AnonymousClass() {
        class NamedInnerClass extends PTransform<PBegin, PDone> {
          @Override
          public PDone expand(PBegin begin) {
            throw new IllegalArgumentException("Should never be applied");
          }
        }

        @Override
        public Object getInnerClassInstance() {
          return new NamedInnerClass();
        }
      };

  assertEquals(
      "NamedInnerClass",
      NameUtils.approximateSimpleName(anonymousClassObj.getInnerClassInstance()));
  assertEquals(
      "NameUtilsTest.NamedInnerClass",
      NameUtils.approximatePTransformName(anonymousClassObj.getInnerClassInstance().getClass()));
}
 
Example #13
Source File: BigQueryDeadletterSink.java    From feast with Apache License 2.0 6 votes vote down vote up
@Override
public PDone expand(PCollection<FailedElement> input) {
  TimePartitioning partition = new TimePartitioning().setType("DAY");
  partition.setField(TIMESTAMP_COLUMN);
  input
      .apply("FailedElementToTableRow", ParDo.of(new FailedElementToTableRowFn()))
      .apply(
          "WriteFailedElementsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getTableSpec())
              .withJsonSchema(getJsonSchema())
              .withTimePartitioning(partition)
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(input.getPipeline());
}
 
Example #14
Source File: PTransformTranslationTest.java    From beam with Apache License 2.0 6 votes vote down vote up
private static AppliedPTransform<?, ?, ?> rawPTransformWithNullSpec(Pipeline pipeline) {
  PTransformTranslation.RawPTransform<PBegin, PDone> rawPTransform =
      new PTransformTranslation.RawPTransform<PBegin, PDone>() {
        @Override
        public String getUrn() {
          return "fake/urn";
        }

        @Nullable
        @Override
        public RunnerApi.FunctionSpec getSpec() {
          return null;
        }
      };
  return AppliedPTransform.<PBegin, PDone, PTransform<PBegin, PDone>>of(
      "RawPTransformWithNoSpec",
      pipeline.begin().expand(),
      PDone.in(pipeline).expand(),
      rawPTransform,
      pipeline);
}
 
Example #15
Source File: WriteToGCSAvro.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Override
public PDone expand(PCollection<KV<String, String>> kafkaRecords) {
  return kafkaRecords
      /*
       * Converting KV<String, String> records to GenericRecord using DoFn and {@link
       * KeyValueToGenericRecordFn} class.
       */
      .apply("Create GenericRecord(s)", ParDo.of(new KeyValueToGenericRecordFn()))
      .setCoder(AvroCoder.of(GenericRecord.class, KeyValueToGenericRecordFn.SCHEMA))
      /*
       * Writing as avro file using {@link AvroIO}.
       *
       * The {@link WindowedFilenamePolicy} class specifies the file path for writing the file.
       * The {@link withNumShards} option specifies the number of shards passed by the user.
       * The {@link withTempDirectory} option sets the base directory used to generate temporary files.
       */
      .apply(
          "Writing as Avro",
          AvroIO.writeGenericRecords(KeyValueToGenericRecordFn.SCHEMA)
              .to(
                  new WindowedFilenamePolicy(
                      outputDirectory(),
                      outputFilenamePrefix(),
                      WriteToGCSUtility.SHARD_TEMPLATE,
                      WriteToGCSUtility.FILE_SUFFIX_MAP.get(WriteToGCSUtility.FileFormat.AVRO)))
              .withTempDirectory(
                  FileBasedSink.convertToFileResourceIfPossible(tempLocation())
                      .getCurrentDirectory())
              .withWindowedWrites()
              .withNumShards(numShards()));
}
 
Example #16
Source File: TestBoundedTable.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public POutput buildIOWriter(PCollection<Row> input) {
  input.apply(
      ParDo.of(
          new DoFn<Row, Void>() {
            @ProcessElement
            public void processElement(ProcessContext c) {
              CONTENT.add(c.element());
            }

            @Teardown
            public void close() {
              CONTENT.clear();
            }
          }));
  return PDone.in(input.getPipeline());
}
 
Example #17
Source File: KafkaIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<V> input) {
  return input
      .apply(
          "Kafka values with default key",
          MapElements.via(
              new SimpleFunction<V, KV<K, V>>() {
                @Override
                public KV<K, V> apply(V element) {
                  return KV.of(null, element);
                }
              }))
      .setCoder(KvCoder.of(new NullOnlyCoder<>(), input.getCoder()))
      .apply(kvWriteTransform);
}
 
Example #18
Source File: KafkaIO.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<V> input) {
  return input
      .apply(
          "Kafka values with default key",
          MapElements.via(
              new SimpleFunction<V, KV<K, V>>() {
                @Override
                public KV<K, V> apply(V element) {
                  return KV.of(null, element);
                }
              }))
      .setCoder(KvCoder.of(new NullOnlyCoder<>(), input.getCoder()))
      .apply(kvWriteTransform);
}
 
Example #19
Source File: PAssert.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<T> input) {
  input
      .apply("GroupGlobally", new GroupGlobally<>(rewindowingStrategy))
      .apply("GetPane", MapElements.via(paneExtractor))
      .setCoder(IterableCoder.of(input.getCoder()))
      .apply("RunChecks", ParDo.of(new SingletonCheckerDoFn<>(checkerFn, site)))
      .apply("VerifyAssertions", new DefaultConcludeTransform());

  return PDone.in(input.getPipeline());
}
 
Example #20
Source File: WriteToText.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<InputT> teamAndScore) {
  if (windowed) {
    teamAndScore
        .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
        .apply(new WriteToText.WriteOneFilePerWindow(filenamePrefix));
  } else {
    teamAndScore
        .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
        .apply(TextIO.write().to(filenamePrefix));
  }
  return PDone.in(teamAndScore.getPipeline());
}
 
Example #21
Source File: PAssert.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<T> input) {
  input
      .apply("GroupGlobally", new GroupGlobally<>(rewindowingStrategy))
      .apply("GetPane", MapElements.via(paneExtractor))
      .setCoder(IterableCoder.of(input.getCoder()))
      .apply("RunChecks", ParDo.of(new GroupedValuesCheckerDoFn<>(checkerFn, site)))
      .apply("VerifyAssertions", new DefaultConcludeTransform());

  return PDone.in(input.getPipeline());
}
 
Example #22
Source File: WriteToBigQuery.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<InputT> teamAndScore) {
  teamAndScore
      .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
      .apply(
          BigQueryIO.writeTableRows()
              .to(getTable(projectId, datasetId, tableName))
              .withSchema(getSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(teamAndScore.getPipeline());
}
 
Example #23
Source File: PAssert.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PBegin input) {
  final PCollectionView<ActualT> actual = input.apply("CreateActual", createActual);

  input
      .apply(Create.of(0).withCoder(VarIntCoder.of()))
      .apply("WindowToken", windowToken)
      .apply(
          "RunChecks",
          ParDo.of(new SideInputCheckerDoFn<>(checkerFn, actual, site)).withSideInputs(actual))
      .apply("VerifyAssertions", new DefaultConcludeTransform());
  return PDone.in(input.getPipeline());
}
 
Example #24
Source File: MongoDbIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<Document> input) {
  checkArgument(uri() != null, "withUri() is required");
  checkArgument(database() != null, "withDatabase() is required");
  checkArgument(collection() != null, "withCollection() is required");

  input.apply(ParDo.of(new WriteFn(this)));
  return PDone.in(input.getPipeline());
}
 
Example #25
Source File: ErrorConverters.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<FailsafeElement<String, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToPubSubMessage", ParDo.of(new FailedStringToPubsubMessageFn()))
      .apply("WriteFailedRecordsToPubSub", PubsubIO.writeMessages().to(errorRecordsTopic()));
}
 
Example #26
Source File: SqsIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<SendMessageRequest> input) {
  input.apply(
      ParDo.of(
          new SqsWriteFn(
              new SqsConfiguration(input.getPipeline().getOptions().as(AwsOptions.class)))));
  return PDone.in(input.getPipeline());
}
 
Example #27
Source File: HBaseIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<Mutation> input) {
  checkArgument(configuration != null, "withConfiguration() is required");
  checkArgument(tableId != null && !tableId.isEmpty(), "withTableId() is required");
  try (Connection connection = ConnectionFactory.createConnection(configuration)) {
    Admin admin = connection.getAdmin();
    checkArgument(
        admin.tableExists(TableName.valueOf(tableId)), "Table %s does not exist", tableId);
  } catch (IOException e) {
    LOG.warn("Error checking whether table {} exists; proceeding.", tableId, e);
  }
  input.apply(ParDo.of(new HBaseWriterFn(this)));
  return PDone.in(input.getPipeline());
}
 
Example #28
Source File: WriteToGCSText.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<KV<String, String>> kafkaRecords) {
  return kafkaRecords
      /*
       * Converting KV<String, String> records to String using DoFn.
       */
      .apply(
          "Converting to String",
          ParDo.of(
              new DoFn<KV<String, String>, String>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  c.output(c.element().getValue());
                }
              }))
      /*
       * Writing as text file using {@link TextIO}.
       *
       * The {@link WindowedFilenamePolicy} class specifies the file path for writing the file.
       * The {@link withNumShards} option specifies the number of shards passed by the user.
       * The {@link withTempDirectory} option sets the base directory used to generate temporary files.
       */
      .apply(
          "Writing as Text",
          TextIO.write()
              .to(
                  new WindowedFilenamePolicy(
                      outputDirectory(),
                      outputFilenamePrefix(),
                      WriteToGCSUtility.SHARD_TEMPLATE,
                      WriteToGCSUtility.FILE_SUFFIX_MAP.get(WriteToGCSUtility.FileFormat.TEXT)))
              .withTempDirectory(
                  FileBasedSink.convertToFileResourceIfPossible(tempLocation())
                      .getCurrentDirectory())
              .withWindowedWrites()
              .withNumShards(numShards()));
}
 
Example #29
Source File: KinesisIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<byte[]> input) {
  checkArgument(getStreamName() != null, "withStreamName() is required");
  checkArgument(
      (getPartitionKey() != null) || (getPartitioner() != null),
      "withPartitionKey() or withPartitioner() is required");
  checkArgument(
      getPartitionKey() == null || (getPartitioner() == null),
      "only one of either withPartitionKey() or withPartitioner() is possible");
  checkArgument(getAWSClientsProvider() != null, "withAWSClientsProvider() is required");

  input.apply(ParDo.of(new KinesisWriterFn(this)));
  return PDone.in(input.getPipeline());
}
 
Example #30
Source File: WriteOneFilePerWindow.java    From deployment-examples with MIT License 5 votes vote down vote up
@Override
public PDone expand(PCollection<String> input) {
  ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix);
  TextIO.Write write =
      TextIO.write()
          .to(new PerWindowFiles(resource))
          .withTempDirectory(resource.getCurrentDirectory())
          .withWindowedWrites();
  if (numShards != null) {
    write = write.withNumShards(numShards);
  }
  return input.apply(write);
}