Java Code Examples for org.apache.beam.sdk.values.PCollectionList#and()

The following examples show how to use org.apache.beam.sdk.values.PCollectionList#and() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Partition.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollectionList<T> expand(PCollection<T> in) {
  final TupleTagList outputTags = partitionDoFn.getOutputTags();

  PCollectionTuple outputs =
      in.apply(
          ParDo.of(partitionDoFn)
              .withOutputTags(new TupleTag<Void>() {}, outputTags)
              .withSideInputs(partitionDoFn.getSideInputs()));

  PCollectionList<T> pcs = PCollectionList.empty(in.getPipeline());
  Coder<T> coder = in.getCoder();

  for (TupleTag<?> outputTag : outputTags.getAll()) {
    // All the tuple tags are actually TupleTag<T>
    // And all the collections are actually PCollection<T>
    @SuppressWarnings("unchecked")
    TupleTag<T> typedOutputTag = (TupleTag<T>) outputTag;
    pcs = pcs.and(outputs.get(typedOutputTag).setCoder(coder));
  }
  return pcs;
}
 
Example 2
Source File: DeduplicatedFlattenFactory.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<T> expand(PCollectionList<T> input) {
  Map<PCollection<T>, Integer> instances = new HashMap<>();
  for (PCollection<T> pCollection : input.getAll()) {
    int existing = instances.get(pCollection) == null ? 0 : instances.get(pCollection);
    instances.put(pCollection, existing + 1);
  }
  PCollectionList<T> output = PCollectionList.empty(input.getPipeline());
  for (Map.Entry<PCollection<T>, Integer> instanceEntry : instances.entrySet()) {
    if (instanceEntry.getValue().equals(1)) {
      output = output.and(instanceEntry.getKey());
    } else {
      String duplicationName = String.format("Multiply %s", instanceEntry.getKey().getName());
      PCollection<T> duplicated =
          instanceEntry
              .getKey()
              .apply(duplicationName, ParDo.of(new DuplicateFn<>(instanceEntry.getValue())));
      output = output.and(duplicated);
    }
  }
  return output.apply(Flatten.pCollections());
}
 
Example 3
Source File: TfIdf.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<URI, String>> expand(PBegin input) {
  Pipeline pipeline = input.getPipeline();

  // Create one TextIO.Read transform for each document
  // and add its output to a PCollectionList
  PCollectionList<KV<URI, String>> urisToLines = PCollectionList.empty(pipeline);

  // TextIO.Read supports:
  //  - file: URIs and paths locally
  //  - gs: URIs on the service
  for (final URI uri : uris) {
    String uriString;
    if ("file".equals(uri.getScheme())) {
      uriString = new File(uri).getPath();
    } else {
      uriString = uri.toString();
    }

    PCollection<KV<URI, String>> oneUriToLines =
        pipeline
            .apply("TextIO.Read(" + uriString + ")", TextIO.read().from(uriString))
            .apply("WithKeys(" + uriString + ")", WithKeys.of(uri))
            .setCoder(KvCoder.of(StringDelegateCoder.of(URI.class), StringUtf8Coder.of()));

    urisToLines = urisToLines.and(oneUriToLines);
  }

  return urisToLines.apply(Flatten.pCollections());
}
 
Example 4
Source File: TestExpansionService.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollectionList<Long> createInput(Pipeline p, Map<String, PCollection<?>> inputs) {
  PCollectionList<Long> inputList = PCollectionList.empty(p);
  for (PCollection<?> collection : inputs.values()) {
    inputList = inputList.and((PCollection<Long>) collection);
  }
  return inputList;
}
 
Example 5
Source File: TransformHierarchyTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void producingOwnAndOthersOutputsFails() {
  PCollection<Long> created =
      PCollection.createPrimitiveOutputInternal(
          pipeline, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarLongCoder.of());
  hierarchy.pushNode("Create", PBegin.in(pipeline), Create.of(1));
  hierarchy.setOutput(created);
  hierarchy.popNode();
  PCollectionList<Long> pcList = PCollectionList.of(created);

  final PCollectionList<Long> appended =
      pcList.and(
          PCollection.createPrimitiveOutputInternal(
                  pipeline,
                  WindowingStrategy.globalDefault(),
                  IsBounded.BOUNDED,
                  VarLongCoder.of())
              .setName("prim"));
  hierarchy.pushNode(
      "AddPc",
      pcList,
      new PTransform<PCollectionList<Long>, PCollectionList<Long>>() {
        @Override
        public PCollectionList<Long> expand(PCollectionList<Long> input) {
          return appended;
        }
      });
  thrown.expect(IllegalArgumentException.class);
  thrown.expectMessage("contains a primitive POutput produced by it");
  thrown.expectMessage("AddPc");
  thrown.expectMessage("Create");
  thrown.expectMessage(appended.expand().toString());
  hierarchy.setOutput(appended);
}
 
Example 6
Source File: CountingSourceTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
@Category({
  ValidatesRunner.class,
  UsesStatefulParDo.class, // This test fails if State is unsupported despite no direct usage.
  DataflowPortabilityApiUnsupported.class
})
public void testUnboundedSourceSplits() throws Exception {
  long numElements = 1000;
  int numSplits = 10;

  UnboundedSource<Long, ?> initial = CountingSource.unbounded();
  List<? extends UnboundedSource<Long, ?>> splits = initial.split(numSplits, p.getOptions());
  assertEquals("Expected exact splitting", numSplits, splits.size());

  long elementsPerSplit = numElements / numSplits;
  assertEquals("Expected even splits", numElements, elementsPerSplit * numSplits);
  PCollectionList<Long> pcollections = PCollectionList.empty(p);
  for (int i = 0; i < splits.size(); ++i) {
    pcollections =
        pcollections.and(
            p.apply("split" + i, Read.from(splits.get(i)).withMaxNumRecords(elementsPerSplit)));
  }
  PCollection<Long> input = pcollections.apply(Flatten.pCollections());

  addCountingAsserts(input, numElements);
  p.run();
}
 
Example 7
Source File: CountingSourceTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
@Category(NeedsRunner.class)
public void testUnboundedSourceRateSplits() throws Exception {
  int elementsPerPeriod = 10;
  Duration period = Duration.millis(5);

  long numElements = 1000;
  int numSplits = 10;

  UnboundedCountingSource initial =
      CountingSource.createUnboundedFrom(0).withRate(elementsPerPeriod, period);
  List<? extends UnboundedSource<Long, ?>> splits = initial.split(numSplits, p.getOptions());
  assertEquals("Expected exact splitting", numSplits, splits.size());

  long elementsPerSplit = numElements / numSplits;
  assertEquals("Expected even splits", numElements, elementsPerSplit * numSplits);
  PCollectionList<Long> pcollections = PCollectionList.empty(p);
  for (int i = 0; i < splits.size(); ++i) {
    pcollections =
        pcollections.and(
            p.apply("split" + i, Read.from(splits.get(i)).withMaxNumRecords(elementsPerSplit)));
  }
  PCollection<Long> input = pcollections.apply(Flatten.pCollections());

  addCountingAsserts(input, numElements);
  Instant startTime = Instant.now();
  p.run();
  Instant endTime = Instant.now();
  // 500 ms if the readers are all initialized in parallel; 5000 ms if they are evaluated serially
  long expectedMinimumMillis = (numElements * period.getMillis()) / elementsPerPeriod;
  assertThat(expectedMinimumMillis, lessThan(endTime.getMillis() - startTime.getMillis()));
}
 
Example 8
Source File: KafkaIOTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testUnboundedSourceSplits() throws Exception {

  int numElements = 1000;
  int numSplits = 10;

  // Coders must be specified explicitly here due to the way the transform
  // is used in the test.
  UnboundedSource<KafkaRecord<Integer, Long>, ?> initial =
      mkKafkaReadTransform(numElements, null)
          .withKeyDeserializerAndCoder(IntegerDeserializer.class, BigEndianIntegerCoder.of())
          .withValueDeserializerAndCoder(LongDeserializer.class, BigEndianLongCoder.of())
          .makeSource();

  List<? extends UnboundedSource<KafkaRecord<Integer, Long>, ?>> splits =
      initial.split(numSplits, p.getOptions());
  assertEquals("Expected exact splitting", numSplits, splits.size());

  long elementsPerSplit = numElements / numSplits;
  assertEquals("Expected even splits", numElements, elementsPerSplit * numSplits);
  PCollectionList<Long> pcollections = PCollectionList.empty(p);
  for (int i = 0; i < splits.size(); ++i) {
    pcollections =
        pcollections.and(
            p.apply("split" + i, Read.from(splits.get(i)).withMaxNumRecords(elementsPerSplit))
                .apply("Remove Metadata " + i, ParDo.of(new RemoveKafkaMetadata<>()))
                .apply("collection " + i, Values.create()));
  }
  PCollection<Long> input = pcollections.apply(Flatten.pCollections());

  addCountingAsserts(input, numElements);
  p.run();
}
 
Example 9
Source File: DeduplicatedFlattenFactory.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}.
 *
 * <p>The input {@link PCollectionList} that is constructed will have the same values in the same
 */
private PCollectionList<T> getInput(Map<TupleTag<?>, PValue> inputs, Pipeline p) {
  PCollectionList<T> pCollections = PCollectionList.empty(p);
  for (PValue input : inputs.values()) {
    PCollection<T> pcollection = (PCollection<T>) input;
    pCollections = pCollections.and(pcollection);
  }
  return pCollections;
}
 
Example 10
Source File: FixedInputRuntime.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<IndexedRecord> expand(PBegin begin) {
    FixedDatasetRuntime runtime = new FixedDatasetRuntime();
    runtime.initialize(null, properties.getDatasetProperties());

    // The values to include in the PCollection
    List<IndexedRecord> values = new LinkedList<>();

    if (properties.overrideValuesAction.getValue() == FixedInputProperties.OverrideValuesAction.NONE
            || properties.overrideValuesAction.getValue() == FixedInputProperties.OverrideValuesAction.APPEND) {
        if (!properties.getDatasetProperties().values.getValue().trim().isEmpty()) {
            values.addAll(runtime.getValues(Integer.MAX_VALUE));
        }
    }

    if (properties.overrideValuesAction.getValue() == FixedInputProperties.OverrideValuesAction.APPEND
            || properties.overrideValuesAction.getValue() == FixedInputProperties.OverrideValuesAction.REPLACE) {
        properties.getDatasetProperties().values.setValue(properties.overrideValues.getValue());
        if (!properties.getDatasetProperties().values.getValue().trim().isEmpty()) {
            values.addAll(runtime.getValues(Integer.MAX_VALUE));
        }
    }

    if (values.size() != 0) {
        PCollection<IndexedRecord> out = (PCollection<IndexedRecord>) begin
                .apply(Create.of(values).withCoder((AvroCoder) AvroCoder.of(runtime.getSchema())));
        if (properties.repeat.getValue() > 1) {
            PCollectionList<IndexedRecord> merged = PCollectionList.of(out);
            for (int i = 2; i < properties.repeat.getValue(); i++)
                merged = merged.and(out);
            out = merged.apply(Flatten.<IndexedRecord> pCollections());
        }
        return out;
    } else {
        return begin.apply(RowGeneratorIO.read().withSchema(runtime.getSchema()) //
                .withSeed(0L) //
                .withPartitions(1) //
                .withRows(properties.repeat.getValue()));
    }
}