org.apache.beam.sdk.transforms.Flatten Java Exaples

Source File: DirectGraphVisitorTest.java From beam with Apache License 2.0

6 votes

@Test
public void getValueToConsumersWithDuplicateInputSucceeds() {
  PCollection<String> created = p.apply(Create.of("1", "2", "3"));

  PCollection<String> flattened =
      PCollectionList.of(created).and(created).apply(Flatten.pCollections());

  p.traverseTopologically(visitor);

  DirectGraph graph = visitor.getGraph();
  AppliedPTransform<?, ?, ?> flattenedProducer = graph.getProducer(flattened);

  assertThat(
      graph.getPerElementConsumers(created),
      Matchers.containsInAnyOrder(new Object[] {flattenedProducer, flattenedProducer}));
  assertThat(graph.getPerElementConsumers(flattened), emptyIterable());
}

Source File: FlattenTranslatorBatch.java From twister2 with Apache License 2.0

6 votes

@Override
public void translateNode(
    Flatten.PCollections<T> transform, Twister2BatchTranslationContext context) {
  Collection<PValue> pcs = context.getInputs().values();
  List<BatchTSetImpl<WindowedValue<T>>> tSets = new ArrayList<>();
  BatchTSetImpl<WindowedValue<T>> unionTSet = null;
  if (pcs.isEmpty()) {
    // TODO: create empty TSet
    throw new UnsupportedOperationException("Operation not implemented yet");
  } else {
    for (PValue pc : pcs) {
      BatchTSetImpl<WindowedValue<T>> curr = context.getInputDataSet(pc);
      tSets.add(curr);
    }

    BatchTSetImpl<WindowedValue<T>> first = tSets.remove(0);
    Collection<TSet<WindowedValue<T>>> others = new ArrayList<>();
    others.addAll(tSets);
    if (tSets.size() > 0) {
      unionTSet = first.union(others);
    } else {
      unionTSet = first;
    }
  }
  context.setOutputDataSet(context.getOutput(transform), unionTSet);
}

Source File: FileIndexerPipeline.java From dataflow-opinion-analysis with Apache License 2.0

6 votes

/**
 * @param indexes
 * @return
 */
private static PCollection<ContentIndexSummary> enrichWithCNLP(
		PCollection<ContentIndexSummary> indexes, Float ratio) {
	
	PCollectionTuple splitAB = indexes
		.apply(ParDo.of(new SplitAB(ratio))
			.withOutputTags(PipelineTags.BranchA,  
				TupleTagList.of(PipelineTags.BranchB))); 
	
	PCollection<ContentIndexSummary> branchACol = splitAB.get(PipelineTags.BranchA);
	PCollection<ContentIndexSummary> branchBCol = splitAB.get(PipelineTags.BranchB);
	
	PCollection<ContentIndexSummary> enrichedBCol = branchBCol.apply(
		ParDo.of(new EnrichWithCNLPEntities()));
	
	//Merge all collections with WebResource table records
	PCollectionList<ContentIndexSummary> contentIndexSummariesList = 
		PCollectionList.of(branchACol).and(enrichedBCol);
	PCollection<ContentIndexSummary> allIndexSummaries = 
		contentIndexSummariesList.apply(Flatten.<ContentIndexSummary>pCollections());

	indexes = allIndexSummaries;
	return indexes;
}

Source File: IndexerPipeline.java From dataflow-opinion-analysis with Apache License 2.0

6 votes

/**
 * @param filteredIndexes
 * @return
 */
private static PCollection<ContentIndexSummary> enrichWithCNLP(
		PCollection<ContentIndexSummary> filteredIndexes, Float ratio) {
	
	PCollectionTuple splitAB = filteredIndexes
		.apply(ParDo.of(new SplitAB(ratio))
			.withOutputTags(PipelineTags.BranchA,  
				TupleTagList.of(PipelineTags.BranchB))); 
	
	PCollection<ContentIndexSummary> branchACol = splitAB.get(PipelineTags.BranchA);
	PCollection<ContentIndexSummary> branchBCol = splitAB.get(PipelineTags.BranchB);
	
	PCollection<ContentIndexSummary> enrichedBCol = branchBCol.apply(
		ParDo.of(new EnrichWithCNLPEntities()));
	
	//Merge all collections with WebResource table records
	PCollectionList<ContentIndexSummary> contentIndexSummariesList = 
		PCollectionList.of(branchACol).and(enrichedBCol);
	PCollection<ContentIndexSummary> allIndexSummaries = 
		contentIndexSummariesList.apply(Flatten.<ContentIndexSummary>pCollections());

	filteredIndexes = allIndexSummaries;
	return filteredIndexes;
}

Source File: SideInputLoadTest.java From beam with Apache License 2.0

6 votes

private void performTestWithMap(
    PCollection<KV<byte[], byte[]>> input, Optional<SyntheticStep> syntheticStep) {
  applyStepIfPresent(input, "Synthetic step", syntheticStep);
  PCollectionView<Map<byte[], byte[]>> sideInput =
      applyWindowingIfPresent(input).apply(View.asMap());
  PCollectionView<List<byte[]>> randomKeys =
      pipeline
          .apply(Create.of(0))
          .apply(
              ParDo.of(new GetRandomKeyList(sideInput, options.getAccessPercentage()))
                  .withSideInputs(sideInput))
          .apply(Flatten.iterables())
          .apply(View.asList());

  input
      .apply(
          ParDo.of(new SideInputTestWithMap(sideInput, randomKeys))
              .withSideInputs(sideInput, randomKeys))
      .apply("Collect end time metrics", ParDo.of(runtimeMonitor));
}

Source File: Window.java From beam with Apache License 2.0

6 votes

@Override
public PCollection<T> expand(PCollection<T> input) {
  applicableTo(input);

  WindowingStrategy<?, ?> outputStrategy =
      getOutputStrategyInternal(input.getWindowingStrategy());

  if (getWindowFn() == null) {
    // A new PCollection must be created in case input is reused in a different location as the
    // two PCollections will, in general, have a different windowing strategy.
    return PCollectionList.of(input)
        .apply(Flatten.pCollections())
        .setWindowingStrategyInternal(outputStrategy);
  } else {
    // This is the AssignWindows primitive
    return input.apply(new Assign<>(this, outputStrategy));
  }
}

Source File: PAssert.java From beam with Apache License 2.0

6 votes

@Override
public PCollectionView<ActualT> expand(PBegin input) {
  final Coder<T> coder = actual.getCoder();
  return actual
      .apply("FilterActuals", rewindowActuals.prepareActuals())
      .apply("GatherPanes", GatherAllPanes.globally())
      .apply("ExtractPane", MapElements.via(extractPane))
      .setCoder(IterableCoder.of(actual.getCoder()))
      .apply(Flatten.iterables())
      .apply("RewindowActuals", rewindowActuals.windowActuals())
      .apply(
          ParDo.of(
              new DoFn<T, T>() {
                @ProcessElement
                public void processElement(ProcessContext context) throws CoderException {
                  context.output(CoderUtils.clone(coder, context.element()));
                }
              }))
      .apply(actualView);
}

Source File: TestStreamTest.java From beam with Apache License 2.0

6 votes

@Test
@Category({NeedsRunner.class, UsesTestStream.class})
public void testElementsAtAlmostPositiveInfinity() {
  Instant endOfGlobalWindow = GlobalWindow.INSTANCE.maxTimestamp();
  TestStream<String> stream =
      TestStream.create(StringUtf8Coder.of())
          .addElements(
              TimestampedValue.of("foo", endOfGlobalWindow),
              TimestampedValue.of("bar", endOfGlobalWindow))
          .advanceWatermarkToInfinity();

  FixedWindows windows = FixedWindows.of(Duration.standardHours(6));
  PCollection<String> windowedValues =
      p.apply(stream)
          .apply(into(windows))
          .apply(WithKeys.of(1))
          .apply(GroupByKey.create())
          .apply(Values.create())
          .apply(Flatten.iterables());

  PAssert.that(windowedValues)
      .inWindow(windows.assignWindow(endOfGlobalWindow))
      .containsInAnyOrder("foo", "bar");
  p.run();
}

Source File: PTransformMatchers.java From beam with Apache License 2.0

6 votes

/**
 * A {@link PTransformMatcher} which matches a {@link Flatten.PCollections} which consumes a
 * single input {@link PCollection} multiple times.
 */
public static PTransformMatcher flattenWithDuplicateInputs() {
  return new PTransformMatcher() {
    @Override
    public boolean matches(AppliedPTransform<?, ?, ?> application) {
      if (application.getTransform() instanceof Flatten.PCollections) {
        Set<PValue> observed = new HashSet<>();
        for (PValue pvalue : application.getInputs().values()) {
          boolean firstInstance = observed.add(pvalue);
          if (!firstInstance) {
            return true;
          }
        }
      }
      return false;
    }

    @Override
    public String toString() {
      return MoreObjects.toStringHelper("FlattenWithDuplicateInputsMatcher").toString();
    }
  };
}

Source File: DeduplicatedFlattenFactoryTest.java From beam with Apache License 2.0

6 votes

@Test
public void duplicatesInsertsMultipliers() {
  PTransform<PCollectionList<String>, PCollection<String>> replacement =
      new DeduplicatedFlattenFactory.FlattenWithoutDuplicateInputs<>();
  final PCollectionList<String> inputList =
      PCollectionList.of(first).and(second).and(first).and(first);
  inputList.apply(replacement);
  pipeline.traverseTopologically(
      new Defaults() {
        @Override
        public void visitPrimitiveTransform(TransformHierarchy.Node node) {
          if (node.getTransform() instanceof Flatten.PCollections) {
            assertThat(node.getInputs(), not(equalTo(inputList.expand())));
          }
        }
      });
}

Source File: PTransformMatchersTest.java From beam with Apache License 2.0

6 votes

@Test
public void emptyFlattenWithNonEmptyFlatten() {
  AppliedPTransform application =
      AppliedPTransform.of(
          "Flatten",
          Collections.singletonMap(
              new TupleTag<Integer>(),
              PCollection.createPrimitiveOutputInternal(
                  p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarIntCoder.of())),
          Collections.singletonMap(
              new TupleTag<Integer>(),
              PCollection.createPrimitiveOutputInternal(
                  p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarIntCoder.of())),
          Flatten.pCollections(),
          p);

  assertThat(PTransformMatchers.emptyFlatten().matches(application), is(false));
}

Source File: PTransformMatchersTest.java From beam with Apache License 2.0

6 votes

@Test
public void flattenWithDuplicateInputsWithoutDuplicates() {
  AppliedPTransform application =
      AppliedPTransform.of(
          "Flatten",
          Collections.singletonMap(
              new TupleTag<Integer>(),
              PCollection.createPrimitiveOutputInternal(
                  p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarIntCoder.of())),
          Collections.singletonMap(
              new TupleTag<Integer>(),
              PCollection.createPrimitiveOutputInternal(
                  p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarIntCoder.of())),
          Flatten.pCollections(),
          p);

  assertThat(PTransformMatchers.flattenWithDuplicateInputs().matches(application), is(false));
}

Source File: PTransformMatchersTest.java From beam with Apache License 2.0

6 votes

@Test
public void flattenWithDuplicateInputsNonFlatten() {
  AppliedPTransform application =
      AppliedPTransform
          .<PCollection<Iterable<Integer>>, PCollection<Integer>, Flatten.Iterables<Integer>>of(
              "EmptyFlatten",
              Collections.emptyMap(),
              Collections.singletonMap(
                  new TupleTag<Integer>(),
                  PCollection.createPrimitiveOutputInternal(
                      p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarIntCoder.of())),
              /* This isn't actually possible to construct, but for the sake of example */
              Flatten.iterables(),
              p);

  assertThat(PTransformMatchers.flattenWithDuplicateInputs().matches(application), is(false));
}

Source File: UnconsumedReadsTest.java From beam with Apache License 2.0

6 votes

@Test
public void doesNotConsumeAlreadyConsumedRead() {
  Unbounded<Long> transform = Read.from(CountingSource.unbounded());
  final PCollection<Long> output = pipeline.apply(transform);
  final Flatten.PCollections<Long> consumer = Flatten.pCollections();
  PCollectionList.of(output).apply(consumer);
  UnconsumedReads.ensureAllReadsConsumed(pipeline);
  pipeline.traverseTopologically(
      new PipelineVisitor.Defaults() {
        @Override
        public void visitPrimitiveTransform(Node node) {
          // The output should only be consumed by a single consumer
          if (node.getInputs().values().contains(output)) {
            assertThat(node.getTransform(), Matchers.is(consumer));
          }
        }
      });
}

Source File: EmptyFlattenAsCreateFactoryTest.java From beam with Apache License 2.0

6 votes

@Test
public void getInputNonEmptyThrows() {
  PCollectionList<Long> nonEmpty =
      PCollectionList.of(pipeline.apply("unbounded", GenerateSequence.from(0)))
          .and(pipeline.apply("bounded", GenerateSequence.from(0).to(100)));
  thrown.expect(IllegalArgumentException.class);
  thrown.expectMessage(nonEmpty.expand().toString());
  thrown.expectMessage(EmptyFlattenAsCreateFactory.class.getSimpleName());
  factory.getReplacementTransform(
      AppliedPTransform.of(
          "nonEmptyInput",
          nonEmpty.expand(),
          Collections.emptyMap(),
          Flatten.pCollections(),
          pipeline));
}

Source File: EmptyFlattenAsCreateFactoryTest.java From beam with Apache License 2.0

6 votes

@Test
@Category(NeedsRunner.class)
public void testOverride() {
  PCollectionList<Long> empty = PCollectionList.empty(pipeline);
  PCollection<Long> emptyFlattened =
      empty.apply(
          factory
              .getReplacementTransform(
                  AppliedPTransform.of(
                      "nonEmptyInput",
                      Collections.emptyMap(),
                      Collections.emptyMap(),
                      Flatten.pCollections(),
                      pipeline))
              .getTransform());
  PAssert.that(emptyFlattened).empty();
  pipeline.run();
}

Source File: QueryablePipelineTest.java From beam with Apache License 2.0

6 votes

/**
 * Tests that {@link QueryablePipeline#getPerElementConsumers(PCollectionNode)} returns a
 * transform that consumes the node more than once.
 */
@Test
public void perElementConsumersWithConsumingMultipleTimes() {
  Pipeline p = Pipeline.create();
  PCollection<Long> longs = p.apply("BoundedRead", Read.from(CountingSource.upTo(100L)));
  PCollectionList.of(longs).and(longs).and(longs).apply("flatten", Flatten.pCollections());

  Components components = PipelineTranslation.toProto(p).getComponents();
  // This breaks if the way that IDs are assigned to PTransforms changes in PipelineTranslation
  String readOutput =
      getOnlyElement(components.getTransformsOrThrow("BoundedRead").getOutputsMap().values());
  QueryablePipeline qp = QueryablePipeline.forPrimitivesIn(components);
  Set<PTransformNode> consumers =
      qp.getPerElementConsumers(
          PipelineNode.pCollection(readOutput, components.getPcollectionsOrThrow(readOutput)));

  assertThat(consumers.size(), equalTo(1));
  assertThat(
      getOnlyElement(consumers).getTransform().getSpec().getUrn(),
      equalTo(PTransformTranslation.FLATTEN_TRANSFORM_URN));
}

Source File: PTransformMatchersTest.java From beam with Apache License 2.0

6 votes

@Test
public void emptyFlattenWithNonFlatten() {
  AppliedPTransform application =
      AppliedPTransform
          .<PCollection<Iterable<Integer>>, PCollection<Integer>, Flatten.Iterables<Integer>>of(
              "EmptyFlatten",
              Collections.emptyMap(),
              Collections.singletonMap(
                  new TupleTag<Integer>(),
                  PCollection.createPrimitiveOutputInternal(
                      p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarIntCoder.of())),
              /* This isn't actually possible to construct, but for the sake of example */
              Flatten.iterables(),
              p);

  assertThat(PTransformMatchers.emptyFlatten().matches(application), is(false));
}

Source File: CreateStreamTest.java From beam with Apache License 2.0

6 votes

@Test
public void testElementsAtAlmostPositiveInfinity() throws IOException {
  Instant endOfGlobalWindow = GlobalWindow.INSTANCE.maxTimestamp();
  CreateStream<String> source =
      CreateStream.of(StringUtf8Coder.of(), batchDuration())
          .nextBatch(
              TimestampedValue.of("foo", endOfGlobalWindow),
              TimestampedValue.of("bar", endOfGlobalWindow))
          .advanceNextBatchWatermarkToInfinity();

  FixedWindows windows = FixedWindows.of(Duration.standardHours(6));
  PCollection<String> windowedValues =
      p.apply(source)
          .apply(Window.into(windows))
          .apply(WithKeys.of(1))
          .apply(GroupByKey.create())
          .apply(Values.create())
          .apply(Flatten.iterables());

  PAssert.that(windowedValues)
      .inWindow(windows.assignWindow(GlobalWindow.INSTANCE.maxTimestamp()))
      .containsInAnyOrder("foo", "bar");
  p.run();
}

Source File: TrackStreamingSourcesTest.java From beam with Apache License 2.0

6 votes

@Test
public void testTrackFlattened() {
  options.setRunner(SparkRunner.class);
  JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
  JavaStreamingContext jssc =
      new JavaStreamingContext(
          jsc, new org.apache.spark.streaming.Duration(options.getBatchIntervalMillis()));

  Pipeline p = Pipeline.create(options);

  CreateStream<Integer> queueStream1 =
      CreateStream.of(VarIntCoder.of(), Duration.millis(options.getBatchIntervalMillis()))
          .emptyBatch();
  CreateStream<Integer> queueStream2 =
      CreateStream.of(VarIntCoder.of(), Duration.millis(options.getBatchIntervalMillis()))
          .emptyBatch();

  PCollection<Integer> pcol1 = p.apply(queueStream1);
  PCollection<Integer> pcol2 = p.apply(queueStream2);
  PCollection<Integer> flattened =
      PCollectionList.of(pcol1).and(pcol2).apply(Flatten.pCollections());
  flattened.apply(ParDo.of(new PassthroughFn<>()));

  p.traverseTopologically(new StreamingSourceTracker(jssc, p, ParDo.MultiOutput.class, 0, 1));
  assertThat(StreamingSourceTracker.numAssertions, equalTo(1));
}

Source File: Sink.java From gcp-ingestion with Mozilla Public License 2.0

6 votes

/**
 * Execute an Apache Beam pipeline and return the {@code PipelineResult}.
 */
public static PipelineResult run(SinkOptions.Parsed options) {
  final Pipeline pipeline = Pipeline.create(options);
  final List<PCollection<PubsubMessage>> failureCollections = new ArrayList<>();

  pipeline //
      .apply(options.getInputType().read(options)) //
      .apply(DecompressPayload.enabled(options.getDecompressInputPayloads())) //
      .apply(options.getOutputType().write(options)).failuresTo(failureCollections);

  PCollectionList.of(failureCollections) //
      .apply("FlattenFailureCollections", Flatten.pCollections()) //
      .apply("WriteErrorOutput", options.getErrorOutputType().write(options)) //
      .output();

  return pipeline.run();
}

Source File: WatermarkManagerTest.java From beam with Apache License 2.0

5 votes

@Before
public void setup() {

  createdInts = p.apply("createdInts", Create.of(1, 2, 3));

  filtered = createdInts.apply("filtered", Filter.greaterThan(1));
  filteredTimesTwo =
      filtered.apply(
          "timesTwo",
          ParDo.of(
              new DoFn<Integer, Integer>() {
                @ProcessElement
                public void processElement(ProcessContext c) throws Exception {
                  c.output(c.element() * 2);
                }
              }));

  keyed = createdInts.apply("keyed", WithKeys.of("MyKey"));

  intsToFlatten = p.apply("intsToFlatten", Create.of(-1, 256, 65535));
  PCollectionList<Integer> preFlatten = PCollectionList.of(createdInts).and(intsToFlatten);
  flattened = preFlatten.apply("flattened", Flatten.pCollections());

  clock = MockClock.fromInstant(new Instant(1000));
  DirectGraphs.performDirectOverrides(p);
  graph = DirectGraphs.getGraph(p);

  manager = WatermarkManager.create(clock, graph, AppliedPTransform::getFullName);
  bundleFactory = ImmutableListBundleFactory.create();
}

Source File: FlattenTranslatorBatch.java From beam with Apache License 2.0

5 votes

@Override
public void translateNode(
    Flatten.PCollections<T> transform, Twister2BatchTranslationContext context) {
  Collection<PValue> pcs = context.getInputs().values();
  List<BatchTSetImpl<WindowedValue<T>>> tSets = new ArrayList<>();
  BatchTSetImpl<WindowedValue<T>> unionTSet;

  if (pcs.isEmpty()) {
    final TSetEnvironment tsetEnv = context.getEnvironment();
    unionTSet =
        ((BatchTSetEnvironment) tsetEnv)
            .createSource(new Twister2EmptySource(), context.getOptions().getParallelism());
  } else {
    for (PValue pc : pcs) {
      BatchTSetImpl<WindowedValue<T>> curr = context.getInputDataSet(pc);
      tSets.add(curr);
    }

    BatchTSetImpl<WindowedValue<T>> first = tSets.remove(0);
    Collection<TSet<WindowedValue<T>>> others = new ArrayList<>();
    others.addAll(tSets);
    if (tSets.size() > 0) {
      unionTSet = first.union(others);
    } else {
      unionTSet = first;
    }
  }
  context.setOutputDataSet(context.getOutput(transform), unionTSet);
}

Source File: PTransformMatchersTest.java From beam with Apache License 2.0

5 votes

@Test
public void emptyFlattenWithEmptyFlatten() {
  AppliedPTransform application =
      AppliedPTransform.of(
          "EmptyFlatten",
          Collections.emptyMap(),
          Collections.singletonMap(
              new TupleTag<Integer>(),
              PCollection.createPrimitiveOutputInternal(
                  p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarIntCoder.of())),
          Flatten.pCollections(),
          p);

  assertThat(PTransformMatchers.emptyFlatten().matches(application), is(true));
}

Source File: AtomicInsertTest.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<Row> expand(PBegin input) {
  Schema schema = Schema.of(Schema.Field.of("f0", Schema.FieldType.INT64));
  Iterable<Row> bundle =
      IntStream.range(0, size)
          .mapToObj(x -> Row.withSchema(schema).addValue((long) x).build())
          .collect(Collectors.toList());

  // make sure we get one big bundle
  return input
      .getPipeline()
      .apply(Create.<Iterable<Row>>of(bundle).withCoder(IterableCoder.of(RowCoder.of(schema))))
      .apply(Flatten.iterables())
      .setRowSchema(schema);
}

Source File: BatchLoads.java From beam with Apache License 2.0

5 votes

PCollection<WriteBundlesToFiles.Result<DestinationT>> writeDynamicallyShardedFiles(
    PCollection<KV<DestinationT, ElementT>> input, PCollectionView<String> tempFilePrefix) {
  TupleTag<WriteBundlesToFiles.Result<DestinationT>> writtenFilesTag =
      new TupleTag<WriteBundlesToFiles.Result<DestinationT>>("writtenFiles") {};
  TupleTag<KV<ShardedKey<DestinationT>, ElementT>> unwrittedRecordsTag =
      new TupleTag<KV<ShardedKey<DestinationT>, ElementT>>("unwrittenRecords") {};
  PCollectionTuple writeBundlesTuple =
      input.apply(
          "WriteBundlesToFiles",
          ParDo.of(
                  new WriteBundlesToFiles<>(
                      tempFilePrefix,
                      unwrittedRecordsTag,
                      maxNumWritersPerBundle,
                      maxFileSize,
                      rowWriterFactory))
              .withSideInputs(tempFilePrefix)
              .withOutputTags(writtenFilesTag, TupleTagList.of(unwrittedRecordsTag)));
  PCollection<WriteBundlesToFiles.Result<DestinationT>> writtenFiles =
      writeBundlesTuple
          .get(writtenFilesTag)
          .setCoder(WriteBundlesToFiles.ResultCoder.of(destinationCoder));
  PCollection<KV<ShardedKey<DestinationT>, ElementT>> unwrittenRecords =
      writeBundlesTuple
          .get(unwrittedRecordsTag)
          .setCoder(KvCoder.of(ShardedKeyCoder.of(destinationCoder), elementCoder));

  // If the bundles contain too many output tables to be written inline to files (due to memory
  // limits), any unwritten records will be spilled to the unwrittenRecordsTag PCollection.
  // Group these records by key, and write the files after grouping. Since the record is grouped
  // by key, we can ensure that only one file is open at a time in each bundle.
  PCollection<WriteBundlesToFiles.Result<DestinationT>> writtenFilesGrouped =
      writeShardedRecords(unwrittenRecords, tempFilePrefix);

  // PCollection of filename, file byte size, and table destination.
  return PCollectionList.of(writtenFiles)
      .and(writtenFilesGrouped)
      .apply("FlattenFiles", Flatten.pCollections())
      .setCoder(WriteBundlesToFiles.ResultCoder.of(destinationCoder));
}

Source File: KafkaIOTest.java From beam with Apache License 2.0

5 votes

@Test
public void testUnboundedSourceSplits() throws Exception {

  int numElements = 1000;
  int numSplits = 10;

  // Coders must be specified explicitly here due to the way the transform
  // is used in the test.
  UnboundedSource<KafkaRecord<Integer, Long>, ?> initial =
      mkKafkaReadTransform(numElements, null)
          .withKeyDeserializerAndCoder(IntegerDeserializer.class, BigEndianIntegerCoder.of())
          .withValueDeserializerAndCoder(LongDeserializer.class, BigEndianLongCoder.of())
          .makeSource();

  List<? extends UnboundedSource<KafkaRecord<Integer, Long>, ?>> splits =
      initial.split(numSplits, p.getOptions());
  assertEquals("Expected exact splitting", numSplits, splits.size());

  long elementsPerSplit = numElements / numSplits;
  assertEquals("Expected even splits", numElements, elementsPerSplit * numSplits);
  PCollectionList<Long> pcollections = PCollectionList.empty(p);
  for (int i = 0; i < splits.size(); ++i) {
    pcollections =
        pcollections.and(
            p.apply("split" + i, Read.from(splits.get(i)).withMaxNumRecords(elementsPerSplit))
                .apply("Remove Metadata " + i, ParDo.of(new RemoveKafkaMetadata<>()))
                .apply("collection " + i, Values.create()));
  }
  PCollection<Long> input = pcollections.apply(Flatten.pCollections());

  addCountingAsserts(input, numElements);
  p.run();
}

Source File: UnionTranslator.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<InputT> translate(Union<InputT> operator, PCollectionList<InputT> inputs) {
  final TypeDescriptor<InputT> outputType = operator.getOutputType().orElse(null);
  return operator
      .getName()
      .map(name -> inputs.apply(name, Flatten.pCollections()).setTypeDescriptor(outputType))
      .orElseGet(() -> inputs.apply(Flatten.pCollections()).setTypeDescriptor(outputType));
}

Source File: FlattenTest.java From beam with Apache License 2.0

5 votes

@Test
public void testFlatten() {
  PCollection<Integer> input1 = pipeline.apply(Create.of(1, 2, 3, 4, 5, 6, 7, 8, 9, 10));
  PCollection<Integer> input2 = pipeline.apply(Create.of(11, 12, 13, 14, 15, 16, 17, 18, 19, 20));
  PCollectionList<Integer> pcs = PCollectionList.of(input1).and(input2);
  PCollection<Integer> input = pcs.apply(Flatten.pCollections());
  PAssert.that(input)
      .containsInAnyOrder(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20);
  pipeline.run();
}

Source File: PTransformMatchers.java From beam with Apache License 2.0

5 votes

/**
 * A {@link PTransformMatcher} which matches a {@link Flatten.PCollections} which consumes no
 * input {@link PCollection PCollections}.
 */
public static PTransformMatcher emptyFlatten() {
  return new PTransformMatcher() {
    @Override
    public boolean matches(AppliedPTransform<?, ?, ?> application) {
      return (application.getTransform() instanceof Flatten.PCollections)
          && application.getInputs().isEmpty();
    }

    @Override
    public String toString() {
      return MoreObjects.toStringHelper("EmptyFlattenMatcher").toString();
    }
  };
}

org.apache.beam.sdk.transforms.Flatten Java Examples