Java Code Examples for org.apache.beam.sdk.values.PCollectionList#and()
The following examples show how to use
org.apache.beam.sdk.values.PCollectionList#and() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Partition.java From beam with Apache License 2.0 | 6 votes |
@Override public PCollectionList<T> expand(PCollection<T> in) { final TupleTagList outputTags = partitionDoFn.getOutputTags(); PCollectionTuple outputs = in.apply( ParDo.of(partitionDoFn) .withOutputTags(new TupleTag<Void>() {}, outputTags) .withSideInputs(partitionDoFn.getSideInputs())); PCollectionList<T> pcs = PCollectionList.empty(in.getPipeline()); Coder<T> coder = in.getCoder(); for (TupleTag<?> outputTag : outputTags.getAll()) { // All the tuple tags are actually TupleTag<T> // And all the collections are actually PCollection<T> @SuppressWarnings("unchecked") TupleTag<T> typedOutputTag = (TupleTag<T>) outputTag; pcs = pcs.and(outputs.get(typedOutputTag).setCoder(coder)); } return pcs; }
Example 2
Source File: DeduplicatedFlattenFactory.java From beam with Apache License 2.0 | 6 votes |
@Override public PCollection<T> expand(PCollectionList<T> input) { Map<PCollection<T>, Integer> instances = new HashMap<>(); for (PCollection<T> pCollection : input.getAll()) { int existing = instances.get(pCollection) == null ? 0 : instances.get(pCollection); instances.put(pCollection, existing + 1); } PCollectionList<T> output = PCollectionList.empty(input.getPipeline()); for (Map.Entry<PCollection<T>, Integer> instanceEntry : instances.entrySet()) { if (instanceEntry.getValue().equals(1)) { output = output.and(instanceEntry.getKey()); } else { String duplicationName = String.format("Multiply %s", instanceEntry.getKey().getName()); PCollection<T> duplicated = instanceEntry .getKey() .apply(duplicationName, ParDo.of(new DuplicateFn<>(instanceEntry.getValue()))); output = output.and(duplicated); } } return output.apply(Flatten.pCollections()); }
Example 3
Source File: TfIdf.java From beam with Apache License 2.0 | 5 votes |
@Override public PCollection<KV<URI, String>> expand(PBegin input) { Pipeline pipeline = input.getPipeline(); // Create one TextIO.Read transform for each document // and add its output to a PCollectionList PCollectionList<KV<URI, String>> urisToLines = PCollectionList.empty(pipeline); // TextIO.Read supports: // - file: URIs and paths locally // - gs: URIs on the service for (final URI uri : uris) { String uriString; if ("file".equals(uri.getScheme())) { uriString = new File(uri).getPath(); } else { uriString = uri.toString(); } PCollection<KV<URI, String>> oneUriToLines = pipeline .apply("TextIO.Read(" + uriString + ")", TextIO.read().from(uriString)) .apply("WithKeys(" + uriString + ")", WithKeys.of(uri)) .setCoder(KvCoder.of(StringDelegateCoder.of(URI.class), StringUtf8Coder.of())); urisToLines = urisToLines.and(oneUriToLines); } return urisToLines.apply(Flatten.pCollections()); }
Example 4
Source File: TestExpansionService.java From beam with Apache License 2.0 | 5 votes |
@Override public PCollectionList<Long> createInput(Pipeline p, Map<String, PCollection<?>> inputs) { PCollectionList<Long> inputList = PCollectionList.empty(p); for (PCollection<?> collection : inputs.values()) { inputList = inputList.and((PCollection<Long>) collection); } return inputList; }
Example 5
Source File: TransformHierarchyTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void producingOwnAndOthersOutputsFails() { PCollection<Long> created = PCollection.createPrimitiveOutputInternal( pipeline, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarLongCoder.of()); hierarchy.pushNode("Create", PBegin.in(pipeline), Create.of(1)); hierarchy.setOutput(created); hierarchy.popNode(); PCollectionList<Long> pcList = PCollectionList.of(created); final PCollectionList<Long> appended = pcList.and( PCollection.createPrimitiveOutputInternal( pipeline, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarLongCoder.of()) .setName("prim")); hierarchy.pushNode( "AddPc", pcList, new PTransform<PCollectionList<Long>, PCollectionList<Long>>() { @Override public PCollectionList<Long> expand(PCollectionList<Long> input) { return appended; } }); thrown.expect(IllegalArgumentException.class); thrown.expectMessage("contains a primitive POutput produced by it"); thrown.expectMessage("AddPc"); thrown.expectMessage("Create"); thrown.expectMessage(appended.expand().toString()); hierarchy.setOutput(appended); }
Example 6
Source File: CountingSourceTest.java From beam with Apache License 2.0 | 5 votes |
@Test @Category({ ValidatesRunner.class, UsesStatefulParDo.class, // This test fails if State is unsupported despite no direct usage. DataflowPortabilityApiUnsupported.class }) public void testUnboundedSourceSplits() throws Exception { long numElements = 1000; int numSplits = 10; UnboundedSource<Long, ?> initial = CountingSource.unbounded(); List<? extends UnboundedSource<Long, ?>> splits = initial.split(numSplits, p.getOptions()); assertEquals("Expected exact splitting", numSplits, splits.size()); long elementsPerSplit = numElements / numSplits; assertEquals("Expected even splits", numElements, elementsPerSplit * numSplits); PCollectionList<Long> pcollections = PCollectionList.empty(p); for (int i = 0; i < splits.size(); ++i) { pcollections = pcollections.and( p.apply("split" + i, Read.from(splits.get(i)).withMaxNumRecords(elementsPerSplit))); } PCollection<Long> input = pcollections.apply(Flatten.pCollections()); addCountingAsserts(input, numElements); p.run(); }
Example 7
Source File: CountingSourceTest.java From beam with Apache License 2.0 | 5 votes |
@Test @Category(NeedsRunner.class) public void testUnboundedSourceRateSplits() throws Exception { int elementsPerPeriod = 10; Duration period = Duration.millis(5); long numElements = 1000; int numSplits = 10; UnboundedCountingSource initial = CountingSource.createUnboundedFrom(0).withRate(elementsPerPeriod, period); List<? extends UnboundedSource<Long, ?>> splits = initial.split(numSplits, p.getOptions()); assertEquals("Expected exact splitting", numSplits, splits.size()); long elementsPerSplit = numElements / numSplits; assertEquals("Expected even splits", numElements, elementsPerSplit * numSplits); PCollectionList<Long> pcollections = PCollectionList.empty(p); for (int i = 0; i < splits.size(); ++i) { pcollections = pcollections.and( p.apply("split" + i, Read.from(splits.get(i)).withMaxNumRecords(elementsPerSplit))); } PCollection<Long> input = pcollections.apply(Flatten.pCollections()); addCountingAsserts(input, numElements); Instant startTime = Instant.now(); p.run(); Instant endTime = Instant.now(); // 500 ms if the readers are all initialized in parallel; 5000 ms if they are evaluated serially long expectedMinimumMillis = (numElements * period.getMillis()) / elementsPerPeriod; assertThat(expectedMinimumMillis, lessThan(endTime.getMillis() - startTime.getMillis())); }
Example 8
Source File: KafkaIOTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testUnboundedSourceSplits() throws Exception { int numElements = 1000; int numSplits = 10; // Coders must be specified explicitly here due to the way the transform // is used in the test. UnboundedSource<KafkaRecord<Integer, Long>, ?> initial = mkKafkaReadTransform(numElements, null) .withKeyDeserializerAndCoder(IntegerDeserializer.class, BigEndianIntegerCoder.of()) .withValueDeserializerAndCoder(LongDeserializer.class, BigEndianLongCoder.of()) .makeSource(); List<? extends UnboundedSource<KafkaRecord<Integer, Long>, ?>> splits = initial.split(numSplits, p.getOptions()); assertEquals("Expected exact splitting", numSplits, splits.size()); long elementsPerSplit = numElements / numSplits; assertEquals("Expected even splits", numElements, elementsPerSplit * numSplits); PCollectionList<Long> pcollections = PCollectionList.empty(p); for (int i = 0; i < splits.size(); ++i) { pcollections = pcollections.and( p.apply("split" + i, Read.from(splits.get(i)).withMaxNumRecords(elementsPerSplit)) .apply("Remove Metadata " + i, ParDo.of(new RemoveKafkaMetadata<>())) .apply("collection " + i, Values.create())); } PCollection<Long> input = pcollections.apply(Flatten.pCollections()); addCountingAsserts(input, numElements); p.run(); }
Example 9
Source File: DeduplicatedFlattenFactory.java From beam with Apache License 2.0 | 5 votes |
/** * {@inheritDoc}. * * <p>The input {@link PCollectionList} that is constructed will have the same values in the same */ private PCollectionList<T> getInput(Map<TupleTag<?>, PValue> inputs, Pipeline p) { PCollectionList<T> pCollections = PCollectionList.empty(p); for (PValue input : inputs.values()) { PCollection<T> pcollection = (PCollection<T>) input; pCollections = pCollections.and(pcollection); } return pCollections; }
Example 10
Source File: FixedInputRuntime.java From components with Apache License 2.0 | 5 votes |
@Override public PCollection<IndexedRecord> expand(PBegin begin) { FixedDatasetRuntime runtime = new FixedDatasetRuntime(); runtime.initialize(null, properties.getDatasetProperties()); // The values to include in the PCollection List<IndexedRecord> values = new LinkedList<>(); if (properties.overrideValuesAction.getValue() == FixedInputProperties.OverrideValuesAction.NONE || properties.overrideValuesAction.getValue() == FixedInputProperties.OverrideValuesAction.APPEND) { if (!properties.getDatasetProperties().values.getValue().trim().isEmpty()) { values.addAll(runtime.getValues(Integer.MAX_VALUE)); } } if (properties.overrideValuesAction.getValue() == FixedInputProperties.OverrideValuesAction.APPEND || properties.overrideValuesAction.getValue() == FixedInputProperties.OverrideValuesAction.REPLACE) { properties.getDatasetProperties().values.setValue(properties.overrideValues.getValue()); if (!properties.getDatasetProperties().values.getValue().trim().isEmpty()) { values.addAll(runtime.getValues(Integer.MAX_VALUE)); } } if (values.size() != 0) { PCollection<IndexedRecord> out = (PCollection<IndexedRecord>) begin .apply(Create.of(values).withCoder((AvroCoder) AvroCoder.of(runtime.getSchema()))); if (properties.repeat.getValue() > 1) { PCollectionList<IndexedRecord> merged = PCollectionList.of(out); for (int i = 2; i < properties.repeat.getValue(); i++) merged = merged.and(out); out = merged.apply(Flatten.<IndexedRecord> pCollections()); } return out; } else { return begin.apply(RowGeneratorIO.read().withSchema(runtime.getSchema()) // .withSeed(0L) // .withPartitions(1) // .withRows(properties.repeat.getValue())); } }