org.apache.flink.api.java.operators.MapPartitionOperator Java Examples
The following examples show how to use
org.apache.flink.api.java.operators.MapPartitionOperator.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BaseComQueue.java From Alink with Apache License 2.0 | 6 votes |
private DataSet<byte[]> loopStartDataSet(ExecutionEnvironment env) { MapPartitionOperator<Integer, byte[]> initial = env .fromElements(1) .rebalance() .mapPartition(new MapPartitionFunction<Integer, byte[]>() { @Override public void mapPartition(Iterable<Integer> values, Collector<byte[]> out) { //pass } }).name("iterInitialize"); if (cacheDataRel != null) { initial = initial.withBroadcastSet(cacheDataRel, "rel"); } return initial; }
Example #2
Source File: BootstrapTransformation.java From flink with Apache License 2.0 | 5 votes |
private static <T> int getParallelism(MapPartitionOperator<T, TaggedOperatorSubtaskState> subtaskStates) { int parallelism = subtaskStates.getParallelism(); if (parallelism == ExecutionConfig.PARALLELISM_DEFAULT) { parallelism = subtaskStates.getExecutionEnvironment().getParallelism(); } return parallelism; }
Example #3
Source File: BootstrapTransformation.java From flink with Apache License 2.0 | 5 votes |
@VisibleForTesting MapPartitionOperator<T, TaggedOperatorSubtaskState> writeOperatorSubtaskStates( OperatorID operatorID, StateBackend stateBackend, Path savepointPath, int localMaxParallelism) { DataSet<T> input = dataSet; if (originalKeySelector != null) { input = dataSet.partitionCustom(new KeyGroupRangePartitioner(localMaxParallelism), hashKeySelector); } StreamOperator<TaggedOperatorSubtaskState> operator = factory.createOperator( System.currentTimeMillis(), savepointPath); operator = dataSet.clean(operator); final StreamConfig config = getConfig(operatorID, stateBackend, operator); BoundedOneInputStreamTaskRunner<T> operatorRunner = new BoundedOneInputStreamTaskRunner<>( config, localMaxParallelism); MapPartitionOperator<T, TaggedOperatorSubtaskState> subtaskStates = input .mapPartition(operatorRunner) .name(operatorID.toHexString()); if (operator instanceof BroadcastStateBootstrapOperator) { subtaskStates = subtaskStates.setParallelism(1); } else { int currentParallelism = getParallelism(subtaskStates); if (currentParallelism > localMaxParallelism) { subtaskStates.setParallelism(localMaxParallelism); } } return subtaskStates; }
Example #4
Source File: DataSetUtils.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
/** * Generate a sample of DataSet which contains fixed size elements. * * <p><strong>NOTE:</strong> Sample with fixed size is not as efficient as sample with fraction, use sample with * fraction unless you need exact precision. * * @param withReplacement Whether element can be selected more than once. * @param numSamples The expected sample size. * @param seed Random number generator seed. * @return The sampled DataSet */ public static <T> DataSet<T> sampleWithSize( DataSet <T> input, final boolean withReplacement, final int numSamples, final long seed) { SampleInPartition<T> sampleInPartition = new SampleInPartition<>(withReplacement, numSamples, seed); MapPartitionOperator mapPartitionOperator = input.mapPartition(sampleInPartition); // There is no previous group, so the parallelism of GroupReduceOperator is always 1. String callLocation = Utils.getCallLocationName(); SampleInCoordinator<T> sampleInCoordinator = new SampleInCoordinator<>(withReplacement, numSamples, seed); return new GroupReduceOperator<>(mapPartitionOperator, input.getType(), sampleInCoordinator, callLocation); }
Example #5
Source File: SampleITCase.java From flink with Apache License 2.0 | 5 votes |
private void verifySamplerWithFraction(boolean withReplacement, double fraction, long seed) throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); FlatMapOperator<Tuple3<Integer, Long, String>, String> ds = getSourceDataSet(env); MapPartitionOperator<String, String> sampled = DataSetUtils.sample(ds, withReplacement, fraction, seed); List<String> result = sampled.collect(); containsResultAsText(result, getSourceStrings()); }
Example #6
Source File: DataSetUtils.java From flink with Apache License 2.0 | 5 votes |
/** * Generate a sample of DataSet which contains fixed size elements. * * <p><strong>NOTE:</strong> Sample with fixed size is not as efficient as sample with fraction, use sample with * fraction unless you need exact precision. * * @param withReplacement Whether element can be selected more than once. * @param numSamples The expected sample size. * @param seed Random number generator seed. * @return The sampled DataSet */ public static <T> DataSet<T> sampleWithSize( DataSet <T> input, final boolean withReplacement, final int numSamples, final long seed) { SampleInPartition<T> sampleInPartition = new SampleInPartition<>(withReplacement, numSamples, seed); MapPartitionOperator mapPartitionOperator = input.mapPartition(sampleInPartition); // There is no previous group, so the parallelism of GroupReduceOperator is always 1. String callLocation = Utils.getCallLocationName(); SampleInCoordinator<T> sampleInCoordinator = new SampleInCoordinator<>(withReplacement, numSamples, seed); return new GroupReduceOperator<>(mapPartitionOperator, input.getType(), sampleInCoordinator, callLocation); }
Example #7
Source File: SampleITCase.java From flink with Apache License 2.0 | 5 votes |
private void verifySamplerWithFraction(boolean withReplacement, double fraction, long seed) throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); FlatMapOperator<Tuple3<Integer, Long, String>, String> ds = getSourceDataSet(env); MapPartitionOperator<String, String> sampled = DataSetUtils.sample(ds, withReplacement, fraction, seed); List<String> result = sampled.collect(); containsResultAsText(result, getSourceStrings()); }
Example #8
Source File: DataSetUtils.java From flink with Apache License 2.0 | 5 votes |
/** * Generate a sample of DataSet which contains fixed size elements. * * <p><strong>NOTE:</strong> Sample with fixed size is not as efficient as sample with fraction, use sample with * fraction unless you need exact precision. * * @param withReplacement Whether element can be selected more than once. * @param numSamples The expected sample size. * @param seed Random number generator seed. * @return The sampled DataSet */ public static <T> DataSet<T> sampleWithSize( DataSet <T> input, final boolean withReplacement, final int numSamples, final long seed) { SampleInPartition<T> sampleInPartition = new SampleInPartition<>(withReplacement, numSamples, seed); MapPartitionOperator mapPartitionOperator = input.mapPartition(sampleInPartition); // There is no previous group, so the parallelism of GroupReduceOperator is always 1. String callLocation = Utils.getCallLocationName(); SampleInCoordinator<T> sampleInCoordinator = new SampleInCoordinator<>(withReplacement, numSamples, seed); return new GroupReduceOperator<>(mapPartitionOperator, input.getType(), sampleInCoordinator, callLocation); }
Example #9
Source File: BootstrapTransformation.java From flink with Apache License 2.0 | 5 votes |
@VisibleForTesting MapPartitionOperator<T, TaggedOperatorSubtaskState> writeOperatorSubtaskStates( OperatorID operatorID, StateBackend stateBackend, Path savepointPath, int localMaxParallelism) { DataSet<T> input = dataSet; if (originalKeySelector != null) { input = dataSet.partitionCustom(new KeyGroupRangePartitioner(localMaxParallelism), hashKeySelector); } StreamOperator<TaggedOperatorSubtaskState> operator = factory.createOperator( System.currentTimeMillis(), savepointPath); operator = dataSet.clean(operator); final StreamConfig config = getConfig(operatorID, stateBackend, operator); BoundedOneInputStreamTaskRunner<T> operatorRunner = new BoundedOneInputStreamTaskRunner<>( config, localMaxParallelism ); MapPartitionOperator<T, TaggedOperatorSubtaskState> subtaskStates = input .mapPartition(operatorRunner) .name(operatorID.toHexString()); if (operator instanceof BroadcastStateBootstrapOperator) { subtaskStates = subtaskStates.setParallelism(1); } else { int currentParallelism = getParallelism(subtaskStates); if (currentParallelism > localMaxParallelism) { subtaskStates.setParallelism(localMaxParallelism); } } return subtaskStates; }
Example #10
Source File: BootstrapTransformation.java From flink with Apache License 2.0 | 5 votes |
private static <T> int getParallelism(MapPartitionOperator<T, TaggedOperatorSubtaskState> subtaskStates) { int parallelism = subtaskStates.getParallelism(); if (parallelism == ExecutionConfig.PARALLELISM_DEFAULT) { parallelism = subtaskStates.getExecutionEnvironment().getParallelism(); } return parallelism; }
Example #11
Source File: SampleITCase.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
private void verifySamplerWithFraction(boolean withReplacement, double fraction, long seed) throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); FlatMapOperator<Tuple3<Integer, Long, String>, String> ds = getSourceDataSet(env); MapPartitionOperator<String, String> sampled = DataSetUtils.sample(ds, withReplacement, fraction, seed); List<String> result = sampled.collect(); containsResultAsText(result, getSourceStrings()); }
Example #12
Source File: FlinkBatchTransformTranslators.java From flink-dataflow with Apache License 2.0 | 5 votes |
private static void transformSideInputs(List<PCollectionView<?>> sideInputs, MapPartitionOperator<?, ?> outputDataSet, FlinkBatchTranslationContext context) { // get corresponding Flink broadcast DataSets for(PCollectionView<?> input : sideInputs) { DataSet<?> broadcastSet = context.getSideInputDataSet(input); outputDataSet.withBroadcastSet(broadcastSet, input.getTagInternal().getId()); } }
Example #13
Source File: BaseRandomForestTrainBatchOp.java From Alink with Apache License 2.0 | 4 votes |
private DataSet<Row> seriesTrain(BatchOperator<?> in) { DataSet<Row> trainDataSet = in.getDataSet(); MapPartitionOperator<Row, Tuple2<Integer, Row>> sampled = trainDataSet .mapPartition(new SampleData( get(HasSeed.SEED), get(HasSubsamplingRatio.SUBSAMPLING_RATIO), get(HasNumTreesDefaltAs10.NUM_TREES) ) ); if (getParams().get(HasSubsamplingRatio.SUBSAMPLING_RATIO) > 1.0) { DataSet<Long> cnt = DataSetUtils .countElementsPerPartition(trainDataSet) .sum(1) .map(new MapFunction<Tuple2<Integer, Long>, Long>() { @Override public Long map(Tuple2<Integer, Long> value) throws Exception { return value.f1; } }); sampled = sampled.withBroadcastSet(cnt, "totalCnt"); } DataSet<Integer> labelSize = labels.map(new MapFunction<Object[], Integer>() { @Override public Integer map(Object[] objects) throws Exception { return objects.length; } }); DataSet<Tuple2<Integer, String>> pModel = sampled .groupBy(0) .withPartitioner(new AvgPartition()) .reduceGroup(new SeriesTrainFunction(getParams())) .withBroadcastSet(stringIndexerModel.getDataSet(), "stringIndexerModel") .withBroadcastSet(labelSize, "labelSize"); return pModel .reduceGroup(new SerializeModel(getParams())) .withBroadcastSet(stringIndexerModel.getDataSet(), "stringIndexerModel") .withBroadcastSet(labels, "labels"); }
Example #14
Source File: FlinkBatchTransformTranslators.java From flink-dataflow with Apache License 2.0 | 4 votes |
@Override public void translateNode(ParDo.BoundMulti<IN, OUT> transform, FlinkBatchTranslationContext context) { DataSet<IN> inputDataSet = context.getInputDataSet(context.getInput(transform)); final DoFn<IN, OUT> doFn = transform.getFn(); Map<TupleTag<?>, PCollection<?>> outputs = context.getOutput(transform).getAll(); Map<TupleTag<?>, Integer> outputMap = Maps.newHashMap(); // put the main output at index 0, FlinkMultiOutputDoFnFunction also expects this outputMap.put(transform.getMainOutputTag(), 0); int count = 1; for (TupleTag<?> tag: outputs.keySet()) { if (!outputMap.containsKey(tag)) { outputMap.put(tag, count++); } } // collect all output Coders and create a UnionCoder for our tagged outputs List<Coder<?>> outputCoders = Lists.newArrayList(); for (PCollection<?> coll: outputs.values()) { outputCoders.add(coll.getCoder()); } UnionCoder unionCoder = UnionCoder.of(outputCoders); @SuppressWarnings("unchecked") TypeInformation<RawUnionValue> typeInformation = new CoderTypeInformation<>(unionCoder); @SuppressWarnings("unchecked") FlinkMultiOutputDoFnFunction<IN, OUT> doFnWrapper = new FlinkMultiOutputDoFnFunction(doFn, context.getPipelineOptions(), outputMap); MapPartitionOperator<IN, RawUnionValue> outputDataSet = new MapPartitionOperator<>(inputDataSet, typeInformation, doFnWrapper, transform.getName()); transformSideInputs(transform.getSideInputs(), outputDataSet, context); for (Map.Entry<TupleTag<?>, PCollection<?>> output: outputs.entrySet()) { TypeInformation<Object> outputType = context.getTypeInfo(output.getValue()); int outputTag = outputMap.get(output.getKey()); FlinkMultiOutputPruningFunction<Object> pruningFunction = new FlinkMultiOutputPruningFunction<>(outputTag); FlatMapOperator<RawUnionValue, Object> pruningOperator = new FlatMapOperator<>(outputDataSet, outputType, pruningFunction, output.getValue().getName()); context.setOutputDataSet(output.getValue(), pruningOperator); } }
Example #15
Source File: FlinkBatchTransformTranslators.java From flink-dataflow with Apache License 2.0 | 3 votes |
@Override public void translateNode(ParDo.Bound<IN, OUT> transform, FlinkBatchTranslationContext context) { DataSet<IN> inputDataSet = context.getInputDataSet(context.getInput(transform)); final DoFn<IN, OUT> doFn = transform.getFn(); TypeInformation<OUT> typeInformation = context.getTypeInfo(context.getOutput(transform)); FlinkDoFnFunction<IN, OUT> doFnWrapper = new FlinkDoFnFunction<>(doFn, context.getPipelineOptions()); MapPartitionOperator<IN, OUT> outputDataSet = new MapPartitionOperator<>(inputDataSet, typeInformation, doFnWrapper, transform.getName()); transformSideInputs(transform.getSideInputs(), outputDataSet, context); context.setOutputDataSet(context.getOutput(transform), outputDataSet); }
Example #16
Source File: DataSetUtils.java From flink with Apache License 2.0 | 3 votes |
/** * Generate a sample of DataSet by the probability fraction of each element. * * @param withReplacement Whether element can be selected more than once. * @param fraction Probability that each element is chosen, should be [0,1] without replacement, * and [0, ∞) with replacement. While fraction is larger than 1, the elements are * expected to be selected multi times into sample on average. * @param seed random number generator seed. * @return The sampled DataSet */ public static <T> MapPartitionOperator<T, T> sample( DataSet <T> input, final boolean withReplacement, final double fraction, final long seed) { return input.mapPartition(new SampleWithFraction<T>(withReplacement, fraction, seed)); }
Example #17
Source File: DataSetUtils.java From flink with Apache License 2.0 | 3 votes |
/** * Generate a sample of DataSet by the probability fraction of each element. * * @param withReplacement Whether element can be selected more than once. * @param fraction Probability that each element is chosen, should be [0,1] without replacement, * and [0, ∞) with replacement. While fraction is larger than 1, the elements are * expected to be selected multi times into sample on average. * @return The sampled DataSet */ public static <T> MapPartitionOperator<T, T> sample( DataSet <T> input, final boolean withReplacement, final double fraction) { return sample(input, withReplacement, fraction, Utils.RNG.nextLong()); }
Example #18
Source File: DataSetUtils.java From flink with Apache License 2.0 | 3 votes |
/** * Generate a sample of DataSet by the probability fraction of each element. * * @param withReplacement Whether element can be selected more than once. * @param fraction Probability that each element is chosen, should be [0,1] without replacement, * and [0, ∞) with replacement. While fraction is larger than 1, the elements are * expected to be selected multi times into sample on average. * @param seed random number generator seed. * @return The sampled DataSet */ public static <T> MapPartitionOperator<T, T> sample( DataSet <T> input, final boolean withReplacement, final double fraction, final long seed) { return input.mapPartition(new SampleWithFraction<T>(withReplacement, fraction, seed)); }
Example #19
Source File: DataSetUtils.java From flink with Apache License 2.0 | 3 votes |
/** * Generate a sample of DataSet by the probability fraction of each element. * * @param withReplacement Whether element can be selected more than once. * @param fraction Probability that each element is chosen, should be [0,1] without replacement, * and [0, ∞) with replacement. While fraction is larger than 1, the elements are * expected to be selected multi times into sample on average. * @return The sampled DataSet */ public static <T> MapPartitionOperator<T, T> sample( DataSet <T> input, final boolean withReplacement, final double fraction) { return sample(input, withReplacement, fraction, Utils.RNG.nextLong()); }
Example #20
Source File: DataSetUtils.java From Flink-CEPplus with Apache License 2.0 | 3 votes |
/** * Generate a sample of DataSet by the probability fraction of each element. * * @param withReplacement Whether element can be selected more than once. * @param fraction Probability that each element is chosen, should be [0,1] without replacement, * and [0, ∞) with replacement. While fraction is larger than 1, the elements are * expected to be selected multi times into sample on average. * @param seed random number generator seed. * @return The sampled DataSet */ public static <T> MapPartitionOperator<T, T> sample( DataSet <T> input, final boolean withReplacement, final double fraction, final long seed) { return input.mapPartition(new SampleWithFraction<T>(withReplacement, fraction, seed)); }
Example #21
Source File: DataSetUtils.java From Flink-CEPplus with Apache License 2.0 | 3 votes |
/** * Generate a sample of DataSet by the probability fraction of each element. * * @param withReplacement Whether element can be selected more than once. * @param fraction Probability that each element is chosen, should be [0,1] without replacement, * and [0, ∞) with replacement. While fraction is larger than 1, the elements are * expected to be selected multi times into sample on average. * @return The sampled DataSet */ public static <T> MapPartitionOperator<T, T> sample( DataSet <T> input, final boolean withReplacement, final double fraction) { return sample(input, withReplacement, fraction, Utils.RNG.nextLong()); }