org.apache.flink.api.java.operators.MapPartitionOperator Java Exaples

Source File: BaseComQueue.java From Alink with Apache License 2.0

6 votes

private DataSet<byte[]> loopStartDataSet(ExecutionEnvironment env) {
	MapPartitionOperator<Integer, byte[]> initial = env
		.fromElements(1)
		.rebalance()
		.mapPartition(new MapPartitionFunction<Integer, byte[]>() {
			@Override
			public void mapPartition(Iterable<Integer> values, Collector<byte[]> out) {
				//pass
			}
		}).name("iterInitialize");

	if (cacheDataRel != null) {
		initial = initial.withBroadcastSet(cacheDataRel, "rel");
	}

	return initial;
}

Source File: BootstrapTransformation.java From flink with Apache License 2.0

5 votes

private static <T> int getParallelism(MapPartitionOperator<T, TaggedOperatorSubtaskState> subtaskStates) {
	int parallelism = subtaskStates.getParallelism();
	if (parallelism == ExecutionConfig.PARALLELISM_DEFAULT) {
		parallelism = subtaskStates.getExecutionEnvironment().getParallelism();
	}

	return parallelism;
}

Source File: BootstrapTransformation.java From flink with Apache License 2.0

5 votes

@VisibleForTesting
MapPartitionOperator<T, TaggedOperatorSubtaskState> writeOperatorSubtaskStates(
	OperatorID operatorID,
	StateBackend stateBackend,
	Path savepointPath,
	int localMaxParallelism) {

	DataSet<T> input = dataSet;
	if (originalKeySelector != null) {
		input = dataSet.partitionCustom(new KeyGroupRangePartitioner(localMaxParallelism), hashKeySelector);
	}

	StreamOperator<TaggedOperatorSubtaskState> operator = factory.createOperator(
		System.currentTimeMillis(),
		savepointPath);

	operator = dataSet.clean(operator);

	final StreamConfig config = getConfig(operatorID, stateBackend, operator);

	BoundedOneInputStreamTaskRunner<T> operatorRunner = new BoundedOneInputStreamTaskRunner<>(
		config,
		localMaxParallelism);

	MapPartitionOperator<T, TaggedOperatorSubtaskState> subtaskStates = input
		.mapPartition(operatorRunner)
		.name(operatorID.toHexString());

	if (operator instanceof BroadcastStateBootstrapOperator) {
		subtaskStates = subtaskStates.setParallelism(1);
	} else {
		int currentParallelism = getParallelism(subtaskStates);
		if (currentParallelism > localMaxParallelism) {
			subtaskStates.setParallelism(localMaxParallelism);
		}
	}
	return subtaskStates;
}

Source File: DataSetUtils.java From Flink-CEPplus with Apache License 2.0

5 votes

/**
 * Generate a sample of DataSet which contains fixed size elements.
 *
 * <p><strong>NOTE:</strong> Sample with fixed size is not as efficient as sample with fraction, use sample with
 * fraction unless you need exact precision.
 *
 * @param withReplacement Whether element can be selected more than once.
 * @param numSamples       The expected sample size.
 * @param seed            Random number generator seed.
 * @return The sampled DataSet
 */
public static <T> DataSet<T> sampleWithSize(
	DataSet <T> input,
	final boolean withReplacement,
	final int numSamples,
	final long seed) {

	SampleInPartition<T> sampleInPartition = new SampleInPartition<>(withReplacement, numSamples, seed);
	MapPartitionOperator mapPartitionOperator = input.mapPartition(sampleInPartition);

	// There is no previous group, so the parallelism of GroupReduceOperator is always 1.
	String callLocation = Utils.getCallLocationName();
	SampleInCoordinator<T> sampleInCoordinator = new SampleInCoordinator<>(withReplacement, numSamples, seed);
	return new GroupReduceOperator<>(mapPartitionOperator, input.getType(), sampleInCoordinator, callLocation);
}

Source File: SampleITCase.java From flink with Apache License 2.0

5 votes

private void verifySamplerWithFraction(boolean withReplacement, double fraction, long seed) throws Exception {
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	FlatMapOperator<Tuple3<Integer, Long, String>, String> ds = getSourceDataSet(env);
	MapPartitionOperator<String, String> sampled = DataSetUtils.sample(ds, withReplacement, fraction, seed);
	List<String> result = sampled.collect();
	containsResultAsText(result, getSourceStrings());
}

Source File: DataSetUtils.java From flink with Apache License 2.0

5 votes

/**
 * Generate a sample of DataSet which contains fixed size elements.
 *
 * <p><strong>NOTE:</strong> Sample with fixed size is not as efficient as sample with fraction, use sample with
 * fraction unless you need exact precision.
 *
 * @param withReplacement Whether element can be selected more than once.
 * @param numSamples       The expected sample size.
 * @param seed            Random number generator seed.
 * @return The sampled DataSet
 */
public static <T> DataSet<T> sampleWithSize(
	DataSet <T> input,
	final boolean withReplacement,
	final int numSamples,
	final long seed) {

	SampleInPartition<T> sampleInPartition = new SampleInPartition<>(withReplacement, numSamples, seed);
	MapPartitionOperator mapPartitionOperator = input.mapPartition(sampleInPartition);

	// There is no previous group, so the parallelism of GroupReduceOperator is always 1.
	String callLocation = Utils.getCallLocationName();
	SampleInCoordinator<T> sampleInCoordinator = new SampleInCoordinator<>(withReplacement, numSamples, seed);
	return new GroupReduceOperator<>(mapPartitionOperator, input.getType(), sampleInCoordinator, callLocation);
}

Source File: SampleITCase.java From flink with Apache License 2.0

5 votes

private void verifySamplerWithFraction(boolean withReplacement, double fraction, long seed) throws Exception {
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	FlatMapOperator<Tuple3<Integer, Long, String>, String> ds = getSourceDataSet(env);
	MapPartitionOperator<String, String> sampled = DataSetUtils.sample(ds, withReplacement, fraction, seed);
	List<String> result = sampled.collect();
	containsResultAsText(result, getSourceStrings());
}

Source File: DataSetUtils.java From flink with Apache License 2.0

5 votes

/**
 * Generate a sample of DataSet which contains fixed size elements.
 *
 * <p><strong>NOTE:</strong> Sample with fixed size is not as efficient as sample with fraction, use sample with
 * fraction unless you need exact precision.
 *
 * @param withReplacement Whether element can be selected more than once.
 * @param numSamples       The expected sample size.
 * @param seed            Random number generator seed.
 * @return The sampled DataSet
 */
public static <T> DataSet<T> sampleWithSize(
	DataSet <T> input,
	final boolean withReplacement,
	final int numSamples,
	final long seed) {

	SampleInPartition<T> sampleInPartition = new SampleInPartition<>(withReplacement, numSamples, seed);
	MapPartitionOperator mapPartitionOperator = input.mapPartition(sampleInPartition);

	// There is no previous group, so the parallelism of GroupReduceOperator is always 1.
	String callLocation = Utils.getCallLocationName();
	SampleInCoordinator<T> sampleInCoordinator = new SampleInCoordinator<>(withReplacement, numSamples, seed);
	return new GroupReduceOperator<>(mapPartitionOperator, input.getType(), sampleInCoordinator, callLocation);
}

Source File: BootstrapTransformation.java From flink with Apache License 2.0

5 votes

@VisibleForTesting
MapPartitionOperator<T, TaggedOperatorSubtaskState> writeOperatorSubtaskStates(
	OperatorID operatorID,
	StateBackend stateBackend,
	Path savepointPath,
	int localMaxParallelism) {

	DataSet<T> input = dataSet;
	if (originalKeySelector != null) {
		input = dataSet.partitionCustom(new KeyGroupRangePartitioner(localMaxParallelism), hashKeySelector);
	}

	StreamOperator<TaggedOperatorSubtaskState> operator = factory.createOperator(
		System.currentTimeMillis(),
		savepointPath);

	operator = dataSet.clean(operator);

	final StreamConfig config = getConfig(operatorID, stateBackend, operator);

	BoundedOneInputStreamTaskRunner<T> operatorRunner = new BoundedOneInputStreamTaskRunner<>(
		config,
		localMaxParallelism
	);

	MapPartitionOperator<T, TaggedOperatorSubtaskState> subtaskStates = input
		.mapPartition(operatorRunner)
		.name(operatorID.toHexString());

	if (operator instanceof BroadcastStateBootstrapOperator) {
		subtaskStates = subtaskStates.setParallelism(1);
	} else {
		int currentParallelism = getParallelism(subtaskStates);
		if (currentParallelism > localMaxParallelism) {
			subtaskStates.setParallelism(localMaxParallelism);
		}
	}
	return subtaskStates;
}

Source File: BootstrapTransformation.java From flink with Apache License 2.0

5 votes

private static <T> int getParallelism(MapPartitionOperator<T, TaggedOperatorSubtaskState> subtaskStates) {
	int parallelism = subtaskStates.getParallelism();
	if (parallelism == ExecutionConfig.PARALLELISM_DEFAULT) {
		parallelism = subtaskStates.getExecutionEnvironment().getParallelism();
	}

	return parallelism;
}

Source File: SampleITCase.java From Flink-CEPplus with Apache License 2.0

5 votes

private void verifySamplerWithFraction(boolean withReplacement, double fraction, long seed) throws Exception {
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	FlatMapOperator<Tuple3<Integer, Long, String>, String> ds = getSourceDataSet(env);
	MapPartitionOperator<String, String> sampled = DataSetUtils.sample(ds, withReplacement, fraction, seed);
	List<String> result = sampled.collect();
	containsResultAsText(result, getSourceStrings());
}

Source File: FlinkBatchTransformTranslators.java From flink-dataflow with Apache License 2.0

5 votes

private static void transformSideInputs(List<PCollectionView<?>> sideInputs,
                                        MapPartitionOperator<?, ?> outputDataSet,
                                        FlinkBatchTranslationContext context) {
	// get corresponding Flink broadcast DataSets
	for(PCollectionView<?> input : sideInputs) {
		DataSet<?> broadcastSet = context.getSideInputDataSet(input);
		outputDataSet.withBroadcastSet(broadcastSet, input.getTagInternal().getId());
	}
}

Source File: BaseRandomForestTrainBatchOp.java From Alink with Apache License 2.0

4 votes

private DataSet<Row> seriesTrain(BatchOperator<?> in) {
	DataSet<Row> trainDataSet = in.getDataSet();

	MapPartitionOperator<Row, Tuple2<Integer, Row>> sampled = trainDataSet
		.mapPartition(new SampleData(
				get(HasSeed.SEED),
				get(HasSubsamplingRatio.SUBSAMPLING_RATIO),
				get(HasNumTreesDefaltAs10.NUM_TREES)
			)
		);

	if (getParams().get(HasSubsamplingRatio.SUBSAMPLING_RATIO) > 1.0) {
		DataSet<Long> cnt = DataSetUtils
			.countElementsPerPartition(trainDataSet)
			.sum(1)
			.map(new MapFunction<Tuple2<Integer, Long>, Long>() {
				@Override
				public Long map(Tuple2<Integer, Long> value) throws Exception {
					return value.f1;
				}
			});

		sampled = sampled.withBroadcastSet(cnt, "totalCnt");
	}

	DataSet<Integer> labelSize = labels.map(new MapFunction<Object[], Integer>() {
		@Override
		public Integer map(Object[] objects) throws Exception {
			return objects.length;
		}
	});

	DataSet<Tuple2<Integer, String>> pModel = sampled
		.groupBy(0)
		.withPartitioner(new AvgPartition())
		.reduceGroup(new SeriesTrainFunction(getParams()))
		.withBroadcastSet(stringIndexerModel.getDataSet(), "stringIndexerModel")
		.withBroadcastSet(labelSize, "labelSize");

	return pModel
		.reduceGroup(new SerializeModel(getParams()))
		.withBroadcastSet(stringIndexerModel.getDataSet(), "stringIndexerModel")
		.withBroadcastSet(labels, "labels");
}

Source File: FlinkBatchTransformTranslators.java From flink-dataflow with Apache License 2.0

4 votes

@Override
public void translateNode(ParDo.BoundMulti<IN, OUT> transform, FlinkBatchTranslationContext context) {
	DataSet<IN> inputDataSet = context.getInputDataSet(context.getInput(transform));

	final DoFn<IN, OUT> doFn = transform.getFn();

	Map<TupleTag<?>, PCollection<?>> outputs = context.getOutput(transform).getAll();

	Map<TupleTag<?>, Integer> outputMap = Maps.newHashMap();
	// put the main output at index 0, FlinkMultiOutputDoFnFunction also expects this
	outputMap.put(transform.getMainOutputTag(), 0);
	int count = 1;
	for (TupleTag<?> tag: outputs.keySet()) {
		if (!outputMap.containsKey(tag)) {
			outputMap.put(tag, count++);
		}
	}

	// collect all output Coders and create a UnionCoder for our tagged outputs
	List<Coder<?>> outputCoders = Lists.newArrayList();
	for (PCollection<?> coll: outputs.values()) {
		outputCoders.add(coll.getCoder());
	}

	UnionCoder unionCoder = UnionCoder.of(outputCoders);

	@SuppressWarnings("unchecked")
	TypeInformation<RawUnionValue> typeInformation = new CoderTypeInformation<>(unionCoder);

	@SuppressWarnings("unchecked")
	FlinkMultiOutputDoFnFunction<IN, OUT> doFnWrapper = new FlinkMultiOutputDoFnFunction(doFn, context.getPipelineOptions(), outputMap);
	MapPartitionOperator<IN, RawUnionValue> outputDataSet = new MapPartitionOperator<>(inputDataSet, typeInformation, doFnWrapper, transform.getName());

	transformSideInputs(transform.getSideInputs(), outputDataSet, context);

	for (Map.Entry<TupleTag<?>, PCollection<?>> output: outputs.entrySet()) {
		TypeInformation<Object> outputType = context.getTypeInfo(output.getValue());
		int outputTag = outputMap.get(output.getKey());
		FlinkMultiOutputPruningFunction<Object> pruningFunction = new FlinkMultiOutputPruningFunction<>(outputTag);
		FlatMapOperator<RawUnionValue, Object> pruningOperator = new
				FlatMapOperator<>(outputDataSet, outputType,
				pruningFunction, output.getValue().getName());
		context.setOutputDataSet(output.getValue(), pruningOperator);

	}
}

Source File: FlinkBatchTransformTranslators.java From flink-dataflow with Apache License 2.0

3 votes

@Override
public void translateNode(ParDo.Bound<IN, OUT> transform, FlinkBatchTranslationContext context) {
	DataSet<IN> inputDataSet = context.getInputDataSet(context.getInput(transform));

	final DoFn<IN, OUT> doFn = transform.getFn();

	TypeInformation<OUT> typeInformation = context.getTypeInfo(context.getOutput(transform));

	FlinkDoFnFunction<IN, OUT> doFnWrapper = new FlinkDoFnFunction<>(doFn, context.getPipelineOptions());
	MapPartitionOperator<IN, OUT> outputDataSet = new MapPartitionOperator<>(inputDataSet, typeInformation, doFnWrapper, transform.getName());

	transformSideInputs(transform.getSideInputs(), outputDataSet, context);

	context.setOutputDataSet(context.getOutput(transform), outputDataSet);
}

Source File: DataSetUtils.java From flink with Apache License 2.0

3 votes

/**
 * Generate a sample of DataSet by the probability fraction of each element.
 *
 * @param withReplacement Whether element can be selected more than once.
 * @param fraction        Probability that each element is chosen, should be [0,1] without replacement,
 *                        and [0, ∞) with replacement. While fraction is larger than 1, the elements are
 *                        expected to be selected multi times into sample on average.
 * @param seed            random number generator seed.
 * @return The sampled DataSet
 */
public static <T> MapPartitionOperator<T, T> sample(
	DataSet <T> input,
	final boolean withReplacement,
	final double fraction,
	final long seed) {

	return input.mapPartition(new SampleWithFraction<T>(withReplacement, fraction, seed));
}

Source File: DataSetUtils.java From flink with Apache License 2.0

3 votes

/**
 * Generate a sample of DataSet by the probability fraction of each element.
 *
 * @param withReplacement Whether element can be selected more than once.
 * @param fraction        Probability that each element is chosen, should be [0,1] without replacement,
 *                        and [0, ∞) with replacement. While fraction is larger than 1, the elements are
 *                        expected to be selected multi times into sample on average.
 * @return The sampled DataSet
 */
public static <T> MapPartitionOperator<T, T> sample(
	DataSet <T> input,
	final boolean withReplacement,
	final double fraction) {

	return sample(input, withReplacement, fraction, Utils.RNG.nextLong());
}

Source File: DataSetUtils.java From flink with Apache License 2.0

3 votes

/**
 * Generate a sample of DataSet by the probability fraction of each element.
 *
 * @param withReplacement Whether element can be selected more than once.
 * @param fraction        Probability that each element is chosen, should be [0,1] without replacement,
 *                        and [0, ∞) with replacement. While fraction is larger than 1, the elements are
 *                        expected to be selected multi times into sample on average.
 * @param seed            random number generator seed.
 * @return The sampled DataSet
 */
public static <T> MapPartitionOperator<T, T> sample(
	DataSet <T> input,
	final boolean withReplacement,
	final double fraction,
	final long seed) {

	return input.mapPartition(new SampleWithFraction<T>(withReplacement, fraction, seed));
}

Source File: DataSetUtils.java From flink with Apache License 2.0

3 votes

/**
 * Generate a sample of DataSet by the probability fraction of each element.
 *
 * @param withReplacement Whether element can be selected more than once.
 * @param fraction        Probability that each element is chosen, should be [0,1] without replacement,
 *                        and [0, ∞) with replacement. While fraction is larger than 1, the elements are
 *                        expected to be selected multi times into sample on average.
 * @return The sampled DataSet
 */
public static <T> MapPartitionOperator<T, T> sample(
	DataSet <T> input,
	final boolean withReplacement,
	final double fraction) {

	return sample(input, withReplacement, fraction, Utils.RNG.nextLong());
}

Source File: DataSetUtils.java From Flink-CEPplus with Apache License 2.0

3 votes

/**
 * Generate a sample of DataSet by the probability fraction of each element.
 *
 * @param withReplacement Whether element can be selected more than once.
 * @param fraction        Probability that each element is chosen, should be [0,1] without replacement,
 *                        and [0, ∞) with replacement. While fraction is larger than 1, the elements are
 *                        expected to be selected multi times into sample on average.
 * @param seed            random number generator seed.
 * @return The sampled DataSet
 */
public static <T> MapPartitionOperator<T, T> sample(
	DataSet <T> input,
	final boolean withReplacement,
	final double fraction,
	final long seed) {

	return input.mapPartition(new SampleWithFraction<T>(withReplacement, fraction, seed));
}

Source File: DataSetUtils.java From Flink-CEPplus with Apache License 2.0

3 votes

/**
 * Generate a sample of DataSet by the probability fraction of each element.
 *
 * @param withReplacement Whether element can be selected more than once.
 * @param fraction        Probability that each element is chosen, should be [0,1] without replacement,
 *                        and [0, ∞) with replacement. While fraction is larger than 1, the elements are
 *                        expected to be selected multi times into sample on average.
 * @return The sampled DataSet
 */
public static <T> MapPartitionOperator<T, T> sample(
	DataSet <T> input,
	final boolean withReplacement,
	final double fraction) {

	return sample(input, withReplacement, fraction, Utils.RNG.nextLong());
}

org.apache.flink.api.java.operators.MapPartitionOperator Java Examples