org.apache.flink.api.java.DataSet#getType

Source File: AggregateOperator.java From flink with Apache License 2.0

6 votes

/**
 * Non grouped aggregation.
 */
public AggregateOperator(DataSet<IN> input, Aggregations function, int field, String aggregateLocationName) {
	super(Preconditions.checkNotNull(input), input.getType());
	Preconditions.checkNotNull(function);

	this.aggregateLocationName = aggregateLocationName;

	if (!input.getType().isTupleType()) {
		throw new InvalidProgramException("Aggregating on field positions is only possible on tuple data types.");
	}

	TupleTypeInfoBase<?> inType = (TupleTypeInfoBase<?>) input.getType();

	if (field < 0 || field >= inType.getArity()) {
		throw new IllegalArgumentException("Aggregation field position is out of range.");
	}

	AggregationFunctionFactory factory = function.getFactory();
	AggregationFunction<?> aggFunct = factory.createAggregationFunction(inType.getTypeAt(field).getTypeClass());

	// this is the first aggregation operator after a regular data set (non grouped aggregation)
	this.aggregationFunctions.add(aggFunct);
	this.fields.add(field);
	this.grouping = null;
}

Source File: DataSetUtils.java From flink with Apache License 2.0

6 votes

/**
 * Summarize a DataSet of Tuples by collecting single pass statistics for all columns.
 *
 * <p>Example usage:
 * <pre>
 * {@code
 * Dataset<Tuple3<Double, String, Boolean>> input = // [...]
 * Tuple3<NumericColumnSummary,StringColumnSummary, BooleanColumnSummary> summary = DataSetUtils.summarize(input)
 *
 * summary.f0.getStandardDeviation()
 * summary.f1.getMaxLength()
 * }
 * </pre>
 * @return the summary as a Tuple the same width as input rows
 */
public static <R extends Tuple, T extends Tuple> R summarize(DataSet<T> input) throws Exception {
	if (!input.getType().isTupleType()) {
		throw new IllegalArgumentException("summarize() is only implemented for DataSet's of Tuples");
	}
	final TupleTypeInfoBase<?> inType = (TupleTypeInfoBase<?>) input.getType();
	DataSet<TupleSummaryAggregator<R>> result = input.mapPartition(new MapPartitionFunction<T, TupleSummaryAggregator<R>>() {
		@Override
		public void mapPartition(Iterable<T> values, Collector<TupleSummaryAggregator<R>> out) throws Exception {
			TupleSummaryAggregator<R> aggregator = SummaryAggregatorFactory.create(inType);
			for (Tuple value : values) {
				aggregator.aggregate(value);
			}
			out.collect(aggregator);
		}
	}).reduce(new ReduceFunction<TupleSummaryAggregator<R>>() {
		@Override
		public TupleSummaryAggregator<R> reduce(TupleSummaryAggregator<R> agg1, TupleSummaryAggregator<R> agg2) throws Exception {
			agg1.combine(agg2);
			return agg1;
		}
	});
	return result.collect().get(0).result();
}

Source File: ProjectOperator.java From flink with Apache License 2.0

6 votes

public Projection(DataSet<T> ds, int[] fieldIndexes) {

			if (!(ds.getType() instanceof TupleTypeInfo)) {
				throw new UnsupportedOperationException("project() can only be applied to DataSets of Tuples.");
			}

			if (fieldIndexes.length == 0) {
				throw new IllegalArgumentException("project() needs to select at least one (1) field.");
			} else if (fieldIndexes.length > Tuple.MAX_ARITY - 1) {
				throw new IllegalArgumentException(
					"project() may select only up to (" + (Tuple.MAX_ARITY - 1) + ") fields.");
			}

			int maxFieldIndex = ds.getType().getArity();
			for (int fieldIndexe : fieldIndexes) {
				Preconditions.checkElementIndex(fieldIndexe, maxFieldIndex);
			}

			this.ds = ds;
			this.fieldIndexes = fieldIndexes;
		}

Source File: PartitionOperator.java From flink with Apache License 2.0

6 votes

private <P> PartitionOperator(DataSet<T> input, PartitionMethod pMethod, Keys<T> pKeys, Partitioner<P> customPartitioner,
		TypeInformation<P> partitionerTypeInfo, DataDistribution distribution, String partitionLocationName) {
	super(input, input.getType());

	Preconditions.checkNotNull(pMethod);
	Preconditions.checkArgument(pKeys != null || pMethod == PartitionMethod.REBALANCE, "Partitioning requires keys");
	Preconditions.checkArgument(pMethod != PartitionMethod.CUSTOM || customPartitioner != null, "Custom partioning requires a partitioner.");
	Preconditions.checkArgument(distribution == null || pMethod == PartitionMethod.RANGE, "Customized data distribution is only neccessary for range partition.");

	if (distribution != null) {
		Preconditions.checkArgument(pKeys.getNumberOfKeyFields() <= distribution.getNumberOfFields(), "The distribution must provide at least as many fields as flat key fields are specified.");
		Preconditions.checkArgument(Arrays.equals(pKeys.getKeyFieldTypes(), Arrays.copyOfRange(distribution.getKeyTypes(), 0, pKeys.getNumberOfKeyFields())),
				"The types of the flat key fields must be equal to the types of the fields of the distribution.");
	}

	if (customPartitioner != null) {
		pKeys.validateCustomPartitioner(customPartitioner, partitionerTypeInfo);
	}

	this.pMethod = pMethod;
	this.pKeys = pKeys;
	this.partitionLocationName = partitionLocationName;
	this.customPartitioner = customPartitioner;
	this.distribution = distribution;
}

Source File: PythonPlanBinder.java From Flink-CEPplus with Apache License 2.0

5 votes

private <IN1, IN2, OUT> void createCoGroupOperation(PythonOperationInfo info, TypeInformation<OUT> type) {
	DataSet<IN1> op1 = sets.getDataSet(info.parentID);
	DataSet<IN2> op2 = sets.getDataSet(info.otherID);
	Keys.ExpressionKeys<IN1> key1 = new Keys.ExpressionKeys<>(info.keys1.toArray(new String[info.keys1.size()]), op1.getType());
	Keys.ExpressionKeys<IN2> key2 = new Keys.ExpressionKeys<>(info.keys2.toArray(new String[info.keys2.size()]), op2.getType());
	PythonCoGroup<IN1, IN2, OUT> pcg = new PythonCoGroup<>(operatorConfig, info.envID, info.setID, type);
	sets.add(info.setID, new CoGroupRawOperator<>(op1, op2, key1, key2, pcg, type, info.name).setParallelism(info.parallelism));
}

Source File: SortPartitionOperator.java From flink with Apache License 2.0

5 votes

private SortPartitionOperator(DataSet<T> dataSet, String sortLocationName) {
	super(dataSet, dataSet.getType());

	keys = new ArrayList<>();
	orders = new ArrayList<>();
	this.sortLocationName = sortLocationName;
}

Source File: FlinkBatchTransformTranslators.java From flink-dataflow with Apache License 2.0

5 votes

@Override
public void translateNode(GroupByKey.GroupByKeyOnly<K, V> transform, FlinkBatchTranslationContext context) {
	DataSet<KV<K, V>> inputDataSet = context.getInputDataSet(context.getInput(transform));
	GroupReduceFunction<KV<K, V>, KV<K, Iterable<V>>> groupReduceFunction = new FlinkKeyedListAggregationFunction<>();

	TypeInformation<KV<K, Iterable<V>>> typeInformation = context.getTypeInfo(context.getOutput(transform));

	Grouping<KV<K, V>> grouping = new UnsortedGrouping<>(inputDataSet, new Keys.ExpressionKeys<>(new String[]{"key"}, inputDataSet.getType()));

	GroupReduceOperator<KV<K, V>, KV<K, Iterable<V>>> outputDataSet =
			new GroupReduceOperator<>(grouping, typeInformation, groupReduceFunction, transform.getName());
	context.setOutputDataSet(context.getOutput(transform), outputDataSet);
}

Source File: DeltaIteration.java From flink with Apache License 2.0

5 votes

public DeltaIteration(ExecutionEnvironment context, TypeInformation<ST> type, DataSet<ST> solutionSet, DataSet<WT> workset, Keys<ST> keys, int maxIterations) {
	initialSolutionSet = solutionSet;
	initialWorkset = workset;
	solutionSetPlaceholder = new SolutionSetPlaceHolder<>(context, solutionSet.getType(), this);
	worksetPlaceholder = new WorksetPlaceHolder<>(context, workset.getType());
	this.keys = keys;
	this.maxIterations = maxIterations;
}

Source File: DistinctOperator.java From Flink-CEPplus with Apache License 2.0

5 votes

public DistinctOperator(DataSet<T> input, Keys<T> keys, String distinctLocationName) {
	super(input, input.getType());

	this.distinctLocationName = distinctLocationName;

	// if keys is null distinction is done on all fields
	if (keys == null) {
		keys = new Keys.ExpressionKeys<>(input.getType());
	}

	this.keys = keys;
}

Source File: DeltaIteration.java From flink with Apache License 2.0

5 votes

public DeltaIteration(ExecutionEnvironment context, TypeInformation<ST> type, DataSet<ST> solutionSet, DataSet<WT> workset, Keys<ST> keys, int maxIterations) {
	initialSolutionSet = solutionSet;
	initialWorkset = workset;
	solutionSetPlaceholder = new SolutionSetPlaceHolder<>(context, solutionSet.getType(), this);
	worksetPlaceholder = new WorksetPlaceHolder<>(context, workset.getType());
	this.keys = keys;
	this.maxIterations = maxIterations;
}

Source File: SortPartitionOperator.java From Flink-CEPplus with Apache License 2.0

5 votes

private SortPartitionOperator(DataSet<T> dataSet, String sortLocationName) {
	super(dataSet, dataSet.getType());

	keys = new ArrayList<>();
	orders = new ArrayList<>();
	this.sortLocationName = sortLocationName;
}

Source File: DataSetUtils.java From flink with Apache License 2.0

5 votes

/**
 * Generate a sample of DataSet which contains fixed size elements.
 *
 * <p><strong>NOTE:</strong> Sample with fixed size is not as efficient as sample with fraction, use sample with
 * fraction unless you need exact precision.
 *
 * @param withReplacement Whether element can be selected more than once.
 * @param numSamples       The expected sample size.
 * @param seed            Random number generator seed.
 * @return The sampled DataSet
 */
public static <T> DataSet<T> sampleWithSize(
	DataSet <T> input,
	final boolean withReplacement,
	final int numSamples,
	final long seed) {

	SampleInPartition<T> sampleInPartition = new SampleInPartition<>(withReplacement, numSamples, seed);
	MapPartitionOperator mapPartitionOperator = input.mapPartition(sampleInPartition);

	// There is no previous group, so the parallelism of GroupReduceOperator is always 1.
	String callLocation = Utils.getCallLocationName();
	SampleInCoordinator<T> sampleInCoordinator = new SampleInCoordinator<>(withReplacement, numSamples, seed);
	return new GroupReduceOperator<>(mapPartitionOperator, input.getType(), sampleInCoordinator, callLocation);
}

Source File: ReduceOperator.java From flink with Apache License 2.0

5 votes

/**
 * This is the case for a reduce-all case (in contrast to the reduce-per-group case).
 *
 * @param input
 * @param function
 */
public ReduceOperator(DataSet<IN> input, ReduceFunction<IN> function, String defaultName) {
	super(input, input.getType());

	this.function = function;
	this.grouper = null;
	this.defaultName = defaultName;
	this.hint = null;
}

Source File: DataSetUtils.java From flink with Apache License 2.0

5 votes

/**
 * Generate a sample of DataSet which contains fixed size elements.
 *
 * <p><strong>NOTE:</strong> Sample with fixed size is not as efficient as sample with fraction, use sample with
 * fraction unless you need exact precision.
 *
 * @param withReplacement Whether element can be selected more than once.
 * @param numSamples       The expected sample size.
 * @param seed            Random number generator seed.
 * @return The sampled DataSet
 */
public static <T> DataSet<T> sampleWithSize(
	DataSet <T> input,
	final boolean withReplacement,
	final int numSamples,
	final long seed) {

	SampleInPartition<T> sampleInPartition = new SampleInPartition<>(withReplacement, numSamples, seed);
	MapPartitionOperator mapPartitionOperator = input.mapPartition(sampleInPartition);

	// There is no previous group, so the parallelism of GroupReduceOperator is always 1.
	String callLocation = Utils.getCallLocationName();
	SampleInCoordinator<T> sampleInCoordinator = new SampleInCoordinator<>(withReplacement, numSamples, seed);
	return new GroupReduceOperator<>(mapPartitionOperator, input.getType(), sampleInCoordinator, callLocation);
}

Source File: DataSetUtils.java From flink with Apache License 2.0

4 votes

/**
 * Range-partitions a DataSet on the specified fields.
 */
public static <T> PartitionOperator<T> partitionByRange(DataSet<T> input, DataDistribution distribution, String... fields) {
	return new PartitionOperator<>(input, PartitionOperatorBase.PartitionMethod.RANGE, new Keys.ExpressionKeys<>(fields, input.getType()), distribution, Utils.getCallLocationName());
}

Source File: FilterOperator.java From flink with Apache License 2.0

4 votes

public FilterOperator(DataSet<T> input, FilterFunction<T> function, String defaultName) {
	super(input, input.getType());

	this.function = function;
	this.defaultName = defaultName;
}

Source File: DataSetUtils.java From flink with Apache License 2.0

4 votes

/**
 * Range-partitions a DataSet on the specified fields.
 */
public static <T> PartitionOperator<T> partitionByRange(DataSet<T> input, DataDistribution distribution, String... fields) {
	return new PartitionOperator<>(input, PartitionOperatorBase.PartitionMethod.RANGE, new Keys.ExpressionKeys<>(fields, input.getType()), distribution, Utils.getCallLocationName());
}

Source File: CrossOperator.java From flink with Apache License 2.0

4 votes

public CrossProjection(DataSet<I1> ds1, DataSet<I2> ds2, int[] firstFieldIndexes, int[] secondFieldIndexes, CrossHint hint) {

			this.ds1 = ds1;
			this.ds2 = ds2;
			this.hint = hint;

			boolean isFirstTuple;
			boolean isSecondTuple;

			if (ds1.getType() instanceof TupleTypeInfo) {
				numFieldsDs1 = ((TupleTypeInfo<?>) ds1.getType()).getArity();
				isFirstTuple = true;
			} else {
				numFieldsDs1 = 1;
				isFirstTuple = false;
			}
			if (ds2.getType() instanceof TupleTypeInfo) {
				numFieldsDs2 = ((TupleTypeInfo<?>) ds2.getType()).getArity();
				isSecondTuple = true;
			} else {
				numFieldsDs2 = 1;
				isSecondTuple = false;
			}

			boolean isTuple;
			boolean firstInput;

			if (firstFieldIndexes != null && secondFieldIndexes == null) {
				// index array for first input is provided
				firstInput = true;
				isTuple = isFirstTuple;
				this.fieldIndexes = firstFieldIndexes;

				if (this.fieldIndexes.length == 0) {
					// no indexes provided, treat tuple as regular object
					isTuple = false;
				}
			} else if (firstFieldIndexes == null && secondFieldIndexes != null) {
				// index array for second input is provided
				firstInput = false;
				isTuple = isSecondTuple;
				this.fieldIndexes = secondFieldIndexes;

				if (this.fieldIndexes.length == 0) {
					// no indexes provided, treat tuple as regular object
					isTuple = false;
				}
			} else if (firstFieldIndexes == null && secondFieldIndexes == null) {
				throw new IllegalArgumentException("You must provide at least one field index array.");
			} else {
				throw new IllegalArgumentException("You must provide at most one field index array.");
			}

			if (!isTuple && this.fieldIndexes.length != 0) {
				// field index provided for non-Tuple input
				throw new IllegalArgumentException("Input is not a Tuple. Call projectFirst() (or projectSecond()) without arguments to include it.");
			} else if (this.fieldIndexes.length > 22) {
				throw new IllegalArgumentException("You may select only up to twenty-two (22) fields.");
			}

			if (isTuple) {
				this.isFieldInFirst = new boolean[this.fieldIndexes.length];

				// check field indexes and adapt to position in tuple
				int maxFieldIndex = firstInput ? numFieldsDs1 : numFieldsDs2;
				for (int i = 0; i < this.fieldIndexes.length; i++) {
					Preconditions.checkElementIndex(this.fieldIndexes[i], maxFieldIndex);

					if (firstInput) {
						this.isFieldInFirst[i] = true;
					} else {
						this.isFieldInFirst[i] = false;
					}
				}
			} else {
				this.isFieldInFirst = new boolean[]{firstInput};
				this.fieldIndexes = new int[]{-1};
			}

		}

Source File: DataSetUtils.java From flink with Apache License 2.0

4 votes

/**
 * Range-partitions a DataSet using the specified key selector function.
 */
public static <T, K extends Comparable<K>> PartitionOperator<T> partitionByRange(DataSet<T> input, DataDistribution distribution, KeySelector<T, K> keyExtractor) {
	final TypeInformation<K> keyType = TypeExtractor.getKeySelectorTypes(keyExtractor, input.getType());
	return new PartitionOperator<>(input, PartitionOperatorBase.PartitionMethod.RANGE, new Keys.SelectorFunctionKeys<>(input.clean(keyExtractor), input.getType(), keyType), distribution, Utils.getCallLocationName());
}

Source File: EdgeList.java From flink with Apache License 2.0

3 votes

/**
 * Check whether the edge type of the {@link DataSet} is {@link NullValue}.
 *
 * @param edges data set for introspection
 * @param <T> graph ID type
 * @param <ET> edge value type
 * @return whether the edge type of the {@link DataSet} is {@link NullValue}
 */
private static <T, ET> boolean hasNullValueEdges(DataSet<Edge<T, ET>> edges) {
	TypeInformation<?> genericTypeInfo = edges.getType();
	@SuppressWarnings("unchecked")
	TupleTypeInfo<Tuple3<T, T, ET>> tupleTypeInfo = (TupleTypeInfo<Tuple3<T, T, ET>>) genericTypeInfo;

	return tupleTypeInfo.getTypeAt(2).equals(ValueTypeInfo.NULL_VALUE_TYPE_INFO);
}

Java Code Examples for org.apache.flink.api.java.DataSet#getType()