org.apache.flink.api.java.utils.DataSetUtils Java Exaples

Source File: DataSetUtilsITCase.java From flink with Apache License 2.0

6 votes

@Test
public void testZipWithUniqueId() throws Exception {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	long expectedSize = 100L;
	DataSet<Long> numbers = env.generateSequence(1L, expectedSize);

	DataSet<Long> ids = DataSetUtils.zipWithUniqueId(numbers).map(new MapFunction<Tuple2<Long, Long>, Long>() {
		@Override
		public Long map(Tuple2<Long, Long> value) throws Exception {
			return value.f0;
		}
	});

	Set<Long> result = new HashSet<>(ids.collect());

	Assert.assertEquals(expectedSize, result.size());
}

Source File: DataSetUtilsITCase.java From flink with Apache License 2.0

6 votes

@Test
public void testZipWithUniqueId() throws Exception {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	long expectedSize = 100L;
	DataSet<Long> numbers = env.generateSequence(1L, expectedSize);

	DataSet<Long> ids = DataSetUtils.zipWithUniqueId(numbers).map(new MapFunction<Tuple2<Long, Long>, Long>() {
		@Override
		public Long map(Tuple2<Long, Long> value) throws Exception {
			return value.f0;
		}
	});

	Set<Long> result = new HashSet<>(ids.collect());

	Assert.assertEquals(expectedSize, result.size());
}

Source File: DataSetUtilsITCase.java From flink with Apache License 2.0

6 votes

@Test
public void testZipWithIndex() throws Exception {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	long expectedSize = 100L;
	DataSet<Long> numbers = env.generateSequence(0, expectedSize - 1);

	List<Tuple2<Long, Long>> result = new ArrayList<>(DataSetUtils.zipWithIndex(numbers).collect());

	Assert.assertEquals(expectedSize, result.size());
	// sort result by created index
	Collections.sort(result, new Comparator<Tuple2<Long, Long>>() {
		@Override
		public int compare(Tuple2<Long, Long> o1, Tuple2<Long, Long> o2) {
			return o1.f0.compareTo(o2.f0);
		}
	});
	// test if index is consecutive
	for (int i = 0; i < expectedSize; i++) {
		Assert.assertEquals(i, result.get(i).f0.longValue());
	}
}

Source File: DataSetUtilsITCase.java From flink with Apache License 2.0

6 votes

@Test
public void testZipWithIndex() throws Exception {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	long expectedSize = 100L;
	DataSet<Long> numbers = env.generateSequence(0, expectedSize - 1);

	List<Tuple2<Long, Long>> result = new ArrayList<>(DataSetUtils.zipWithIndex(numbers).collect());

	Assert.assertEquals(expectedSize, result.size());
	// sort result by created index
	Collections.sort(result, new Comparator<Tuple2<Long, Long>>() {
		@Override
		public int compare(Tuple2<Long, Long> o1, Tuple2<Long, Long> o2) {
			return o1.f0.compareTo(o2.f0);
		}
	});
	// test if index is consecutive
	for (int i = 0; i < expectedSize; i++) {
		Assert.assertEquals(i, result.get(i).f0.longValue());
	}
}

Source File: DataSetUtilsITCase.java From Flink-CEPplus with Apache License 2.0

6 votes

@Test
public void testZipWithIndex() throws Exception {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	long expectedSize = 100L;
	DataSet<Long> numbers = env.generateSequence(0, expectedSize - 1);

	List<Tuple2<Long, Long>> result = new ArrayList<>(DataSetUtils.zipWithIndex(numbers).collect());

	Assert.assertEquals(expectedSize, result.size());
	// sort result by created index
	Collections.sort(result, new Comparator<Tuple2<Long, Long>>() {
		@Override
		public int compare(Tuple2<Long, Long> o1, Tuple2<Long, Long> o2) {
			return o1.f0.compareTo(o2.f0);
		}
	});
	// test if index is consecutive
	for (int i = 0; i < expectedSize; i++) {
		Assert.assertEquals(i, result.get(i).f0.longValue());
	}
}

Source File: DataSetUtilsITCase.java From Flink-CEPplus with Apache License 2.0

6 votes

@Test
public void testZipWithUniqueId() throws Exception {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	long expectedSize = 100L;
	DataSet<Long> numbers = env.generateSequence(1L, expectedSize);

	DataSet<Long> ids = DataSetUtils.zipWithUniqueId(numbers).map(new MapFunction<Tuple2<Long, Long>, Long>() {
		@Override
		public Long map(Tuple2<Long, Long> value) throws Exception {
			return value.f0;
		}
	});

	Set<Long> result = new HashSet<>(ids.collect());

	Assert.assertEquals(expectedSize, result.size());
}

Source File: SplitBatchOp.java From Alink with Apache License 2.0

6 votes

@Override
public SplitBatchOp linkFrom(BatchOperator<?>... inputs) {
    BatchOperator<?> in = checkAndGetFirst(inputs);
    final double fraction = getFraction();
    if (fraction < 0. || fraction > 1.0) {
        throw new RuntimeException("invalid fraction " + fraction);
    }

    DataSet<Row> rows = in.getDataSet();

    DataSet<Tuple2<Integer, Long>> countsPerPartition = DataSetUtils.countElementsPerPartition(rows);
    DataSet<long[]> numPickedPerPartition = countsPerPartition
        .mapPartition(new CountInPartition(fraction))
        .setParallelism(1)
        .name("decide_count_of_each_partition");

    DataSet<Row> out = rows
        .mapPartition(new PickInPartition())
        .withBroadcastSet(numPickedPerPartition, "counts")
        .name("pick_in_each_partition");

    this.setOutput(out, in.getSchema());
    this.setSideOutputTables(new Table[]{in.getOutputTable().minusAll(this.getOutputTable())});
    return this;
}

Source File: BaseComQueue.java From Alink with Apache License 2.0

6 votes

private <T> void createRelationshipAndCachedData(DataSet<T> data, final String key) {
	final int localSessionId = sessionId;
	if (cacheDataRel == null) {
		cacheDataRel = clearObjs(
			BatchOperator
				.getExecutionEnvironmentFromDataSets(data)
				.fromElements(new byte[0])
				.mapPartition(new MapPartitionFunction<byte[], byte[]>() {
					@Override
					public void mapPartition(Iterable<byte[]> values, Collector<byte[]> out) throws Exception {
						//pass
					}
				})
		);
	}

	DataSet<Tuple2<Integer, Long>> rowCount = DataSetUtils.countElementsPerPartition(data);

	cacheDataRel = data.mapPartition(new PutCachedData<T>(key, localSessionId))
		.withBroadcastSet(cacheDataRel, "rel")
		.withBroadcastSet(rowCount, "rowCount")
		.name("cachedDataRel@" + key);

	cacheDataObjNames.add(key);
}

Source File: SampleITCase.java From flink with Apache License 2.0

5 votes

private void verifySamplerWithFraction(boolean withReplacement, double fraction, long seed) throws Exception {
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	FlatMapOperator<Tuple3<Integer, Long, String>, String> ds = getSourceDataSet(env);
	MapPartitionOperator<String, String> sampled = DataSetUtils.sample(ds, withReplacement, fraction, seed);
	List<String> result = sampled.collect();
	containsResultAsText(result, getSourceStrings());
}

Source File: SampleWithSizeBatchOp.java From Alink with Apache License 2.0

5 votes

@Override
public SampleWithSizeBatchOp linkFrom(BatchOperator<?>... inputs) {
    BatchOperator<?> in = checkAndGetFirst(inputs);
    boolean withReplacement = getWithReplacement();
    int numSamples = getSize();
    DataSet<Row> rows = DataSetUtils.sampleWithSize(in.getDataSet(), withReplacement, numSamples);
    this.setOutput(rows, in.getSchema());
    return this;
}

Source File: AppendIdBatchOp.java From Alink with Apache License 2.0

5 votes

public static Table appendId(
	DataSet <Row> dataSet,
	TableSchema schema,
	String appendIdColName,
	AppendType appendType,
	Long sessionId) {
	String[] rawColNames = schema.getFieldNames();
	TypeInformation[] rawColTypes = schema.getFieldTypes();

	String[] colNames = ArrayUtils.add(rawColNames, appendIdColName);
	TypeInformation[] colTypes = ArrayUtils.add(rawColTypes, appendIdColType);

	DataSet <Row> ret = null;

	switch (appendType) {
		case DENSE:
			ret = DataSetUtils.zipWithIndex(dataSet)
				.map(new TransTupleToRowMapper());
			break;
		case UNIQUE:
			ret = DataSetUtils.zipWithUniqueId(dataSet)
				.map(new TransTupleToRowMapper());
			ret = dataSet.map(new AppendIdMapper());
			break;
		default:
			throw new IllegalArgumentException("Error append type.");
	}

	return DataSetConversionUtil.toTable(sessionId, ret, colNames, colTypes);
}

Source File: MultilayerPerceptronTrainBatchOp.java From Alink with Apache License 2.0

5 votes

/**
 * Get distinct labels and assign each label an index.
 */
private static DataSet<Tuple2<Long, Object>> getDistinctLabels(BatchOperator data, final String labelColName) {
    data = data.select("`" + labelColName + "`").distinct();
    DataSet<Row> labelRows = data.getDataSet();
    return DataSetUtils.zipWithIndex(labelRows)
        .map(new MapFunction<Tuple2<Long, Row>, Tuple2<Long, Object>>() {
            @Override
            public Tuple2<Long, Object> map(Tuple2<Long, Row> value) throws Exception {
                return Tuple2.of(value.f0, value.f1.getField(0));
            }
        })
        .name("get_labels");
}

Source File: JoinITCase.java From flink with Apache License 2.0

5 votes

@Test
public void testJoinWithRangePartitioning() throws Exception {
	/*
	 * Test Join on tuples with multiple key field positions and same customized distribution
	 */

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Tuple3<Integer, Long, String>> ds1 = CollectionDataSets.get3TupleDataSet(env);
	DataSet<Tuple5<Integer, Long, Integer, String, Long>> ds2 = CollectionDataSets.get5TupleDataSet(env);

	env.setParallelism(4);
	TestDistribution testDis = new TestDistribution();
	DataSet<Tuple2<String, String>> joinDs =
			DataSetUtils.partitionByRange(ds1, testDis, 0, 1)
					.join(DataSetUtils.partitionByRange(ds2, testDis, 0, 4))
					.where(0, 1)
					.equalTo(0, 4)
					.with(new T3T5FlatJoin());

	List<Tuple2<String, String>> result = joinDs.collect();

	String expected = "Hi,Hallo\n" +
			"Hello,Hallo Welt\n" +
			"Hello world,Hallo Welt wie gehts?\n" +
			"Hello world,ABC\n" +
			"I am fine.,HIJ\n" +
			"I am fine.,IJK\n";

	compareResultAsTuples(result, expected);
}

Source File: CoGroupITCase.java From flink with Apache License 2.0

5 votes

@Test
public void testCoGroupWithRangePartitioning() throws Exception {
	/*
	 * Test coGroup on tuples with multiple key field positions and same customized distribution
	 */

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Tuple5<Integer, Long, Integer, String, Long>> ds1 = CollectionDataSets.get5TupleDataSet(env);
	DataSet<Tuple3<Integer, Long, String>> ds2 = CollectionDataSets.get3TupleDataSet(env);

	env.setParallelism(4);
	TestDistribution testDis = new TestDistribution();
	DataSet<Tuple3<Integer, Long, String>> coGrouped =
			DataSetUtils.partitionByRange(ds1, testDis, 0, 4)
					.coGroup(DataSetUtils.partitionByRange(ds2, testDis, 0, 1))
					.where(0, 4)
					.equalTo(0, 1)
					.with(new Tuple5Tuple3CoGroup());

	List<Tuple3<Integer, Long, String>> result = coGrouped.collect();

	String expected = "1,1,Hallo\n" +
			"2,2,Hallo Welt\n" +
			"3,2,Hallo Welt wie gehts?\n" +
			"3,2,ABC\n" +
			"5,3,HIJ\n" +
			"5,3,IJK\n";

	compareResultAsTuples(result, expected);
}

Source File: CoGroupITCase.java From Flink-CEPplus with Apache License 2.0

5 votes

@Test
public void testCoGroupWithRangePartitioning() throws Exception {
	/*
	 * Test coGroup on tuples with multiple key field positions and same customized distribution
	 */

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Tuple5<Integer, Long, Integer, String, Long>> ds1 = CollectionDataSets.get5TupleDataSet(env);
	DataSet<Tuple3<Integer, Long, String>> ds2 = CollectionDataSets.get3TupleDataSet(env);

	env.setParallelism(4);
	TestDistribution testDis = new TestDistribution();
	DataSet<Tuple3<Integer, Long, String>> coGrouped =
			DataSetUtils.partitionByRange(ds1, testDis, 0, 4)
					.coGroup(DataSetUtils.partitionByRange(ds2, testDis, 0, 1))
					.where(0, 4)
					.equalTo(0, 1)
					.with(new Tuple5Tuple3CoGroup());

	List<Tuple3<Integer, Long, String>> result = coGrouped.collect();

	String expected = "1,1,Hallo\n" +
			"2,2,Hallo Welt\n" +
			"3,2,Hallo Welt wie gehts?\n" +
			"3,2,ABC\n" +
			"5,3,HIJ\n" +
			"5,3,IJK\n";

	compareResultAsTuples(result, expected);
}

Source File: SampleITCase.java From flink with Apache License 2.0

5 votes

private void verifySamplerWithFixedSize(boolean withReplacement, int numSamples, long seed) throws Exception {
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	FlatMapOperator<Tuple3<Integer, Long, String>, String> ds = getSourceDataSet(env);
	DataSet<String> sampled = DataSetUtils.sampleWithSize(ds, withReplacement, numSamples, seed);
	List<String> result = sampled.collect();
	assertEquals(numSamples, result.size());
	containsResultAsText(result, getSourceStrings());
}

Source File: CustomDistributionITCase.java From flink with Apache License 2.0

5 votes

@Test(expected = IllegalArgumentException.class)
public void testPartitionMoreThanDistribution() throws Exception {
	final TestDataDist2 dist = new TestDataDist2();

	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Tuple3<Integer, Long, String>> input = CollectionDataSets.get3TupleDataSet(env);
	DataSetUtils.partitionByRange(input, dist, 0, 1, 2);
}

Source File: DataSetUtilsITCase.java From flink with Apache License 2.0

5 votes

@Test
public void testCountElementsPerPartition() throws Exception {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	long expectedSize = 100L;
	DataSet<Long> numbers = env.generateSequence(0, expectedSize - 1);

	DataSet<Tuple2<Integer, Long>> ds = DataSetUtils.countElementsPerPartition(numbers);

	Assert.assertEquals(env.getParallelism(), ds.count());
	Assert.assertEquals(expectedSize, ds.sum(1).collect().get(0).f1.longValue());
}

Source File: DataSetUtilsITCase.java From flink with Apache License 2.0

5 votes

@Test
public void testIntegerDataSetChecksumHashCode() throws Exception {
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Integer> ds = CollectionDataSets.getIntegerDataSet(env);

	Utils.ChecksumHashCode checksum = DataSetUtils.checksumHashCode(ds);
	Assert.assertEquals(checksum.getCount(), 15);
	Assert.assertEquals(checksum.getChecksum(), 55);
}

Source File: SampleITCase.java From Flink-CEPplus with Apache License 2.0

5 votes

private void verifySamplerWithFraction(boolean withReplacement, double fraction, long seed) throws Exception {
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	FlatMapOperator<Tuple3<Integer, Long, String>, String> ds = getSourceDataSet(env);
	MapPartitionOperator<String, String> sampled = DataSetUtils.sample(ds, withReplacement, fraction, seed);
	List<String> result = sampled.collect();
	containsResultAsText(result, getSourceStrings());
}

Source File: JoinITCase.java From flink with Apache License 2.0

5 votes

@Test
public void testJoinWithRangePartitioning() throws Exception {
	/*
	 * Test Join on tuples with multiple key field positions and same customized distribution
	 */

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Tuple3<Integer, Long, String>> ds1 = CollectionDataSets.get3TupleDataSet(env);
	DataSet<Tuple5<Integer, Long, Integer, String, Long>> ds2 = CollectionDataSets.get5TupleDataSet(env);

	env.setParallelism(4);
	TestDistribution testDis = new TestDistribution();
	DataSet<Tuple2<String, String>> joinDs =
			DataSetUtils.partitionByRange(ds1, testDis, 0, 1)
					.join(DataSetUtils.partitionByRange(ds2, testDis, 0, 4))
					.where(0, 1)
					.equalTo(0, 4)
					.with(new T3T5FlatJoin());

	List<Tuple2<String, String>> result = joinDs.collect();

	String expected = "Hi,Hallo\n" +
			"Hello,Hallo Welt\n" +
			"Hello world,Hallo Welt wie gehts?\n" +
			"Hello world,ABC\n" +
			"I am fine.,HIJ\n" +
			"I am fine.,IJK\n";

	compareResultAsTuples(result, expected);
}

Source File: CoGroupITCase.java From flink with Apache License 2.0

5 votes

@Test
public void testCoGroupWithRangePartitioning() throws Exception {
	/*
	 * Test coGroup on tuples with multiple key field positions and same customized distribution
	 */

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Tuple5<Integer, Long, Integer, String, Long>> ds1 = CollectionDataSets.get5TupleDataSet(env);
	DataSet<Tuple3<Integer, Long, String>> ds2 = CollectionDataSets.get3TupleDataSet(env);

	env.setParallelism(4);
	TestDistribution testDis = new TestDistribution();
	DataSet<Tuple3<Integer, Long, String>> coGrouped =
			DataSetUtils.partitionByRange(ds1, testDis, 0, 4)
					.coGroup(DataSetUtils.partitionByRange(ds2, testDis, 0, 1))
					.where(0, 4)
					.equalTo(0, 1)
					.with(new Tuple5Tuple3CoGroup());

	List<Tuple3<Integer, Long, String>> result = coGrouped.collect();

	String expected = "1,1,Hallo\n" +
			"2,2,Hallo Welt\n" +
			"3,2,Hallo Welt wie gehts?\n" +
			"3,2,ABC\n" +
			"5,3,HIJ\n" +
			"5,3,IJK\n";

	compareResultAsTuples(result, expected);
}

Source File: DataSetUtilsITCase.java From Flink-CEPplus with Apache License 2.0

5 votes

@Test
public void testIntegerDataSetChecksumHashCode() throws Exception {
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Integer> ds = CollectionDataSets.getIntegerDataSet(env);

	Utils.ChecksumHashCode checksum = DataSetUtils.checksumHashCode(ds);
	Assert.assertEquals(checksum.getCount(), 15);
	Assert.assertEquals(checksum.getChecksum(), 55);
}

Source File: DataSetUtilsITCase.java From Flink-CEPplus with Apache License 2.0

5 votes

@Test
public void testCountElementsPerPartition() throws Exception {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	long expectedSize = 100L;
	DataSet<Long> numbers = env.generateSequence(0, expectedSize - 1);

	DataSet<Tuple2<Integer, Long>> ds = DataSetUtils.countElementsPerPartition(numbers);

	Assert.assertEquals(env.getParallelism(), ds.count());
	Assert.assertEquals(expectedSize, ds.sum(1).collect().get(0).f1.longValue());
}

Source File: CustomDistributionITCase.java From Flink-CEPplus with Apache License 2.0

5 votes

@Test(expected = IllegalArgumentException.class)
public void testPartitionMoreThanDistribution() throws Exception {
	final TestDataDist2 dist = new TestDataDist2();

	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Tuple3<Integer, Long, String>> input = CollectionDataSets.get3TupleDataSet(env);
	DataSetUtils.partitionByRange(input, dist, 0, 1, 2);
}

Source File: SampleITCase.java From Flink-CEPplus with Apache License 2.0

5 votes

private void verifySamplerWithFixedSize(boolean withReplacement, int numSamples, long seed) throws Exception {
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	FlatMapOperator<Tuple3<Integer, Long, String>, String> ds = getSourceDataSet(env);
	DataSet<String> sampled = DataSetUtils.sampleWithSize(ds, withReplacement, numSamples, seed);
	List<String> result = sampled.collect();
	assertEquals(numSamples, result.size());
	containsResultAsText(result, getSourceStrings());
}

Source File: SampleITCase.java From flink with Apache License 2.0

5 votes

private void verifySamplerWithFraction(boolean withReplacement, double fraction, long seed) throws Exception {
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	FlatMapOperator<Tuple3<Integer, Long, String>, String> ds = getSourceDataSet(env);
	MapPartitionOperator<String, String> sampled = DataSetUtils.sample(ds, withReplacement, fraction, seed);
	List<String> result = sampled.collect();
	containsResultAsText(result, getSourceStrings());
}

Source File: SampleITCase.java From flink with Apache License 2.0

5 votes

private void verifySamplerWithFixedSize(boolean withReplacement, int numSamples, long seed) throws Exception {
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	FlatMapOperator<Tuple3<Integer, Long, String>, String> ds = getSourceDataSet(env);
	DataSet<String> sampled = DataSetUtils.sampleWithSize(ds, withReplacement, numSamples, seed);
	List<String> result = sampled.collect();
	assertEquals(numSamples, result.size());
	containsResultAsText(result, getSourceStrings());
}

Source File: CustomDistributionITCase.java From flink with Apache License 2.0

5 votes

@Test(expected = IllegalArgumentException.class)
public void testPartitionMoreThanDistribution() throws Exception {
	final TestDataDist2 dist = new TestDataDist2();

	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Tuple3<Integer, Long, String>> input = CollectionDataSets.get3TupleDataSet(env);
	DataSetUtils.partitionByRange(input, dist, 0, 1, 2);
}

Source File: DataSetUtilsITCase.java From flink with Apache License 2.0

5 votes

@Test
public void testCountElementsPerPartition() throws Exception {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	long expectedSize = 100L;
	DataSet<Long> numbers = env.generateSequence(0, expectedSize - 1);

	DataSet<Tuple2<Integer, Long>> ds = DataSetUtils.countElementsPerPartition(numbers);

	Assert.assertEquals(env.getParallelism(), ds.count());
	Assert.assertEquals(expectedSize, ds.sum(1).collect().get(0).f1.longValue());
}

org.apache.flink.api.java.utils.DataSetUtils Java Examples