org.apache.flink.api.java.utils.DataSetUtils Java Examples
The following examples show how to use
org.apache.flink.api.java.utils.DataSetUtils.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DataSetUtilsITCase.java From flink with Apache License 2.0 | 6 votes |
@Test public void testZipWithUniqueId() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); long expectedSize = 100L; DataSet<Long> numbers = env.generateSequence(1L, expectedSize); DataSet<Long> ids = DataSetUtils.zipWithUniqueId(numbers).map(new MapFunction<Tuple2<Long, Long>, Long>() { @Override public Long map(Tuple2<Long, Long> value) throws Exception { return value.f0; } }); Set<Long> result = new HashSet<>(ids.collect()); Assert.assertEquals(expectedSize, result.size()); }
Example #2
Source File: DataSetUtilsITCase.java From flink with Apache License 2.0 | 6 votes |
@Test public void testZipWithUniqueId() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); long expectedSize = 100L; DataSet<Long> numbers = env.generateSequence(1L, expectedSize); DataSet<Long> ids = DataSetUtils.zipWithUniqueId(numbers).map(new MapFunction<Tuple2<Long, Long>, Long>() { @Override public Long map(Tuple2<Long, Long> value) throws Exception { return value.f0; } }); Set<Long> result = new HashSet<>(ids.collect()); Assert.assertEquals(expectedSize, result.size()); }
Example #3
Source File: DataSetUtilsITCase.java From flink with Apache License 2.0 | 6 votes |
@Test public void testZipWithIndex() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); long expectedSize = 100L; DataSet<Long> numbers = env.generateSequence(0, expectedSize - 1); List<Tuple2<Long, Long>> result = new ArrayList<>(DataSetUtils.zipWithIndex(numbers).collect()); Assert.assertEquals(expectedSize, result.size()); // sort result by created index Collections.sort(result, new Comparator<Tuple2<Long, Long>>() { @Override public int compare(Tuple2<Long, Long> o1, Tuple2<Long, Long> o2) { return o1.f0.compareTo(o2.f0); } }); // test if index is consecutive for (int i = 0; i < expectedSize; i++) { Assert.assertEquals(i, result.get(i).f0.longValue()); } }
Example #4
Source File: DataSetUtilsITCase.java From flink with Apache License 2.0 | 6 votes |
@Test public void testZipWithIndex() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); long expectedSize = 100L; DataSet<Long> numbers = env.generateSequence(0, expectedSize - 1); List<Tuple2<Long, Long>> result = new ArrayList<>(DataSetUtils.zipWithIndex(numbers).collect()); Assert.assertEquals(expectedSize, result.size()); // sort result by created index Collections.sort(result, new Comparator<Tuple2<Long, Long>>() { @Override public int compare(Tuple2<Long, Long> o1, Tuple2<Long, Long> o2) { return o1.f0.compareTo(o2.f0); } }); // test if index is consecutive for (int i = 0; i < expectedSize; i++) { Assert.assertEquals(i, result.get(i).f0.longValue()); } }
Example #5
Source File: DataSetUtilsITCase.java From Flink-CEPplus with Apache License 2.0 | 6 votes |
@Test public void testZipWithIndex() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); long expectedSize = 100L; DataSet<Long> numbers = env.generateSequence(0, expectedSize - 1); List<Tuple2<Long, Long>> result = new ArrayList<>(DataSetUtils.zipWithIndex(numbers).collect()); Assert.assertEquals(expectedSize, result.size()); // sort result by created index Collections.sort(result, new Comparator<Tuple2<Long, Long>>() { @Override public int compare(Tuple2<Long, Long> o1, Tuple2<Long, Long> o2) { return o1.f0.compareTo(o2.f0); } }); // test if index is consecutive for (int i = 0; i < expectedSize; i++) { Assert.assertEquals(i, result.get(i).f0.longValue()); } }
Example #6
Source File: DataSetUtilsITCase.java From Flink-CEPplus with Apache License 2.0 | 6 votes |
@Test public void testZipWithUniqueId() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); long expectedSize = 100L; DataSet<Long> numbers = env.generateSequence(1L, expectedSize); DataSet<Long> ids = DataSetUtils.zipWithUniqueId(numbers).map(new MapFunction<Tuple2<Long, Long>, Long>() { @Override public Long map(Tuple2<Long, Long> value) throws Exception { return value.f0; } }); Set<Long> result = new HashSet<>(ids.collect()); Assert.assertEquals(expectedSize, result.size()); }
Example #7
Source File: SplitBatchOp.java From Alink with Apache License 2.0 | 6 votes |
@Override public SplitBatchOp linkFrom(BatchOperator<?>... inputs) { BatchOperator<?> in = checkAndGetFirst(inputs); final double fraction = getFraction(); if (fraction < 0. || fraction > 1.0) { throw new RuntimeException("invalid fraction " + fraction); } DataSet<Row> rows = in.getDataSet(); DataSet<Tuple2<Integer, Long>> countsPerPartition = DataSetUtils.countElementsPerPartition(rows); DataSet<long[]> numPickedPerPartition = countsPerPartition .mapPartition(new CountInPartition(fraction)) .setParallelism(1) .name("decide_count_of_each_partition"); DataSet<Row> out = rows .mapPartition(new PickInPartition()) .withBroadcastSet(numPickedPerPartition, "counts") .name("pick_in_each_partition"); this.setOutput(out, in.getSchema()); this.setSideOutputTables(new Table[]{in.getOutputTable().minusAll(this.getOutputTable())}); return this; }
Example #8
Source File: BaseComQueue.java From Alink with Apache License 2.0 | 6 votes |
private <T> void createRelationshipAndCachedData(DataSet<T> data, final String key) { final int localSessionId = sessionId; if (cacheDataRel == null) { cacheDataRel = clearObjs( BatchOperator .getExecutionEnvironmentFromDataSets(data) .fromElements(new byte[0]) .mapPartition(new MapPartitionFunction<byte[], byte[]>() { @Override public void mapPartition(Iterable<byte[]> values, Collector<byte[]> out) throws Exception { //pass } }) ); } DataSet<Tuple2<Integer, Long>> rowCount = DataSetUtils.countElementsPerPartition(data); cacheDataRel = data.mapPartition(new PutCachedData<T>(key, localSessionId)) .withBroadcastSet(cacheDataRel, "rel") .withBroadcastSet(rowCount, "rowCount") .name("cachedDataRel@" + key); cacheDataObjNames.add(key); }
Example #9
Source File: SampleITCase.java From flink with Apache License 2.0 | 5 votes |
private void verifySamplerWithFraction(boolean withReplacement, double fraction, long seed) throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); FlatMapOperator<Tuple3<Integer, Long, String>, String> ds = getSourceDataSet(env); MapPartitionOperator<String, String> sampled = DataSetUtils.sample(ds, withReplacement, fraction, seed); List<String> result = sampled.collect(); containsResultAsText(result, getSourceStrings()); }
Example #10
Source File: SampleWithSizeBatchOp.java From Alink with Apache License 2.0 | 5 votes |
@Override public SampleWithSizeBatchOp linkFrom(BatchOperator<?>... inputs) { BatchOperator<?> in = checkAndGetFirst(inputs); boolean withReplacement = getWithReplacement(); int numSamples = getSize(); DataSet<Row> rows = DataSetUtils.sampleWithSize(in.getDataSet(), withReplacement, numSamples); this.setOutput(rows, in.getSchema()); return this; }
Example #11
Source File: AppendIdBatchOp.java From Alink with Apache License 2.0 | 5 votes |
public static Table appendId( DataSet <Row> dataSet, TableSchema schema, String appendIdColName, AppendType appendType, Long sessionId) { String[] rawColNames = schema.getFieldNames(); TypeInformation[] rawColTypes = schema.getFieldTypes(); String[] colNames = ArrayUtils.add(rawColNames, appendIdColName); TypeInformation[] colTypes = ArrayUtils.add(rawColTypes, appendIdColType); DataSet <Row> ret = null; switch (appendType) { case DENSE: ret = DataSetUtils.zipWithIndex(dataSet) .map(new TransTupleToRowMapper()); break; case UNIQUE: ret = DataSetUtils.zipWithUniqueId(dataSet) .map(new TransTupleToRowMapper()); ret = dataSet.map(new AppendIdMapper()); break; default: throw new IllegalArgumentException("Error append type."); } return DataSetConversionUtil.toTable(sessionId, ret, colNames, colTypes); }
Example #12
Source File: MultilayerPerceptronTrainBatchOp.java From Alink with Apache License 2.0 | 5 votes |
/** * Get distinct labels and assign each label an index. */ private static DataSet<Tuple2<Long, Object>> getDistinctLabels(BatchOperator data, final String labelColName) { data = data.select("`" + labelColName + "`").distinct(); DataSet<Row> labelRows = data.getDataSet(); return DataSetUtils.zipWithIndex(labelRows) .map(new MapFunction<Tuple2<Long, Row>, Tuple2<Long, Object>>() { @Override public Tuple2<Long, Object> map(Tuple2<Long, Row> value) throws Exception { return Tuple2.of(value.f0, value.f1.getField(0)); } }) .name("get_labels"); }
Example #13
Source File: JoinITCase.java From flink with Apache License 2.0 | 5 votes |
@Test public void testJoinWithRangePartitioning() throws Exception { /* * Test Join on tuples with multiple key field positions and same customized distribution */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Long, String>> ds1 = CollectionDataSets.get3TupleDataSet(env); DataSet<Tuple5<Integer, Long, Integer, String, Long>> ds2 = CollectionDataSets.get5TupleDataSet(env); env.setParallelism(4); TestDistribution testDis = new TestDistribution(); DataSet<Tuple2<String, String>> joinDs = DataSetUtils.partitionByRange(ds1, testDis, 0, 1) .join(DataSetUtils.partitionByRange(ds2, testDis, 0, 4)) .where(0, 1) .equalTo(0, 4) .with(new T3T5FlatJoin()); List<Tuple2<String, String>> result = joinDs.collect(); String expected = "Hi,Hallo\n" + "Hello,Hallo Welt\n" + "Hello world,Hallo Welt wie gehts?\n" + "Hello world,ABC\n" + "I am fine.,HIJ\n" + "I am fine.,IJK\n"; compareResultAsTuples(result, expected); }
Example #14
Source File: CoGroupITCase.java From flink with Apache License 2.0 | 5 votes |
@Test public void testCoGroupWithRangePartitioning() throws Exception { /* * Test coGroup on tuples with multiple key field positions and same customized distribution */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, Integer, String, Long>> ds1 = CollectionDataSets.get5TupleDataSet(env); DataSet<Tuple3<Integer, Long, String>> ds2 = CollectionDataSets.get3TupleDataSet(env); env.setParallelism(4); TestDistribution testDis = new TestDistribution(); DataSet<Tuple3<Integer, Long, String>> coGrouped = DataSetUtils.partitionByRange(ds1, testDis, 0, 4) .coGroup(DataSetUtils.partitionByRange(ds2, testDis, 0, 1)) .where(0, 4) .equalTo(0, 1) .with(new Tuple5Tuple3CoGroup()); List<Tuple3<Integer, Long, String>> result = coGrouped.collect(); String expected = "1,1,Hallo\n" + "2,2,Hallo Welt\n" + "3,2,Hallo Welt wie gehts?\n" + "3,2,ABC\n" + "5,3,HIJ\n" + "5,3,IJK\n"; compareResultAsTuples(result, expected); }
Example #15
Source File: CoGroupITCase.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
@Test public void testCoGroupWithRangePartitioning() throws Exception { /* * Test coGroup on tuples with multiple key field positions and same customized distribution */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, Integer, String, Long>> ds1 = CollectionDataSets.get5TupleDataSet(env); DataSet<Tuple3<Integer, Long, String>> ds2 = CollectionDataSets.get3TupleDataSet(env); env.setParallelism(4); TestDistribution testDis = new TestDistribution(); DataSet<Tuple3<Integer, Long, String>> coGrouped = DataSetUtils.partitionByRange(ds1, testDis, 0, 4) .coGroup(DataSetUtils.partitionByRange(ds2, testDis, 0, 1)) .where(0, 4) .equalTo(0, 1) .with(new Tuple5Tuple3CoGroup()); List<Tuple3<Integer, Long, String>> result = coGrouped.collect(); String expected = "1,1,Hallo\n" + "2,2,Hallo Welt\n" + "3,2,Hallo Welt wie gehts?\n" + "3,2,ABC\n" + "5,3,HIJ\n" + "5,3,IJK\n"; compareResultAsTuples(result, expected); }
Example #16
Source File: SampleITCase.java From flink with Apache License 2.0 | 5 votes |
private void verifySamplerWithFixedSize(boolean withReplacement, int numSamples, long seed) throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); FlatMapOperator<Tuple3<Integer, Long, String>, String> ds = getSourceDataSet(env); DataSet<String> sampled = DataSetUtils.sampleWithSize(ds, withReplacement, numSamples, seed); List<String> result = sampled.collect(); assertEquals(numSamples, result.size()); containsResultAsText(result, getSourceStrings()); }
Example #17
Source File: CustomDistributionITCase.java From flink with Apache License 2.0 | 5 votes |
@Test(expected = IllegalArgumentException.class) public void testPartitionMoreThanDistribution() throws Exception { final TestDataDist2 dist = new TestDataDist2(); ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Long, String>> input = CollectionDataSets.get3TupleDataSet(env); DataSetUtils.partitionByRange(input, dist, 0, 1, 2); }
Example #18
Source File: DataSetUtilsITCase.java From flink with Apache License 2.0 | 5 votes |
@Test public void testCountElementsPerPartition() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); long expectedSize = 100L; DataSet<Long> numbers = env.generateSequence(0, expectedSize - 1); DataSet<Tuple2<Integer, Long>> ds = DataSetUtils.countElementsPerPartition(numbers); Assert.assertEquals(env.getParallelism(), ds.count()); Assert.assertEquals(expectedSize, ds.sum(1).collect().get(0).f1.longValue()); }
Example #19
Source File: DataSetUtilsITCase.java From flink with Apache License 2.0 | 5 votes |
@Test public void testIntegerDataSetChecksumHashCode() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Integer> ds = CollectionDataSets.getIntegerDataSet(env); Utils.ChecksumHashCode checksum = DataSetUtils.checksumHashCode(ds); Assert.assertEquals(checksum.getCount(), 15); Assert.assertEquals(checksum.getChecksum(), 55); }
Example #20
Source File: SampleITCase.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
private void verifySamplerWithFraction(boolean withReplacement, double fraction, long seed) throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); FlatMapOperator<Tuple3<Integer, Long, String>, String> ds = getSourceDataSet(env); MapPartitionOperator<String, String> sampled = DataSetUtils.sample(ds, withReplacement, fraction, seed); List<String> result = sampled.collect(); containsResultAsText(result, getSourceStrings()); }
Example #21
Source File: JoinITCase.java From flink with Apache License 2.0 | 5 votes |
@Test public void testJoinWithRangePartitioning() throws Exception { /* * Test Join on tuples with multiple key field positions and same customized distribution */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Long, String>> ds1 = CollectionDataSets.get3TupleDataSet(env); DataSet<Tuple5<Integer, Long, Integer, String, Long>> ds2 = CollectionDataSets.get5TupleDataSet(env); env.setParallelism(4); TestDistribution testDis = new TestDistribution(); DataSet<Tuple2<String, String>> joinDs = DataSetUtils.partitionByRange(ds1, testDis, 0, 1) .join(DataSetUtils.partitionByRange(ds2, testDis, 0, 4)) .where(0, 1) .equalTo(0, 4) .with(new T3T5FlatJoin()); List<Tuple2<String, String>> result = joinDs.collect(); String expected = "Hi,Hallo\n" + "Hello,Hallo Welt\n" + "Hello world,Hallo Welt wie gehts?\n" + "Hello world,ABC\n" + "I am fine.,HIJ\n" + "I am fine.,IJK\n"; compareResultAsTuples(result, expected); }
Example #22
Source File: CoGroupITCase.java From flink with Apache License 2.0 | 5 votes |
@Test public void testCoGroupWithRangePartitioning() throws Exception { /* * Test coGroup on tuples with multiple key field positions and same customized distribution */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, Integer, String, Long>> ds1 = CollectionDataSets.get5TupleDataSet(env); DataSet<Tuple3<Integer, Long, String>> ds2 = CollectionDataSets.get3TupleDataSet(env); env.setParallelism(4); TestDistribution testDis = new TestDistribution(); DataSet<Tuple3<Integer, Long, String>> coGrouped = DataSetUtils.partitionByRange(ds1, testDis, 0, 4) .coGroup(DataSetUtils.partitionByRange(ds2, testDis, 0, 1)) .where(0, 4) .equalTo(0, 1) .with(new Tuple5Tuple3CoGroup()); List<Tuple3<Integer, Long, String>> result = coGrouped.collect(); String expected = "1,1,Hallo\n" + "2,2,Hallo Welt\n" + "3,2,Hallo Welt wie gehts?\n" + "3,2,ABC\n" + "5,3,HIJ\n" + "5,3,IJK\n"; compareResultAsTuples(result, expected); }
Example #23
Source File: DataSetUtilsITCase.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
@Test public void testIntegerDataSetChecksumHashCode() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Integer> ds = CollectionDataSets.getIntegerDataSet(env); Utils.ChecksumHashCode checksum = DataSetUtils.checksumHashCode(ds); Assert.assertEquals(checksum.getCount(), 15); Assert.assertEquals(checksum.getChecksum(), 55); }
Example #24
Source File: DataSetUtilsITCase.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
@Test public void testCountElementsPerPartition() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); long expectedSize = 100L; DataSet<Long> numbers = env.generateSequence(0, expectedSize - 1); DataSet<Tuple2<Integer, Long>> ds = DataSetUtils.countElementsPerPartition(numbers); Assert.assertEquals(env.getParallelism(), ds.count()); Assert.assertEquals(expectedSize, ds.sum(1).collect().get(0).f1.longValue()); }
Example #25
Source File: CustomDistributionITCase.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
@Test(expected = IllegalArgumentException.class) public void testPartitionMoreThanDistribution() throws Exception { final TestDataDist2 dist = new TestDataDist2(); ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Long, String>> input = CollectionDataSets.get3TupleDataSet(env); DataSetUtils.partitionByRange(input, dist, 0, 1, 2); }
Example #26
Source File: SampleITCase.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
private void verifySamplerWithFixedSize(boolean withReplacement, int numSamples, long seed) throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); FlatMapOperator<Tuple3<Integer, Long, String>, String> ds = getSourceDataSet(env); DataSet<String> sampled = DataSetUtils.sampleWithSize(ds, withReplacement, numSamples, seed); List<String> result = sampled.collect(); assertEquals(numSamples, result.size()); containsResultAsText(result, getSourceStrings()); }
Example #27
Source File: SampleITCase.java From flink with Apache License 2.0 | 5 votes |
private void verifySamplerWithFraction(boolean withReplacement, double fraction, long seed) throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); FlatMapOperator<Tuple3<Integer, Long, String>, String> ds = getSourceDataSet(env); MapPartitionOperator<String, String> sampled = DataSetUtils.sample(ds, withReplacement, fraction, seed); List<String> result = sampled.collect(); containsResultAsText(result, getSourceStrings()); }
Example #28
Source File: SampleITCase.java From flink with Apache License 2.0 | 5 votes |
private void verifySamplerWithFixedSize(boolean withReplacement, int numSamples, long seed) throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); FlatMapOperator<Tuple3<Integer, Long, String>, String> ds = getSourceDataSet(env); DataSet<String> sampled = DataSetUtils.sampleWithSize(ds, withReplacement, numSamples, seed); List<String> result = sampled.collect(); assertEquals(numSamples, result.size()); containsResultAsText(result, getSourceStrings()); }
Example #29
Source File: CustomDistributionITCase.java From flink with Apache License 2.0 | 5 votes |
@Test(expected = IllegalArgumentException.class) public void testPartitionMoreThanDistribution() throws Exception { final TestDataDist2 dist = new TestDataDist2(); ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Long, String>> input = CollectionDataSets.get3TupleDataSet(env); DataSetUtils.partitionByRange(input, dist, 0, 1, 2); }
Example #30
Source File: DataSetUtilsITCase.java From flink with Apache License 2.0 | 5 votes |
@Test public void testCountElementsPerPartition() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); long expectedSize = 100L; DataSet<Long> numbers = env.generateSequence(0, expectedSize - 1); DataSet<Tuple2<Integer, Long>> ds = DataSetUtils.countElementsPerPartition(numbers); Assert.assertEquals(env.getParallelism(), ds.count()); Assert.assertEquals(expectedSize, ds.sum(1).collect().get(0).f1.longValue()); }