Java Code Examples for org.apache.flink.api.java.DataSet#iterate()
The following examples show how to use
org.apache.flink.api.java.DataSet#iterate() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BranchingPlansCompilerTest.java From flink with Apache License 2.0 | 6 votes |
@Test public void testBranchAfterIteration() { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(DEFAULT_PARALLELISM); DataSet<Long> sourceA = env.generateSequence(0,1); IterativeDataSet<Long> loopHead = sourceA.iterate(10); DataSet<Long> loopTail = loopHead.map(new IdentityMapper<Long>()).name("Mapper"); DataSet<Long> loopRes = loopHead.closeWith(loopTail); loopRes.output(new DiscardingOutputFormat<Long>()); loopRes.map(new IdentityMapper<Long>()) .output(new DiscardingOutputFormat<Long>()); Plan plan = env.createProgramPlan(); try { compileNoStats(plan); } catch (Exception e) { e.printStackTrace(); Assert.fail(e.getMessage()); } }
Example 2
Source File: NestedIterationsTest.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
@Test public void testBulkIterationInClosure() { try { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Long> data1 = env.generateSequence(1, 100); DataSet<Long> data2 = env.generateSequence(1, 100); IterativeDataSet<Long> firstIteration = data1.iterate(100); DataSet<Long> firstResult = firstIteration.closeWith(firstIteration.map(new IdentityMapper<Long>())); IterativeDataSet<Long> mainIteration = data2.map(new IdentityMapper<Long>()).iterate(100); DataSet<Long> joined = mainIteration.join(firstResult) .where(new IdentityKeyExtractor<Long>()).equalTo(new IdentityKeyExtractor<Long>()) .with(new DummyFlatJoinFunction<Long>()); DataSet<Long> mainResult = mainIteration.closeWith(joined); mainResult.output(new DiscardingOutputFormat<Long>()); Plan p = env.createProgramPlan(); // optimizer should be able to translate this OptimizedPlan op = compileNoStats(p); // job graph generator should be able to translate this new JobGraphGenerator().compileJobGraph(op); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
Example 3
Source File: IterationIncompleteStaticPathConsumptionITCase.java From flink with Apache License 2.0 | 5 votes |
@Override protected void testProgram() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // the test data is constructed such that the merge join zig zag // has an early out, leaving elements on the static path input unconsumed DataSet<Path> edges = env.fromElements( new Path(2, 1), new Path(4, 1), new Path(6, 3), new Path(8, 3), new Path(10, 1), new Path(12, 1), new Path(14, 3), new Path(16, 3), new Path(18, 1), new Path(20, 1)); IterativeDataSet<Path> currentPaths = edges.iterate(10); DataSet<Path> newPaths = currentPaths .join(edges, JoinHint.REPARTITION_SORT_MERGE).where("to").equalTo("from") .with(new PathConnector()) .union(currentPaths).distinct("from", "to"); DataSet<Path> result = currentPaths.closeWith(newPaths); result.output(new DiscardingOutputFormat<Path>()); env.execute(); }
Example 4
Source File: IterationsCompilerTest.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
public static DataSet<Tuple2<Long, Long>> doSimpleBulkIteration(DataSet<Tuple2<Long, Long>> vertices, DataSet<Tuple2<Long, Long>> edges) { // open a bulk iteration IterativeDataSet<Tuple2<Long, Long>> iteration = vertices.iterate(20); DataSet<Tuple2<Long, Long>> changes = iteration .join(edges).where(0).equalTo(0) .flatMap(new FlatMapJoin()); // close the bulk iteration return iteration.closeWith(changes); }
Example 5
Source File: AggregatorsITCase.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
@Test public void testAggregatorWithoutParameterForIterate() throws Exception { /* * Test aggregator without parameter for iterate */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(parallelism); DataSet<Integer> initialSolutionSet = CollectionDataSets.getIntegerDataSet(env); IterativeDataSet<Integer> iteration = initialSolutionSet.iterate(MAX_ITERATIONS); // register aggregator LongSumAggregator aggr = new LongSumAggregator(); iteration.registerAggregator(NEGATIVE_ELEMENTS_AGGR, aggr); // register convergence criterion iteration.registerAggregationConvergenceCriterion(NEGATIVE_ELEMENTS_AGGR, aggr, new NegativeElementsConvergenceCriterion()); DataSet<Integer> updatedDs = iteration.map(new SubtractOneMap()); List<Integer> result = iteration.closeWith(updatedDs).collect(); Collections.sort(result); List<Integer> expected = Arrays.asList(-3, -2, -2, -1, -1, -1, 0, 0, 0, 0, 1, 1, 1, 1, 1); assertEquals(expected, result); }
Example 6
Source File: IterationIncompleteDynamicPathConsumptionITCase.java From flink with Apache License 2.0 | 5 votes |
@Override protected void testProgram() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // the test data is constructed such that the merge join zig zag // has an early out, leaving elements on the dynamic path input unconsumed DataSet<Path> edges = env.fromElements( new Path(1, 2), new Path(1, 4), new Path(3, 6), new Path(3, 8), new Path(1, 10), new Path(1, 12), new Path(3, 14), new Path(3, 16), new Path(1, 18), new Path(1, 20)); IterativeDataSet<Path> currentPaths = edges.iterate(10); DataSet<Path> newPaths = currentPaths .join(edges, JoinHint.REPARTITION_SORT_MERGE).where("to").equalTo("from") .with(new PathConnector()) .union(currentPaths).distinct("from", "to"); DataSet<Path> result = currentPaths.closeWith(newPaths); result.output(new DiscardingOutputFormat<Path>()); env.execute(); }
Example 7
Source File: IterationIncompleteDynamicPathConsumptionITCase.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
@Override protected void testProgram() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // the test data is constructed such that the merge join zig zag // has an early out, leaving elements on the dynamic path input unconsumed DataSet<Path> edges = env.fromElements( new Path(1, 2), new Path(1, 4), new Path(3, 6), new Path(3, 8), new Path(1, 10), new Path(1, 12), new Path(3, 14), new Path(3, 16), new Path(1, 18), new Path(1, 20)); IterativeDataSet<Path> currentPaths = edges.iterate(10); DataSet<Path> newPaths = currentPaths .join(edges, JoinHint.REPARTITION_SORT_MERGE).where("to").equalTo("from") .with(new PathConnector()) .union(currentPaths).distinct("from", "to"); DataSet<Path> result = currentPaths.closeWith(newPaths); result.output(new DiscardingOutputFormat<Path>()); env.execute(); }
Example 8
Source File: PipelineBreakerTest.java From flink with Apache License 2.0 | 4 votes |
/** * * * * <pre> * +----------- ITERATION ---------+ * | | * +--+ +----+ * (source 1) ----------------->|PS| ------------ + +-->|next|---> (sink) * +--+ | (BC) | +----+ * | V | | * (source 2) --> (map) --+------|-----------> (MAPPER) ---+ | * | | ^ | * | | | (BC) | * | +----------------|--------------+ * | | * +--(map) --> (reduce) --+ * </pre> */ @Test public void testPipelineBreakerBroadcastedPartialSolution() { try { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.getConfig().setExecutionMode(ExecutionMode.PIPELINED); env.setParallelism(64); DataSet<Long> initialSource = env.generateSequence(1, 10); IterativeDataSet<Long> iteration = initialSource.iterate(100); DataSet<Long> sourceWithMapper = env.generateSequence(1, 10).map(new IdentityMapper<Long>()); DataSet<Long> bcInput1 = sourceWithMapper .map(new IdentityMapper<Long>()) .reduce(new SelectOneReducer<Long>()); DataSet<Long> result = sourceWithMapper .map(new IdentityMapper<Long>()) .withBroadcastSet(iteration, "bc2") .withBroadcastSet(bcInput1, "bc1"); iteration.closeWith(result).output(new DiscardingOutputFormat<Long>()); Plan p = env.createProgramPlan(); OptimizedPlan op = compileNoStats(p); SinkPlanNode sink = op.getDataSinks().iterator().next(); BulkIterationPlanNode iterationPlanNode = (BulkIterationPlanNode) sink.getInput().getSource(); SingleInputPlanNode mapper = (SingleInputPlanNode) iterationPlanNode.getRootOfStepFunction(); assertEquals(TempMode.CACHED, mapper.getInput().getTempMode()); assertEquals(DataExchangeMode.BATCH, mapper.getInput().getDataExchangeMode()); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
Example 9
Source File: BulkIterationWithAllReducerITCase.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
@Override protected void testProgram() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Integer> data = env.fromElements(1, 2, 3, 4, 5, 6, 7, 8); IterativeDataSet<Integer> iteration = data.iterate(10); DataSet<Integer> result = data.reduceGroup(new PickOneAllReduce()).withBroadcastSet(iteration, "bc"); final List<Integer> resultList = new ArrayList<Integer>(); iteration.closeWith(result).output(new LocalCollectionOutputFormat<Integer>(resultList)); env.execute(); Assert.assertEquals(8, resultList.get(0).intValue()); }
Example 10
Source File: IterationsCompilerTest.java From flink with Apache License 2.0 | 4 votes |
@Test public void testResetPartialSolution() { try { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Long> width = env.generateSequence(1, 10); DataSet<Long> update = env.generateSequence(1, 10); DataSet<Long> lastGradient = env.generateSequence(1, 10); DataSet<Long> init = width.union(update).union(lastGradient); IterativeDataSet<Long> iteration = init.iterate(10); width = iteration.filter(new IdFilter<Long>()); update = iteration.filter(new IdFilter<Long>()); lastGradient = iteration.filter(new IdFilter<Long>()); DataSet<Long> gradient = width.map(new IdentityMapper<Long>()); DataSet<Long> term = gradient.join(lastGradient) .where(new IdentityKeyExtractor<Long>()) .equalTo(new IdentityKeyExtractor<Long>()) .with(new JoinFunction<Long, Long, Long>() { public Long join(Long first, Long second) { return null; } }); update = update.map(new RichMapFunction<Long, Long>() { public Long map(Long value) { return null; } }).withBroadcastSet(term, "some-name"); DataSet<Long> result = iteration.closeWith(width.union(update).union(lastGradient)); result.output(new DiscardingOutputFormat<Long>()); Plan p = env.createProgramPlan(); OptimizedPlan op = compileNoStats(p); new JobGraphGenerator().compileJobGraph(op); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
Example 11
Source File: BroadcastVarInitializationITCase.java From flink with Apache License 2.0 | 4 votes |
@Override protected void testProgram() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(4); DataSet<Integer> data = env.fromElements(1, 2, 3, 4, 5, 6, 7, 8); IterativeDataSet<Integer> iteration = data.iterate(10); DataSet<Integer> result = data.reduceGroup(new PickOneAllReduce()).withBroadcastSet(iteration, "bc"); final List<Integer> resultList = new ArrayList<Integer>(); iteration.closeWith(result).output(new LocalCollectionOutputFormat<Integer>(resultList)); env.execute(); Assert.assertEquals(8, resultList.get(0).intValue()); }
Example 12
Source File: KMeansArbitraryDimension.java From flink-perf with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { if(!parseParameters(args)) { return; } // set up execution environment ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // get input data DataSet<Point> points = env .readTextFile(pointsPath) .map(new ConvertToPoint()); DataSet<Centroid> centroids = env .readTextFile(centersPath) .map(new ConvertToCentroid()); // set number of bulk iterations for KMeans algorithm IterativeDataSet<Centroid> loop = centroids.iterate(numIterations); DataSet<Centroid> newCentroids = points // compute closest centroid for each point .map(new SelectNearestCenter()).withBroadcastSet(loop, "centroids") // count and sum point coordinates for each centroid .map(new CountAppender()) .groupBy(0).reduce(new CentroidAccumulator()) // compute new centroids from point counts and coordinate sums .map(new CentroidAverager()); // feed new centroids back into next iteration DataSet<Centroid> finalCentroids = loop.closeWith(newCentroids); DataSet<Tuple2<Integer, Point>> clusteredPoints = points // assign points to final clusters .map(new SelectNearestCenter()).withBroadcastSet(finalCentroids, "centroids"); // emit result //clusteredPoints.writeAsCsv(outputPath, "\n", " ", FileSystem.WriteMode.OVERWRITE); clusteredPoints.writeAsText(outputPath, FileSystem.WriteMode.OVERWRITE); // execute program env.setParallelism(dop); env.execute("KMeans Multi-Dimension"); }
Example 13
Source File: KMeans.java From flink with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { // Checking input parameters final ParameterTool params = ParameterTool.fromArgs(args); // set up execution environment ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.getConfig().setGlobalJobParameters(params); // make parameters available in the web interface // get input data: // read the points and centroids from the provided paths or fall back to default data DataSet<Point> points = getPointDataSet(params, env); DataSet<Centroid> centroids = getCentroidDataSet(params, env); // set number of bulk iterations for KMeans algorithm IterativeDataSet<Centroid> loop = centroids.iterate(params.getInt("iterations", 10)); DataSet<Centroid> newCentroids = points // compute closest centroid for each point .map(new SelectNearestCenter()).withBroadcastSet(loop, "centroids") // count and sum point coordinates for each centroid .map(new CountAppender()) .groupBy(0).reduce(new CentroidAccumulator()) // compute new centroids from point counts and coordinate sums .map(new CentroidAverager()); // feed new centroids back into next iteration DataSet<Centroid> finalCentroids = loop.closeWith(newCentroids); DataSet<Tuple2<Integer, Point>> clusteredPoints = points // assign points to final clusters .map(new SelectNearestCenter()).withBroadcastSet(finalCentroids, "centroids"); // emit result if (params.has("output")) { clusteredPoints.writeAsCsv(params.get("output"), "\n", " "); // since file sinks are lazy, we trigger the execution explicitly env.execute("KMeans Example"); } else { System.out.println("Printing result to stdout. Use --output to specify output path."); clusteredPoints.print(); } }
Example 14
Source File: PageRank.java From flink with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { ParameterTool params = ParameterTool.fromArgs(args); final int numPages = params.getInt("numPages", PageRankData.getNumberOfPages()); final int maxIterations = params.getInt("iterations", 10); // set up execution environment final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // make the parameters available to the web ui env.getConfig().setGlobalJobParameters(params); // get input data DataSet<Long> pagesInput = getPagesDataSet(env, params); DataSet<Tuple2<Long, Long>> linksInput = getLinksDataSet(env, params); // assign initial rank to pages DataSet<Tuple2<Long, Double>> pagesWithRanks = pagesInput. map(new RankAssigner((1.0d / numPages))); // build adjacency list from link input DataSet<Tuple2<Long, Long[]>> adjacencyListInput = linksInput.groupBy(0).reduceGroup(new BuildOutgoingEdgeList()); // set iterative data set IterativeDataSet<Tuple2<Long, Double>> iteration = pagesWithRanks.iterate(maxIterations); DataSet<Tuple2<Long, Double>> newRanks = iteration // join pages with outgoing edges and distribute rank .join(adjacencyListInput).where(0).equalTo(0).flatMap(new JoinVertexWithEdgesMatch()) // collect and sum ranks .groupBy(0).aggregate(SUM, 1) // apply dampening factor .map(new Dampener(DAMPENING_FACTOR, numPages)); DataSet<Tuple2<Long, Double>> finalPageRanks = iteration.closeWith( newRanks, newRanks.join(iteration).where(0).equalTo(0) // termination condition .filter(new EpsilonFilter())); // emit result if (params.has("output")) { finalPageRanks.writeAsCsv(params.get("output"), "\n", " "); // execute program env.execute("Basic Page Rank Example"); } else { System.out.println("Printing result to stdout. Use --output to specify output path."); finalPageRanks.print(); } }
Example 15
Source File: LinearRegression.java From flink with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { final ParameterTool params = ParameterTool.fromArgs(args); // set up execution environment final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); final int iterations = params.getInt("iterations", 10); // make parameters available in the web interface env.getConfig().setGlobalJobParameters(params); // get input x data from elements DataSet<Data> data; if (params.has("input")) { // read data from CSV file data = env.readCsvFile(params.get("input")) .fieldDelimiter(" ") .includeFields(true, true) .pojoType(Data.class); } else { System.out.println("Executing LinearRegression example with default input data set."); System.out.println("Use --input to specify file input."); data = LinearRegressionData.getDefaultDataDataSet(env); } // get the parameters from elements DataSet<Params> parameters = LinearRegressionData.getDefaultParamsDataSet(env); // set number of bulk iterations for SGD linear Regression IterativeDataSet<Params> loop = parameters.iterate(iterations); DataSet<Params> newParameters = data // compute a single step using every sample .map(new SubUpdate()).withBroadcastSet(loop, "parameters") // sum up all the steps .reduce(new UpdateAccumulator()) // average the steps and update all parameters .map(new Update()); // feed new parameters back into next iteration DataSet<Params> result = loop.closeWith(newParameters); // emit result if (params.has("output")) { result.writeAsText(params.get("output")); // execute program env.execute("Linear Regression example"); } else { System.out.println("Printing result to stdout. Use --output to specify output path."); result.print(); } }
Example 16
Source File: CachedMatchStrategyCompilerTest.java From flink with Apache License 2.0 | 3 votes |
private Plan getTestPlanRightStatic(String strategy) { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(DEFAULT_PARALLELISM); DataSet<Tuple3<Long, Long, Long>> bigInput = env.readCsvFile("file://bigFile").types(Long.class, Long.class, Long.class).name("bigFile"); DataSet<Tuple3<Long, Long, Long>> smallInput = env.readCsvFile("file://smallFile").types(Long.class, Long.class, Long.class).name("smallFile"); IterativeDataSet<Tuple3<Long, Long, Long>> iteration = bigInput.iterate(10); Configuration joinStrategy = new Configuration(); joinStrategy.setString(Optimizer.HINT_SHIP_STRATEGY, Optimizer.HINT_SHIP_STRATEGY_REPARTITION_HASH); if(!strategy.equals("")) { joinStrategy.setString(Optimizer.HINT_LOCAL_STRATEGY, strategy); } DataSet<Tuple3<Long, Long, Long>> inner = iteration.join(smallInput).where(0).equalTo(0).with(new DummyJoiner()).name("DummyJoiner").withParameters(joinStrategy); DataSet<Tuple3<Long, Long, Long>> output = iteration.closeWith(inner); output.output(new DiscardingOutputFormat<Tuple3<Long, Long, Long>>()); return env.createProgramPlan(); }
Example 17
Source File: IterationWithUnionITCase.java From flink with Apache License 2.0 | 3 votes |
@Override protected void testProgram() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<Integer, CoordVector>> initialInput = env.readFile(new PointInFormat(), this.dataPath).setParallelism(1); IterativeDataSet<Tuple2<Integer, CoordVector>> iteration = initialInput.iterate(2); DataSet<Tuple2<Integer, CoordVector>> result = iteration.union(iteration).map(new IdentityMapper()); iteration.closeWith(result).writeAsFormattedText(this.resultPath, new PointFormatter()); env.execute(); }
Example 18
Source File: BranchingPlansCompilerTest.java From flink with Apache License 2.0 | 3 votes |
@Test public void testBCVariableClosure() { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<String> input = env.readTextFile(IN_FILE).name("source1"); DataSet<String> reduced = input .map(new IdentityMapper<String>()) .reduceGroup(new Top1GroupReducer<String>()); DataSet<String> initialSolution = input.map(new IdentityMapper<String>()).withBroadcastSet(reduced, "bc"); IterativeDataSet<String> iteration = initialSolution.iterate(100); iteration.closeWith(iteration.map(new IdentityMapper<String>()).withBroadcastSet(reduced, "red")) .output(new DiscardingOutputFormat<String>()); Plan plan = env.createProgramPlan(); try{ compileNoStats(plan); }catch(Exception e){ e.printStackTrace(); Assert.fail(e.getMessage()); } }
Example 19
Source File: BulkIterationTranslationTest.java From flink with Apache License 2.0 | 2 votes |
@Test public void testCorrectTranslation() { final String jobName = "Test JobName"; final int numIterations = 13; final int defaultParallelism = 133; final int iterationParallelism = 77; ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // ------------ construct the test program ------------------ { env.setParallelism(defaultParallelism); @SuppressWarnings("unchecked") DataSet<Tuple3<Double, Long, String>> initialDataSet = env.fromElements(new Tuple3<>(3.44, 5L, "abc")); IterativeDataSet<Tuple3<Double, Long, String>> bulkIteration = initialDataSet.iterate(numIterations); bulkIteration.setParallelism(iterationParallelism); // test that multiple iteration consumers are supported DataSet<Tuple3<Double, Long, String>> identity = bulkIteration .map(new IdentityMapper<Tuple3<Double, Long, String>>()); DataSet<Tuple3<Double, Long, String>> result = bulkIteration.closeWith(identity); result.output(new DiscardingOutputFormat<Tuple3<Double, Long, String>>()); result.writeAsText("/dev/null"); } Plan p = env.createProgramPlan(jobName); // ------------- validate the plan ---------------- BulkIterationBase<?> iteration = (BulkIterationBase<?>) p.getDataSinks().iterator().next().getInput(); assertEquals(jobName, p.getJobName()); assertEquals(defaultParallelism, p.getDefaultParallelism()); assertEquals(iterationParallelism, iteration.getParallelism()); }
Example 20
Source File: PageRankStephan.java From flink-perf with Apache License 2.0 | 2 votes |
public static void main(String[] args) throws Exception { String adjacencyPath = args[0]; //"/data/demodata/pagerank/adjacency/adjacency.csv"; String outpath = args[1]; //"/home/cicero/Desktop/out.txt"; int numIterations = Integer.valueOf(args[2]); long numVertices = Integer.valueOf(args[3]); final double threshold = 0.005 / numVertices; ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<Long, long[]>> adjacency = env.readTextFile(adjacencyPath).map(new AdjacencyBuilder()); DataSet<Tuple2<Long, long[]>> adjacency2 = env.readTextFile(adjacencyPath).map(new AdjacencyBuilder()); DataSet<Tuple2<Long, Double>> initialRanks = adjacency.map(new VertexInitializer(1.0 / numVertices)); IterativeDataSet<Tuple2<Long, Double>> iteration = initialRanks.iterate(numIterations); DataSet<Tuple2<Long, Double>> newRanks = iteration .join(adjacency2, JoinHint.REPARTITION_HASH_SECOND).where(0).equalTo(0).with(new RankDistributor(0.85, numVertices)) .groupBy(0) .reduceGroup(new Adder()); DataSet<Integer> tc = iteration.join(newRanks).where(0).equalTo(0).with(new FlatJoinFunction<Tuple2<Long, Double>, Tuple2<Long, Double>, Integer>() { @Override public void join(Tuple2<Long, Double> longDoubleTuple2, Tuple2<Long, Double> longDoubleTuple22, Collector<Integer> collector) throws Exception { double delta = Math.abs(longDoubleTuple2.f1 - longDoubleTuple22.f1); if(delta > threshold) { collector.collect(1); } } }); iteration.closeWith(newRanks, tc).writeAsCsv(outpath+"_fastbulk", WriteMode.OVERWRITE); // System.out.println(env.getExecutionPlan()); env.execute("Page Rank Optimized"); }