Java Code Examples for org.apache.flink.api.java.DataSet#print()
The following examples show how to use
org.apache.flink.api.java.DataSet#print() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: WordCountWithInnerClass.java From flink with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws Exception { // set up the execution environment final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // get input data DataSet<String> text = StaticData.getDefaultTextLineDataSet(env); DataSet<Tuple2<String, Integer>> counts = // split up the lines in pairs (2-tuples) containing: (word,1) text.flatMap(new Tokenizer()) // group by the tuple field "0" and sum up tuple field "1" .groupBy(0) .sum(1); // emit result counts.print(); // execute program env.execute("WordCount Example"); }
Example 2
Source File: WordCountWithInnerClass.java From flink with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws Exception { // set up the execution environment final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // get input data DataSet<String> text = StaticData.getDefaultTextLineDataSet(env); DataSet<Tuple2<String, Integer>> counts = // split up the lines in pairs (2-tuples) containing: (word,1) text.flatMap(new Tokenizer()) // group by the tuple field "0" and sum up tuple field "1" .groupBy(0) .sum(1); // emit result counts.print(); // execute program env.execute("WordCount Example"); }
Example 3
Source File: DataFlinkLoaderTest.java From toolbox with Apache License 2.0 | 6 votes |
public static void test1() throws Exception { //Set-up Flink session. Configuration conf = new Configuration(); conf.setInteger("taskmanager.network.numberOfBuffers", 12000); final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf); env.getConfig().disableSysoutLogging(); env.setParallelism(Main.PARALLELISM); DataFlink<DataInstance> dataFlink = DataFlinkLoader.loadDataFromFile(env, "../datasets/simulated/test_not_modify/SmallDataSet.arff", false); DataSet<DataInstance> data = dataFlink.getDataSet(); data.print(); List<DataInstance> instanceList = data.collect(); assertEquals(16, instanceList.size()); List<String> names = Arrays.asList("A", "B", "C", "D", "E", "G"); List<Integer> states = Arrays.asList(2, 3, 2, 2, 2, -1); List<Attribute> atts = dataFlink.getAttributes().getListOfNonSpecialAttributes(); for (int i = 0; i < names.size(); i++) { if (Main.VERBOSE) System.out.println(names.get(i)); assertEquals(atts.get(i).getName(), names.get(i)); assertEquals(atts.get(i).getNumberOfStates(), states.get(i).intValue()); } }
Example 4
Source File: EmptyFieldsCountAccumulator.java From flink with Apache License 2.0 | 5 votes |
public static void main(final String[] args) throws Exception { final ParameterTool params = ParameterTool.fromArgs(args); final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // make parameters available in the web interface env.getConfig().setGlobalJobParameters(params); // get the data set final DataSet<StringTriple> file = getDataSet(env, params); // filter lines with empty fields final DataSet<StringTriple> filteredLines = file.filter(new EmptyFieldFilter()); // Here, we could do further processing with the filtered lines... JobExecutionResult result; // output the filtered lines if (params.has("output")) { filteredLines.writeAsCsv(params.get("output")); // execute program result = env.execute("Accumulator example"); } else { System.out.println("Printing result to stdout. Use --output to specify output path."); filteredLines.print(); result = env.getLastJobExecutionResult(); } // get the accumulator result via its registration key final List<Integer> emptyFields = result.getAccumulatorResult(EMPTY_FIELD_ACCUMULATOR); System.out.format("Number of detected empty fields per column: %s\n", emptyFields); }
Example 5
Source File: FilterWithIndirection.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<String> input = env.fromElements("Please filter", "the words", "but not this"); DataSet<String> output = input.filter(UtilFunctionWrapper.UtilFunction.getWordFilter()); output.print(); env.execute(); }
Example 6
Source File: TestParserMapFunctionAvroInline.java From logparser with Apache License 2.0 | 5 votes |
@Test public void testInlineDefinitionAvro() throws Exception { // set up the execution environment final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<String> input = env.fromElements(TestCase.getInputLine()); DataSet<Click> filledTestRecords = input .map(new RichMapFunction<String, Click>() { private Parser<ClickSetter> parser; @Override public void open(org.apache.flink.configuration.Configuration parameters) { parser = new HttpdLoglineParser<>(ClickSetter.class, TestCase.getLogFormat()) .addDissector(new ScreenResolutionDissector()) .addTypeRemapping("request.firstline.uri.query.g", "HTTP.URI") .addTypeRemapping("request.firstline.uri.query.r", "HTTP.URI") .addTypeRemapping("request.firstline.uri.query.s", "SCREENRESOLUTION") .addDissector(new GeoIPISPDissector(ISP_TEST_MMDB)) .addDissector(new GeoIPCityDissector(CITY_TEST_MMDB)); } @Override public Click map(String line) throws Exception { return parser.parse(line).build(); } }).name("Extract Elements from logline"); filledTestRecords.print(); List<Click> result = filledTestRecords.collect(); assertEquals(1, result.size()); assertEquals(ExpectedClick.create(), result.get(0)); }
Example 7
Source File: TestParserMapFunctionInline.java From logparser with Apache License 2.0 | 5 votes |
@Test public void testInlineDefinition() throws Exception { // set up the execution environment final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<String> input = env.fromElements(TestCase.getInputLine()); DataSet<TestRecord> filledTestRecords = input .map(new RichMapFunction<String, TestRecord>() { private Parser<TestRecord> parser; @Override public void open(org.apache.flink.configuration.Configuration parameters) throws Exception { parser = TestCase.createTestParser(); } @Override public TestRecord map(String line) throws Exception { return parser.parse(line); } }).name("Extract Elements from logline"); filledTestRecords.print(); List<TestRecord> result = filledTestRecords.collect(); assertEquals(1, result.size()); assertEquals(new TestRecord().setFullValid(), result.get(0)); }
Example 8
Source File: GSASingleSourceShortestPaths.java From flink with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { if (!parseParameters(args)) { return; } ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Edge<Long, Double>> edges = getEdgeDataSet(env); Graph<Long, Double, Double> graph = Graph.fromDataSet(edges, new InitVertices(srcVertexId), env); // Execute the GSA iteration Graph<Long, Double, Double> result = graph.runGatherSumApplyIteration( new CalculateDistances(), new ChooseMinDistance(), new UpdateDistance(), maxIterations); // Extract the vertices as the result DataSet<Vertex<Long, Double>> singleSourceShortestPaths = result.getVertices(); // emit result if (fileOutput) { singleSourceShortestPaths.writeAsCsv(outputPath, "\n", ","); // since file sinks are lazy, we trigger the execution explicitly env.execute("GSA Single Source Shortest Paths"); } else { singleSourceShortestPaths.print(); } }
Example 9
Source File: ParquetProtobufExample.java From parquet-flinktacular with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { //output final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<Void, Person>> data = generateDataSet(env); writeProtobuf(data, "newpath"); data.print(); env.execute("Parquet output"); //input final ExecutionEnvironment env2 = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<Void, Person.Builder>> input = readProtobuf(env2, "newpath"); input.map(new TupleToProto()).print(); env2.execute("Parquet input"); }
Example 10
Source File: EuclideanGraphWeighing.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { if (!parseParameters(args)) { return; } ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Vertex<Long, Point>> vertices = getVerticesDataSet(env); DataSet<Edge<Long, Double>> edges = getEdgesDataSet(env); Graph<Long, Point, Double> graph = Graph.fromDataSet(vertices, edges, env); // the edge value will be the Euclidean distance between its src and trg vertex DataSet<Tuple3<Long, Long, Double>> edgesWithEuclideanWeight = graph.getTriplets() .map(new MapFunction<Triplet<Long, Point, Double>, Tuple3<Long, Long, Double>>() { @Override public Tuple3<Long, Long, Double> map(Triplet<Long, Point, Double> triplet) throws Exception { Vertex<Long, Point> srcVertex = triplet.getSrcVertex(); Vertex<Long, Point> trgVertex = triplet.getTrgVertex(); return new Tuple3<>(srcVertex.getId(), trgVertex.getId(), srcVertex.getValue().euclideanDistance(trgVertex.getValue())); } }); Graph<Long, Point, Double> resultedGraph = graph.joinWithEdges(edgesWithEuclideanWeight, new EdgeJoinFunction<Double, Double>() { public Double edgeJoin(Double edgeValue, Double inputValue) { return inputValue; } }); // retrieve the edges from the final result DataSet<Edge<Long, Double>> result = resultedGraph.getEdges(); // emit result if (fileOutput) { result.writeAsCsv(outputPath, "\n", ","); // since file sinks are lazy, we trigger the execution explicitly env.execute("Euclidean Graph Weighing Example"); } else { result.print(); } }
Example 11
Source File: DataSetConversionUtilTest.java From flink with Apache License 2.0 | 4 votes |
@Test public void testE2E() throws Exception { ExecutionEnvironment env = MLEnvironmentFactory.getDefault().getExecutionEnvironment(); DataSet<Row> input = env.fromElements(Row.of("a")); Table table1 = DataSetConversionUtil.toTable(MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID, input, new String[]{"word"}); Assert.assertEquals( new TableSchema(new String[]{"word"}, new TypeInformation[]{TypeInformation.of(String.class)}), table1.getSchema() ); DataSet<Row> genericInput1 = input.map(new GenericTypeMap()); // Force type should go through with explicit type info. Table table2 = DataSetConversionUtil.toTable( MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID, genericInput1, new String[]{"word"}, new TypeInformation[]{TypeInformation.of(Integer.class)} ); Assert.assertEquals( new TableSchema(new String[]{"word"}, new TypeInformation[]{TypeInformation.of(Integer.class)}), table2.getSchema() ); DataSet<Row> genericInput2 = input.map(new GenericTypeMap()); // Force type should go through with table schema. Table table3 = DataSetConversionUtil.toTable( MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID, genericInput2, new TableSchema( new String[]{"word"}, new TypeInformation[]{TypeInformation.of(Integer.class)} ) ); Assert.assertEquals( new TableSchema(new String[]{"word"}, new TypeInformation[]{TypeInformation.of(Integer.class)}), table3.getSchema() ); // applying toTable again on the same input should fail thrown.expect(IllegalStateException.class); DataSetConversionUtil.toTable( MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID, genericInput2, new TableSchema( new String[]{"word"}, new TypeInformation[]{TypeInformation.of(Integer.class)} ) ); // Validation should fail without correct type inference. DataSet<Row> genericInput3 = input.map(new GenericTypeMap()); thrown.expect(ValidationException.class); DataSetConversionUtil.toTable(MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID, genericInput3, new String[]{"word"}); // Output should go through when using correct type to output. DataSet<Row> output = DataSetConversionUtil.fromTable(MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID, table1); output.print(); // Output should NOT go through when using incorrect type forcing. thrown.expect(ExecutionException.class); DataSetConversionUtil.fromTable(MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID, table2).print(); }
Example 12
Source File: IncrementalSSSP.java From flink with Apache License 2.0 | 4 votes |
public static void main(String [] args) throws Exception { if (!parseParameters(args)) { return; } ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); Edge<Long, Double> edgeToBeRemoved = getEdgeToBeRemoved(); Graph<Long, Double, Double> graph = IncrementalSSSP.getGraph(env); // Assumption: all minimum weight paths are kept Graph<Long, Double, Double> ssspGraph = IncrementalSSSP.getSSSPGraph(env); // remove the edge graph.removeEdge(edgeToBeRemoved); // configure the iteration ScatterGatherConfiguration parameters = new ScatterGatherConfiguration(); if (isInSSSP(edgeToBeRemoved, ssspGraph.getEdges())) { parameters.setDirection(EdgeDirection.IN); parameters.setOptDegrees(true); // run the scatter-gather iteration to propagate info Graph<Long, Double, Double> result = ssspGraph.runScatterGatherIteration(new InvalidateMessenger(edgeToBeRemoved), new VertexDistanceUpdater(), maxIterations, parameters); DataSet<Vertex<Long, Double>> resultedVertices = result.getVertices(); // Emit results if (fileOutput) { resultedVertices.writeAsCsv(outputPath, "\n", ","); env.execute("Incremental SSSP Example"); } else { resultedVertices.print(); } } else { // print the vertices if (fileOutput) { graph.getVertices().writeAsCsv(outputPath, "\n", ","); env.execute("Incremental SSSP Example"); } else { graph.getVertices().print(); } } }
Example 13
Source File: ParallelVBTest.java From toolbox with Apache License 2.0 | 4 votes |
public void testingMLParallelPosteriors() throws Exception { //Set-up Flink session. Configuration conf = new Configuration(); conf.setInteger("taskmanager.network.numberOfBuffers", 12000); final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf); env.getConfig().disableSysoutLogging(); env.setParallelism(Main.PARALLELISM); DataFlink<DataInstance> dataFlink = DataFlinkLoader.loadDataFromFolder(env, "../datasets/simulated/test_not_modify/MONTH1.arff", true); //DataFlink<DataInstance> dataStream = DataFlinkLoader.loadDataFromFile(env, // "./datasets/dataStream/test_not_modify/SmallDataSet.arff", false); //Structure learning is excluded from the test, i.e., we use directly the initial Asia network structure // and just learn then test the parameter learning //Parameter Learning ParallelVB parallelVB = new ParallelVB(); parallelVB.setOutput(true); parallelVB.setSeed(5); parallelVB.setBatchSize(100); parallelVB.setLocalThreshold(0.001); parallelVB.setGlobalThreshold(0.05); parallelVB.setMaximumLocalIterations(100); parallelVB.setMaximumGlobalIterations(100); DAG dag = DAGGenerator.getHiddenNaiveBayesStructure(dataFlink.getAttributes(), "GlobalHidden", 2); if (Main.VERBOSE) System.out.println(dag.toString()); parallelVB.setDAG(dag); parallelVB.initLearning(); parallelVB.updateModel(dataFlink); BayesianNetwork bnet = parallelVB.getLearntBayesianNetwork(); if (Main.VERBOSE) System.out.println(bnet.toString()); DataSet<DataPosterior> dataPosteriorDataSet = parallelVB.computePosterior(dataFlink,Arrays.asList(dag.getVariables().getVariableByName("GlobalHidden"))); dataPosteriorDataSet.print(); //DataSetSerializer.serializeDataSet(dataPosteriorDataSet, "./datasets/tmp.ser"); //dataPosteriorDataSet = DataSetSerializer.deserializeDataSet("./datasets/tmp.ser"); dataPosteriorDataSet.print(); }
Example 14
Source File: PageRank.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { ParameterTool params = ParameterTool.fromArgs(args); final int numPages = params.getInt("numPages", PageRankData.getNumberOfPages()); final int maxIterations = params.getInt("iterations", 10); // set up execution environment final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // make the parameters available to the web ui env.getConfig().setGlobalJobParameters(params); // get input data DataSet<Long> pagesInput = getPagesDataSet(env, params); DataSet<Tuple2<Long, Long>> linksInput = getLinksDataSet(env, params); // assign initial rank to pages DataSet<Tuple2<Long, Double>> pagesWithRanks = pagesInput. map(new RankAssigner((1.0d / numPages))); // build adjacency list from link input DataSet<Tuple2<Long, Long[]>> adjacencyListInput = linksInput.groupBy(0).reduceGroup(new BuildOutgoingEdgeList()); // set iterative data set IterativeDataSet<Tuple2<Long, Double>> iteration = pagesWithRanks.iterate(maxIterations); DataSet<Tuple2<Long, Double>> newRanks = iteration // join pages with outgoing edges and distribute rank .join(adjacencyListInput).where(0).equalTo(0).flatMap(new JoinVertexWithEdgesMatch()) // collect and sum ranks .groupBy(0).aggregate(SUM, 1) // apply dampening factor .map(new Dampener(DAMPENING_FACTOR, numPages)); DataSet<Tuple2<Long, Double>> finalPageRanks = iteration.closeWith( newRanks, newRanks.join(iteration).where(0).equalTo(0) // termination condition .filter(new EpsilonFilter())); // emit result if (params.has("output")) { finalPageRanks.writeAsCsv(params.get("output"), "\n", " "); // execute program env.execute("Basic Page Rank Example"); } else { System.out.println("Printing result to stdout. Use --output to specify output path."); finalPageRanks.print(); } }
Example 15
Source File: EnumTriangles.java From flink with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { // Checking input parameters final ParameterTool params = ParameterTool.fromArgs(args); // set up execution environment final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // make parameters available in the web interface env.getConfig().setGlobalJobParameters(params); // read input data DataSet<Edge> edges; if (params.has("edges")) { edges = env.readCsvFile(params.get("edges")) .fieldDelimiter(" ") .includeFields(true, true) .types(Integer.class, Integer.class) .map(new TupleEdgeConverter()); } else { System.out.println("Executing EnumTriangles example with default edges data set."); System.out.println("Use --edges to specify file input."); edges = EnumTrianglesData.getDefaultEdgeDataSet(env); } // project edges by vertex id DataSet<Edge> edgesById = edges .map(new EdgeByIdProjector()); DataSet<Triad> triangles = edgesById // build triads .groupBy(Edge.V1).sortGroup(Edge.V2, Order.ASCENDING).reduceGroup(new TriadBuilder()) // filter triads .join(edgesById).where(Triad.V2, Triad.V3).equalTo(Edge.V1, Edge.V2).with(new TriadFilter()); // emit result if (params.has("output")) { triangles.writeAsCsv(params.get("output"), "\n", ","); // execute program env.execute("Basic Triangle Enumeration Example"); } else { System.out.println("Printing result to stdout. Use --output to specify output path."); triangles.print(); } }
Example 16
Source File: dVMPv1Test.java From toolbox with Apache License 2.0 | 4 votes |
public void testingMLParallelPosteriorsAssignment() throws Exception { //Set-up Flink session. Configuration conf = new Configuration(); conf.setInteger("taskmanager.network.numberOfBuffers", 12000); final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf); env.getConfig().disableSysoutLogging(); env.setParallelism(Main.PARALLELISM); DataFlink<DataInstance> dataFlink = DataFlinkLoader.loadDataFromFolder(env, "../datasets/simulated/test_not_modify/MONTH1.arff", true); //DataFlink<DataInstance> dataStream = DataFlinkLoader.loadDataFromFile(env, // "./datasets/dataStream/test_not_modify/SmallDataSet.arff", false); //Structure learning is excluded from the test, i.e., we use directly the initial Asia network structure // and just learn then test the parameter learning //Parameter Learning dVMPv1 parallelVB = new dVMPv1(); parallelVB.setOutput(true); parallelVB.setSeed(5); parallelVB.setBatchSize(100); parallelVB.setLocalThreshold(0.001); parallelVB.setGlobalThreshold(0.05); parallelVB.setMaximumLocalIterations(100); parallelVB.setMaximumGlobalIterations(100); DAG dag = DAGGenerator.getHiddenNaiveBayesStructure(dataFlink.getAttributes(), "GlobalHidden", 2); if (Main.VERBOSE) System.out.println(dag.toString()); parallelVB.setDAG(dag); parallelVB.initLearning(); parallelVB.updateModel(dataFlink); BayesianNetwork bnet = parallelVB.getLearntBayesianNetwork(); if (Main.VERBOSE) System.out.println(bnet.toString()); List<Variable> list = new ArrayList<>(); list.add(dag.getVariables().getVariableByName("GlobalHidden")); list.add(dag.getVariables().getVariableById(0)); DataSet<DataPosteriorAssignment> dataPosteriorDataSet = parallelVB.computePosteriorAssignment(dataFlink,list); dataPosteriorDataSet.print(); dataPosteriorDataSet.print(); }
Example 17
Source File: GeoTempFlatMapTest.java From OSTMap with Apache License 2.0 | 4 votes |
@Test public void testFlatMap() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); ByteBuffer key1 = ByteBuffer.allocate(12); key1.putLong(1459350458).putInt(123); DataSet<Tuple2<Key,Value>> input = env.fromElements( new Tuple2<>(new Key(new Text(key1.array()),new Text("t"),new Text("")), new Value(("{\n" + " \"created_at\": \"Wed Mar 30 15:07:38 +0000 2016\",\n" + " \"id\": 715193777833582592,\n" + " \"user\": {\n" + " \"id\": 2243967693,\n" + " \"id_str\": \"2243967693\",\n" + " },\n" + " \"geo\": {\n" + " \"type\": \"Point\",\n" + " \"coordinates\": [\n" + " 41.00870620,\n" + " 29.21240342\n" + " ]\n" + " },\n" + " \"coordinates\": {\n" + " \"type\": \"Point\",\n" + " \"coordinates\": [\n" + " 29.21240342,\n" + " 41.00870620\n" + " ]\n" + " },\n" + " \"place\": {\n" + " \"id\": \"5e02a0f0d91c76d2\",\n" + " \"url\": \"https:\\/\\/api.twitter.com\\/1.1\\/geo\\/id\\/5e02a0f0d91c76d2.json\",\n" + " \"place_type\": \"city\",\n" + " \"name\": \"\\u0130stanbul\",\n" + " \"full_name\": \"\\u0130stanbul, T\\u00fcrkiye\",\n" + " \"country_code\": \"TR\",\n" + " \"country\": \"T\\u00fcrkiye\",\n" + " \"bounding_box\": {\n" + " \"type\": \"Polygon\",\n" + " \"coordinates\": [\n" + " [\n" + " [\n" + " 28.632104,\n" + " 40.802734\n" + " ],\n" + " [\n" + " 28.632104,\n" + " 41.239907\n" + " ],\n" + " [\n" + " 29.378341,\n" + " 41.239907\n" + " ],\n" + " [\n" + " 29.378341,\n" + " 40.802734\n" + " ]\n" + " ]\n" + " ]\n" + " },\n" + " \"attributes\": {}\n" + " },\n" + " \"timestamp_ms\": \"1459350458950\"\n" + "}\n").getBytes()))); DataSet<Tuple2<Text,Mutation>> output = input.flatMap( new GeoTempFlatMap("table")); output.print(); assertEquals(output.count(), 1); }
Example 18
Source File: WebLogAnalysis.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { final ParameterTool params = ParameterTool.fromArgs(args); final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.getConfig().setGlobalJobParameters(params); // get input data DataSet<Tuple2<String, String>> documents = getDocumentsDataSet(env, params); DataSet<Tuple3<Integer, String, Integer>> ranks = getRanksDataSet(env, params); DataSet<Tuple2<String, String>> visits = getVisitsDataSet(env, params); // Retain documents with keywords DataSet<Tuple1<String>> filterDocs = documents .filter(new FilterDocByKeyWords()) .project(0); // Filter ranks by minimum rank DataSet<Tuple3<Integer, String, Integer>> filterRanks = ranks .filter(new FilterByRank()); // Filter visits by visit date DataSet<Tuple1<String>> filterVisits = visits .filter(new FilterVisitsByDate()) .project(0); // Join the filtered documents and ranks, i.e., get all URLs with min rank and keywords DataSet<Tuple3<Integer, String, Integer>> joinDocsRanks = filterDocs.join(filterRanks) .where(0).equalTo(1) .projectSecond(0, 1, 2); // Anti-join urls with visits, i.e., retain all URLs which have NOT been visited in a certain time DataSet<Tuple3<Integer, String, Integer>> result = joinDocsRanks.coGroup(filterVisits) .where(1).equalTo(0) .with(new AntiJoinVisits()); // emit result if (params.has("output")) { result.writeAsCsv(params.get("output"), "\n", "|"); // execute program env.execute("WebLogAnalysis Example"); } else { System.out.println("Printing result to stdout. Use --output to specify output path."); result.print(); } }
Example 19
Source File: dVMPTest.java From toolbox with Apache License 2.0 | 4 votes |
public void testingMLParallelPosteriorsAssignment() throws Exception { //Set-up Flink session. Configuration conf = new Configuration(); conf.setInteger("taskmanager.network.numberOfBuffers", 12000); final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf); env.getConfig().disableSysoutLogging(); env.setParallelism(Main.PARALLELISM); DataFlink<DataInstance> dataFlink = DataFlinkLoader.loadDataFromFolder(env, "../datasets/simulated/test_not_modify/MONTH1.arff", true); //DataFlink<DataInstance> dataStream = DataFlinkLoader.loadDataFromFile(env, // "./datasets/dataStream/test_not_modify/SmallDataSet.arff", false); //Structure learning is excluded from the test, i.e., we use directly the initial Asia network structure // and just learn then test the parameter learning //Parameter Learning dVMP parallelVB = new dVMP(); parallelVB.setOutput(true); parallelVB.setSeed(5); parallelVB.setBatchSize(100); parallelVB.setLocalThreshold(0.001); parallelVB.setGlobalThreshold(0.05); parallelVB.setMaximumLocalIterations(100); parallelVB.setMaximumGlobalIterations(100); DAG dag = DAGGenerator.getHiddenNaiveBayesStructure(dataFlink.getAttributes(), "GlobalHidden", 2); if (Main.VERBOSE) System.out.println(dag.toString()); parallelVB.setDAG(dag); parallelVB.initLearning(); parallelVB.updateModel(dataFlink); BayesianNetwork bnet = parallelVB.getLearntBayesianNetwork(); if (Main.VERBOSE) System.out.println(bnet.toString()); List<Variable> list = new ArrayList<>(); list.add(dag.getVariables().getVariableByName("GlobalHidden")); list.add(dag.getVariables().getVariableById(0)); DataSet<DataPosteriorAssignment> dataPosteriorDataSet = parallelVB.computePosteriorAssignment(dataFlink,list); dataPosteriorDataSet.print(); dataPosteriorDataSet.print(); }
Example 20
Source File: PageRank.java From flink with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { ParameterTool params = ParameterTool.fromArgs(args); final int numPages = params.getInt("numPages", PageRankData.getNumberOfPages()); final int maxIterations = params.getInt("iterations", 10); // set up execution environment final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // make the parameters available to the web ui env.getConfig().setGlobalJobParameters(params); // get input data DataSet<Long> pagesInput = getPagesDataSet(env, params); DataSet<Tuple2<Long, Long>> linksInput = getLinksDataSet(env, params); // assign initial rank to pages DataSet<Tuple2<Long, Double>> pagesWithRanks = pagesInput. map(new RankAssigner((1.0d / numPages))); // build adjacency list from link input DataSet<Tuple2<Long, Long[]>> adjacencyListInput = linksInput.groupBy(0).reduceGroup(new BuildOutgoingEdgeList()); // set iterative data set IterativeDataSet<Tuple2<Long, Double>> iteration = pagesWithRanks.iterate(maxIterations); DataSet<Tuple2<Long, Double>> newRanks = iteration // join pages with outgoing edges and distribute rank .join(adjacencyListInput).where(0).equalTo(0).flatMap(new JoinVertexWithEdgesMatch()) // collect and sum ranks .groupBy(0).aggregate(SUM, 1) // apply dampening factor .map(new Dampener(DAMPENING_FACTOR, numPages)); DataSet<Tuple2<Long, Double>> finalPageRanks = iteration.closeWith( newRanks, newRanks.join(iteration).where(0).equalTo(0) // termination condition .filter(new EpsilonFilter())); // emit result if (params.has("output")) { finalPageRanks.writeAsCsv(params.get("output"), "\n", " "); // execute program env.execute("Basic Page Rank Example"); } else { System.out.println("Printing result to stdout. Use --output to specify output path."); finalPageRanks.print(); } }