org.apache.flink.api.java.DataSet#print

Source File: WordCountWithInnerClass.java From flink with Apache License 2.0

6 votes

public static void main(String[] args) throws Exception {
	// set up the execution environment
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// get input data
	DataSet<String> text = StaticData.getDefaultTextLineDataSet(env);

	DataSet<Tuple2<String, Integer>> counts =
		// split up the lines in pairs (2-tuples) containing: (word,1)
		text.flatMap(new Tokenizer())
			// group by the tuple field "0" and sum up tuple field "1"
			.groupBy(0)
			.sum(1);

	// emit result
	counts.print();

	// execute program
	env.execute("WordCount Example");
}

Source File: WordCountWithInnerClass.java From flink with Apache License 2.0

6 votes

public static void main(String[] args) throws Exception {
	// set up the execution environment
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// get input data
	DataSet<String> text = StaticData.getDefaultTextLineDataSet(env);

	DataSet<Tuple2<String, Integer>> counts =
		// split up the lines in pairs (2-tuples) containing: (word,1)
		text.flatMap(new Tokenizer())
			// group by the tuple field "0" and sum up tuple field "1"
			.groupBy(0)
			.sum(1);

	// emit result
	counts.print();

	// execute program
	env.execute("WordCount Example");
}

Source File: DataFlinkLoaderTest.java From toolbox with Apache License 2.0

6 votes

public static void test1() throws Exception {
    //Set-up Flink session.
    Configuration conf = new Configuration();
    conf.setInteger("taskmanager.network.numberOfBuffers", 12000);
    final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf);
            env.getConfig().disableSysoutLogging();         env.setParallelism(Main.PARALLELISM);

    DataFlink<DataInstance> dataFlink = DataFlinkLoader.loadDataFromFile(env,
            "../datasets/simulated/test_not_modify/SmallDataSet.arff", false);
    DataSet<DataInstance> data = dataFlink.getDataSet();

    data.print();

    List<DataInstance> instanceList = data.collect();

    assertEquals(16, instanceList.size());
    List<String> names = Arrays.asList("A", "B", "C", "D", "E", "G");
    List<Integer> states = Arrays.asList(2, 3, 2, 2, 2, -1);

    List<Attribute> atts = dataFlink.getAttributes().getListOfNonSpecialAttributes();
    for (int i = 0; i < names.size(); i++) {
        if (Main.VERBOSE) System.out.println(names.get(i));
        assertEquals(atts.get(i).getName(), names.get(i));
        assertEquals(atts.get(i).getNumberOfStates(), states.get(i).intValue());
    }
}

Source File: EmptyFieldsCountAccumulator.java From flink with Apache License 2.0

5 votes

public static void main(final String[] args) throws Exception {

		final ParameterTool params = ParameterTool.fromArgs(args);

		final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		// make parameters available in the web interface
		env.getConfig().setGlobalJobParameters(params);

		// get the data set
		final DataSet<StringTriple> file = getDataSet(env, params);

		// filter lines with empty fields
		final DataSet<StringTriple> filteredLines = file.filter(new EmptyFieldFilter());

		// Here, we could do further processing with the filtered lines...
		JobExecutionResult result;
		// output the filtered lines
		if (params.has("output")) {
			filteredLines.writeAsCsv(params.get("output"));
			// execute program
			result = env.execute("Accumulator example");
		} else {
			System.out.println("Printing result to stdout. Use --output to specify output path.");
			filteredLines.print();
			result = env.getLastJobExecutionResult();
		}

		// get the accumulator result via its registration key
		final List<Integer> emptyFields = result.getAccumulatorResult(EMPTY_FIELD_ACCUMULATOR);
		System.out.format("Number of detected empty fields per column: %s\n", emptyFields);
	}

Source File: FilterWithIndirection.java From Flink-CEPplus with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	DataSet<String> input = env.fromElements("Please filter", "the words", "but not this");

	DataSet<String> output = input.filter(UtilFunctionWrapper.UtilFunction.getWordFilter());
	output.print();

	env.execute();
}

Source File: TestParserMapFunctionAvroInline.java From logparser with Apache License 2.0

5 votes

@Test
public void testInlineDefinitionAvro() throws Exception {
    // set up the execution environment
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

    DataSet<String> input = env.fromElements(TestCase.getInputLine());

    DataSet<Click> filledTestRecords = input
        .map(new RichMapFunction<String, Click>() {
            private Parser<ClickSetter> parser;

            @Override
            public void open(org.apache.flink.configuration.Configuration parameters) {
                parser = new HttpdLoglineParser<>(ClickSetter.class, TestCase.getLogFormat())
                    .addDissector(new ScreenResolutionDissector())
                    .addTypeRemapping("request.firstline.uri.query.g", "HTTP.URI")
                    .addTypeRemapping("request.firstline.uri.query.r", "HTTP.URI")
                    .addTypeRemapping("request.firstline.uri.query.s", "SCREENRESOLUTION")
                    .addDissector(new GeoIPISPDissector(ISP_TEST_MMDB))
                    .addDissector(new GeoIPCityDissector(CITY_TEST_MMDB));
            }

            @Override
            public Click map(String line) throws Exception {
                return parser.parse(line).build();
            }
        }).name("Extract Elements from logline");

    filledTestRecords.print();

    List<Click> result = filledTestRecords.collect();

    assertEquals(1, result.size());
    assertEquals(ExpectedClick.create(), result.get(0));
}

Source File: TestParserMapFunctionInline.java From logparser with Apache License 2.0

5 votes

@Test
public void testInlineDefinition() throws Exception {
    // set up the execution environment
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

    DataSet<String> input = env.fromElements(TestCase.getInputLine());

    DataSet<TestRecord> filledTestRecords = input
        .map(new RichMapFunction<String, TestRecord>() {
            private Parser<TestRecord> parser;

            @Override
            public void open(org.apache.flink.configuration.Configuration parameters) throws Exception {
                parser = TestCase.createTestParser();
            }

            @Override
            public TestRecord map(String line) throws Exception {
                return parser.parse(line);
            }
        }).name("Extract Elements from logline");

    filledTestRecords.print();

    List<TestRecord> result = filledTestRecords.collect();

    assertEquals(1, result.size());
    assertEquals(new TestRecord().setFullValid(), result.get(0));
}

Source File: GSASingleSourceShortestPaths.java From flink with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {

		if (!parseParameters(args)) {
			return;
		}

		ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		DataSet<Edge<Long, Double>> edges = getEdgeDataSet(env);

		Graph<Long, Double, Double> graph = Graph.fromDataSet(edges, new InitVertices(srcVertexId), env);

		// Execute the GSA iteration
		Graph<Long, Double, Double> result = graph.runGatherSumApplyIteration(
				new CalculateDistances(), new ChooseMinDistance(), new UpdateDistance(), maxIterations);

		// Extract the vertices as the result
		DataSet<Vertex<Long, Double>> singleSourceShortestPaths = result.getVertices();

		// emit result
		if (fileOutput) {
			singleSourceShortestPaths.writeAsCsv(outputPath, "\n", ",");

			// since file sinks are lazy, we trigger the execution explicitly
			env.execute("GSA Single Source Shortest Paths");
		} else {
			singleSourceShortestPaths.print();
		}

	}

Source File: ParquetProtobufExample.java From parquet-flinktacular with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {

		//output
		final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
		DataSet<Tuple2<Void, Person>> data = generateDataSet(env);
		writeProtobuf(data, "newpath");
		data.print();
		env.execute("Parquet output");

		//input
		final ExecutionEnvironment env2 = ExecutionEnvironment.getExecutionEnvironment();
		DataSet<Tuple2<Void, Person.Builder>> input = readProtobuf(env2, "newpath");
		input.map(new TupleToProto()).print();
		env2.execute("Parquet input");
	}

Source File: EuclideanGraphWeighing.java From Flink-CEPplus with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception {

		if (!parseParameters(args)) {
			return;
		}

		ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		DataSet<Vertex<Long, Point>> vertices = getVerticesDataSet(env);

		DataSet<Edge<Long, Double>> edges = getEdgesDataSet(env);

		Graph<Long, Point, Double> graph = Graph.fromDataSet(vertices, edges, env);

		// the edge value will be the Euclidean distance between its src and trg vertex
		DataSet<Tuple3<Long, Long, Double>> edgesWithEuclideanWeight = graph.getTriplets()
				.map(new MapFunction<Triplet<Long, Point, Double>, Tuple3<Long, Long, Double>>() {

					@Override
					public Tuple3<Long, Long, Double> map(Triplet<Long, Point, Double> triplet)
							throws Exception {

						Vertex<Long, Point> srcVertex = triplet.getSrcVertex();
						Vertex<Long, Point> trgVertex = triplet.getTrgVertex();

						return new Tuple3<>(srcVertex.getId(), trgVertex.getId(),
							srcVertex.getValue().euclideanDistance(trgVertex.getValue()));
					}
				});

		Graph<Long, Point, Double> resultedGraph = graph.joinWithEdges(edgesWithEuclideanWeight,
				new EdgeJoinFunction<Double, Double>() {

					public Double edgeJoin(Double edgeValue, Double inputValue) {
						return inputValue;
					}
				});

		// retrieve the edges from the final result
		DataSet<Edge<Long, Double>> result = resultedGraph.getEdges();

		// emit result
		if (fileOutput) {
			result.writeAsCsv(outputPath, "\n", ",");

			// since file sinks are lazy, we trigger the execution explicitly
			env.execute("Euclidean Graph Weighing Example");
		} else {
			result.print();
		}

	}

Source File: DataSetConversionUtilTest.java From flink with Apache License 2.0

4 votes

@Test
public void testE2E() throws Exception {
	ExecutionEnvironment env = MLEnvironmentFactory.getDefault().getExecutionEnvironment();

	DataSet<Row> input = env.fromElements(Row.of("a"));

	Table table1 = DataSetConversionUtil.toTable(MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID, input, new String[]{"word"});
	Assert.assertEquals(
		new TableSchema(new String[]{"word"}, new TypeInformation[]{TypeInformation.of(String.class)}),
		table1.getSchema()
	);

	DataSet<Row> genericInput1 = input.map(new GenericTypeMap());

	// Force type should go through with explicit type info.
	Table table2 = DataSetConversionUtil.toTable(
		MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID,
		genericInput1,
		new String[]{"word"},
		new TypeInformation[]{TypeInformation.of(Integer.class)}
	);

	Assert.assertEquals(
		new TableSchema(new String[]{"word"}, new TypeInformation[]{TypeInformation.of(Integer.class)}),
		table2.getSchema()
	);

	DataSet<Row> genericInput2 = input.map(new GenericTypeMap());

	// Force type should go through with table schema.
	Table table3 = DataSetConversionUtil.toTable(
		MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID,
		genericInput2,
		new TableSchema(
			new String[]{"word"},
			new TypeInformation[]{TypeInformation.of(Integer.class)}
		)
	);

	Assert.assertEquals(
		new TableSchema(new String[]{"word"}, new TypeInformation[]{TypeInformation.of(Integer.class)}),
		table3.getSchema()
	);

	// applying toTable again on the same input should fail
	thrown.expect(IllegalStateException.class);
	DataSetConversionUtil.toTable(
		MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID,
		genericInput2,
		new TableSchema(
			new String[]{"word"},
			new TypeInformation[]{TypeInformation.of(Integer.class)}
		)
	);

	// Validation should fail without correct type inference.
	DataSet<Row> genericInput3 = input.map(new GenericTypeMap());
	thrown.expect(ValidationException.class);
	DataSetConversionUtil.toTable(MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID, genericInput3, new String[]{"word"});

	// Output should go through when using correct type to output.
	DataSet<Row> output = DataSetConversionUtil.fromTable(MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID, table1);
	output.print();

	// Output should NOT go through when using incorrect type forcing.
	thrown.expect(ExecutionException.class);
	DataSetConversionUtil.fromTable(MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID, table2).print();
}

Source File: IncrementalSSSP.java From flink with Apache License 2.0

4 votes

public static void main(String [] args) throws Exception {

		if (!parseParameters(args)) {
			return;
		}

		ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		Edge<Long, Double> edgeToBeRemoved = getEdgeToBeRemoved();

		Graph<Long, Double, Double> graph = IncrementalSSSP.getGraph(env);

		// Assumption: all minimum weight paths are kept
		Graph<Long, Double, Double> ssspGraph = IncrementalSSSP.getSSSPGraph(env);

		// remove the edge
		graph.removeEdge(edgeToBeRemoved);

		// configure the iteration
		ScatterGatherConfiguration parameters = new ScatterGatherConfiguration();

		if (isInSSSP(edgeToBeRemoved, ssspGraph.getEdges())) {

			parameters.setDirection(EdgeDirection.IN);
			parameters.setOptDegrees(true);

			// run the scatter-gather iteration to propagate info
			Graph<Long, Double, Double> result = ssspGraph.runScatterGatherIteration(new InvalidateMessenger(edgeToBeRemoved),
					new VertexDistanceUpdater(), maxIterations, parameters);

			DataSet<Vertex<Long, Double>> resultedVertices = result.getVertices();

			// Emit results
			if (fileOutput) {
				resultedVertices.writeAsCsv(outputPath, "\n", ",");
				env.execute("Incremental SSSP Example");
			} else {
				resultedVertices.print();
			}
		} else {
			// print the vertices
			if (fileOutput) {
				graph.getVertices().writeAsCsv(outputPath, "\n", ",");
				env.execute("Incremental SSSP Example");
			} else {
				graph.getVertices().print();
			}
		}
	}

Source File: ParallelVBTest.java From toolbox with Apache License 2.0

4 votes

public void testingMLParallelPosteriors() throws Exception {

        //Set-up Flink session.
        Configuration conf = new Configuration();
        conf.setInteger("taskmanager.network.numberOfBuffers", 12000);
        final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf);
                env.getConfig().disableSysoutLogging();         env.setParallelism(Main.PARALLELISM);

        DataFlink<DataInstance> dataFlink = DataFlinkLoader.loadDataFromFolder(env,
                "../datasets/simulated/test_not_modify/MONTH1.arff", true);
        //DataFlink<DataInstance> dataStream = DataFlinkLoader.loadDataFromFile(env,
        //        "./datasets/dataStream/test_not_modify/SmallDataSet.arff", false);

        //Structure learning is excluded from the test, i.e., we use directly the initial Asia network structure
        // and just learn then test the parameter learning

        //Parameter Learning
        ParallelVB parallelVB = new ParallelVB();
        parallelVB.setOutput(true);
        parallelVB.setSeed(5);
        parallelVB.setBatchSize(100);
        parallelVB.setLocalThreshold(0.001);
        parallelVB.setGlobalThreshold(0.05);
        parallelVB.setMaximumLocalIterations(100);
        parallelVB.setMaximumGlobalIterations(100);

        DAG dag = DAGGenerator.getHiddenNaiveBayesStructure(dataFlink.getAttributes(), "GlobalHidden", 2);
        if (Main.VERBOSE) System.out.println(dag.toString());
        parallelVB.setDAG(dag);
        parallelVB.initLearning();
        parallelVB.updateModel(dataFlink);
        BayesianNetwork bnet = parallelVB.getLearntBayesianNetwork();

        if (Main.VERBOSE) System.out.println(bnet.toString());

        DataSet<DataPosterior> dataPosteriorDataSet = parallelVB.computePosterior(dataFlink,Arrays.asList(dag.getVariables().getVariableByName("GlobalHidden")));

        dataPosteriorDataSet.print();

        //DataSetSerializer.serializeDataSet(dataPosteriorDataSet, "./datasets/tmp.ser");
        //dataPosteriorDataSet = DataSetSerializer.deserializeDataSet("./datasets/tmp.ser");

        dataPosteriorDataSet.print();
    }

Source File: PageRank.java From Flink-CEPplus with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception {

		ParameterTool params = ParameterTool.fromArgs(args);

		final int numPages = params.getInt("numPages", PageRankData.getNumberOfPages());
		final int maxIterations = params.getInt("iterations", 10);

		// set up execution environment
		final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		// make the parameters available to the web ui
		env.getConfig().setGlobalJobParameters(params);

		// get input data
		DataSet<Long> pagesInput = getPagesDataSet(env, params);
		DataSet<Tuple2<Long, Long>> linksInput = getLinksDataSet(env, params);

		// assign initial rank to pages
		DataSet<Tuple2<Long, Double>> pagesWithRanks = pagesInput.
				map(new RankAssigner((1.0d / numPages)));

		// build adjacency list from link input
		DataSet<Tuple2<Long, Long[]>> adjacencyListInput =
				linksInput.groupBy(0).reduceGroup(new BuildOutgoingEdgeList());

		// set iterative data set
		IterativeDataSet<Tuple2<Long, Double>> iteration = pagesWithRanks.iterate(maxIterations);

		DataSet<Tuple2<Long, Double>> newRanks = iteration
				// join pages with outgoing edges and distribute rank
				.join(adjacencyListInput).where(0).equalTo(0).flatMap(new JoinVertexWithEdgesMatch())
				// collect and sum ranks
				.groupBy(0).aggregate(SUM, 1)
				// apply dampening factor
				.map(new Dampener(DAMPENING_FACTOR, numPages));

		DataSet<Tuple2<Long, Double>> finalPageRanks = iteration.closeWith(
				newRanks,
				newRanks.join(iteration).where(0).equalTo(0)
				// termination condition
				.filter(new EpsilonFilter()));

		// emit result
		if (params.has("output")) {
			finalPageRanks.writeAsCsv(params.get("output"), "\n", " ");
			// execute program
			env.execute("Basic Page Rank Example");
		} else {
			System.out.println("Printing result to stdout. Use --output to specify output path.");
			finalPageRanks.print();
		}
	}

Source File: EnumTriangles.java From flink with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception {

		// Checking input parameters
		final ParameterTool params = ParameterTool.fromArgs(args);

		// set up execution environment
		final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		// make parameters available in the web interface
		env.getConfig().setGlobalJobParameters(params);

		// read input data
		DataSet<Edge> edges;
		if (params.has("edges")) {
			edges = env.readCsvFile(params.get("edges"))
					.fieldDelimiter(" ")
					.includeFields(true, true)
					.types(Integer.class, Integer.class)
					.map(new TupleEdgeConverter());
		} else {
			System.out.println("Executing EnumTriangles example with default edges data set.");
			System.out.println("Use --edges to specify file input.");
			edges = EnumTrianglesData.getDefaultEdgeDataSet(env);
		}

		// project edges by vertex id
		DataSet<Edge> edgesById = edges
				.map(new EdgeByIdProjector());

		DataSet<Triad> triangles = edgesById
				// build triads
				.groupBy(Edge.V1).sortGroup(Edge.V2, Order.ASCENDING).reduceGroup(new TriadBuilder())
				// filter triads
				.join(edgesById).where(Triad.V2, Triad.V3).equalTo(Edge.V1, Edge.V2).with(new TriadFilter());

		// emit result
		if (params.has("output")) {
			triangles.writeAsCsv(params.get("output"), "\n", ",");
			// execute program
			env.execute("Basic Triangle Enumeration Example");
		} else {
			System.out.println("Printing result to stdout. Use --output to specify output path.");
			triangles.print();
		}
	}

Source File: dVMPv1Test.java From toolbox with Apache License 2.0

4 votes

public void testingMLParallelPosteriorsAssignment() throws Exception {

        //Set-up Flink session.
        Configuration conf = new Configuration();
        conf.setInteger("taskmanager.network.numberOfBuffers", 12000);
        final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf);
                env.getConfig().disableSysoutLogging();         env.setParallelism(Main.PARALLELISM);

        DataFlink<DataInstance> dataFlink = DataFlinkLoader.loadDataFromFolder(env,
                "../datasets/simulated/test_not_modify/MONTH1.arff", true);

        //DataFlink<DataInstance> dataStream = DataFlinkLoader.loadDataFromFile(env,
        //        "./datasets/dataStream/test_not_modify/SmallDataSet.arff", false);


        //Structure learning is excluded from the test, i.e., we use directly the initial Asia network structure
        // and just learn then test the parameter learning

        //Parameter Learning
        dVMPv1 parallelVB = new dVMPv1();
        parallelVB.setOutput(true);
        parallelVB.setSeed(5);
        parallelVB.setBatchSize(100);
        parallelVB.setLocalThreshold(0.001);
        parallelVB.setGlobalThreshold(0.05);
        parallelVB.setMaximumLocalIterations(100);
        parallelVB.setMaximumGlobalIterations(100);

        DAG dag = DAGGenerator.getHiddenNaiveBayesStructure(dataFlink.getAttributes(), "GlobalHidden", 2);
        if (Main.VERBOSE) System.out.println(dag.toString());
        parallelVB.setDAG(dag);
        parallelVB.initLearning();
        parallelVB.updateModel(dataFlink);
        BayesianNetwork bnet = parallelVB.getLearntBayesianNetwork();

        if (Main.VERBOSE) System.out.println(bnet.toString());
        List<Variable> list = new ArrayList<>();
        list.add(dag.getVariables().getVariableByName("GlobalHidden"));
        list.add(dag.getVariables().getVariableById(0));

        DataSet<DataPosteriorAssignment> dataPosteriorDataSet = parallelVB.computePosteriorAssignment(dataFlink,list);

        dataPosteriorDataSet.print();


        dataPosteriorDataSet.print();
    }

Source File: GeoTempFlatMapTest.java From OSTMap with Apache License 2.0

4 votes

@Test
public void testFlatMap() throws Exception {

    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

    ByteBuffer key1 = ByteBuffer.allocate(12);
    key1.putLong(1459350458).putInt(123);
    DataSet<Tuple2<Key,Value>> input = env.fromElements(
            new Tuple2<>(new Key(new Text(key1.array()),new Text("t"),new Text("")),
                    new Value(("{\n" +
                            "  \"created_at\": \"Wed Mar 30 15:07:38 +0000 2016\",\n" +
                            "  \"id\": 715193777833582592,\n" +
                            "  \"user\": {\n" +
                            "    \"id\": 2243967693,\n" +
                            "    \"id_str\": \"2243967693\",\n" +
                            "  },\n" +
                            "  \"geo\": {\n" +
                            "    \"type\": \"Point\",\n" +
                            "    \"coordinates\": [\n" +
                            "      41.00870620,\n" +
                            "      29.21240342\n" +
                            "    ]\n" +
                            "  },\n" +
                            "  \"coordinates\": {\n" +
                            "    \"type\": \"Point\",\n" +
                            "    \"coordinates\": [\n" +
                            "      29.21240342,\n" +
                            "      41.00870620\n" +
                            "    ]\n" +
                            "  },\n" +
                            "  \"place\": {\n" +
                            "    \"id\": \"5e02a0f0d91c76d2\",\n" +
                            "    \"url\": \"https:\\/\\/api.twitter.com\\/1.1\\/geo\\/id\\/5e02a0f0d91c76d2.json\",\n" +
                            "    \"place_type\": \"city\",\n" +
                            "    \"name\": \"\\u0130stanbul\",\n" +
                            "    \"full_name\": \"\\u0130stanbul, T\\u00fcrkiye\",\n" +
                            "    \"country_code\": \"TR\",\n" +
                            "    \"country\": \"T\\u00fcrkiye\",\n" +
                            "    \"bounding_box\": {\n" +
                            "      \"type\": \"Polygon\",\n" +
                            "      \"coordinates\": [\n" +
                            "        [\n" +
                            "          [\n" +
                            "            28.632104,\n" +
                            "            40.802734\n" +
                            "          ],\n" +
                            "          [\n" +
                            "            28.632104,\n" +
                            "            41.239907\n" +
                            "          ],\n" +
                            "          [\n" +
                            "            29.378341,\n" +
                            "            41.239907\n" +
                            "          ],\n" +
                            "          [\n" +
                            "            29.378341,\n" +
                            "            40.802734\n" +
                            "          ]\n" +
                            "        ]\n" +
                            "      ]\n" +
                            "    },\n" +
                            "    \"attributes\": {}\n" +
                            "  },\n" +
                            "  \"timestamp_ms\": \"1459350458950\"\n" +
                            "}\n").getBytes())));

    DataSet<Tuple2<Text,Mutation>> output = input.flatMap(
            new GeoTempFlatMap("table"));



    output.print();
    assertEquals(output.count(), 1);

}

Source File: WebLogAnalysis.java From Flink-CEPplus with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception {

		final ParameterTool params = ParameterTool.fromArgs(args);

		final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		env.getConfig().setGlobalJobParameters(params);

		// get input data
		DataSet<Tuple2<String, String>> documents = getDocumentsDataSet(env, params);
		DataSet<Tuple3<Integer, String, Integer>> ranks = getRanksDataSet(env, params);
		DataSet<Tuple2<String, String>> visits = getVisitsDataSet(env, params);

		// Retain documents with keywords
		DataSet<Tuple1<String>> filterDocs = documents
				.filter(new FilterDocByKeyWords())
				.project(0);

		// Filter ranks by minimum rank
		DataSet<Tuple3<Integer, String, Integer>> filterRanks = ranks
				.filter(new FilterByRank());

		// Filter visits by visit date
		DataSet<Tuple1<String>> filterVisits = visits
				.filter(new FilterVisitsByDate())
				.project(0);

		// Join the filtered documents and ranks, i.e., get all URLs with min rank and keywords
		DataSet<Tuple3<Integer, String, Integer>> joinDocsRanks =
				filterDocs.join(filterRanks)
							.where(0).equalTo(1)
							.projectSecond(0, 1, 2);

		// Anti-join urls with visits, i.e., retain all URLs which have NOT been visited in a certain time
		DataSet<Tuple3<Integer, String, Integer>> result =
				joinDocsRanks.coGroup(filterVisits)
								.where(1).equalTo(0)
								.with(new AntiJoinVisits());

		// emit result
		if (params.has("output")) {
			result.writeAsCsv(params.get("output"), "\n", "|");
			// execute program
			env.execute("WebLogAnalysis Example");
		} else {
			System.out.println("Printing result to stdout. Use --output to specify output path.");
			result.print();
		}
	}

Source File: dVMPTest.java From toolbox with Apache License 2.0

4 votes

public void testingMLParallelPosteriorsAssignment() throws Exception {

        //Set-up Flink session.
        Configuration conf = new Configuration();
        conf.setInteger("taskmanager.network.numberOfBuffers", 12000);
        final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf);
                env.getConfig().disableSysoutLogging();         env.setParallelism(Main.PARALLELISM);

        DataFlink<DataInstance> dataFlink = DataFlinkLoader.loadDataFromFolder(env,
                "../datasets/simulated/test_not_modify/MONTH1.arff", true);

        //DataFlink<DataInstance> dataStream = DataFlinkLoader.loadDataFromFile(env,
        //        "./datasets/dataStream/test_not_modify/SmallDataSet.arff", false);


        //Structure learning is excluded from the test, i.e., we use directly the initial Asia network structure
        // and just learn then test the parameter learning

        //Parameter Learning
        dVMP parallelVB = new dVMP();
        parallelVB.setOutput(true);
        parallelVB.setSeed(5);
        parallelVB.setBatchSize(100);
        parallelVB.setLocalThreshold(0.001);
        parallelVB.setGlobalThreshold(0.05);
        parallelVB.setMaximumLocalIterations(100);
        parallelVB.setMaximumGlobalIterations(100);

        DAG dag = DAGGenerator.getHiddenNaiveBayesStructure(dataFlink.getAttributes(), "GlobalHidden", 2);
        if (Main.VERBOSE) System.out.println(dag.toString());
        parallelVB.setDAG(dag);
        parallelVB.initLearning();
        parallelVB.updateModel(dataFlink);
        BayesianNetwork bnet = parallelVB.getLearntBayesianNetwork();

        if (Main.VERBOSE) System.out.println(bnet.toString());
        List<Variable> list = new ArrayList<>();
        list.add(dag.getVariables().getVariableByName("GlobalHidden"));
        list.add(dag.getVariables().getVariableById(0));

        DataSet<DataPosteriorAssignment> dataPosteriorDataSet = parallelVB.computePosteriorAssignment(dataFlink,list);

        dataPosteriorDataSet.print();


        dataPosteriorDataSet.print();
    }

Source File: PageRank.java From flink with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception {

		ParameterTool params = ParameterTool.fromArgs(args);

		final int numPages = params.getInt("numPages", PageRankData.getNumberOfPages());
		final int maxIterations = params.getInt("iterations", 10);

		// set up execution environment
		final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		// make the parameters available to the web ui
		env.getConfig().setGlobalJobParameters(params);

		// get input data
		DataSet<Long> pagesInput = getPagesDataSet(env, params);
		DataSet<Tuple2<Long, Long>> linksInput = getLinksDataSet(env, params);

		// assign initial rank to pages
		DataSet<Tuple2<Long, Double>> pagesWithRanks = pagesInput.
				map(new RankAssigner((1.0d / numPages)));

		// build adjacency list from link input
		DataSet<Tuple2<Long, Long[]>> adjacencyListInput =
				linksInput.groupBy(0).reduceGroup(new BuildOutgoingEdgeList());

		// set iterative data set
		IterativeDataSet<Tuple2<Long, Double>> iteration = pagesWithRanks.iterate(maxIterations);

		DataSet<Tuple2<Long, Double>> newRanks = iteration
				// join pages with outgoing edges and distribute rank
				.join(adjacencyListInput).where(0).equalTo(0).flatMap(new JoinVertexWithEdgesMatch())
				// collect and sum ranks
				.groupBy(0).aggregate(SUM, 1)
				// apply dampening factor
				.map(new Dampener(DAMPENING_FACTOR, numPages));

		DataSet<Tuple2<Long, Double>> finalPageRanks = iteration.closeWith(
				newRanks,
				newRanks.join(iteration).where(0).equalTo(0)
				// termination condition
				.filter(new EpsilonFilter()));

		// emit result
		if (params.has("output")) {
			finalPageRanks.writeAsCsv(params.get("output"), "\n", " ");
			// execute program
			env.execute("Basic Page Rank Example");
		} else {
			System.out.println("Printing result to stdout. Use --output to specify output path.");
			finalPageRanks.print();
		}
	}

Java Code Examples for org.apache.flink.api.java.DataSet#print()