org.apache.spark.streaming.api.java.JavaReceiverInputDStream Java Examples
The following examples show how to use
org.apache.spark.streaming.api.java.JavaReceiverInputDStream.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SparkStreamDemo.java From sparkResearch with Apache License 2.0 | 6 votes |
public static void main(String[] args) { //创建两个核心的本地线程,批处理的间隔为1秒 SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("sparkStreamIng"); JavaStreamingContext javaStreamingContext = new JavaStreamingContext(conf, Durations.seconds(1)); //创建一个连接到IP:localhost,PORT:8080的DStream JavaReceiverInputDStream<String> dStream = javaStreamingContext.socketTextStream("localhost", 8080); JavaDStream<String> errorLine = dStream.filter(new Function<String, Boolean>() { @Override public Boolean call(String v1) throws Exception { return v1.contains("error"); } }); //打印包含error的行 errorLine.print(); try { //开始计算 javaStreamingContext.start(); //等待计算完成 javaStreamingContext.awaitTermination(); } catch (InterruptedException e) { e.printStackTrace(); } }
Example #2
Source File: WordCountRecoverableEx.java From Apache-Spark-2x-for-Java-Developers with MIT License | 5 votes |
protected static JavaStreamingContext createContext(String ip, int port, String checkpointDirectory) { SparkConf sparkConf = new SparkConf().setAppName("WordCountRecoverableEx").setMaster("local[*]"); JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1)); streamingContext.checkpoint(checkpointDirectory); // Initial state RDD input to mapWithState @SuppressWarnings("unchecked") List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 1), new Tuple2<>("world", 1)); JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples); JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream(ip,port, StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = StreamingLines.flatMap(str -> Arrays.asList(str.split(" ")).iterator()); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str -> new Tuple2<>(str, 1)) .reduceByKey((count1, count2) -> count1 + count2); // Update the cumulative count function Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>> mappingFunc = new Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>>() { @Override public Tuple2<String, Integer> call(String word, Optional<Integer> one, State<Integer> state) { int sum = one.orElse(0) + (state.exists() ? state.get() : 0); Tuple2<String, Integer> output = new Tuple2<>(word, sum); state.update(sum); return output; } }; // DStream made of get cumulative counts that get updated in every batch JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> stateDstream = wordCounts .mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD)); stateDstream.print(); return streamingContext; }
Example #3
Source File: ReaderWriterExample.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
public static void main(String[] args) throws Exception { final String dbUrl = args[0]; final String hostname = args[1]; final String port = args[2]; final String inTargetSchema = args[3]; final String inTargetTable = args[4]; SparkConf conf = new SparkConf(); JavaStreamingContext ssc = new JavaStreamingContext(conf, new Duration(500)); SpliceSpark.setContext(ssc.sparkContext()); SparkSession spark = SpliceSpark.getSessionUnsafe(); JavaReceiverInputDStream<String> stream = ssc.socketTextStream(hostname, Integer.parseInt(port)); // Create a SplicemachineContext based on the provided DB connection SplicemachineContext splicemachineContext = new SplicemachineContext(dbUrl); // Set target tablename and schemaname final String table = inTargetSchema + "." + inTargetTable; stream.foreachRDD((VoidFunction<JavaRDD<String>>) rdd -> { JavaRDD<Row> rowRDD = rdd.map((Function<String, Row>) s -> RowFactory.create(s)); Dataset<Row> df = spark.createDataFrame(rowRDD, splicemachineContext.getSchema(table)); splicemachineContext.insert(df, table); }); ssc.start(); ssc.awaitTermination(); }
Example #4
Source File: ReaderWriterExample.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
public static void main(String[] args) throws Exception { final String dbUrl = args[0]; final String hostname = args[1]; final String port = args[2]; final String inTargetSchema = args[3]; final String inTargetTable = args[4]; SparkConf conf = new SparkConf(); JavaStreamingContext ssc = new JavaStreamingContext(conf, new Duration(500)); JavaReceiverInputDStream<String> stream = ssc.socketTextStream(hostname, Integer.parseInt(port)); SparkSession spark = SparkSession.builder().getOrCreate(); // Create a SplicemachineContext based on the provided DB connection SplicemachineContext splicemachineContext = new SplicemachineContext(dbUrl); // Set target tablename and schemaname final String table = inTargetSchema + "." + inTargetTable; stream.foreachRDD((VoidFunction<JavaRDD<String>>) rdd -> { JavaRDD<Row> rowRDD = rdd.map((Function<String, Row>) s -> RowFactory.create(s)); Dataset<Row> df = spark.createDataFrame(rowRDD, splicemachineContext.getSchema(table)); splicemachineContext.insert(df, table); }); ssc.start(); ssc.awaitTermination(); }
Example #5
Source File: JavaHBaseStreamingBulkPutExample.java From learning-hadoop with Apache License 2.0 | 5 votes |
public static void main(String args[]) { if (args.length == 0) { System.out .println("JavaHBaseBulkPutExample {master} {host} {post} {tableName} {columnFamily}"); } String master = args[0]; String host = args[1]; String port = args[2]; String tableName = args[3]; String columnFamily = args[4]; System.out.println("master:" + master); System.out.println("host:" + host); System.out.println("port:" + Integer.parseInt(port)); System.out.println("tableName:" + tableName); System.out.println("columnFamily:" + columnFamily); SparkConf sparkConf = new SparkConf(); sparkConf.set("spark.cleaner.ttl", "120000"); JavaSparkContext jsc = new JavaSparkContext(master, "JavaHBaseBulkPutExample"); jsc.addJar("SparkHBase.jar"); JavaStreamingContext jssc = new JavaStreamingContext(jsc, new Duration(1000)); JavaReceiverInputDStream<String> javaDstream = jssc.socketTextStream(host, Integer.parseInt(port)); Configuration conf = HBaseConfiguration.create(); conf.addResource(new Path("/etc/hbase/conf/core-site.xml")); conf.addResource(new Path("/etc/hbase/conf/hbase-site.xml")); JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf); hbaseContext.streamBulkPut(javaDstream, tableName, new PutFunction(), true); }
Example #6
Source File: JavaCustomReceiver.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaCustomReceiver <hostname> <port>"); System.exit(1); } StreamingExamples.setStreamingLogLevels(); // Create the context with a 1 second batch size SparkConf sparkConf = new SparkConf().setAppName("JavaCustomReceiver"); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, new Duration(1000)); // Create an input stream with the custom receiver on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') JavaReceiverInputDStream<String> lines = ssc.receiverStream( new JavaCustomReceiver(args[0], Integer.parseInt(args[1]))); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String x) { return Arrays.asList(SPACE.split(x)).iterator(); } }); JavaPairDStream<String, Integer> wordCounts = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<>(s, 1); } }).reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.print(); ssc.start(); ssc.awaitTermination(); }
Example #7
Source File: WordCountSocketJava8Ex.java From Apache-Spark-2x-for-Java-Developers with MIT License | 5 votes |
public static void main(String[] args) throws Exception { System.setProperty("hadoop.home.dir", "E:\\hadoop"); SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]"); JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1)); List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10)); JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples); JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() ); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 ); wordCounts.print(); JavaPairDStream<String, Integer> joinedDstream = wordCounts.transformToPair( new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() { @Override public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> rdd) throws Exception { rdd.join(initialRDD).mapToPair(new PairFunction<Tuple2<String,Tuple2<Integer,Integer>>, String, Integer>() { @Override public Tuple2<String, Integer> call(Tuple2<String, Tuple2<Integer, Integer>> joinedTuple) throws Exception { // TODO Auto-generated method stub return new Tuple2<>( joinedTuple._1(), (joinedTuple._2()._1()+joinedTuple._2()._2()) ); } }); return rdd; } }); joinedDstream.print(); streamingContext.start(); streamingContext.awaitTermination(); }
Example #8
Source File: WordCountSocketStateful.java From Apache-Spark-2x-for-Java-Developers with MIT License | 5 votes |
public static void main(String[] args) throws Exception { System.setProperty("hadoop.home.dir", "E:\\hadoop"); SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]"); JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1)); streamingContext.checkpoint("E:\\hadoop\\checkpoint"); // Initial state RDD input to mapWithState @SuppressWarnings("unchecked") List<Tuple2<String, Integer>> tuples =Arrays.asList(new Tuple2<>("hello", 1), new Tuple2<>("world", 1)); JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples); JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() ); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 ); // Update the cumulative count function Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>> mappingFunc = new Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>>() { @Override public Tuple2<String, Integer> call(String word, Optional<Integer> one, State<Integer> state) { int sum = one.orElse(0) + (state.exists() ? state.get() : 0); Tuple2<String, Integer> output = new Tuple2<>(word, sum); state.update(sum); return output; } }; // DStream made of get cumulative counts that get updated in every batch JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> stateDstream = wordCounts.mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD)); stateDstream.print(); streamingContext.start(); streamingContext.awaitTermination(); }
Example #9
Source File: WordCountTransformOpEx.java From Apache-Spark-2x-for-Java-Developers with MIT License | 5 votes |
public static void main(String[] args) throws Exception { System.setProperty("hadoop.home.dir", "E:\\hadoop"); SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]"); JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1)); Logger rootLogger = LogManager.getRootLogger(); rootLogger.setLevel(Level.WARN); List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10)); JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples); JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() ); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 ); wordCounts.print(); JavaPairDStream<String, Integer> joinedDstream = wordCounts .transformToPair(new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() { @Override public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> rdd) throws Exception { JavaPairRDD<String, Integer> modRDD = rdd.join(initialRDD).mapToPair( new PairFunction<Tuple2<String, Tuple2<Integer, Integer>>, String, Integer>() { @Override public Tuple2<String, Integer> call( Tuple2<String, Tuple2<Integer, Integer>> joinedTuple) throws Exception { return new Tuple2<>(joinedTuple._1(),(joinedTuple._2()._1() + joinedTuple._2()._2())); } }); return modRDD; } }); joinedDstream.print(); streamingContext.start(); streamingContext.awaitTermination(); }
Example #10
Source File: StateFulProcessingExample.java From Apache-Spark-2x-for-Java-Developers with MIT License | 4 votes |
public static void main(String[] args) throws InterruptedException { System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils"); SparkSession sparkSession = SparkSession.builder().master("local[*]").appName("Stateful Streaming Example") .config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate(); JavaStreamingContext jssc= new JavaStreamingContext(new JavaSparkContext(sparkSession.sparkContext()), Durations.milliseconds(1000)); JavaReceiverInputDStream<String> inStream = jssc.socketTextStream("10.204.136.223", 9999); jssc.checkpoint("C:\\Users\\sgulati\\spark-checkpoint"); JavaDStream<FlightDetails> flightDetailsStream = inStream.map(x -> { ObjectMapper mapper = new ObjectMapper(); return mapper.readValue(x, FlightDetails.class); }); JavaPairDStream<String, FlightDetails> flightDetailsPairStream = flightDetailsStream .mapToPair(f -> new Tuple2<String, FlightDetails>(f.getFlightId(), f)); Function3<String, Optional<FlightDetails>, State<List<FlightDetails>>, Tuple2<String, Double>> mappingFunc = ( flightId, curFlightDetail, state) -> { List<FlightDetails> details = state.exists() ? state.get() : new ArrayList<>(); boolean isLanded = false; if (curFlightDetail.isPresent()) { details.add(curFlightDetail.get()); if (curFlightDetail.get().isLanded()) { isLanded = true; } } Double avgSpeed = details.stream().mapToDouble(f -> f.getTemperature()).average().orElse(0.0); if (isLanded) { state.remove(); } else { state.update(details); } return new Tuple2<String, Double>(flightId, avgSpeed); }; JavaMapWithStateDStream<String, FlightDetails, List<FlightDetails>, Tuple2<String, Double>> streamWithState = flightDetailsPairStream .mapWithState(StateSpec.function(mappingFunc).timeout(Durations.minutes(5))); streamWithState.print(); jssc.start(); jssc.awaitTermination(); }
Example #11
Source File: JavaRecoverableNetworkWordCount.java From SparkDemo with MIT License | 4 votes |
private static JavaStreamingContext createContext(String ip, int port, String checkpointDirectory, String outputPath) { // If you do not see this printed, that means the StreamingContext has been loaded // from the new checkpoint System.out.println("Creating new context"); final File outputFile = new File(outputPath); if (outputFile.exists()) { outputFile.delete(); } SparkConf sparkConf = new SparkConf().setAppName("JavaRecoverableNetworkWordCount"); // Create the context with a 1 second batch size JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1)); ssc.checkpoint(checkpointDirectory); // Create a socket stream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') JavaReceiverInputDStream<String> lines = ssc.socketTextStream(ip, port); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String x) { return Arrays.asList(SPACE.split(x)).iterator(); } }); JavaPairDStream<String, Integer> wordCounts = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<>(s, 1); } }).reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.foreachRDD(new VoidFunction2<JavaPairRDD<String, Integer>, Time>() { @Override public void call(JavaPairRDD<String, Integer> rdd, Time time) throws IOException { // Get or register the blacklist Broadcast final Broadcast<List<String>> blacklist = JavaWordBlacklist.getInstance(new JavaSparkContext(rdd.context())); // Get or register the droppedWordsCounter Accumulator final LongAccumulator droppedWordsCounter = JavaDroppedWordsCounter.getInstance(new JavaSparkContext(rdd.context())); // Use blacklist to drop words and use droppedWordsCounter to count them String counts = rdd.filter(new Function<Tuple2<String, Integer>, Boolean>() { @Override public Boolean call(Tuple2<String, Integer> wordCount) { if (blacklist.value().contains(wordCount._1())) { droppedWordsCounter.add(wordCount._2()); return false; } else { return true; } } }).collect().toString(); String output = "Counts at time " + time + " " + counts; System.out.println(output); System.out.println("Dropped " + droppedWordsCounter.value() + " word(s) totally"); System.out.println("Appending to " + outputFile.getAbsolutePath()); Files.append(output + "\n", outputFile, Charset.defaultCharset()); } }); return ssc; }
Example #12
Source File: WindowBatchInterval.java From Apache-Spark-2x-for-Java-Developers with MIT License | 4 votes |
public static void main(String[] args) { //Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set System.setProperty("hadoop.home.dir", "E:\\hadoop"); //Logger rootLogger = LogManager.getRootLogger(); //rootLogger.setLevel(Level.WARN); SparkConf conf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]"); JavaSparkContext sc = new JavaSparkContext(conf); JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.minutes(2)); streamingContext.checkpoint("E:\\hadoop\\checkpoint"); Logger rootLogger = LogManager.getRootLogger(); rootLogger.setLevel(Level.WARN); List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10)); JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples); JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() ); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 ); wordCounts.print(); wordCounts.window(Durations.minutes(8)).countByValue() .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); wordCounts.window(Durations.minutes(8),Durations.minutes(2)).countByValue() .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); wordCounts.window(Durations.minutes(12),Durations.minutes(8)).countByValue() .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); wordCounts.window(Durations.minutes(2),Durations.minutes(2)).countByValue() .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); wordCounts.window(Durations.minutes(12),Durations.minutes(12)).countByValue() .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); //comment these two operation to make it run wordCounts.window(Durations.minutes(5),Durations.minutes(2)).countByValue() .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); wordCounts.window(Durations.minutes(10),Durations.minutes(1)).countByValue() .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); streamingContext.start(); try { streamingContext.awaitTermination(); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
Example #13
Source File: JavaNetworkWordCount.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaNetworkWordCount <hostname> <port>"); System.exit(1); } StreamingExamples.setStreamingLogLevels(); // Create the context with a 1 second batch size SparkConf sparkConf = new SparkConf().setAppName("JavaNetworkWordCount"); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1)); // Create a JavaReceiverInputDStream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') // Note that no duplication in storage level only for running locally. // Replication necessary in distributed scenario for fault tolerance. JavaReceiverInputDStream<String> lines = ssc.socketTextStream( args[0], Integer.parseInt(args[1]), StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String x) { return Arrays.asList(SPACE.split(x)).iterator(); } }); JavaPairDStream<String, Integer> wordCounts = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<>(s, 1); } }).reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.print(); ssc.start(); ssc.awaitTermination(); }
Example #14
Source File: JavaSqlNetworkWordCount.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaNetworkWordCount <hostname> <port>"); System.exit(1); } StreamingExamples.setStreamingLogLevels(); // Create the context with a 1 second batch size SparkConf sparkConf = new SparkConf().setAppName("JavaSqlNetworkWordCount"); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1)); // Create a JavaReceiverInputDStream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') // Note that no duplication in storage level only for running locally. // Replication necessary in distributed scenario for fault tolerance. JavaReceiverInputDStream<String> lines = ssc.socketTextStream( args[0], Integer.parseInt(args[1]), StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String x) { return Arrays.asList(SPACE.split(x)).iterator(); } }); // Convert RDDs of the words DStream to DataFrame and run SQL query words.foreachRDD(new VoidFunction2<JavaRDD<String>, Time>() { @Override public void call(JavaRDD<String> rdd, Time time) { SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf()); // Convert JavaRDD[String] to JavaRDD[bean class] to DataFrame JavaRDD<JavaRecord> rowRDD = rdd.map(new Function<String, JavaRecord>() { @Override public JavaRecord call(String word) { JavaRecord record = new JavaRecord(); record.setWord(word); return record; } }); Dataset<Row> wordsDataFrame = spark.createDataFrame(rowRDD, JavaRecord.class); // Creates a temporary view using the DataFrame wordsDataFrame.createOrReplaceTempView("words"); // Do word count on table using SQL and print it Dataset<Row> wordCountsDataFrame = spark.sql("select word, count(*) as total from words group by word"); System.out.println("========= " + time + "========="); wordCountsDataFrame.show(); } }); ssc.start(); ssc.awaitTermination(); }
Example #15
Source File: JavaNetworkWordCount.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { /** * 资源.setMaster("local[2]")必须大于1 一个负责取数据 其他负责计算 */ // if (args.length < 2) { // System.err.println("Usage: JavaNetworkWordCount <hostname> <port>"); // System.exit(1); // } StreamingExamples.setStreamingLogLevels(); // Create the context with a 1 second batch size SparkConf sparkConf = SparkUtils.getLocalSparkConf(JavaNetworkWordCount.class); /* * 创建该对象类似于spark core中的JavaSparkContext * 该对象除了接受SparkConf对象,还接收了一个BatchInterval参数,就算说,每收集多长时间去划分一个人Batch即RDD去执行 */ JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(2)); /* * 首先创建输入DStream,代表一个数据比如这里从socket或KafKa来持续不断的进入实时数据流 * 创建一个监听Socket数据量,RDD里面的每一个元素就是一行行的文本 */ JavaReceiverInputDStream<String> lines = ssc.socketTextStream("192.168.2.1", 9999, StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String x) { return Lists.newArrayList(SPACE.split(x)).iterator(); } }); JavaPairDStream<String, Integer> wordCounts = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }).reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.print(); ssc.start(); try { ssc.awaitTermination(); } catch (Exception e) { e.printStackTrace(); } }
Example #16
Source File: SparkStreamingPulsarReceiverExample.java From pulsar with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { if (args.length < 3) { System.err.println("Missing parameters!"); System.err.println("Usage: <pulsar-service-url> <topic> <sub>"); return; } String serviceUrl = args[0]; String inputTopic = args[1]; String subscription = args[2]; System.out.println("Parameters:"); System.out.println("\tServiceUrl:\t" + serviceUrl); System.out.println("\tTopic:\t" + inputTopic); System.out.println("\tSubscription:\t" + subscription); SparkConf sparkConf = new SparkConf().setAppName("Pulsar Spark Example"); JavaStreamingContext jsc = new JavaStreamingContext(sparkConf, Durations.seconds(60)); ConsumerConfigurationData<byte[]> pulsarConf = new ConsumerConfigurationData(); Set<String> set = new HashSet<>(); set.add(inputTopic); pulsarConf.setTopicNames(set); pulsarConf.setSubscriptionName(subscription); SparkStreamingPulsarReceiver pulsarReceiver = new SparkStreamingPulsarReceiver( serviceUrl, pulsarConf, new AuthenticationDisabled()); JavaReceiverInputDStream<byte[]> lineDStream = jsc.receiverStream(pulsarReceiver); JavaPairDStream<String, Integer> result = lineDStream.flatMap(x -> { String line = new String(x, StandardCharsets.UTF_8); List<String> list = Arrays.asList(line.split(" ")); return list.iterator(); }) .mapToPair(x -> new Tuple2<String, Integer>(x, 1)) .reduceByKey((x, y) -> x + y); result.print(); jsc.start(); jsc.awaitTermination(); }
Example #17
Source File: CloudPubSubStreamingWordCount.java From spark-on-k8s-gcp-examples with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws InterruptedException { if (args.length != 4) { System.err.println("Usage: CloudPubSubStreamingWordCount <GCP project ID> " + "<Cloud PubSub subscription> <GCS output dir path> <job duration in seconds>"); System.exit(1); } Preconditions.checkArgument( !Strings.isNullOrEmpty(args[0]), "GCP project ID must not be null or empty"); Preconditions.checkArgument( !Strings.isNullOrEmpty(args[1]), "Cloud PubSub topic name must not be empty"); JavaStreamingContext jsc = new JavaStreamingContext( new SparkConf().setAppName("Cloud PubSub Spark Streaming Word Count"), Seconds.apply(30) // Batch duration ); Configuration hadoopConf = jsc.sparkContext().hadoopConfiguration(); // Use service account for authentication. The service account key file is located at the path // specified by the configuration property google.cloud.auth.service.account.json.keyfile. hadoopConf.set( EntriesCredentialConfiguration.BASE_KEY_PREFIX + EntriesCredentialConfiguration.ENABLE_SERVICE_ACCOUNTS_SUFFIX, "true"); // Use the service account Json key file shared with the GCS connector. String serviceAccountJsonKeyFilePath = hadoopConf.get( EntriesCredentialConfiguration.BASE_KEY_PREFIX + EntriesCredentialConfiguration.JSON_KEYFILE_SUFFIX); Preconditions.checkArgument(!Strings.isNullOrEmpty(serviceAccountJsonKeyFilePath), "Service account Json key file path must be specified"); // This will create a subscription to the given topic. JavaReceiverInputDStream<SparkPubsubMessage> pubSubStream = PubsubUtils.createStream( jsc, args[0], // GCP project ID args[1], // Cloud PubSub subscription new SparkGCPCredentials.Builder() .jsonServiceAccount(serviceAccountJsonKeyFilePath) .build(), StorageLevel.MEMORY_AND_DISK_SER()); JavaPairDStream<String, Long> wordCounts = pubSubStream .mapToPair(message -> new Tuple2<>(new String(message.getData()), 1L)) .reduceByKey((count1, count2) -> count1 + count2); final String gcsFilePathTemplate = args[2] + "/batch-%d"; wordCounts .mapToPair(tuple -> new Tuple2<>(new Text(tuple._1), new LongWritable(tuple._2))) .foreachRDD(rdd -> rdd .saveAsNewAPIHadoopFile(String.format(gcsFilePathTemplate, rdd.id()), Text.class, LongWritable.class, TextOutputFormat.class)); try { jsc.start(); // Let the job run for the given duration and then terminate it. jsc.awaitTerminationOrTimeout(TimeUnit.SECONDS.toMillis(Long.parseLong(args[3]))); } finally { jsc.stop(true, true); } }
Example #18
Source File: Throughput.java From flink-perf with Apache License 2.0 | 4 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("throughput").setMaster("local[8]"); JavaStreamingContext ssc = new JavaStreamingContext(conf, new Duration(2000)); JavaReceiverInputDStream<Tuple4<Long, Integer, Long, byte[]>> source = ssc.receiverStream(new Source(StorageLevel.MEMORY_ONLY())); JavaPairDStream<Long, Tuple3<Integer, Long, byte[]>> kvsource = source.mapToPair(new PairFunction<Tuple4<Long, Integer, Long, byte[]>, Long, Tuple3<Integer, Long, byte[]>>() { @Override public Tuple2<Long, Tuple3<Integer, Long, byte[]>> call(Tuple4<Long, Integer, Long, byte[]> longIntegerLongTuple4) throws Exception { return new Tuple2<Long, Tuple3<Integer, Long, byte[]>>(longIntegerLongTuple4._1(), new Tuple3<Integer, Long, byte[]>(longIntegerLongTuple4._2(), longIntegerLongTuple4._3(), longIntegerLongTuple4._4())); } }); JavaDStream<Long> res = kvsource.repartition(3).mapPartitions(new FlatMapFunction<Iterator<Tuple2<Long,Tuple3<Integer,Long,byte[]>>>, Long>() { @Override public Iterable<Long> call(Iterator<Tuple2<Long, Tuple3<Integer, Long, byte[]>>> tuple2Iterator) throws Exception { long start = System.currentTimeMillis(); long received = 0; while(tuple2Iterator.hasNext()) { received++; Tuple2<Long, Tuple3<Integer, Long, byte[]>> el = tuple2Iterator.next(); if (el._2()._2() != 0) { long lat = System.currentTimeMillis() - el._2()._2(); System.out.println("Latency " + lat + " ms"); } } long sinceMs = (System.currentTimeMillis() - start); System.out.println("Finished Batch. Processed "+received+" elements in "+sinceMs+" ms."); return new Iterable<Long>() { @Override public Iterator<Long> iterator() { return new Iterator<Long>() { @Override public boolean hasNext() { return false; } @Override public Long next() { return 1L; } @Override public void remove() { } }; } }; } /* @Override public Long call(Tuple2<Long, Tuple3<Integer, Long, byte[]>> v1) throws Exception { // System.out.println("Recevied " + v1); if (start == 0) { } received++; if (received % logfreq == 0) { if (sinceSec == 0) { System.out.println("received " + received + " elements since 0"); return 0L; } System.out.println("Received " + received + " elements since " + sinceSec + ". " + "Elements per second " + received / sinceSec + ", GB received " + ((received * (8 + 4 + 12)) / 1024 / 1024 / 1024)); } if (v1._2()._2() != 0) { long lat = System.currentTimeMillis() - v1._2()._2(); System.out.println("Latency " + lat + " ms"); } return received; } */ }); //res.print(); /*res.foreachRDD(new Function2<JavaRDD<Long>, Time, Void>() { @Override public Void call(JavaRDD<Long> integerJavaRDD, Time t) throws Exception { integerJavaRDD.saveAsTextFile("/home/robert/flink-workdir/flink-perf/out/"+t.toString()); return null; } }); */ res.print(); // res.print(); ssc.start(); }
Example #19
Source File: StateLess.java From sparkResearch with Apache License 2.0 | 4 votes |
public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setMaster("local[2]").setAppName("StateLess"); JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1)); JavaReceiverInputDStream<String> inputDStream = streamingContext.socketTextStream("localhost", 8080); JavaDStream<String> dStream = inputDStream.flatMap((FlatMapFunction<String, String>) s -> Arrays.asList(SPACE.split(s)).iterator()); JavaPairDStream<String, Integer> pairDStream = dStream.mapToPair(new LogTuple()); JavaPairDStream<String, Integer> result = pairDStream.reduceByKey(new ReduceIsKey()); //JOIN JavaPairDStream<String, Integer> pairDStream1 = dStream.mapToPair(new LogTuple()); JavaPairDStream<String, Integer> result1 = pairDStream.reduceByKey(new ReduceIsKey()); JavaPairDStream<String, Tuple2<Integer, Integer>> c = result.join(result); result.foreachRDD(rdd -> { rdd.foreachPartition(partitionOfRecords -> { Connection connection = ConnectionPool.getConnection(); Tuple2<String, Integer> wordCount; while (partitionOfRecords.hasNext()) { wordCount = partitionOfRecords.next(); String sql = "insert into wordcount(word,count) " + "values('" + wordCount._1 + "'," + wordCount._2 + ")"; Statement stmt = connection.createStatement(); stmt.executeUpdate(sql); } ConnectionPool.returnConnection(connection); }); }); try { streamingContext.start(); streamingContext.awaitTermination(); streamingContext.close(); } catch (InterruptedException e) { e.printStackTrace(); } }
Example #20
Source File: StateLessProcessingExample.java From Apache-Spark-2x-for-Java-Developers with MIT License | 3 votes |
public static void main(String[] args) throws InterruptedException { System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils"); SparkSession sparkSession = SparkSession.builder().master("local[*]").appName("stateless Streaming Example") .config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate(); JavaStreamingContext jssc = new JavaStreamingContext(new JavaSparkContext(sparkSession.sparkContext()), Durations.milliseconds(1000)); JavaReceiverInputDStream<String> inStream = jssc.socketTextStream("10.204.136.223", 9999); JavaDStream<FlightDetails> flightDetailsStream = inStream.map(x -> { ObjectMapper mapper = new ObjectMapper(); return mapper.readValue(x, FlightDetails.class); }); //flightDetailsStream.print(); //flightDetailsStream.foreachRDD((VoidFunction<JavaRDD<FlightDetails>>) rdd -> rdd.saveAsTextFile("hdfs://namenode:port/path")); JavaDStream<FlightDetails> window = flightDetailsStream.window(Durations.minutes(5),Durations.minutes(1)); JavaPairDStream<String, Double> transfomedWindow = window.mapToPair(f->new Tuple2<String,Double>(f.getFlightId(),f.getTemperature())). mapValues(t->new Tuple2<Double,Integer>(t,1)) .reduceByKey((t1, t2) -> new Tuple2<Double, Integer>(t1._1()+t2._1(), t1._2()+t2._2())).mapValues(t -> t._1()/t._2()); transfomedWindow.cache(); transfomedWindow.print(); jssc.start(); jssc.awaitTermination(); }
Example #21
Source File: Join.java From sparkResearch with Apache License 2.0 | 3 votes |
public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setMaster("local[2]").setAppName("StateLess"); JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1)); JavaReceiverInputDStream<String> inputDStream = streamingContext.socketTextStream("localhost", 8080); JavaReceiverInputDStream<String> inputDStream1 = streamingContext.socketTextStream("localhost", 8081); }