Java Code Examples for org.apache.spark.streaming.api.java.JavaDStream#mapToPair()
The following examples show how to use
org.apache.spark.streaming.api.java.JavaDStream#mapToPair() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ReduceByKeyAndWindow.java From sparkResearch with Apache License 2.0 | 6 votes |
public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("reduceByKeyAndWindow").setMaster("local[2]"); JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10)); //检查点设置 streamingContext.checkpoint("hdfs://localhost:9300"); //数据源 JavaDStream<String> dStream = streamingContext.socketTextStream("localhost", 8080); JavaPairDStream<String, Long> ipPairDstream = dStream.mapToPair(new GetIp()); JavaPairDStream<String, Long> result = ipPairDstream.reduceByKeyAndWindow(new AddLongs(), new SubtractLongs(), Durations.seconds(30), Durations.seconds(10)); try { streamingContext.start(); streamingContext.awaitTermination(); } catch (InterruptedException e) { e.printStackTrace(); } }
Example 2
Source File: BlurLoadSparkProcessor.java From incubator-retired-blur with Apache License 2.0 | 6 votes |
public void run() throws IOException { SparkConf conf = new SparkConf(); conf.setAppName(getAppName()); conf.set(SPARK_SERIALIZER, ORG_APACHE_SPARK_SERIALIZER_KRYO_SERIALIZER); JavaSparkUtil.packProjectJars(conf); setupSparkConf(conf); JavaStreamingContext ssc = new JavaStreamingContext(conf, getDuration()); List<JavaDStream<T>> streamsList = getStreamsList(ssc); // Union all the streams if there is more than 1 stream JavaDStream<T> streams = unionStreams(ssc, streamsList); JavaPairDStream<String, RowMutation> pairDStream = streams.mapToPair(new PairFunction<T, String, RowMutation>() { public Tuple2<String, RowMutation> call(T t) { RowMutation rowMutation = convert(t); return new Tuple2<String, RowMutation>(rowMutation.getRowId(), rowMutation); } }); pairDStream.foreachRDD(getFunction()); ssc.start(); ssc.awaitTermination(); }
Example 3
Source File: RealtimeTrafficDataProcessor.java From lambda-arch with Apache License 2.0 | 5 votes |
/** * Method to get the vehicles which are in radius of POI and their distance from POI. * * @param nonFilteredIotDataStream original IoT data stream * @param broadcastPOIValues variable containing POI coordinates, route and vehicle types to monitor. */ public void processPOIData( JavaDStream<IoTData> nonFilteredIotDataStream, Broadcast<Tuple3<POIData, String, String>> broadcastPOIValues ) { // Filter by routeId,vehicleType and in POI range JavaDStream<IoTData> iotDataStreamFiltered = nonFilteredIotDataStream .filter(iot -> (iot.getRouteId().equals(broadcastPOIValues.value()._2()) && iot.getVehicleType().contains(broadcastPOIValues.value()._3()) && GeoDistanceCalculator.isInPOIRadius(Double.valueOf(iot.getLatitude()), Double.valueOf(iot.getLongitude()), broadcastPOIValues.value()._1().getLatitude(), broadcastPOIValues.value()._1().getLongitude(), broadcastPOIValues.value()._1().getRadius()))); // pair with poi JavaPairDStream<IoTData, POIData> poiDStreamPair = iotDataStreamFiltered.mapToPair( iot -> new Tuple2<>(iot, broadcastPOIValues.value()._1()) ); // Transform to dstream of POITrafficData JavaDStream<POITrafficData> trafficDStream = poiDStreamPair.map(poiTrafficDataFunc); // Map Cassandra table column Map<String, String> columnNameMappings = new HashMap<String, String>(); columnNameMappings.put("vehicleId", "vehicleid"); columnNameMappings.put("distance", "distance"); columnNameMappings.put("vehicleType", "vehicletype"); columnNameMappings.put("timeStamp", "timestamp"); // call CassandraStreamingJavaUtil function to save in DB javaFunctions(trafficDStream) .writerBuilder( "traffickeyspace", "poi_traffic", CassandraJavaUtil.mapToRow(POITrafficData.class, columnNameMappings) ) .withConstantTTL(120)//keeping data for 2 minutes .saveToCassandra(); }
Example 4
Source File: ComputeStreamingResponse.java From incubator-retired-pirk with Apache License 2.0 | 5 votes |
/** * Method to perform the query given an input JavaDStream of JSON * */ public void performQuery(JavaDStream<MapWritable> input) { logger.info("Performing query: "); // Process non-overlapping windows of data of duration windowLength seconds // If we are using queue streams, there is no need to window if (!useQueueStream) { input.window(Durations.seconds(windowLength), Durations.seconds(windowLength)); } // Extract the selectors for each dataElement based upon the query type // and perform a keyed hash of the selectors JavaPairDStream<Integer,List<BigInteger>> selectorHashToDocRDD = input.mapToPair(new HashSelectorsAndPartitionData(bVars)); // Group by hashed selector (row) -- can combine with the line above, separating for testing and benchmarking... JavaPairDStream<Integer,Iterable<List<BigInteger>>> selectorGroupRDD = selectorHashToDocRDD.groupByKey(); // Calculate the encrypted row values for each row, emit <colNum, colVal> for each row JavaPairDStream<Long,BigInteger> encRowRDD = selectorGroupRDD.flatMapToPair(new EncRowCalc(accum, bVars)); // Multiply the column values by colNum: emit <colNum, finalColVal> and write the final result object encryptedColumnCalc(encRowRDD); // Start the streaming computation start(); }
Example 5
Source File: IoTTrafficDataProcessor.java From iot-traffic-monitor with Apache License 2.0 | 5 votes |
/** * Method to get the vehicles which are in radius of POI and their distance from POI. * * @param nonFilteredIotDataStream original IoT data stream * @param broadcastPOIValues variable containing POI coordinates, route and vehicle types to monitor. */ public void processPOIData(JavaDStream<IoTData> nonFilteredIotDataStream,Broadcast<Tuple3<POIData, String, String>> broadcastPOIValues) { // Filter by routeId,vehicleType and in POI range JavaDStream<IoTData> iotDataStreamFiltered = nonFilteredIotDataStream .filter(iot -> (iot.getRouteId().equals(broadcastPOIValues.value()._2()) && iot.getVehicleType().contains(broadcastPOIValues.value()._3()) && GeoDistanceCalculator.isInPOIRadius(Double.valueOf(iot.getLatitude()), Double.valueOf(iot.getLongitude()), broadcastPOIValues.value()._1().getLatitude(), broadcastPOIValues.value()._1().getLongitude(), broadcastPOIValues.value()._1().getRadius()))); // pair with poi JavaPairDStream<IoTData, POIData> poiDStreamPair = iotDataStreamFiltered .mapToPair(iot -> new Tuple2<>(iot, broadcastPOIValues.value()._1())); // Transform to dstream of POITrafficData JavaDStream<POITrafficData> trafficDStream = poiDStreamPair.map(poiTrafficDataFunc); // Map Cassandra table column Map<String, String> columnNameMappings = new HashMap<String, String>(); columnNameMappings.put("vehicleId", "vehicleid"); columnNameMappings.put("distance", "distance"); columnNameMappings.put("vehicleType", "vehicletype"); columnNameMappings.put("timeStamp", "timestamp"); // call CassandraStreamingJavaUtil function to save in DB javaFunctions(trafficDStream) .writerBuilder("traffickeyspace", "poi_traffic",CassandraJavaUtil.mapToRow(POITrafficData.class, columnNameMappings)) .withConstantTTL(120)//keeping data for 2 minutes .saveToCassandra(); }
Example 6
Source File: SparkUtils.java From cxf with Apache License 2.0 | 5 votes |
public static JavaPairDStream<String, Integer> createOutputDStream( JavaDStream<String> receiverStream, boolean withId) { final JavaDStream<String> words = receiverStream.flatMap(x -> withId ? splitInputStringWithId(x) : splitInputString(x)); final JavaPairDStream<String, Integer> pairs = words.mapToPair(s -> { return new Tuple2<String, Integer>(s, 1); }); return pairs.reduceByKey((i1, i2) -> { return i1 + i2; }); }
Example 7
Source File: AbstractJavaEsSparkStreamingTest.java From elasticsearch-hadoop with Apache License 2.0 | 5 votes |
@Test public void testEsRDDWriteWithDynamicMapping() throws Exception { Map<String, Object> doc1 = new HashMap<>(); doc1.put("number", 3); doc1.put("one", null); Set<String> values = new HashSet<>(); values.add("2"); doc1.put("two", values); doc1.put("three", "."); Map<String, Object> doc2 = new HashMap<>(); doc2.put("number", 4); doc2.put("OTP", "Otopeni"); doc2.put("SFO", "San Fran"); List<Map<String, Object>> docs = new ArrayList<>(); docs.add(doc1); docs.add(doc2); String target = wrapIndex(resource("spark-streaming-test-scala-dyn-id-write", "data", version)); String docEndpoint = wrapIndex(docEndpoint("spark-streaming-test-scala-dyn-id-write", "data", version)); JavaRDD<Map<String,Object>> batch = sc.parallelize(docs); Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>(); rddQueue.add(batch); JavaDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue); JavaPairDStream<Integer, Map<String, Object>> metaDstream = dstream.mapToPair(new ExtractIDFunction()); JavaEsSparkStreaming.saveToEsWithMeta(metaDstream, target, cfg); ssc.start(); TimeUnit.SECONDS.sleep(2); ssc.stop(false, true); assertEquals(2, JavaEsSpark.esRDD(sc, target).count()); assertTrue(RestUtils.exists(docEndpoint + "/3")); assertTrue(RestUtils.exists(docEndpoint + "/4")); assertThat(RestUtils.get(target + "/_search?"), containsString("SFO")); }
Example 8
Source File: AbstractJavaEsSparkStreamingTest.java From elasticsearch-hadoop with Apache License 2.0 | 5 votes |
@Test public void testEsRDDWriteWithDynamicMapping() throws Exception { Map<String, Object> doc1 = new HashMap<>(); doc1.put("number", 3); doc1.put("one", null); Set<String> values = new HashSet<>(); values.add("2"); doc1.put("two", values); doc1.put("three", "."); Map<String, Object> doc2 = new HashMap<>(); doc2.put("number", 4); doc2.put("OTP", "Otopeni"); doc2.put("SFO", "San Fran"); List<Map<String, Object>> docs = new ArrayList<>(); docs.add(doc1); docs.add(doc2); String target = wrapIndex(resource("spark-test-scala-dyn-id-write", "data", version)); String docEndpoint = wrapIndex(docEndpoint("spark-test-scala-dyn-id-write", "data", version)); JavaRDD<Map<String,Object>> batch = sc.parallelize(docs); Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>(); rddQueue.add(batch); JavaDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue); JavaPairDStream<Integer, Map<String, Object>> metaDstream = dstream.mapToPair(new ExtractIDFunction()); JavaEsSparkStreaming.saveToEsWithMeta(metaDstream, target, cfg); ssc.start(); TimeUnit.SECONDS.sleep(2); ssc.stop(false, true); assertEquals(2, JavaEsSpark.esRDD(sc, target).count()); assertTrue(RestUtils.exists(docEndpoint + "/3")); assertTrue(RestUtils.exists(docEndpoint + "/4")); assertThat(RestUtils.get(target + "/_search?"), containsString("SFO")); }
Example 9
Source File: StateLess.java From sparkResearch with Apache License 2.0 | 4 votes |
public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setMaster("local[2]").setAppName("StateLess"); JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1)); JavaReceiverInputDStream<String> inputDStream = streamingContext.socketTextStream("localhost", 8080); JavaDStream<String> dStream = inputDStream.flatMap((FlatMapFunction<String, String>) s -> Arrays.asList(SPACE.split(s)).iterator()); JavaPairDStream<String, Integer> pairDStream = dStream.mapToPair(new LogTuple()); JavaPairDStream<String, Integer> result = pairDStream.reduceByKey(new ReduceIsKey()); //JOIN JavaPairDStream<String, Integer> pairDStream1 = dStream.mapToPair(new LogTuple()); JavaPairDStream<String, Integer> result1 = pairDStream.reduceByKey(new ReduceIsKey()); JavaPairDStream<String, Tuple2<Integer, Integer>> c = result.join(result); result.foreachRDD(rdd -> { rdd.foreachPartition(partitionOfRecords -> { Connection connection = ConnectionPool.getConnection(); Tuple2<String, Integer> wordCount; while (partitionOfRecords.hasNext()) { wordCount = partitionOfRecords.next(); String sql = "insert into wordcount(word,count) " + "values('" + wordCount._1 + "'," + wordCount._2 + ")"; Statement stmt = connection.createStatement(); stmt.executeUpdate(sql); } ConnectionPool.returnConnection(connection); }); }); try { streamingContext.start(); streamingContext.awaitTermination(); streamingContext.close(); } catch (InterruptedException e) { e.printStackTrace(); } }
Example 10
Source File: SparkMLTrainingAndScoringOnline.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License | 4 votes |
public static void main(String[] args) throws InterruptedException { System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE); final SparkConf conf = new SparkConf() .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES) .setAppName(APPLICATION_NAME) .set("spark.sql.caseSensitive", CASE_SENSITIVE); JavaStreamingContext streamingContext = new JavaStreamingContext(conf, new Duration(BATCH_DURATION_INTERVAL_MS)); JavaInputDStream<ConsumerRecord<String, String>> meetupStream = KafkaUtils.createDirectStream( streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES) ); JavaDStream<String> meetupStreamValues = meetupStream.map(v -> { return v.value(); }); // Prepare the training data as strings of type: (y,[x1,x2,x3,...,xn]) // Where n is the number of features, y is a binary label, // and n must be the same for train and test. // e.g. "(response, [group_lat, group_long])"; JavaDStream<String> trainData = meetupStreamValues.map(e -> { JSONParser jsonParser = new JSONParser(); JSONObject json = (JSONObject)jsonParser.parse(e); String result = "(" + (String.valueOf(json.get("response")).equals("yes") ? "1.0,[":"0.0,[") + ((JSONObject)json.get("group")).get("group_lat") + "," + ((JSONObject)json.get("group")).get("group_lon") + "])"; return result; }); trainData.print(); JavaDStream<LabeledPoint> labeledPoints = trainData.map(LabeledPoint::parse); StreamingLogisticRegressionWithSGD streamingLogisticRegressionWithSGD = new StreamingLogisticRegressionWithSGD() .setInitialWeights(Vectors.zeros(2)); streamingLogisticRegressionWithSGD.trainOn(labeledPoints); JavaPairDStream<Double, Vector> values = labeledPoints.mapToPair(f -> new Tuple2<>(f.label(), f.features())); streamingLogisticRegressionWithSGD.predictOnValues(values).print(); // some time later, after outputs have completed meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> { OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges(); ((CanCommitOffsets) meetupStream.inputDStream()) .commitAsync(offsetRanges, new MeetupOffsetCommitCallback()); }); streamingContext.start(); streamingContext.awaitTermination(); }
Example 11
Source File: StateFulProcessingExample.java From Apache-Spark-2x-for-Java-Developers with MIT License | 4 votes |
public static void main(String[] args) throws InterruptedException { System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils"); SparkSession sparkSession = SparkSession.builder().master("local[*]").appName("Stateful Streaming Example") .config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate(); JavaStreamingContext jssc= new JavaStreamingContext(new JavaSparkContext(sparkSession.sparkContext()), Durations.milliseconds(1000)); JavaReceiverInputDStream<String> inStream = jssc.socketTextStream("10.204.136.223", 9999); jssc.checkpoint("C:\\Users\\sgulati\\spark-checkpoint"); JavaDStream<FlightDetails> flightDetailsStream = inStream.map(x -> { ObjectMapper mapper = new ObjectMapper(); return mapper.readValue(x, FlightDetails.class); }); JavaPairDStream<String, FlightDetails> flightDetailsPairStream = flightDetailsStream .mapToPair(f -> new Tuple2<String, FlightDetails>(f.getFlightId(), f)); Function3<String, Optional<FlightDetails>, State<List<FlightDetails>>, Tuple2<String, Double>> mappingFunc = ( flightId, curFlightDetail, state) -> { List<FlightDetails> details = state.exists() ? state.get() : new ArrayList<>(); boolean isLanded = false; if (curFlightDetail.isPresent()) { details.add(curFlightDetail.get()); if (curFlightDetail.get().isLanded()) { isLanded = true; } } Double avgSpeed = details.stream().mapToDouble(f -> f.getTemperature()).average().orElse(0.0); if (isLanded) { state.remove(); } else { state.update(details); } return new Tuple2<String, Double>(flightId, avgSpeed); }; JavaMapWithStateDStream<String, FlightDetails, List<FlightDetails>, Tuple2<String, Double>> streamWithState = flightDetailsPairStream .mapWithState(StateSpec.function(mappingFunc).timeout(Durations.minutes(5))); streamWithState.print(); jssc.start(); jssc.awaitTermination(); }
Example 12
Source File: AbstractJavaEsSparkStreamingTest.java From elasticsearch-hadoop with Apache License 2.0 | 4 votes |
@Test public void testEsRDDWriteWithDynamicMapMapping() throws Exception { Map<String, Object> doc1 = new HashMap<>(); doc1.put("id", 5); doc1.put("version", "3"); doc1.put("one", null); Set<String> values = new HashSet<>(); values.add("2"); doc1.put("two", values); doc1.put("three", "."); Map<String, Object> doc2 = new HashMap<>(); doc2.put("id", 6); doc1.put("version", "5"); doc2.put("OTP", "Otopeni"); doc2.put("SFO", "San Fran"); List<Map<String, Object>> docs = new ArrayList<>(); docs.add(doc1); docs.add(doc2); String target = wrapIndex(resource("spark-streaming-test-scala-dyn-id-write-map", "data", version)); String docEndpoint = wrapIndex(docEndpoint("spark-streaming-test-scala-dyn-id-write-map", "data", version)); JavaRDD<Map<String,Object>> batch = sc.parallelize(docs); Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>(); rddQueue.add(batch); JavaDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue); JavaPairDStream<Map<Metadata, Object>, Map<String, Object>> metaDstream = dstream.mapToPair(new ExtractMetaMap()); JavaEsSparkStreaming.saveToEsWithMeta(metaDstream, target, cfg); ssc.start(); TimeUnit.SECONDS.sleep(2); ssc.stop(false, true); assertEquals(2, JavaEsSpark.esRDD(sc, target).count()); assertTrue(RestUtils.exists(docEndpoint + "/5")); assertTrue(RestUtils.exists(docEndpoint + "/6")); assertThat(RestUtils.get(target + "/_search?"), containsString("SFO")); }
Example 13
Source File: AbstractJavaEsSparkStreamingTest.java From elasticsearch-hadoop with Apache License 2.0 | 4 votes |
@Test public void testEsRDDWriteWithDynamicMapMapping() throws Exception { Map<String, Object> doc1 = new HashMap<>(); doc1.put("id", 5); doc1.put("version", "3"); doc1.put("one", null); Set<String> values = new HashSet<>(); values.add("2"); doc1.put("two", values); doc1.put("three", "."); Map<String, Object> doc2 = new HashMap<>(); doc2.put("id", 6); doc1.put("version", "5"); doc2.put("OTP", "Otopeni"); doc2.put("SFO", "San Fran"); List<Map<String, Object>> docs = new ArrayList<>(); docs.add(doc1); docs.add(doc2); String target = wrapIndex(resource("spark-test-scala-dyn-id-write-map", "data", version)); String docEndpoint = wrapIndex(docEndpoint("spark-test-scala-dyn-id-write-map", "data", version)); JavaRDD<Map<String,Object>> batch = sc.parallelize(docs); Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>(); rddQueue.add(batch); JavaDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue); JavaPairDStream<Map<Metadata, Object>, Map<String, Object>> metaDstream = dstream.mapToPair(new ExtractMetaMap()); JavaEsSparkStreaming.saveToEsWithMeta(metaDstream, target, cfg); ssc.start(); TimeUnit.SECONDS.sleep(2); ssc.stop(false, true); assertEquals(2, JavaEsSpark.esRDD(sc, target).count()); assertTrue(RestUtils.exists(docEndpoint + "/5")); assertTrue(RestUtils.exists(docEndpoint + "/6")); assertThat(RestUtils.get(target + "/_search?"), containsString("SFO")); }