org.apache.spark.streaming.api.java.JavaPairDStream Java Examples
The following examples show how to use
org.apache.spark.streaming.api.java.JavaPairDStream.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: KafkaStreaming.java From sparkResearch with Apache License 2.0 | 8 votes |
public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]"); JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10000)); //设置检查点 streamingContext.checkpoint("HDFS URL"); Map<String, Integer> topicThread = new HashMap<>(1); topicThread.put(TOPIC, THREAD); JavaPairInputDStream<String, String> dStream = KafkaUtils.createStream(streamingContext, HOST, GROP, topicThread); JavaDStream<String> words = dStream.flatMap((FlatMapFunction<Tuple2<String, String>, String>) stringStringTuple2 -> Arrays.asList(SPACE.split(stringStringTuple2._2)).iterator()); //统计 JavaPairDStream<String, Integer> result = words.mapToPair((PairFunction<String, String, Integer>) s -> new Tuple2<>(s, 1)).reduceByKey((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2); try { result.print(); streamingContext.start(); streamingContext.awaitTermination(); } catch (InterruptedException e) { e.printStackTrace(); } }
Example #2
Source File: StreamingEngine.java From spark-streaming-direct-kafka with Apache License 2.0 | 6 votes |
public void start() { SparkConf sparkConf = getSparkConf(); streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(Long.parseLong(config.getStreamingBatchIntervalInSec()))); JavaInputDStream<MessageAndMetadata<String, byte[]>> dStream = buildInputDStream(streamingContext); JavaPairDStream<String, byte[]> pairDStream = dStream.mapToPair(km -> new Tuple2<>(km.key(), km.message())); pairDStream.foreachRDD(new ProcessStreamingData<>(config)); // process data dStream.foreachRDD(new UpdateOffsetsFn<>(config.getKafkaGroupId(), config.getZkOffsetManager())); streamingContext.start(); }
Example #3
Source File: ReduceByKeyAndWindow.java From sparkResearch with Apache License 2.0 | 6 votes |
public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("reduceByKeyAndWindow").setMaster("local[2]"); JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10)); //检查点设置 streamingContext.checkpoint("hdfs://localhost:9300"); //数据源 JavaDStream<String> dStream = streamingContext.socketTextStream("localhost", 8080); JavaPairDStream<String, Long> ipPairDstream = dStream.mapToPair(new GetIp()); JavaPairDStream<String, Long> result = ipPairDstream.reduceByKeyAndWindow(new AddLongs(), new SubtractLongs(), Durations.seconds(30), Durations.seconds(10)); try { streamingContext.start(); streamingContext.awaitTermination(); } catch (InterruptedException e) { e.printStackTrace(); } }
Example #4
Source File: BlurLoadSparkProcessor.java From incubator-retired-blur with Apache License 2.0 | 6 votes |
public void run() throws IOException { SparkConf conf = new SparkConf(); conf.setAppName(getAppName()); conf.set(SPARK_SERIALIZER, ORG_APACHE_SPARK_SERIALIZER_KRYO_SERIALIZER); JavaSparkUtil.packProjectJars(conf); setupSparkConf(conf); JavaStreamingContext ssc = new JavaStreamingContext(conf, getDuration()); List<JavaDStream<T>> streamsList = getStreamsList(ssc); // Union all the streams if there is more than 1 stream JavaDStream<T> streams = unionStreams(ssc, streamsList); JavaPairDStream<String, RowMutation> pairDStream = streams.mapToPair(new PairFunction<T, String, RowMutation>() { public Tuple2<String, RowMutation> call(T t) { RowMutation rowMutation = convert(t); return new Tuple2<String, RowMutation>(rowMutation.getRowId(), rowMutation); } }); pairDStream.foreachRDD(getFunction()); ssc.start(); ssc.awaitTermination(); }
Example #5
Source File: IoTTrafficDataProcessor.java From iot-traffic-monitor with Apache License 2.0 | 6 votes |
/** * Method to get window traffic counts of different type of vehicles for each route. * Window duration = 30 seconds and Slide interval = 10 seconds * * @param filteredIotDataStream IoT data stream */ public void processWindowTrafficData(JavaDStream<IoTData> filteredIotDataStream) { // reduce by key and window (30 sec window and 10 sec slide). JavaPairDStream<AggregateKey, Long> countDStreamPair = filteredIotDataStream .mapToPair(iot -> new Tuple2<>(new AggregateKey(iot.getRouteId(), iot.getVehicleType()), 1L)) .reduceByKeyAndWindow((a, b) -> a + b, Durations.seconds(30), Durations.seconds(10)); // Transform to dstream of TrafficData JavaDStream<WindowTrafficData> trafficDStream = countDStreamPair.map(windowTrafficDataFunc); // Map Cassandra table column Map<String, String> columnNameMappings = new HashMap<String, String>(); columnNameMappings.put("routeId", "routeid"); columnNameMappings.put("vehicleType", "vehicletype"); columnNameMappings.put("totalCount", "totalcount"); columnNameMappings.put("timeStamp", "timestamp"); columnNameMappings.put("recordDate", "recorddate"); // call CassandraStreamingJavaUtil function to save in DB javaFunctions(trafficDStream).writerBuilder("traffickeyspace", "window_traffic", CassandraJavaUtil.mapToRow(WindowTrafficData.class, columnNameMappings)).saveToCassandra(); }
Example #6
Source File: StreamingService.java From cxf with Apache License 2.0 | 6 votes |
private void processStreamOneWay(List<String> inputStrings) { try { SparkConf sparkConf = new SparkConf().setMaster("local[*]") .setAppName("JAX-RS Spark Connect OneWay " + SparkUtils.getRandomId()); JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(1)); JavaDStream<String> receiverStream = null; if ("queue".equals(receiverType)) { Queue<JavaRDD<String>> rddQueue = new LinkedList<>(); for (int i = 0; i < 30; i++) { rddQueue.add(jssc.sparkContext().parallelize(inputStrings)); } receiverStream = jssc.queueStream(rddQueue); } else { receiverStream = jssc.receiverStream(new StringListReceiver(inputStrings)); } JavaPairDStream<String, Integer> wordCounts = SparkUtils.createOutputDStream(receiverStream, false); wordCounts.foreachRDD(new PrintOutputFunction(jssc)); jssc.start(); } catch (Exception ex) { // ignore } }
Example #7
Source File: SparkStreamingJob.java From zipkin-sparkstreaming with Apache License 2.0 | 6 votes |
static void streamSpansToStorage( JavaDStream<byte[]> stream, ReadSpans readSpans, AdjustAndConsumeSpansSharingTraceId adjustAndConsumeSpansSharingTraceId ) { JavaDStream<Span> spans = stream.flatMap(readSpans); // TODO: plug in some filter to drop spans regardless of trace ID // spans = spans.filter(spanFilter); JavaPairDStream<String, Iterable<Span>> tracesById = spans .mapToPair(s -> new Tuple2<>(Util.toLowerHex(s.traceIdHigh, s.traceId), s)) .groupByKey(); tracesById.foreachRDD(rdd -> { rdd.values().foreachPartition(adjustAndConsumeSpansSharingTraceId); }); }
Example #8
Source File: IoTTrafficDataProcessor.java From iot-traffic-monitor with Apache License 2.0 | 5 votes |
/** * Method to get the vehicles which are in radius of POI and their distance from POI. * * @param nonFilteredIotDataStream original IoT data stream * @param broadcastPOIValues variable containing POI coordinates, route and vehicle types to monitor. */ public void processPOIData(JavaDStream<IoTData> nonFilteredIotDataStream,Broadcast<Tuple3<POIData, String, String>> broadcastPOIValues) { // Filter by routeId,vehicleType and in POI range JavaDStream<IoTData> iotDataStreamFiltered = nonFilteredIotDataStream .filter(iot -> (iot.getRouteId().equals(broadcastPOIValues.value()._2()) && iot.getVehicleType().contains(broadcastPOIValues.value()._3()) && GeoDistanceCalculator.isInPOIRadius(Double.valueOf(iot.getLatitude()), Double.valueOf(iot.getLongitude()), broadcastPOIValues.value()._1().getLatitude(), broadcastPOIValues.value()._1().getLongitude(), broadcastPOIValues.value()._1().getRadius()))); // pair with poi JavaPairDStream<IoTData, POIData> poiDStreamPair = iotDataStreamFiltered .mapToPair(iot -> new Tuple2<>(iot, broadcastPOIValues.value()._1())); // Transform to dstream of POITrafficData JavaDStream<POITrafficData> trafficDStream = poiDStreamPair.map(poiTrafficDataFunc); // Map Cassandra table column Map<String, String> columnNameMappings = new HashMap<String, String>(); columnNameMappings.put("vehicleId", "vehicleid"); columnNameMappings.put("distance", "distance"); columnNameMappings.put("vehicleType", "vehicletype"); columnNameMappings.put("timeStamp", "timestamp"); // call CassandraStreamingJavaUtil function to save in DB javaFunctions(trafficDStream) .writerBuilder("traffickeyspace", "poi_traffic",CassandraJavaUtil.mapToRow(POITrafficData.class, columnNameMappings)) .withConstantTTL(120)//keeping data for 2 minutes .saveToCassandra(); }
Example #9
Source File: RealtimeTrafficDataProcessor.java From lambda-arch with Apache License 2.0 | 5 votes |
/** * Method to get window traffic counts of different type of vehicles for each route. * Window duration = 30 seconds and Slide interval = 10 seconds * * @param filteredIotDataStream IoT data stream */ public void processWindowTrafficData(JavaDStream<IoTData> filteredIotDataStream) { // reduce by key and window (30 sec window and 10 sec slide). JavaPairDStream<AggregateKey, Long> countDStreamPair = filteredIotDataStream .mapToPair(iot -> new Tuple2<>( new AggregateKey(iot.getRouteId(), iot.getVehicleType()), 1L )) .reduceByKeyAndWindow((a, b) -> a + b, Durations.seconds(30), Durations.seconds(10) ); // Transform to dstream of TrafficData JavaDStream<WindowTrafficData> trafficDStream = countDStreamPair.map(windowTrafficDataFunc); // Map Cassandra table column Map<String, String> columnNameMappings = new HashMap<String, String>(); columnNameMappings.put("routeId", "routeid"); columnNameMappings.put("vehicleType", "vehicletype"); columnNameMappings.put("totalCount", "totalcount"); columnNameMappings.put("timeStamp", "timestamp"); columnNameMappings.put("recordDate", "recorddate"); // call CassandraStreamingJavaUtil function to save in DB javaFunctions(trafficDStream).writerBuilder( "traffickeyspace", "window_traffic", CassandraJavaUtil.mapToRow(WindowTrafficData.class, columnNameMappings) ).saveToCassandra(); }
Example #10
Source File: RealTimeHeatMapProcessor.java From lambda-arch with Apache License 2.0 | 5 votes |
public void processHeatMap(JavaDStream<IoTData> filteredIotDataStream) throws IOException { JavaDStream<Measurement> measurements = iotDataToMeasurements(filteredIotDataStream); JavaDStream<Measurement> measurementsWithRoundedCoordinates = roundCoordinates(measurements); JavaPairDStream<Coordinate, Integer> counts = countPerGridBox(measurementsWithRoundedCoordinates); JavaDStream<HeatMapData> heatMapStream = getHeatMap(counts); save(heatMapStream); }
Example #11
Source File: AbstractJavaEsSparkStreamingTest.java From elasticsearch-hadoop with Apache License 2.0 | 5 votes |
@Test public void testEsRDDWriteWithDynamicMapping() throws Exception { Map<String, Object> doc1 = new HashMap<>(); doc1.put("number", 3); doc1.put("one", null); Set<String> values = new HashSet<>(); values.add("2"); doc1.put("two", values); doc1.put("three", "."); Map<String, Object> doc2 = new HashMap<>(); doc2.put("number", 4); doc2.put("OTP", "Otopeni"); doc2.put("SFO", "San Fran"); List<Map<String, Object>> docs = new ArrayList<>(); docs.add(doc1); docs.add(doc2); String target = wrapIndex(resource("spark-streaming-test-scala-dyn-id-write", "data", version)); String docEndpoint = wrapIndex(docEndpoint("spark-streaming-test-scala-dyn-id-write", "data", version)); JavaRDD<Map<String,Object>> batch = sc.parallelize(docs); Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>(); rddQueue.add(batch); JavaDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue); JavaPairDStream<Integer, Map<String, Object>> metaDstream = dstream.mapToPair(new ExtractIDFunction()); JavaEsSparkStreaming.saveToEsWithMeta(metaDstream, target, cfg); ssc.start(); TimeUnit.SECONDS.sleep(2); ssc.stop(false, true); assertEquals(2, JavaEsSpark.esRDD(sc, target).count()); assertTrue(RestUtils.exists(docEndpoint + "/3")); assertTrue(RestUtils.exists(docEndpoint + "/4")); assertThat(RestUtils.get(target + "/_search?"), containsString("SFO")); }
Example #12
Source File: RealTimeHeatMapProcessor.java From lambda-arch with Apache License 2.0 | 5 votes |
/** * Reduces the dataset by counting the number of measurements for a specific grid box (rounded coordinate) * * @param measurements | The dataset of measurements * @return A set of tuples linking rounded coordinates to their number of occurrences */ private JavaPairDStream<Coordinate, Integer> countPerGridBox(JavaDStream<Measurement> measurements) { // reduce by key and window (30 sec window and 10 sec slide). return measurements.mapToPair( measurement -> new Tuple2<>( measurement.getRoundedCoordinate(), 1 ) ).reduceByKeyAndWindow((a, b) -> a + b, Durations.seconds(30), Durations.seconds(10) ); }
Example #13
Source File: RealtimeTrafficDataProcessor.java From lambda-arch with Apache License 2.0 | 5 votes |
/** * Method to get total traffic counts of different type of vehicles for each route. * * @param filteredIotDataStream IoT data stream */ public void processTotalTrafficData(JavaDStream<IoTData> filteredIotDataStream) { // We need to get count of vehicle group by routeId and vehicleType JavaPairDStream<AggregateKey, Long> countDStreamPair = filteredIotDataStream .mapToPair(iot -> new Tuple2<>(new AggregateKey(iot.getRouteId(), iot.getVehicleType()), 1L)) .reduceByKey((a, b) -> a + b); // Need to keep state for total count StateSpec<AggregateKey, Long, Long, Tuple2<AggregateKey, Long>> stateSpec = StateSpec.function(totalSumFunc).timeout(Durations.seconds(3600)); JavaMapWithStateDStream<AggregateKey, Long, Long, Tuple2<AggregateKey, Long>> countDStreamWithStatePair = countDStreamPair.mapWithState(stateSpec);//maintain state for one hour // Transform to dstream of TrafficData JavaDStream<Tuple2<AggregateKey, Long>> countDStream = countDStreamWithStatePair.map(tuple2 -> tuple2); JavaDStream<TotalTrafficData> trafficDStream = countDStream.map(totalTrafficDataFunc); // Map Cassandra table column Map<String, String> columnNameMappings = new HashMap<String, String>(); columnNameMappings.put("routeId", "routeid"); columnNameMappings.put("vehicleType", "vehicletype"); columnNameMappings.put("totalCount", "totalcount"); columnNameMappings.put("timeStamp", "timestamp"); columnNameMappings.put("recordDate", "recorddate"); // call CassandraStreamingJavaUtil function to save in DB javaFunctions(trafficDStream).writerBuilder( "traffickeyspace", "total_traffic", CassandraJavaUtil.mapToRow(TotalTrafficData.class, columnNameMappings) ).saveToCassandra(); }
Example #14
Source File: RealtimeTrafficDataProcessor.java From lambda-arch with Apache License 2.0 | 5 votes |
/** * Method to get the vehicles which are in radius of POI and their distance from POI. * * @param nonFilteredIotDataStream original IoT data stream * @param broadcastPOIValues variable containing POI coordinates, route and vehicle types to monitor. */ public void processPOIData( JavaDStream<IoTData> nonFilteredIotDataStream, Broadcast<Tuple3<POIData, String, String>> broadcastPOIValues ) { // Filter by routeId,vehicleType and in POI range JavaDStream<IoTData> iotDataStreamFiltered = nonFilteredIotDataStream .filter(iot -> (iot.getRouteId().equals(broadcastPOIValues.value()._2()) && iot.getVehicleType().contains(broadcastPOIValues.value()._3()) && GeoDistanceCalculator.isInPOIRadius(Double.valueOf(iot.getLatitude()), Double.valueOf(iot.getLongitude()), broadcastPOIValues.value()._1().getLatitude(), broadcastPOIValues.value()._1().getLongitude(), broadcastPOIValues.value()._1().getRadius()))); // pair with poi JavaPairDStream<IoTData, POIData> poiDStreamPair = iotDataStreamFiltered.mapToPair( iot -> new Tuple2<>(iot, broadcastPOIValues.value()._1()) ); // Transform to dstream of POITrafficData JavaDStream<POITrafficData> trafficDStream = poiDStreamPair.map(poiTrafficDataFunc); // Map Cassandra table column Map<String, String> columnNameMappings = new HashMap<String, String>(); columnNameMappings.put("vehicleId", "vehicleid"); columnNameMappings.put("distance", "distance"); columnNameMappings.put("vehicleType", "vehicletype"); columnNameMappings.put("timeStamp", "timestamp"); // call CassandraStreamingJavaUtil function to save in DB javaFunctions(trafficDStream) .writerBuilder( "traffickeyspace", "poi_traffic", CassandraJavaUtil.mapToRow(POITrafficData.class, columnNameMappings) ) .withConstantTTL(120)//keeping data for 2 minutes .saveToCassandra(); }
Example #15
Source File: FileStreamingEx.java From Apache-Spark-2x-for-Java-Developers with MIT License | 5 votes |
public static void main(String[] args) { //Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set System.setProperty("hadoop.home.dir", "E:\\hadoop"); //Logger rootLogger = LogManager.getRootLogger(); //rootLogger.setLevel(Level.WARN); SparkConf conf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]"); String inputDirectory="E:\\hadoop\\streamFolder\\"; JavaSparkContext sc = new JavaSparkContext(conf); JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.seconds(1)); // streamingContext.checkpoint("E:\\hadoop\\checkpoint"); Logger rootLogger = LogManager.getRootLogger(); rootLogger.setLevel(Level.WARN); JavaDStream<String> streamfile = streamingContext.textFileStream(inputDirectory); streamfile.print(); streamfile.foreachRDD(rdd-> rdd.foreach(x -> System.out.println(x))); JavaPairDStream<LongWritable, Text> streamedFile = streamingContext.fileStream(inputDirectory, LongWritable.class, Text.class, TextInputFormat.class); streamedFile.print(); streamingContext.start(); try { streamingContext.awaitTermination(); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
Example #16
Source File: WordCountTransformOpEx.java From Apache-Spark-2x-for-Java-Developers with MIT License | 5 votes |
public static void main(String[] args) throws Exception { System.setProperty("hadoop.home.dir", "E:\\hadoop"); SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]"); JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1)); Logger rootLogger = LogManager.getRootLogger(); rootLogger.setLevel(Level.WARN); List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10)); JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples); JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() ); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 ); wordCounts.print(); JavaPairDStream<String, Integer> joinedDstream = wordCounts .transformToPair(new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() { @Override public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> rdd) throws Exception { JavaPairRDD<String, Integer> modRDD = rdd.join(initialRDD).mapToPair( new PairFunction<Tuple2<String, Tuple2<Integer, Integer>>, String, Integer>() { @Override public Tuple2<String, Integer> call( Tuple2<String, Tuple2<Integer, Integer>> joinedTuple) throws Exception { return new Tuple2<>(joinedTuple._1(),(joinedTuple._2()._1() + joinedTuple._2()._2())); } }); return modRDD; } }); joinedDstream.print(); streamingContext.start(); streamingContext.awaitTermination(); }
Example #17
Source File: ProcessedOffsetManager.java From kafka-spark-consumer with Apache License 2.0 | 5 votes |
@SuppressWarnings("deprecation") public static void persists(JavaPairDStream<String, Iterable<Long>> partitonOffset, Properties props) { partitonOffset.foreachRDD(new VoidFunction<JavaPairRDD<String,Iterable<Long>>>() { @Override public void call(JavaPairRDD<String, Iterable<Long>> po) throws Exception { List<Tuple2<String, Iterable<Long>>> poList = po.collect(); doPersists(poList, props); } }); }
Example #18
Source File: ProcessedOffsetManager.java From kafka-spark-consumer with Apache License 2.0 | 5 votes |
public static <T> DStream<Tuple2<String, Iterable<Long>>> getPartitionOffset( DStream<MessageAndMetadata<T>> unionStreams, Properties props) { ClassTag<MessageAndMetadata<T>> messageMetaClassTag = ScalaUtil.<T>getMessageAndMetadataClassTag(); JavaDStream<MessageAndMetadata<T>> javaDStream = new JavaDStream<MessageAndMetadata<T>>(unionStreams, messageMetaClassTag); JavaPairDStream<String, Iterable<Long>> partitonOffset = getPartitionOffset(javaDStream, props); return partitonOffset.dstream(); }
Example #19
Source File: ComputeStreamingResponse.java From incubator-retired-pirk with Apache License 2.0 | 5 votes |
/** * Method to perform the query given an input JavaDStream of JSON * */ public void performQuery(JavaDStream<MapWritable> input) { logger.info("Performing query: "); // Process non-overlapping windows of data of duration windowLength seconds // If we are using queue streams, there is no need to window if (!useQueueStream) { input.window(Durations.seconds(windowLength), Durations.seconds(windowLength)); } // Extract the selectors for each dataElement based upon the query type // and perform a keyed hash of the selectors JavaPairDStream<Integer,List<BigInteger>> selectorHashToDocRDD = input.mapToPair(new HashSelectorsAndPartitionData(bVars)); // Group by hashed selector (row) -- can combine with the line above, separating for testing and benchmarking... JavaPairDStream<Integer,Iterable<List<BigInteger>>> selectorGroupRDD = selectorHashToDocRDD.groupByKey(); // Calculate the encrypted row values for each row, emit <colNum, colVal> for each row JavaPairDStream<Long,BigInteger> encRowRDD = selectorGroupRDD.flatMapToPair(new EncRowCalc(accum, bVars)); // Multiply the column values by colNum: emit <colNum, finalColVal> and write the final result object encryptedColumnCalc(encRowRDD); // Start the streaming computation start(); }
Example #20
Source File: AbstractJavaEsSparkStreamingTest.java From elasticsearch-hadoop with Apache License 2.0 | 5 votes |
@Test public void testEsRDDWriteWithDynamicMapping() throws Exception { Map<String, Object> doc1 = new HashMap<>(); doc1.put("number", 3); doc1.put("one", null); Set<String> values = new HashSet<>(); values.add("2"); doc1.put("two", values); doc1.put("three", "."); Map<String, Object> doc2 = new HashMap<>(); doc2.put("number", 4); doc2.put("OTP", "Otopeni"); doc2.put("SFO", "San Fran"); List<Map<String, Object>> docs = new ArrayList<>(); docs.add(doc1); docs.add(doc2); String target = wrapIndex(resource("spark-test-scala-dyn-id-write", "data", version)); String docEndpoint = wrapIndex(docEndpoint("spark-test-scala-dyn-id-write", "data", version)); JavaRDD<Map<String,Object>> batch = sc.parallelize(docs); Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>(); rddQueue.add(batch); JavaDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue); JavaPairDStream<Integer, Map<String, Object>> metaDstream = dstream.mapToPair(new ExtractIDFunction()); JavaEsSparkStreaming.saveToEsWithMeta(metaDstream, target, cfg); ssc.start(); TimeUnit.SECONDS.sleep(2); ssc.stop(false, true); assertEquals(2, JavaEsSpark.esRDD(sc, target).count()); assertTrue(RestUtils.exists(docEndpoint + "/3")); assertTrue(RestUtils.exists(docEndpoint + "/4")); assertThat(RestUtils.get(target + "/_search?"), containsString("SFO")); }
Example #21
Source File: WordCountSocketStateful.java From Apache-Spark-2x-for-Java-Developers with MIT License | 5 votes |
public static void main(String[] args) throws Exception { System.setProperty("hadoop.home.dir", "E:\\hadoop"); SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]"); JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1)); streamingContext.checkpoint("E:\\hadoop\\checkpoint"); // Initial state RDD input to mapWithState @SuppressWarnings("unchecked") List<Tuple2<String, Integer>> tuples =Arrays.asList(new Tuple2<>("hello", 1), new Tuple2<>("world", 1)); JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples); JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() ); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 ); // Update the cumulative count function Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>> mappingFunc = new Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>>() { @Override public Tuple2<String, Integer> call(String word, Optional<Integer> one, State<Integer> state) { int sum = one.orElse(0) + (state.exists() ? state.get() : 0); Tuple2<String, Integer> output = new Tuple2<>(word, sum); state.update(sum); return output; } }; // DStream made of get cumulative counts that get updated in every batch JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> stateDstream = wordCounts.mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD)); stateDstream.print(); streamingContext.start(); streamingContext.awaitTermination(); }
Example #22
Source File: JavaDirectTalosStreamSuite.java From galaxy-sdk-java with Apache License 2.0 | 5 votes |
@Test public void testDirectStream() throws InterruptedException { HashSet<String> sentMessages = new HashSet<String>(); for (int i = 1; i <= 100; i++) sentMessages.add(i + ""); talosTestUtils.sendMessagesAndWaitForReceive(topic, sentMessages); JavaPairDStream<String, String> stream = TalosUtils.createDirectStream( ssc, talosTestUtils.javaTalosParams(), talosTestUtils.credential(), new HashSet<String>() {{ add(topic); }} ); final Set<String> result = Collections.synchronizedSet(new HashSet<String>()); stream.foreachRDD( new Function<JavaPairRDD<String, String>, Void>() { public Void call(JavaPairRDD<String, String> rdd) throws Exception { Iterator<Tuple2<String, String>> iterator = rdd.collect().iterator(); while (iterator.hasNext()) { result.add(iterator.next()._2); } return null; } } ); ssc.start(); long startTime = System.currentTimeMillis(); boolean matches = false; while (!matches && System.currentTimeMillis() - startTime < 20000) { matches = sentMessages.size() == result.size(); Thread.sleep(50); } Assert.assertEquals(sentMessages, result); ssc.stop(); }
Example #23
Source File: SparkUtils.java From cxf with Apache License 2.0 | 5 votes |
public static JavaPairDStream<String, Integer> createOutputDStream( JavaDStream<String> receiverStream, boolean withId) { final JavaDStream<String> words = receiverStream.flatMap(x -> withId ? splitInputStringWithId(x) : splitInputString(x)); final JavaPairDStream<String, Integer> pairs = words.mapToPair(s -> { return new Tuple2<String, Integer>(s, 1); }); return pairs.reduceByKey((i1, i2) -> { return i1 + i2; }); }
Example #24
Source File: WordCountSocketJava8Ex.java From Apache-Spark-2x-for-Java-Developers with MIT License | 5 votes |
public static void main(String[] args) throws Exception { System.setProperty("hadoop.home.dir", "E:\\hadoop"); SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]"); JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1)); List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10)); JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples); JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() ); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 ); wordCounts.print(); JavaPairDStream<String, Integer> joinedDstream = wordCounts.transformToPair( new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() { @Override public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> rdd) throws Exception { rdd.join(initialRDD).mapToPair(new PairFunction<Tuple2<String,Tuple2<Integer,Integer>>, String, Integer>() { @Override public Tuple2<String, Integer> call(Tuple2<String, Tuple2<Integer, Integer>> joinedTuple) throws Exception { // TODO Auto-generated method stub return new Tuple2<>( joinedTuple._1(), (joinedTuple._2()._1()+joinedTuple._2()._2()) ); } }); return rdd; } }); joinedDstream.print(); streamingContext.start(); streamingContext.awaitTermination(); }
Example #25
Source File: WordCountRecoverableEx.java From Apache-Spark-2x-for-Java-Developers with MIT License | 5 votes |
protected static JavaStreamingContext createContext(String ip, int port, String checkpointDirectory) { SparkConf sparkConf = new SparkConf().setAppName("WordCountRecoverableEx").setMaster("local[*]"); JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1)); streamingContext.checkpoint(checkpointDirectory); // Initial state RDD input to mapWithState @SuppressWarnings("unchecked") List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 1), new Tuple2<>("world", 1)); JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples); JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream(ip,port, StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = StreamingLines.flatMap(str -> Arrays.asList(str.split(" ")).iterator()); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str -> new Tuple2<>(str, 1)) .reduceByKey((count1, count2) -> count1 + count2); // Update the cumulative count function Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>> mappingFunc = new Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>>() { @Override public Tuple2<String, Integer> call(String word, Optional<Integer> one, State<Integer> state) { int sum = one.orElse(0) + (state.exists() ? state.get() : 0); Tuple2<String, Integer> output = new Tuple2<>(word, sum); state.update(sum); return output; } }; // DStream made of get cumulative counts that get updated in every batch JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> stateDstream = wordCounts .mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD)); stateDstream.print(); return streamingContext; }
Example #26
Source File: JavaCustomReceiver.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaCustomReceiver <hostname> <port>"); System.exit(1); } StreamingExamples.setStreamingLogLevels(); // Create the context with a 1 second batch size SparkConf sparkConf = new SparkConf().setAppName("JavaCustomReceiver"); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, new Duration(1000)); // Create an input stream with the custom receiver on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') JavaReceiverInputDStream<String> lines = ssc.receiverStream( new JavaCustomReceiver(args[0], Integer.parseInt(args[1]))); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String x) { return Arrays.asList(SPACE.split(x)).iterator(); } }); JavaPairDStream<String, Integer> wordCounts = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<>(s, 1); } }).reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.print(); ssc.start(); ssc.awaitTermination(); }
Example #27
Source File: StreamingService.java From cxf with Apache License 2.0 | 5 votes |
private void processStream(AsyncResponse async, List<String> inputStrings) { try { SparkConf sparkConf = new SparkConf().setMaster("local[*]") .setAppName("JAX-RS Spark Connect " + SparkUtils.getRandomId()); JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(1)); SparkStreamingOutput streamOut = new SparkStreamingOutput(jssc); SparkStreamingListener sparkListener = new SparkStreamingListener(streamOut); jssc.addStreamingListener(sparkListener); JavaDStream<String> receiverStream = null; if ("queue".equals(receiverType)) { Queue<JavaRDD<String>> rddQueue = new LinkedList<>(); for (int i = 0; i < 30; i++) { rddQueue.add(jssc.sparkContext().parallelize(inputStrings)); } receiverStream = jssc.queueStream(rddQueue); } else { receiverStream = jssc.receiverStream(new StringListReceiver(inputStrings)); } JavaPairDStream<String, Integer> wordCounts = SparkUtils.createOutputDStream(receiverStream, false); wordCounts.foreachRDD(new OutputFunction(streamOut)); jssc.start(); executor.execute(new SparkJob(async, sparkListener)); } catch (Exception ex) { // the compiler does not allow to catch SparkException directly if (ex instanceof SparkException) { async.cancel(60); } else { async.resume(new WebApplicationException(ex)); } } }
Example #28
Source File: IoTTrafficDataProcessor.java From iot-traffic-monitor with Apache License 2.0 | 5 votes |
/** * Method to get total traffic counts of different type of vehicles for each route. * * @param filteredIotDataStream IoT data stream */ public void processTotalTrafficData(JavaDStream<IoTData> filteredIotDataStream) { // We need to get count of vehicle group by routeId and vehicleType JavaPairDStream<AggregateKey, Long> countDStreamPair = filteredIotDataStream .mapToPair(iot -> new Tuple2<>(new AggregateKey(iot.getRouteId(), iot.getVehicleType()), 1L)) .reduceByKey((a, b) -> a + b); // Need to keep state for total count JavaMapWithStateDStream<AggregateKey, Long, Long, Tuple2<AggregateKey, Long>> countDStreamWithStatePair = countDStreamPair .mapWithState(StateSpec.function(totalSumFunc).timeout(Durations.seconds(3600)));//maintain state for one hour // Transform to dstream of TrafficData JavaDStream<Tuple2<AggregateKey, Long>> countDStream = countDStreamWithStatePair.map(tuple2 -> tuple2); JavaDStream<TotalTrafficData> trafficDStream = countDStream.map(totalTrafficDataFunc); // Map Cassandra table column Map<String, String> columnNameMappings = new HashMap<String, String>(); columnNameMappings.put("routeId", "routeid"); columnNameMappings.put("vehicleType", "vehicletype"); columnNameMappings.put("totalCount", "totalcount"); columnNameMappings.put("timeStamp", "timestamp"); columnNameMappings.put("recordDate", "recorddate"); // call CassandraStreamingJavaUtil function to save in DB javaFunctions(trafficDStream).writerBuilder("traffickeyspace", "total_traffic", CassandraJavaUtil.mapToRow(TotalTrafficData.class, columnNameMappings)).saveToCassandra(); }
Example #29
Source File: StreamingContextConfiguration.java From Decision with Apache License 2.0 | 5 votes |
private void configureDataContext(JavaStreamingContext context) { Map<String, Integer> baseTopicMap = new HashMap<>(); configurationContext.getDataTopics().forEach( dataTopic -> baseTopicMap.put(dataTopic, 1)); kafkaTopicService.createTopicsIfNotExist(configurationContext.getDataTopics(), configurationContext .getKafkaReplicationFactor(), configurationContext.getKafkaPartitions()); HashMap<String, String> kafkaParams = new HashMap<>(); kafkaParams.put("zookeeper.connect", configurationContext.getZookeeperHostsQuorumWithPath()); kafkaParams.put("group.id", configurationContext.getGroupId()); /* groupId must be the cluster groupId. Kafka assigns each partition of a topic to one, and one only, consumer of the group. Decision topics has only one partition (by default), so if we have two o more decision instances (consumers) reading the same topic with the same groupId, only one instance will be able to read from the topic */ JavaPairDStream<String, byte[]> messages = KafkaUtils.createStream(context, String.class, byte[].class, kafka.serializer.StringDecoder.class, kafka.serializer.DefaultDecoder.class, kafkaParams, baseTopicMap, StorageLevel.MEMORY_AND_DISK_SER()); AvroDeserializeMessageFunction avroDeserializeMessageFunction = new AvroDeserializeMessageFunction(); JavaDStream<StratioStreamingMessage> insertRequests = messages.filter( new FilterAvroMessagesByOperationFunction(STREAM_OPERATIONS.MANIPULATION.INSERT)) .map(avroDeserializeMessageFunction); InsertIntoStreamFunction insertIntoStreamFunction = new InsertIntoStreamFunction(streamOperationService, configurationContext.getZookeeperHostsQuorum()); insertRequests.foreachRDD(insertIntoStreamFunction); }
Example #30
Source File: ComputeStreamingResponse.java From incubator-retired-pirk with Apache License 2.0 | 5 votes |
private void encryptedColumnCalc(JavaPairDStream<Long,BigInteger> encRowRDD) { // Multiply the column values by colNum: emit <colNum, finalColVal> JavaPairDStream<Long,BigInteger> encColRDD; if (colMultReduceByKey) { encColRDD = encRowRDD.reduceByKey(new EncColMultReducer(bVars), numColMultPartitions); } else { encColRDD = encRowRDD.groupByKey(numColMultPartitions).mapToPair(new EncColMultGroupedMapper(bVars)); } // Update the output name, by batch number bVars.setOutput(outputFile + "_" + accum.numBatchesGetValue()); // Form and write the response object encColRDD.repartition(1).foreachRDD((VoidFunction<JavaPairRDD<Long,BigInteger>>) rdd -> { rdd.foreachPartition(new FinalResponseFunction(accum, bVars)); int maxBatchesVar = bVars.getMaxBatches(); if (maxBatchesVar != -1 && accum.numBatchesGetValue() == maxBatchesVar) { logger.info("num batches = maxBatches = " + maxBatchesVar + "; shutting down"); System.exit(0); } }); }