Java Code Examples for org.apache.spark.streaming.api.java.JavaDStream#map()
The following examples show how to use
org.apache.spark.streaming.api.java.JavaDStream#map() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: RealTimeHeatMapProcessor.java From lambda-arch with Apache License 2.0 | 5 votes |
/** * Converts each row from the dataset to a Measurement * * @param streaming | Spark SQL context * @return A set containing all data from the CSV file as Measurements */ private JavaDStream<Measurement> iotDataToMeasurements(JavaDStream<IoTData> streaming) { JavaDStream<Measurement> map = streaming.map(row -> { Coordinate coordinate = new Coordinate( Double.valueOf(row.getLatitude()), Double.valueOf(row.getLongitude()) ); return new Measurement(coordinate, row.getTimestamp()); }); return map; }
Example 2
Source File: RealTimeHeatMapProcessor.java From lambda-arch with Apache License 2.0 | 5 votes |
/** * Maps the measurements by rounding the coordinate. * The world is defined by a grid of boxes, each box has a size of 0.0005 by 0.0005. * Every mapping will be rounded to the center of the box it is part of. * Boundary cases will be rounded up, so a coordinate on (-0.00025,0) will be rounded to (0,0), * while the coordinate (0.00025,0) will be rounded to (0.0005,0). * * @param measurements | The dataset of measurements * @return A set of measurements with rounded coordinates */ private JavaDStream<Measurement> roundCoordinates(JavaDStream<Measurement> measurements) { return measurements.map(measurement -> { double roundedLatitude = (double) (5 * Math.round((measurement.getCoordinate().getLatitude() * 10000) / 5)) / 10000; double roundedLongitude = (double) (5 * Math.round((measurement.getCoordinate().getLongitude() * 10000) / 5)) / 10000; Coordinate roundedCoordinate = new Coordinate(roundedLatitude, roundedLongitude); measurement.setRoundedCoordinate(roundedCoordinate); return measurement; } ); }
Example 3
Source File: RealtimeTrafficDataProcessor.java From lambda-arch with Apache License 2.0 | 5 votes |
/** * Method to get total traffic counts of different type of vehicles for each route. * * @param filteredIotDataStream IoT data stream */ public void processTotalTrafficData(JavaDStream<IoTData> filteredIotDataStream) { // We need to get count of vehicle group by routeId and vehicleType JavaPairDStream<AggregateKey, Long> countDStreamPair = filteredIotDataStream .mapToPair(iot -> new Tuple2<>(new AggregateKey(iot.getRouteId(), iot.getVehicleType()), 1L)) .reduceByKey((a, b) -> a + b); // Need to keep state for total count StateSpec<AggregateKey, Long, Long, Tuple2<AggregateKey, Long>> stateSpec = StateSpec.function(totalSumFunc).timeout(Durations.seconds(3600)); JavaMapWithStateDStream<AggregateKey, Long, Long, Tuple2<AggregateKey, Long>> countDStreamWithStatePair = countDStreamPair.mapWithState(stateSpec);//maintain state for one hour // Transform to dstream of TrafficData JavaDStream<Tuple2<AggregateKey, Long>> countDStream = countDStreamWithStatePair.map(tuple2 -> tuple2); JavaDStream<TotalTrafficData> trafficDStream = countDStream.map(totalTrafficDataFunc); // Map Cassandra table column Map<String, String> columnNameMappings = new HashMap<String, String>(); columnNameMappings.put("routeId", "routeid"); columnNameMappings.put("vehicleType", "vehicletype"); columnNameMappings.put("totalCount", "totalcount"); columnNameMappings.put("timeStamp", "timestamp"); columnNameMappings.put("recordDate", "recorddate"); // call CassandraStreamingJavaUtil function to save in DB javaFunctions(trafficDStream).writerBuilder( "traffickeyspace", "total_traffic", CassandraJavaUtil.mapToRow(TotalTrafficData.class, columnNameMappings) ).saveToCassandra(); }
Example 4
Source File: IoTTrafficDataProcessor.java From iot-traffic-monitor with Apache License 2.0 | 5 votes |
/** * Method to get total traffic counts of different type of vehicles for each route. * * @param filteredIotDataStream IoT data stream */ public void processTotalTrafficData(JavaDStream<IoTData> filteredIotDataStream) { // We need to get count of vehicle group by routeId and vehicleType JavaPairDStream<AggregateKey, Long> countDStreamPair = filteredIotDataStream .mapToPair(iot -> new Tuple2<>(new AggregateKey(iot.getRouteId(), iot.getVehicleType()), 1L)) .reduceByKey((a, b) -> a + b); // Need to keep state for total count JavaMapWithStateDStream<AggregateKey, Long, Long, Tuple2<AggregateKey, Long>> countDStreamWithStatePair = countDStreamPair .mapWithState(StateSpec.function(totalSumFunc).timeout(Durations.seconds(3600)));//maintain state for one hour // Transform to dstream of TrafficData JavaDStream<Tuple2<AggregateKey, Long>> countDStream = countDStreamWithStatePair.map(tuple2 -> tuple2); JavaDStream<TotalTrafficData> trafficDStream = countDStream.map(totalTrafficDataFunc); // Map Cassandra table column Map<String, String> columnNameMappings = new HashMap<String, String>(); columnNameMappings.put("routeId", "routeid"); columnNameMappings.put("vehicleType", "vehicletype"); columnNameMappings.put("totalCount", "totalcount"); columnNameMappings.put("timeStamp", "timestamp"); columnNameMappings.put("recordDate", "recorddate"); // call CassandraStreamingJavaUtil function to save in DB javaFunctions(trafficDStream).writerBuilder("traffickeyspace", "total_traffic", CassandraJavaUtil.mapToRow(TotalTrafficData.class, columnNameMappings)).saveToCassandra(); }
Example 5
Source File: CountByWindow.java From sparkResearch with Apache License 2.0 | 4 votes |
public static void countWindow(JavaDStream<String> javaDStream) { JavaDStream ip = javaDStream.map((Function<String, Object>) v1 -> v1); JavaDStream<Long> ipCount = ip.countByWindow(Durations.seconds(30), Durations.seconds(10)); JavaPairDStream<String, Long> ipAddressRequestCount = ip.countByValueAndWindow(Durations.seconds(30), Durations.seconds(10)); }
Example 6
Source File: SparkMLTrainingAndScoringOnline.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License | 4 votes |
public static void main(String[] args) throws InterruptedException { System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE); final SparkConf conf = new SparkConf() .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES) .setAppName(APPLICATION_NAME) .set("spark.sql.caseSensitive", CASE_SENSITIVE); JavaStreamingContext streamingContext = new JavaStreamingContext(conf, new Duration(BATCH_DURATION_INTERVAL_MS)); JavaInputDStream<ConsumerRecord<String, String>> meetupStream = KafkaUtils.createDirectStream( streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES) ); JavaDStream<String> meetupStreamValues = meetupStream.map(v -> { return v.value(); }); // Prepare the training data as strings of type: (y,[x1,x2,x3,...,xn]) // Where n is the number of features, y is a binary label, // and n must be the same for train and test. // e.g. "(response, [group_lat, group_long])"; JavaDStream<String> trainData = meetupStreamValues.map(e -> { JSONParser jsonParser = new JSONParser(); JSONObject json = (JSONObject)jsonParser.parse(e); String result = "(" + (String.valueOf(json.get("response")).equals("yes") ? "1.0,[":"0.0,[") + ((JSONObject)json.get("group")).get("group_lat") + "," + ((JSONObject)json.get("group")).get("group_lon") + "])"; return result; }); trainData.print(); JavaDStream<LabeledPoint> labeledPoints = trainData.map(LabeledPoint::parse); StreamingLogisticRegressionWithSGD streamingLogisticRegressionWithSGD = new StreamingLogisticRegressionWithSGD() .setInitialWeights(Vectors.zeros(2)); streamingLogisticRegressionWithSGD.trainOn(labeledPoints); JavaPairDStream<Double, Vector> values = labeledPoints.mapToPair(f -> new Tuple2<>(f.label(), f.features())); streamingLogisticRegressionWithSGD.predictOnValues(values).print(); // some time later, after outputs have completed meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> { OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges(); ((CanCommitOffsets) meetupStream.inputDStream()) .commitAsync(offsetRanges, new MeetupOffsetCommitCallback()); }); streamingContext.start(); streamingContext.awaitTermination(); }
Example 7
Source File: KafkaSource08.java From sylph with Apache License 2.0 | 4 votes |
public JavaDStream<Row> createSource(JavaStreamingContext ssc, KafkaSourceConfig08 config, SourceContext context) { String topics = requireNonNull(config.getTopics(), "topics not setting"); String brokers = requireNonNull(config.getBrokers(), "brokers not setting"); //需要把集群的host 配置到程序所在机器 String groupId = requireNonNull(config.getGroupid(), "group.id not setting"); //消费者的名字 String offsetMode = requireNonNull(config.getOffsetMode(), "offsetMode not setting"); Map<String, String> otherConfig = config.getOtherConfig().entrySet() .stream() .filter(x -> x.getValue() != null) .collect(Collectors.toMap(Map.Entry::getKey, v -> v.getValue().toString())); Map<String, String> kafkaParams = new HashMap<>(otherConfig); kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers); //kafkaParams.put("auto.commit.enable", true); //不自动提交偏移量 // "fetch.message.max.bytes" -> // "session.timeout.ms" -> "30000", //session默认是30秒 // "heartbeat.interval.ms" -> "5000", //10秒提交一次 心跳周期 kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, groupId); //注意不同的流 group.id必须要不同 否则会出现offect commit提交失败的错误 kafkaParams.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, offsetMode); //largest smallest //----get fromOffsets @SuppressWarnings("unchecked") scala.collection.immutable.Map<String, String> map = (scala.collection.immutable.Map<String, String>) Map$.MODULE$.apply(JavaConverters.mapAsScalaMapConverter(kafkaParams).asScala().toSeq()); final KafkaCluster kafkaCluster = new KafkaCluster(map); Map<TopicAndPartition, Long> fromOffsets = getFromOffset(kafkaCluster, topics, groupId); //--- createDirectStream DirectKafkaInputDStream.class org.apache.spark.api.java.function.Function<MessageAndMetadata<byte[], byte[]>, ConsumerRecord<byte[], byte[]>> messageHandler = mmd -> new ConsumerRecord<>(mmd.topic(), mmd.partition(), mmd.key(), mmd.message(), mmd.offset()); @SuppressWarnings("unchecked") Class<ConsumerRecord<byte[], byte[]>> recordClass = (Class<ConsumerRecord<byte[], byte[]>>) ClassTag$.MODULE$.<ConsumerRecord<byte[], byte[]>>apply(ConsumerRecord.class).runtimeClass(); JavaInputDStream<ConsumerRecord<byte[], byte[]>> inputStream = KafkaUtils.createDirectStream(ssc, byte[].class, byte[].class, DefaultDecoder.class, DefaultDecoder.class, recordClass, kafkaParams, fromOffsets, messageHandler ); JavaDStream<ConsumerRecord<byte[], byte[]>> dStream = settingCommit(inputStream, kafkaParams, kafkaCluster, groupId); if ("json".equalsIgnoreCase(config.getValueType())) { JsonSchema jsonParser = new JsonSchema(context.getSchema()); return dStream .map(record -> { return jsonParser.deserialize(record.key(), record.value(), record.topic(), record.partition(), record.offset()); }); } else { StructType structType = schemaToSparkType(context.getSchema()); return dStream .map(record -> { String[] names = structType.names(); Object[] values = new Object[names.length]; for (int i = 0; i < names.length; i++) { switch (names[i]) { case "_topic": values[i] = record.topic(); continue; case "_message": values[i] = new String(record.value(), UTF_8); continue; case "_key": values[i] = new String(record.key(), UTF_8); continue; case "_partition": values[i] = record.partition(); continue; case "_offset": values[i] = record.offset(); default: values[i] = null; } } return (Row) new GenericRowWithSchema(values, structType); }); //.window(Duration(10 * 1000)) } }
Example 8
Source File: KafkaSource.java From sylph with Apache License 2.0 | 4 votes |
public JavaDStream<Row> createSource(JavaStreamingContext ssc, KafkaSourceConfig config, SourceContext context) { String topics = config.getTopics(); String brokers = config.getBrokers(); //需要把集群的host 配置到程序所在机器 String groupId = config.getGroupid(); //消费者的名字 String offsetMode = config.getOffsetMode(); Map<String, Object> kafkaParams = new HashMap<>(config.getOtherConfig()); kafkaParams.put("bootstrap.servers", brokers); kafkaParams.put("key.deserializer", ByteArrayDeserializer.class); //StringDeserializer kafkaParams.put("value.deserializer", ByteArrayDeserializer.class); //StringDeserializer kafkaParams.put("enable.auto.commit", false); //不自动提交偏移量 // "fetch.message.max.bytes" -> // "session.timeout.ms" -> "30000", //session默认是30秒 // "heartbeat.interval.ms" -> "5000", //10秒提交一次 心跳周期 kafkaParams.put("group.id", groupId); //注意不同的流 group.id必须要不同 否则会出现offect commit提交失败的错误 kafkaParams.put("auto.offset.reset", offsetMode); //latest earliest List<String> topicSets = Arrays.asList(topics.split(",")); JavaInputDStream<ConsumerRecord<byte[], byte[]>> inputStream = KafkaUtils.createDirectStream( ssc, LocationStrategies.PreferConsistent(), ConsumerStrategies.Subscribe(topicSets, kafkaParams)); DStream<ConsumerRecord<byte[], byte[]>> sylphKafkaOffset = new SylphKafkaOffset<ConsumerRecord<byte[], byte[]>>(inputStream.inputDStream()) { @Override public void commitOffsets(RDD<?> kafkaRdd) { OffsetRange[] offsetRanges = ((HasOffsetRanges) kafkaRdd).offsetRanges(); log().info("commitKafkaOffsets {}", (Object) offsetRanges); DStream<?> firstDStream = DStreamUtil.getFirstDStream(inputStream.dstream()); ((CanCommitOffsets) firstDStream).commitAsync(offsetRanges); } }; JavaDStream<ConsumerRecord<byte[], byte[]>> javaDStream = new JavaDStream<>(sylphKafkaOffset, ClassTag$.MODULE$.apply(ConsumerRecord.class)); if ("json".equalsIgnoreCase(config.getValueType())) { JsonSchema jsonParser = new JsonSchema(context.getSchema()); return javaDStream .map(record -> jsonParser.deserialize(record.key(), record.value(), record.topic(), record.partition(), record.offset())); } else { List<String> names = context.getSchema().getFieldNames(); return javaDStream .map(record -> { Object[] values = new Object[names.size()]; for (int i = 0; i < names.size(); i++) { switch (names.get(i)) { case "_topic": values[i] = record.topic(); continue; case "_message": values[i] = new String(record.value(), UTF_8); continue; case "_key": values[i] = record.key() == null ? null : new String(record.key(), UTF_8); continue; case "_partition": values[i] = record.partition(); continue; case "_offset": values[i] = record.offset(); case "_timestamp": values[i] = record.timestamp(); case "_timestampType": values[i] = record.timestampType().id; default: values[i] = null; } } return new GenericRow(values); //GenericRowWithSchema }); //.window(Duration(10 * 1000)) } }
Example 9
Source File: IoTDataProcessor.java From iot-traffic-monitor with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { //read Spark and Cassandra properties and create SparkConf Properties prop = PropertyFileReader.readPropertyFile(); SparkConf conf = new SparkConf() .setAppName(prop.getProperty("com.iot.app.spark.app.name")) .setMaster(prop.getProperty("com.iot.app.spark.master")) .set("spark.cassandra.connection.host", prop.getProperty("com.iot.app.cassandra.host")) .set("spark.cassandra.connection.port", prop.getProperty("com.iot.app.cassandra.port")) .set("spark.cassandra.connection.keep_alive_ms", prop.getProperty("com.iot.app.cassandra.keep_alive")); //batch interval of 5 seconds for incoming stream JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5)); //add check point directory jssc.checkpoint(prop.getProperty("com.iot.app.spark.checkpoint.dir")); //read and set Kafka properties Map<String, String> kafkaParams = new HashMap<String, String>(); kafkaParams.put("zookeeper.connect", prop.getProperty("com.iot.app.kafka.zookeeper")); kafkaParams.put("metadata.broker.list", prop.getProperty("com.iot.app.kafka.brokerlist")); String topic = prop.getProperty("com.iot.app.kafka.topic"); Set<String> topicsSet = new HashSet<String>(); topicsSet.add(topic); //create direct kafka stream JavaPairInputDStream<String, IoTData> directKafkaStream = KafkaUtils.createDirectStream( jssc, String.class, IoTData.class, StringDecoder.class, IoTDataDecoder.class, kafkaParams, topicsSet ); logger.info("Starting Stream Processing"); //We need non filtered stream for poi traffic data calculation JavaDStream<IoTData> nonFilteredIotDataStream = directKafkaStream.map(tuple -> tuple._2()); //We need filtered stream for total and traffic data calculation JavaPairDStream<String,IoTData> iotDataPairStream = nonFilteredIotDataStream.mapToPair(iot -> new Tuple2<String,IoTData>(iot.getVehicleId(),iot)).reduceByKey((a, b) -> a ); // Check vehicle Id is already processed JavaMapWithStateDStream<String, IoTData, Boolean, Tuple2<IoTData,Boolean>> iotDStreamWithStatePairs = iotDataPairStream .mapWithState(StateSpec.function(processedVehicleFunc).timeout(Durations.seconds(3600)));//maintain state for one hour // Filter processed vehicle ids and keep un-processed JavaDStream<Tuple2<IoTData,Boolean>> filteredIotDStreams = iotDStreamWithStatePairs.map(tuple2 -> tuple2) .filter(tuple -> tuple._2.equals(Boolean.FALSE)); // Get stream of IoTdata JavaDStream<IoTData> filteredIotDataStream = filteredIotDStreams.map(tuple -> tuple._1); //cache stream as it is used in total and window based computation filteredIotDataStream.cache(); //process data IoTTrafficDataProcessor iotTrafficProcessor = new IoTTrafficDataProcessor(); iotTrafficProcessor.processTotalTrafficData(filteredIotDataStream); iotTrafficProcessor.processWindowTrafficData(filteredIotDataStream); //poi data POIData poiData = new POIData(); poiData.setLatitude(33.877495); poiData.setLongitude(-95.50238); poiData.setRadius(30);//30 km //broadcast variables. We will monitor vehicles on Route 37 which are of type Truck Broadcast<Tuple3<POIData, String, String>> broadcastPOIValues = jssc.sparkContext().broadcast(new Tuple3<>(poiData,"Route-37","Truck")); //call method to process stream iotTrafficProcessor.processPOIData(nonFilteredIotDataStream,broadcastPOIValues); //start context jssc.start(); jssc.awaitTermination(); }