org.apache.spark.streaming.api.java.JavaPairInputDStream Java Examples
The following examples show how to use
org.apache.spark.streaming.api.java.JavaPairInputDStream.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: KafkaStreaming.java From sparkResearch with Apache License 2.0 | 8 votes |
public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]"); JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10000)); //设置检查点 streamingContext.checkpoint("HDFS URL"); Map<String, Integer> topicThread = new HashMap<>(1); topicThread.put(TOPIC, THREAD); JavaPairInputDStream<String, String> dStream = KafkaUtils.createStream(streamingContext, HOST, GROP, topicThread); JavaDStream<String> words = dStream.flatMap((FlatMapFunction<Tuple2<String, String>, String>) stringStringTuple2 -> Arrays.asList(SPACE.split(stringStringTuple2._2)).iterator()); //统计 JavaPairDStream<String, Integer> result = words.mapToPair((PairFunction<String, String, Integer>) s -> new Tuple2<>(s, 1)).reduceByKey((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2); try { result.print(); streamingContext.start(); streamingContext.awaitTermination(); } catch (InterruptedException e) { e.printStackTrace(); } }
Example #2
Source File: SparkStreaming.java From kafka-spark-avro-example with Apache License 2.0 | 6 votes |
private static void processStream(JavaStreamingContext ssc, JavaSparkContext sc) { System.out.println("--> Processing stream"); Map<String, String> props = new HashMap<>(); props.put("bootstrap.servers", "localhost:9092"); props.put("schema.registry.url", "http://localhost:8081"); props.put("group.id", "spark"); props.put("specific.avro.reader", "true"); props.put("value.deserializer", "io.confluent.kafka.serializers.KafkaAvroDeserializer"); props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer"); Set<String> topicsSet = new HashSet<>(Collections.singletonList("test")); JavaPairInputDStream<String, Object> stream = KafkaUtils.createDirectStream(ssc, String.class, Object.class, StringDecoder.class, KafkaAvroDecoder.class, props, topicsSet); stream.foreachRDD(rdd -> { rdd.foreachPartition(iterator -> { while (iterator.hasNext()) { Tuple2<String, Object> next = iterator.next(); Model model = (Model) next._2(); System.out.println(next._1() + " --> " + model); } } ); }); }
Example #3
Source File: JavaKafkaDirectWordCount.java From SparkDemo with MIT License | 4 votes |
/** * 1.一对一 * 2.高效 * 3.准确的只计算一次 * * @param args */ public static void main(String[] args) { StreamingExamples.setStreamingLogLevels(); SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaDirectWordCount").setMaster("local[1]"); JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(6)); Map<String, String> kafkaParams = new HashMap<String, String>(); // key是topic名称,value是线程数量 kafkaParams.put("metadata.broker.list", "master:9092,slave1:9092,slave2:9092"); // 指定broker在哪 HashSet<String> topicsSet = new HashSet<String>(); topicsSet.add("2017-7-26"); // 指定操作的topic // Create direct kafka stream with brokers and topics createDirectStream() JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream( jssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet ); JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() { @Override public String call(Tuple2<String, String> tuple2) { return tuple2._2(); } }); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String x) { return Lists.newArrayList(SPACE.split(x)).iterator(); } }); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }).reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.print(); jssc.start(); try { jssc.awaitTermination(); } catch (Exception e) { e.printStackTrace(); } }
Example #4
Source File: AppMain.java From SparkToParquet with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws IOException { Flags.setFromCommandLineArgs(THE_OPTIONS, args); // 初始化Spark Conf. SparkConf conf = new SparkConf().setAppName("A SECTONG Application: Apache Log Analysis with Spark"); JavaSparkContext sc = new JavaSparkContext(conf); JavaStreamingContext jssc = new JavaStreamingContext(sc, Flags.getInstance().getSlideInterval()); SQLContext sqlContext = new SQLContext(sc); // 初始化参数 HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(Flags.getInstance().getKafka_topic().split(","))); HashMap<String, String> kafkaParams = new HashMap<String, String>(); kafkaParams.put("metadata.broker.list", Flags.getInstance().getKafka_broker()); // 从Kafka Stream获取数据 JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(jssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet); JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() { private static final long serialVersionUID = 5266880065425088203L; public String call(Tuple2<String, String> tuple2) { return tuple2._2(); } }); JavaDStream<ApacheAccessLog> accessLogsDStream = lines.flatMap(line -> { List<ApacheAccessLog> list = new ArrayList<>(); try { // 映射每一行 list.add(ApacheAccessLog.parseFromLogLine(line)); return list; } catch (RuntimeException e) { return list; } }).cache(); accessLogsDStream.foreachRDD(rdd -> { // rdd to DataFrame DataFrame df = sqlContext.createDataFrame(rdd, ApacheAccessLog.class); // 写入Parquet文件 df.write().partitionBy("ipAddress", "method", "responseCode").mode(SaveMode.Append).parquet(Flags.getInstance().getParquetFile()); return null; }); // 启动Streaming服务器 jssc.start(); // 启动计算 jssc.awaitTermination(); // 等待终止 }
Example #5
Source File: ComputeStreamingResponse.java From incubator-retired-pirk with Apache License 2.0 | 4 votes |
/** * Method to read in the data from an allowed input format, filter, and return a RDD of MapWritable data elements */ @SuppressWarnings("unchecked") public JavaDStream<MapWritable> readData() throws IOException, PIRException { logger.info("Reading data "); Job job = Job.getInstance(); String baseQuery = SystemConfiguration.getProperty("pir.baseQuery"); String jobName = "pirSpark_base_" + baseQuery + "_" + System.currentTimeMillis(); job.setJobName(jobName); job.getConfiguration().setBoolean("mapreduce.input.fileinputformat.input.dir.recursive", true); job.getConfiguration().set("query", baseQuery); job.getConfiguration().set("dataSchemaName", qSchema.getDataSchemaName()); job.getConfiguration().set("data.schemas", SystemConfiguration.getProperty("data.schemas")); // Set the inputFormatClass based upon the baseInputFormat property String classString = SystemConfiguration.getProperty("pir.baseInputFormat"); Class<? extends BaseInputFormat<Text,MapWritable>> inputClass; try { inputClass = (Class<? extends BaseInputFormat<Text,MapWritable>>) Class.forName(classString); } catch (ClassNotFoundException | ClassCastException e) { throw new PIRException(classString + " cannot be instantiated or does not extend BaseInputFormat", e); } job.setInputFormatClass(inputClass); FileInputFormat.setInputPaths(job, inputData); // Read data from hdfs logger.info("useQueueStream = " + useQueueStream); JavaDStream<MapWritable> mwStream; if (useQueueStream) { Queue<JavaRDD<MapWritable>> rddQueue = new LinkedList<>(); JavaRDD<MapWritable> rddIn = jssc.sparkContext().newAPIHadoopRDD(job.getConfiguration(), inputClass, Text.class, MapWritable.class).values() .coalesce(numDataPartitions); rddQueue.add(rddIn); mwStream = jssc.queueStream(rddQueue); } else { JavaPairInputDStream<Text,MapWritable> inputRDD = jssc.fileStream(inputData, Text.class, MapWritable.class, inputClass); mwStream = inputRDD.transform(new Function<JavaPairRDD<Text,MapWritable>,JavaRDD<MapWritable>>() { private static final long serialVersionUID = 1L; @Override public JavaRDD<MapWritable> call(JavaPairRDD<Text,MapWritable> pair) throws Exception { return pair.values(); } }).repartition(numDataPartitions); } // Filter out by the provided stopListFile entries if (qSchema.getFilter() != null) { return mwStream.filter(new FilterData(accum, bVars)); } return mwStream; }
Example #6
Source File: ComputeStreamingResponse.java From incubator-retired-pirk with Apache License 2.0 | 4 votes |
/** * Method to read in the data from elasticsearch, filter, and return a RDD of MapWritable data elements */ @SuppressWarnings("unchecked") public JavaDStream<MapWritable> readDataES() throws IOException { logger.info("Reading data "); Job job = Job.getInstance(); String jobName = "pirSpark_ES_" + esQuery + "_" + System.currentTimeMillis(); job.setJobName(jobName); job.getConfiguration().set("es.nodes", SystemConfiguration.getProperty("es.nodes")); job.getConfiguration().set("es.port", SystemConfiguration.getProperty("es.port")); job.getConfiguration().set("es.resource", esResource); job.getConfiguration().set("es.query", esQuery); // Read data from hdfs JavaDStream<MapWritable> mwStream; if (useQueueStream) { Queue<JavaRDD<MapWritable>> rddQueue = new LinkedList<>(); JavaRDD<MapWritable> rddIn = jssc.sparkContext().newAPIHadoopRDD(job.getConfiguration(), EsInputFormat.class, Text.class, MapWritable.class).values() .coalesce(numDataPartitions); rddQueue.add(rddIn); mwStream = jssc.queueStream(rddQueue); } else { JavaPairInputDStream<Text,MapWritable> inputRDD = jssc.fileStream(inputData, Text.class, MapWritable.class, EsInputFormat.class); mwStream = inputRDD.transform(new Function<JavaPairRDD<Text,MapWritable>,JavaRDD<MapWritable>>() { private static final long serialVersionUID = 1L; @Override public JavaRDD<MapWritable> call(JavaPairRDD<Text,MapWritable> pair) throws Exception { return pair.values(); } }).repartition(numDataPartitions); } // Filter out by the provided stopListFile entries if (qSchema.getFilter() != null) { return mwStream.filter(new FilterData(accum, bVars)); } else { return mwStream; } }
Example #7
Source File: IoTDataProcessor.java From iot-traffic-monitor with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { //read Spark and Cassandra properties and create SparkConf Properties prop = PropertyFileReader.readPropertyFile(); SparkConf conf = new SparkConf() .setAppName(prop.getProperty("com.iot.app.spark.app.name")) .setMaster(prop.getProperty("com.iot.app.spark.master")) .set("spark.cassandra.connection.host", prop.getProperty("com.iot.app.cassandra.host")) .set("spark.cassandra.connection.port", prop.getProperty("com.iot.app.cassandra.port")) .set("spark.cassandra.connection.keep_alive_ms", prop.getProperty("com.iot.app.cassandra.keep_alive")); //batch interval of 5 seconds for incoming stream JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5)); //add check point directory jssc.checkpoint(prop.getProperty("com.iot.app.spark.checkpoint.dir")); //read and set Kafka properties Map<String, String> kafkaParams = new HashMap<String, String>(); kafkaParams.put("zookeeper.connect", prop.getProperty("com.iot.app.kafka.zookeeper")); kafkaParams.put("metadata.broker.list", prop.getProperty("com.iot.app.kafka.brokerlist")); String topic = prop.getProperty("com.iot.app.kafka.topic"); Set<String> topicsSet = new HashSet<String>(); topicsSet.add(topic); //create direct kafka stream JavaPairInputDStream<String, IoTData> directKafkaStream = KafkaUtils.createDirectStream( jssc, String.class, IoTData.class, StringDecoder.class, IoTDataDecoder.class, kafkaParams, topicsSet ); logger.info("Starting Stream Processing"); //We need non filtered stream for poi traffic data calculation JavaDStream<IoTData> nonFilteredIotDataStream = directKafkaStream.map(tuple -> tuple._2()); //We need filtered stream for total and traffic data calculation JavaPairDStream<String,IoTData> iotDataPairStream = nonFilteredIotDataStream.mapToPair(iot -> new Tuple2<String,IoTData>(iot.getVehicleId(),iot)).reduceByKey((a, b) -> a ); // Check vehicle Id is already processed JavaMapWithStateDStream<String, IoTData, Boolean, Tuple2<IoTData,Boolean>> iotDStreamWithStatePairs = iotDataPairStream .mapWithState(StateSpec.function(processedVehicleFunc).timeout(Durations.seconds(3600)));//maintain state for one hour // Filter processed vehicle ids and keep un-processed JavaDStream<Tuple2<IoTData,Boolean>> filteredIotDStreams = iotDStreamWithStatePairs.map(tuple2 -> tuple2) .filter(tuple -> tuple._2.equals(Boolean.FALSE)); // Get stream of IoTdata JavaDStream<IoTData> filteredIotDataStream = filteredIotDStreams.map(tuple -> tuple._1); //cache stream as it is used in total and window based computation filteredIotDataStream.cache(); //process data IoTTrafficDataProcessor iotTrafficProcessor = new IoTTrafficDataProcessor(); iotTrafficProcessor.processTotalTrafficData(filteredIotDataStream); iotTrafficProcessor.processWindowTrafficData(filteredIotDataStream); //poi data POIData poiData = new POIData(); poiData.setLatitude(33.877495); poiData.setLongitude(-95.50238); poiData.setRadius(30);//30 km //broadcast variables. We will monitor vehicles on Route 37 which are of type Truck Broadcast<Tuple3<POIData, String, String>> broadcastPOIValues = jssc.sparkContext().broadcast(new Tuple3<>(poiData,"Route-37","Truck")); //call method to process stream iotTrafficProcessor.processPOIData(nonFilteredIotDataStream,broadcastPOIValues); //start context jssc.start(); jssc.awaitTermination(); }
Example #8
Source File: SparkUnboundedSource.java From beam with Apache License 2.0 | 4 votes |
public static <T, CheckpointMarkT extends CheckpointMark> UnboundedDataset<T> read( JavaStreamingContext jssc, SerializablePipelineOptions rc, UnboundedSource<T, CheckpointMarkT> source, String stepName) { SparkPipelineOptions options = rc.get().as(SparkPipelineOptions.class); Long maxRecordsPerBatch = options.getMaxRecordsPerBatch(); SourceDStream<T, CheckpointMarkT> sourceDStream = new SourceDStream<>(jssc.ssc(), source, rc, maxRecordsPerBatch); JavaPairInputDStream<Source<T>, CheckpointMarkT> inputDStream = JavaPairInputDStream$.MODULE$.fromInputDStream( sourceDStream, JavaSparkContext$.MODULE$.fakeClassTag(), JavaSparkContext$.MODULE$.fakeClassTag()); // call mapWithState to read from a checkpointable sources. JavaMapWithStateDStream< Source<T>, CheckpointMarkT, Tuple2<byte[], Instant>, Tuple2<Iterable<byte[]>, Metadata>> mapWithStateDStream = inputDStream.mapWithState( StateSpec.function( StateSpecFunctions.<T, CheckpointMarkT>mapSourceFunction(rc, stepName)) .numPartitions(sourceDStream.getNumPartitions())); // set checkpoint duration for read stream, if set. checkpointStream(mapWithStateDStream, options); // report the number of input elements for this InputDStream to the InputInfoTracker. int id = inputDStream.inputDStream().id(); JavaDStream<Metadata> metadataDStream = mapWithStateDStream.map(new Tuple2MetadataFunction()); // register ReadReportDStream to report information related to this read. new ReadReportDStream(metadataDStream.dstream(), id, getSourceName(source, id), stepName) .register(); // output the actual (deserialized) stream. WindowedValue.FullWindowedValueCoder<T> coder = WindowedValue.FullWindowedValueCoder.of( source.getOutputCoder(), GlobalWindow.Coder.INSTANCE); JavaDStream<WindowedValue<T>> readUnboundedStream = mapWithStateDStream .flatMap(new Tuple2byteFlatMapFunction()) .map(CoderHelpers.fromByteFunction(coder)); return new UnboundedDataset<>(readUnboundedStream, Collections.singletonList(id)); }