org.apache.spark.streaming.api.java.JavaPairInputDStream Java Examples

The following examples show how to use org.apache.spark.streaming.api.java.JavaPairInputDStream. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: KafkaStreaming.java    From sparkResearch with Apache License 2.0 8 votes vote down vote up
public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10000));
    //设置检查点
    streamingContext.checkpoint("HDFS URL");
    Map<String, Integer> topicThread = new HashMap<>(1);
    topicThread.put(TOPIC, THREAD);
    JavaPairInputDStream<String, String> dStream = KafkaUtils.createStream(streamingContext, HOST, GROP, topicThread);

    JavaDStream<String> words = dStream.flatMap((FlatMapFunction<Tuple2<String, String>, String>) stringStringTuple2 -> Arrays.asList(SPACE.split(stringStringTuple2._2)).iterator());

    //统计
    JavaPairDStream<String, Integer> result = words.mapToPair((PairFunction<String, String, Integer>) s -> new Tuple2<>(s, 1)).reduceByKey((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2);

    try {
        result.print();
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}
 
Example #2
Source File: SparkStreaming.java    From kafka-spark-avro-example with Apache License 2.0 6 votes vote down vote up
private static void processStream(JavaStreamingContext ssc, JavaSparkContext sc) {
  System.out.println("--> Processing stream");

  Map<String, String> props = new HashMap<>();
  props.put("bootstrap.servers", "localhost:9092");
  props.put("schema.registry.url", "http://localhost:8081");
  props.put("group.id", "spark");
  props.put("specific.avro.reader", "true");

  props.put("value.deserializer", "io.confluent.kafka.serializers.KafkaAvroDeserializer");
  props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");

  Set<String> topicsSet = new HashSet<>(Collections.singletonList("test"));

  JavaPairInputDStream<String, Object> stream = KafkaUtils.createDirectStream(ssc, String.class, Object.class,
    StringDecoder.class, KafkaAvroDecoder.class, props, topicsSet);

  stream.foreachRDD(rdd -> {
    rdd.foreachPartition(iterator -> {
        while (iterator.hasNext()) {
          Tuple2<String, Object> next = iterator.next();
          Model model = (Model) next._2();
          System.out.println(next._1() + " --> " + model);
        }
      }
    );
  });
}
 
Example #3
Source File: JavaKafkaDirectWordCount.java    From SparkDemo with MIT License 4 votes vote down vote up
/**
 * 1.一对一
 * 2.高效
 * 3.准确的只计算一次
 *
 * @param args
 */
public static void main(String[] args) {
    StreamingExamples.setStreamingLogLevels();
    SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaDirectWordCount").setMaster("local[1]");
    JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(6));

    Map<String, String> kafkaParams = new HashMap<String, String>(); // key是topic名称,value是线程数量
    kafkaParams.put("metadata.broker.list", "master:9092,slave1:9092,slave2:9092"); // 指定broker在哪
    HashSet<String> topicsSet = new HashSet<String>();
    topicsSet.add("2017-7-26"); // 指定操作的topic

    // Create direct kafka stream with brokers and topics createDirectStream()
    JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(
            jssc,
            String.class,
            String.class,
            StringDecoder.class,
            StringDecoder.class,
            kafkaParams,
            topicsSet
    );

    JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
        @Override
        public String call(Tuple2<String, String> tuple2) {
            return tuple2._2();
        }
    });

    JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
        @Override
        public Iterator<String> call(String x) {
            return Lists.newArrayList(SPACE.split(x)).iterator();
        }
    });

    JavaPairDStream<String, Integer> wordCounts = words.mapToPair(new PairFunction<String, String, Integer>() {
        @Override
        public Tuple2<String, Integer> call(String s) {
            return new Tuple2<String, Integer>(s, 1);
        }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
        @Override
        public Integer call(Integer i1, Integer i2) {
            return i1 + i2;
        }
    });

    wordCounts.print();
    jssc.start();
    try {
        jssc.awaitTermination();
    } catch (Exception e) {
        e.printStackTrace();
    }
}
 
Example #4
Source File: AppMain.java    From SparkToParquet with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws IOException {
	Flags.setFromCommandLineArgs(THE_OPTIONS, args);

	// 初始化Spark Conf.
	SparkConf conf = new SparkConf().setAppName("A SECTONG Application: Apache Log Analysis with Spark");
	JavaSparkContext sc = new JavaSparkContext(conf);
	JavaStreamingContext jssc = new JavaStreamingContext(sc, Flags.getInstance().getSlideInterval());
	SQLContext sqlContext = new SQLContext(sc);

	// 初始化参数
	HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(Flags.getInstance().getKafka_topic().split(",")));
	HashMap<String, String> kafkaParams = new HashMap<String, String>();
	kafkaParams.put("metadata.broker.list", Flags.getInstance().getKafka_broker());

	// 从Kafka Stream获取数据
	JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(jssc, String.class, String.class,
			StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet);

	JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
		private static final long serialVersionUID = 5266880065425088203L;

		public String call(Tuple2<String, String> tuple2) {
			return tuple2._2();
		}
	});

	JavaDStream<ApacheAccessLog> accessLogsDStream = lines.flatMap(line -> {
		List<ApacheAccessLog> list = new ArrayList<>();
		try {
			// 映射每一行
			list.add(ApacheAccessLog.parseFromLogLine(line));
			return list;
		} catch (RuntimeException e) {
			return list;
		}
	}).cache();

	accessLogsDStream.foreachRDD(rdd -> {

		// rdd to DataFrame
		DataFrame df = sqlContext.createDataFrame(rdd, ApacheAccessLog.class);
		// 写入Parquet文件
		df.write().partitionBy("ipAddress", "method", "responseCode").mode(SaveMode.Append).parquet(Flags.getInstance().getParquetFile());

		return null;
	});

	// 启动Streaming服务器
	jssc.start(); // 启动计算
	jssc.awaitTermination(); // 等待终止
}
 
Example #5
Source File: ComputeStreamingResponse.java    From incubator-retired-pirk with Apache License 2.0 4 votes vote down vote up
/**
 * Method to read in the data from an allowed input format, filter, and return a RDD of MapWritable data elements
 */
@SuppressWarnings("unchecked")
public JavaDStream<MapWritable> readData() throws IOException, PIRException
{
  logger.info("Reading data ");

  Job job = Job.getInstance();
  String baseQuery = SystemConfiguration.getProperty("pir.baseQuery");
  String jobName = "pirSpark_base_" + baseQuery + "_" + System.currentTimeMillis();
  job.setJobName(jobName);
  job.getConfiguration().setBoolean("mapreduce.input.fileinputformat.input.dir.recursive", true);
  job.getConfiguration().set("query", baseQuery);

  job.getConfiguration().set("dataSchemaName", qSchema.getDataSchemaName());
  job.getConfiguration().set("data.schemas", SystemConfiguration.getProperty("data.schemas"));

  // Set the inputFormatClass based upon the baseInputFormat property
  String classString = SystemConfiguration.getProperty("pir.baseInputFormat");
  Class<? extends BaseInputFormat<Text,MapWritable>> inputClass;
  try
  {
    inputClass = (Class<? extends BaseInputFormat<Text,MapWritable>>) Class.forName(classString);
  } catch (ClassNotFoundException | ClassCastException e)
  {
    throw new PIRException(classString + " cannot be instantiated or does not extend BaseInputFormat", e);
  }
  job.setInputFormatClass(inputClass);

  FileInputFormat.setInputPaths(job, inputData);

  // Read data from hdfs
  logger.info("useQueueStream = " + useQueueStream);
  JavaDStream<MapWritable> mwStream;
  if (useQueueStream)
  {
    Queue<JavaRDD<MapWritable>> rddQueue = new LinkedList<>();
    JavaRDD<MapWritable> rddIn = jssc.sparkContext().newAPIHadoopRDD(job.getConfiguration(), inputClass, Text.class, MapWritable.class).values()
        .coalesce(numDataPartitions);

    rddQueue.add(rddIn);
    mwStream = jssc.queueStream(rddQueue);
  }
  else
  {
    JavaPairInputDStream<Text,MapWritable> inputRDD = jssc.fileStream(inputData, Text.class, MapWritable.class, inputClass);
    mwStream = inputRDD.transform(new Function<JavaPairRDD<Text,MapWritable>,JavaRDD<MapWritable>>()
    {
      private static final long serialVersionUID = 1L;

      @Override
      public JavaRDD<MapWritable> call(JavaPairRDD<Text,MapWritable> pair) throws Exception
      {
        return pair.values();
      }
    }).repartition(numDataPartitions);
  }

  // Filter out by the provided stopListFile entries
  if (qSchema.getFilter() != null)
  {
    return mwStream.filter(new FilterData(accum, bVars));
  }

  return mwStream;
}
 
Example #6
Source File: ComputeStreamingResponse.java    From incubator-retired-pirk with Apache License 2.0 4 votes vote down vote up
/**
 * Method to read in the data from elasticsearch, filter, and return a RDD of MapWritable data elements
 */
@SuppressWarnings("unchecked")
public JavaDStream<MapWritable> readDataES() throws IOException
{
  logger.info("Reading data ");

  Job job = Job.getInstance();
  String jobName = "pirSpark_ES_" + esQuery + "_" + System.currentTimeMillis();
  job.setJobName(jobName);
  job.getConfiguration().set("es.nodes", SystemConfiguration.getProperty("es.nodes"));
  job.getConfiguration().set("es.port", SystemConfiguration.getProperty("es.port"));
  job.getConfiguration().set("es.resource", esResource);
  job.getConfiguration().set("es.query", esQuery);

  // Read data from hdfs
  JavaDStream<MapWritable> mwStream;
  if (useQueueStream)
  {
    Queue<JavaRDD<MapWritable>> rddQueue = new LinkedList<>();
    JavaRDD<MapWritable> rddIn = jssc.sparkContext().newAPIHadoopRDD(job.getConfiguration(), EsInputFormat.class, Text.class, MapWritable.class).values()
        .coalesce(numDataPartitions);
    rddQueue.add(rddIn);

    mwStream = jssc.queueStream(rddQueue);
  }
  else
  {
    JavaPairInputDStream<Text,MapWritable> inputRDD = jssc.fileStream(inputData, Text.class, MapWritable.class, EsInputFormat.class);
    mwStream = inputRDD.transform(new Function<JavaPairRDD<Text,MapWritable>,JavaRDD<MapWritable>>()
    {
      private static final long serialVersionUID = 1L;

      @Override
      public JavaRDD<MapWritable> call(JavaPairRDD<Text,MapWritable> pair) throws Exception
      {
        return pair.values();
      }
    }).repartition(numDataPartitions);
  }

  // Filter out by the provided stopListFile entries
  if (qSchema.getFilter() != null)
  {
    return mwStream.filter(new FilterData(accum, bVars));
  }
  else
  {
    return mwStream;
  }
}
 
Example #7
Source File: IoTDataProcessor.java    From iot-traffic-monitor with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
 //read Spark and Cassandra properties and create SparkConf
 Properties prop = PropertyFileReader.readPropertyFile();		
 SparkConf conf = new SparkConf()
		 .setAppName(prop.getProperty("com.iot.app.spark.app.name"))
		 .setMaster(prop.getProperty("com.iot.app.spark.master"))
		 .set("spark.cassandra.connection.host", prop.getProperty("com.iot.app.cassandra.host"))
		 .set("spark.cassandra.connection.port", prop.getProperty("com.iot.app.cassandra.port"))
		 .set("spark.cassandra.connection.keep_alive_ms", prop.getProperty("com.iot.app.cassandra.keep_alive"));		 
 //batch interval of 5 seconds for incoming stream		 
 JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));	
 //add check point directory
 jssc.checkpoint(prop.getProperty("com.iot.app.spark.checkpoint.dir"));
 
 //read and set Kafka properties
 Map<String, String> kafkaParams = new HashMap<String, String>();
 kafkaParams.put("zookeeper.connect", prop.getProperty("com.iot.app.kafka.zookeeper"));
 kafkaParams.put("metadata.broker.list", prop.getProperty("com.iot.app.kafka.brokerlist"));
 String topic = prop.getProperty("com.iot.app.kafka.topic");
 Set<String> topicsSet = new HashSet<String>();
 topicsSet.add(topic);
 //create direct kafka stream
 JavaPairInputDStream<String, IoTData> directKafkaStream = KafkaUtils.createDirectStream(
	        jssc,
	        String.class,
	        IoTData.class,
	        StringDecoder.class,
	        IoTDataDecoder.class,
	        kafkaParams,
	        topicsSet
	    );
 logger.info("Starting Stream Processing");
 
 //We need non filtered stream for poi traffic data calculation
 JavaDStream<IoTData> nonFilteredIotDataStream = directKafkaStream.map(tuple -> tuple._2());
 
 //We need filtered stream for total and traffic data calculation
 JavaPairDStream<String,IoTData> iotDataPairStream = nonFilteredIotDataStream.mapToPair(iot -> new Tuple2<String,IoTData>(iot.getVehicleId(),iot)).reduceByKey((a, b) -> a );

 // Check vehicle Id is already processed
 JavaMapWithStateDStream<String, IoTData, Boolean, Tuple2<IoTData,Boolean>> iotDStreamWithStatePairs = iotDataPairStream
					.mapWithState(StateSpec.function(processedVehicleFunc).timeout(Durations.seconds(3600)));//maintain state for one hour

 // Filter processed vehicle ids and keep un-processed
 JavaDStream<Tuple2<IoTData,Boolean>> filteredIotDStreams = iotDStreamWithStatePairs.map(tuple2 -> tuple2)
					.filter(tuple -> tuple._2.equals(Boolean.FALSE));

 // Get stream of IoTdata
 JavaDStream<IoTData> filteredIotDataStream = filteredIotDStreams.map(tuple -> tuple._1);
 
 //cache stream as it is used in total and window based computation
 filteredIotDataStream.cache();
 	 
 //process data
 IoTTrafficDataProcessor iotTrafficProcessor = new IoTTrafficDataProcessor();
 iotTrafficProcessor.processTotalTrafficData(filteredIotDataStream);
 iotTrafficProcessor.processWindowTrafficData(filteredIotDataStream);

 //poi data
 POIData poiData = new POIData();
 poiData.setLatitude(33.877495);
 poiData.setLongitude(-95.50238);
 poiData.setRadius(30);//30 km
 
 //broadcast variables. We will monitor vehicles on Route 37 which are of type Truck
 Broadcast<Tuple3<POIData, String, String>> broadcastPOIValues = jssc.sparkContext().broadcast(new Tuple3<>(poiData,"Route-37","Truck"));
 //call method  to process stream
 iotTrafficProcessor.processPOIData(nonFilteredIotDataStream,broadcastPOIValues);
 
 //start context
 jssc.start();            
 jssc.awaitTermination();  
}
 
Example #8
Source File: SparkUnboundedSource.java    From beam with Apache License 2.0 4 votes vote down vote up
public static <T, CheckpointMarkT extends CheckpointMark> UnboundedDataset<T> read(
    JavaStreamingContext jssc,
    SerializablePipelineOptions rc,
    UnboundedSource<T, CheckpointMarkT> source,
    String stepName) {

  SparkPipelineOptions options = rc.get().as(SparkPipelineOptions.class);
  Long maxRecordsPerBatch = options.getMaxRecordsPerBatch();
  SourceDStream<T, CheckpointMarkT> sourceDStream =
      new SourceDStream<>(jssc.ssc(), source, rc, maxRecordsPerBatch);

  JavaPairInputDStream<Source<T>, CheckpointMarkT> inputDStream =
      JavaPairInputDStream$.MODULE$.fromInputDStream(
          sourceDStream,
          JavaSparkContext$.MODULE$.fakeClassTag(),
          JavaSparkContext$.MODULE$.fakeClassTag());

  // call mapWithState to read from a checkpointable sources.
  JavaMapWithStateDStream<
          Source<T>, CheckpointMarkT, Tuple2<byte[], Instant>, Tuple2<Iterable<byte[]>, Metadata>>
      mapWithStateDStream =
          inputDStream.mapWithState(
              StateSpec.function(
                      StateSpecFunctions.<T, CheckpointMarkT>mapSourceFunction(rc, stepName))
                  .numPartitions(sourceDStream.getNumPartitions()));

  // set checkpoint duration for read stream, if set.
  checkpointStream(mapWithStateDStream, options);

  // report the number of input elements for this InputDStream to the InputInfoTracker.
  int id = inputDStream.inputDStream().id();
  JavaDStream<Metadata> metadataDStream = mapWithStateDStream.map(new Tuple2MetadataFunction());

  // register ReadReportDStream to report information related to this read.
  new ReadReportDStream(metadataDStream.dstream(), id, getSourceName(source, id), stepName)
      .register();

  // output the actual (deserialized) stream.
  WindowedValue.FullWindowedValueCoder<T> coder =
      WindowedValue.FullWindowedValueCoder.of(
          source.getOutputCoder(), GlobalWindow.Coder.INSTANCE);
  JavaDStream<WindowedValue<T>> readUnboundedStream =
      mapWithStateDStream
          .flatMap(new Tuple2byteFlatMapFunction())
          .map(CoderHelpers.fromByteFunction(coder));
  return new UnboundedDataset<>(readUnboundedStream, Collections.singletonList(id));
}