org.apache.spark.streaming.api.java.JavaPairInputDStream Java Exaples

Source File: KafkaStreaming.java From sparkResearch with Apache License 2.0

8 votes

public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10000));
    //设置检查点
    streamingContext.checkpoint("HDFS URL");
    Map<String, Integer> topicThread = new HashMap<>(1);
    topicThread.put(TOPIC, THREAD);
    JavaPairInputDStream<String, String> dStream = KafkaUtils.createStream(streamingContext, HOST, GROP, topicThread);

    JavaDStream<String> words = dStream.flatMap((FlatMapFunction<Tuple2<String, String>, String>) stringStringTuple2 -> Arrays.asList(SPACE.split(stringStringTuple2._2)).iterator());

    //统计
    JavaPairDStream<String, Integer> result = words.mapToPair((PairFunction<String, String, Integer>) s -> new Tuple2<>(s, 1)).reduceByKey((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2);

    try {
        result.print();
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}

Source File: SparkStreaming.java From kafka-spark-avro-example with Apache License 2.0

6 votes

private static void processStream(JavaStreamingContext ssc, JavaSparkContext sc) {
  System.out.println("--> Processing stream");

  Map<String, String> props = new HashMap<>();
  props.put("bootstrap.servers", "localhost:9092");
  props.put("schema.registry.url", "http://localhost:8081");
  props.put("group.id", "spark");
  props.put("specific.avro.reader", "true");

  props.put("value.deserializer", "io.confluent.kafka.serializers.KafkaAvroDeserializer");
  props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");

  Set<String> topicsSet = new HashSet<>(Collections.singletonList("test"));

  JavaPairInputDStream<String, Object> stream = KafkaUtils.createDirectStream(ssc, String.class, Object.class,
    StringDecoder.class, KafkaAvroDecoder.class, props, topicsSet);

  stream.foreachRDD(rdd -> {
    rdd.foreachPartition(iterator -> {
        while (iterator.hasNext()) {
          Tuple2<String, Object> next = iterator.next();
          Model model = (Model) next._2();
          System.out.println(next._1() + " --> " + model);
        }
      }
    );
  });
}

Source File: JavaKafkaDirectWordCount.java From SparkDemo with MIT License

4 votes

/**
 * 1.一对一
 * 2.高效
 * 3.准确的只计算一次
 *
 * @param args
 */
public static void main(String[] args) {
    StreamingExamples.setStreamingLogLevels();
    SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaDirectWordCount").setMaster("local[1]");
    JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(6));

    Map<String, String> kafkaParams = new HashMap<String, String>(); // key是topic名称,value是线程数量
    kafkaParams.put("metadata.broker.list", "master:9092,slave1:9092,slave2:9092"); // 指定broker在哪
    HashSet<String> topicsSet = new HashSet<String>();
    topicsSet.add("2017-7-26"); // 指定操作的topic

    // Create direct kafka stream with brokers and topics createDirectStream()
    JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(
            jssc,
            String.class,
            String.class,
            StringDecoder.class,
            StringDecoder.class,
            kafkaParams,
            topicsSet
    );

    JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
        @Override
        public String call(Tuple2<String, String> tuple2) {
            return tuple2._2();
        }
    });

    JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
        @Override
        public Iterator<String> call(String x) {
            return Lists.newArrayList(SPACE.split(x)).iterator();
        }
    });

    JavaPairDStream<String, Integer> wordCounts = words.mapToPair(new PairFunction<String, String, Integer>() {
        @Override
        public Tuple2<String, Integer> call(String s) {
            return new Tuple2<String, Integer>(s, 1);
        }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
        @Override
        public Integer call(Integer i1, Integer i2) {
            return i1 + i2;
        }
    });

    wordCounts.print();
    jssc.start();
    try {
        jssc.awaitTermination();
    } catch (Exception e) {
        e.printStackTrace();
    }
}

Source File: AppMain.java From SparkToParquet with Apache License 2.0

4 votes

public static void main(String[] args) throws IOException {
	Flags.setFromCommandLineArgs(THE_OPTIONS, args);

	// 初始化Spark Conf.
	SparkConf conf = new SparkConf().setAppName("A SECTONG Application: Apache Log Analysis with Spark");
	JavaSparkContext sc = new JavaSparkContext(conf);
	JavaStreamingContext jssc = new JavaStreamingContext(sc, Flags.getInstance().getSlideInterval());
	SQLContext sqlContext = new SQLContext(sc);

	// 初始化参数
	HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(Flags.getInstance().getKafka_topic().split(",")));
	HashMap<String, String> kafkaParams = new HashMap<String, String>();
	kafkaParams.put("metadata.broker.list", Flags.getInstance().getKafka_broker());

	// 从Kafka Stream获取数据
	JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(jssc, String.class, String.class,
			StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet);

	JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
		private static final long serialVersionUID = 5266880065425088203L;

		public String call(Tuple2<String, String> tuple2) {
			return tuple2._2();
		}
	});

	JavaDStream<ApacheAccessLog> accessLogsDStream = lines.flatMap(line -> {
		List<ApacheAccessLog> list = new ArrayList<>();
		try {
			// 映射每一行
			list.add(ApacheAccessLog.parseFromLogLine(line));
			return list;
		} catch (RuntimeException e) {
			return list;
		}
	}).cache();

	accessLogsDStream.foreachRDD(rdd -> {

		// rdd to DataFrame
		DataFrame df = sqlContext.createDataFrame(rdd, ApacheAccessLog.class);
		// 写入Parquet文件
		df.write().partitionBy("ipAddress", "method", "responseCode").mode(SaveMode.Append).parquet(Flags.getInstance().getParquetFile());

		return null;
	});

	// 启动Streaming服务器
	jssc.start(); // 启动计算
	jssc.awaitTermination(); // 等待终止
}

Source File: ComputeStreamingResponse.java From incubator-retired-pirk with Apache License 2.0

4 votes

/**
 * Method to read in the data from an allowed input format, filter, and return a RDD of MapWritable data elements
 */
@SuppressWarnings("unchecked")
public JavaDStream<MapWritable> readData() throws IOException, PIRException
{
  logger.info("Reading data ");

  Job job = Job.getInstance();
  String baseQuery = SystemConfiguration.getProperty("pir.baseQuery");
  String jobName = "pirSpark_base_" + baseQuery + "_" + System.currentTimeMillis();
  job.setJobName(jobName);
  job.getConfiguration().setBoolean("mapreduce.input.fileinputformat.input.dir.recursive", true);
  job.getConfiguration().set("query", baseQuery);

  job.getConfiguration().set("dataSchemaName", qSchema.getDataSchemaName());
  job.getConfiguration().set("data.schemas", SystemConfiguration.getProperty("data.schemas"));

  // Set the inputFormatClass based upon the baseInputFormat property
  String classString = SystemConfiguration.getProperty("pir.baseInputFormat");
  Class<? extends BaseInputFormat<Text,MapWritable>> inputClass;
  try
  {
    inputClass = (Class<? extends BaseInputFormat<Text,MapWritable>>) Class.forName(classString);
  } catch (ClassNotFoundException | ClassCastException e)
  {
    throw new PIRException(classString + " cannot be instantiated or does not extend BaseInputFormat", e);
  }
  job.setInputFormatClass(inputClass);

  FileInputFormat.setInputPaths(job, inputData);

  // Read data from hdfs
  logger.info("useQueueStream = " + useQueueStream);
  JavaDStream<MapWritable> mwStream;
  if (useQueueStream)
  {
    Queue<JavaRDD<MapWritable>> rddQueue = new LinkedList<>();
    JavaRDD<MapWritable> rddIn = jssc.sparkContext().newAPIHadoopRDD(job.getConfiguration(), inputClass, Text.class, MapWritable.class).values()
        .coalesce(numDataPartitions);

    rddQueue.add(rddIn);
    mwStream = jssc.queueStream(rddQueue);
  }
  else
  {
    JavaPairInputDStream<Text,MapWritable> inputRDD = jssc.fileStream(inputData, Text.class, MapWritable.class, inputClass);
    mwStream = inputRDD.transform(new Function<JavaPairRDD<Text,MapWritable>,JavaRDD<MapWritable>>()
    {
      private static final long serialVersionUID = 1L;

      @Override
      public JavaRDD<MapWritable> call(JavaPairRDD<Text,MapWritable> pair) throws Exception
      {
        return pair.values();
      }
    }).repartition(numDataPartitions);
  }

  // Filter out by the provided stopListFile entries
  if (qSchema.getFilter() != null)
  {
    return mwStream.filter(new FilterData(accum, bVars));
  }

  return mwStream;
}

Source File: ComputeStreamingResponse.java From incubator-retired-pirk with Apache License 2.0

4 votes

/**
 * Method to read in the data from elasticsearch, filter, and return a RDD of MapWritable data elements
 */
@SuppressWarnings("unchecked")
public JavaDStream<MapWritable> readDataES() throws IOException
{
  logger.info("Reading data ");

  Job job = Job.getInstance();
  String jobName = "pirSpark_ES_" + esQuery + "_" + System.currentTimeMillis();
  job.setJobName(jobName);
  job.getConfiguration().set("es.nodes", SystemConfiguration.getProperty("es.nodes"));
  job.getConfiguration().set("es.port", SystemConfiguration.getProperty("es.port"));
  job.getConfiguration().set("es.resource", esResource);
  job.getConfiguration().set("es.query", esQuery);

  // Read data from hdfs
  JavaDStream<MapWritable> mwStream;
  if (useQueueStream)
  {
    Queue<JavaRDD<MapWritable>> rddQueue = new LinkedList<>();
    JavaRDD<MapWritable> rddIn = jssc.sparkContext().newAPIHadoopRDD(job.getConfiguration(), EsInputFormat.class, Text.class, MapWritable.class).values()
        .coalesce(numDataPartitions);
    rddQueue.add(rddIn);

    mwStream = jssc.queueStream(rddQueue);
  }
  else
  {
    JavaPairInputDStream<Text,MapWritable> inputRDD = jssc.fileStream(inputData, Text.class, MapWritable.class, EsInputFormat.class);
    mwStream = inputRDD.transform(new Function<JavaPairRDD<Text,MapWritable>,JavaRDD<MapWritable>>()
    {
      private static final long serialVersionUID = 1L;

      @Override
      public JavaRDD<MapWritable> call(JavaPairRDD<Text,MapWritable> pair) throws Exception
      {
        return pair.values();
      }
    }).repartition(numDataPartitions);
  }

  // Filter out by the provided stopListFile entries
  if (qSchema.getFilter() != null)
  {
    return mwStream.filter(new FilterData(accum, bVars));
  }
  else
  {
    return mwStream;
  }
}

Source File: IoTDataProcessor.java From iot-traffic-monitor with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception {
 //read Spark and Cassandra properties and create SparkConf
 Properties prop = PropertyFileReader.readPropertyFile();		
 SparkConf conf = new SparkConf()
		 .setAppName(prop.getProperty("com.iot.app.spark.app.name"))
		 .setMaster(prop.getProperty("com.iot.app.spark.master"))
		 .set("spark.cassandra.connection.host", prop.getProperty("com.iot.app.cassandra.host"))
		 .set("spark.cassandra.connection.port", prop.getProperty("com.iot.app.cassandra.port"))
		 .set("spark.cassandra.connection.keep_alive_ms", prop.getProperty("com.iot.app.cassandra.keep_alive"));		 
 //batch interval of 5 seconds for incoming stream		 
 JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));	
 //add check point directory
 jssc.checkpoint(prop.getProperty("com.iot.app.spark.checkpoint.dir"));
 
 //read and set Kafka properties
 Map<String, String> kafkaParams = new HashMap<String, String>();
 kafkaParams.put("zookeeper.connect", prop.getProperty("com.iot.app.kafka.zookeeper"));
 kafkaParams.put("metadata.broker.list", prop.getProperty("com.iot.app.kafka.brokerlist"));
 String topic = prop.getProperty("com.iot.app.kafka.topic");
 Set<String> topicsSet = new HashSet<String>();
 topicsSet.add(topic);
 //create direct kafka stream
 JavaPairInputDStream<String, IoTData> directKafkaStream = KafkaUtils.createDirectStream(
	        jssc,
	        String.class,
	        IoTData.class,
	        StringDecoder.class,
	        IoTDataDecoder.class,
	        kafkaParams,
	        topicsSet
	    );
 logger.info("Starting Stream Processing");
 
 //We need non filtered stream for poi traffic data calculation
 JavaDStream<IoTData> nonFilteredIotDataStream = directKafkaStream.map(tuple -> tuple._2());
 
 //We need filtered stream for total and traffic data calculation
 JavaPairDStream<String,IoTData> iotDataPairStream = nonFilteredIotDataStream.mapToPair(iot -> new Tuple2<String,IoTData>(iot.getVehicleId(),iot)).reduceByKey((a, b) -> a );

 // Check vehicle Id is already processed
 JavaMapWithStateDStream<String, IoTData, Boolean, Tuple2<IoTData,Boolean>> iotDStreamWithStatePairs = iotDataPairStream
					.mapWithState(StateSpec.function(processedVehicleFunc).timeout(Durations.seconds(3600)));//maintain state for one hour

 // Filter processed vehicle ids and keep un-processed
 JavaDStream<Tuple2<IoTData,Boolean>> filteredIotDStreams = iotDStreamWithStatePairs.map(tuple2 -> tuple2)
					.filter(tuple -> tuple._2.equals(Boolean.FALSE));

 // Get stream of IoTdata
 JavaDStream<IoTData> filteredIotDataStream = filteredIotDStreams.map(tuple -> tuple._1);
 
 //cache stream as it is used in total and window based computation
 filteredIotDataStream.cache();
 	 
 //process data
 IoTTrafficDataProcessor iotTrafficProcessor = new IoTTrafficDataProcessor();
 iotTrafficProcessor.processTotalTrafficData(filteredIotDataStream);
 iotTrafficProcessor.processWindowTrafficData(filteredIotDataStream);

 //poi data
 POIData poiData = new POIData();
 poiData.setLatitude(33.877495);
 poiData.setLongitude(-95.50238);
 poiData.setRadius(30);//30 km
 
 //broadcast variables. We will monitor vehicles on Route 37 which are of type Truck
 Broadcast<Tuple3<POIData, String, String>> broadcastPOIValues = jssc.sparkContext().broadcast(new Tuple3<>(poiData,"Route-37","Truck"));
 //call method  to process stream
 iotTrafficProcessor.processPOIData(nonFilteredIotDataStream,broadcastPOIValues);
 
 //start context
 jssc.start();            
 jssc.awaitTermination();  
}

Source File: SparkUnboundedSource.java From beam with Apache License 2.0

4 votes

public static <T, CheckpointMarkT extends CheckpointMark> UnboundedDataset<T> read(
    JavaStreamingContext jssc,
    SerializablePipelineOptions rc,
    UnboundedSource<T, CheckpointMarkT> source,
    String stepName) {

  SparkPipelineOptions options = rc.get().as(SparkPipelineOptions.class);
  Long maxRecordsPerBatch = options.getMaxRecordsPerBatch();
  SourceDStream<T, CheckpointMarkT> sourceDStream =
      new SourceDStream<>(jssc.ssc(), source, rc, maxRecordsPerBatch);

  JavaPairInputDStream<Source<T>, CheckpointMarkT> inputDStream =
      JavaPairInputDStream$.MODULE$.fromInputDStream(
          sourceDStream,
          JavaSparkContext$.MODULE$.fakeClassTag(),
          JavaSparkContext$.MODULE$.fakeClassTag());

  // call mapWithState to read from a checkpointable sources.
  JavaMapWithStateDStream<
          Source<T>, CheckpointMarkT, Tuple2<byte[], Instant>, Tuple2<Iterable<byte[]>, Metadata>>
      mapWithStateDStream =
          inputDStream.mapWithState(
              StateSpec.function(
                      StateSpecFunctions.<T, CheckpointMarkT>mapSourceFunction(rc, stepName))
                  .numPartitions(sourceDStream.getNumPartitions()));

  // set checkpoint duration for read stream, if set.
  checkpointStream(mapWithStateDStream, options);

  // report the number of input elements for this InputDStream to the InputInfoTracker.
  int id = inputDStream.inputDStream().id();
  JavaDStream<Metadata> metadataDStream = mapWithStateDStream.map(new Tuple2MetadataFunction());

  // register ReadReportDStream to report information related to this read.
  new ReadReportDStream(metadataDStream.dstream(), id, getSourceName(source, id), stepName)
      .register();

  // output the actual (deserialized) stream.
  WindowedValue.FullWindowedValueCoder<T> coder =
      WindowedValue.FullWindowedValueCoder.of(
          source.getOutputCoder(), GlobalWindow.Coder.INSTANCE);
  JavaDStream<WindowedValue<T>> readUnboundedStream =
      mapWithStateDStream
          .flatMap(new Tuple2byteFlatMapFunction())
          .map(CoderHelpers.fromByteFunction(coder));
  return new UnboundedDataset<>(readUnboundedStream, Collections.singletonList(id));
}

org.apache.spark.streaming.api.java.JavaPairInputDStream Java Examples