org.apache.spark.streaming.api.java.JavaDStream Java Exaples

Source File: KafkaStreaming.java From sparkResearch with Apache License 2.0

8 votes

public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10000));
    //设置检查点
    streamingContext.checkpoint("HDFS URL");
    Map<String, Integer> topicThread = new HashMap<>(1);
    topicThread.put(TOPIC, THREAD);
    JavaPairInputDStream<String, String> dStream = KafkaUtils.createStream(streamingContext, HOST, GROP, topicThread);

    JavaDStream<String> words = dStream.flatMap((FlatMapFunction<Tuple2<String, String>, String>) stringStringTuple2 -> Arrays.asList(SPACE.split(stringStringTuple2._2)).iterator());

    //统计
    JavaPairDStream<String, Integer> result = words.mapToPair((PairFunction<String, String, Integer>) s -> new Tuple2<>(s, 1)).reduceByKey((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2);

    try {
        result.print();
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}

Source File: ComputeStreamingResponse.java From incubator-retired-pirk with Apache License 2.0

6 votes

/**
 * Method to read in data from an allowed input source/format and perform the query
 */
public void performQuery() throws IOException, PIRException
{
  logger.info("Performing query: ");

  JavaDStream<MapWritable> inputRDD = null;
  if (dataInputFormat.equals(InputFormatConst.BASE_FORMAT))
  {
    inputRDD = readData();
  }
  else if (dataInputFormat.equals(InputFormatConst.ES))
  {
    inputRDD = readDataES();
  }
  else
  {
    throw new PIRException("Unknown data input format " + dataInputFormat);
  }

  performQuery(inputRDD);
}

Source File: DStreamUtil.java From sylph with Apache License 2.0

6 votes

public static void dstreamAction(JavaDStream<Row> stream, Sink<JavaRDD<Row>> sink)
{
    DStream<?> fristDStream = getFristDStream(stream.dstream());
    logger.info("数据源驱动:{}", fristDStream.getClass().getName());

    if ("DirectKafkaInputDStream".equals(fristDStream.getClass().getSimpleName())) {
        logger.info("发现job 数据源是kafka,将开启空job优化 且 自动上报offect");
        stream.foreachRDD(rdd -> {
            RDD<?> kafkaRdd = getFristRdd(rdd.rdd()); //rdd.dependencies(0).rdd
            OffsetRange[] offsetRanges = ((HasOffsetRanges) kafkaRdd).offsetRanges();
            if (kafkaRdd.count() > 0) {
                sink.run(rdd); //执行业务操作
            }
            ((CanCommitOffsets) fristDStream).commitAsync(offsetRanges);
        });
    }
    else { //非kafka数据源 暂时无法做任何优化
        stream.foreachRDD(sink::run);
    }
}

Source File: SparkStreamingJob.java From zipkin-sparkstreaming with Apache License 2.0

6 votes

static void streamSpansToStorage(
    JavaDStream<byte[]> stream,
    ReadSpans readSpans,
    AdjustAndConsumeSpansSharingTraceId adjustAndConsumeSpansSharingTraceId
) {
  JavaDStream<Span> spans = stream.flatMap(readSpans);

  // TODO: plug in some filter to drop spans regardless of trace ID
  // spans = spans.filter(spanFilter);

  JavaPairDStream<String, Iterable<Span>> tracesById = spans
      .mapToPair(s -> new Tuple2<>(Util.toLowerHex(s.traceIdHigh, s.traceId), s))
      .groupByKey();

  tracesById.foreachRDD(rdd -> {
    rdd.values().foreachPartition(adjustAndConsumeSpansSharingTraceId);
  });
}

Source File: StreamingIngestionFileSystemTextFileToDataframeMultipleClassesApp.java From net.jgp.labs.spark with Apache License 2.0

6 votes

private void start() {
  // Create a local StreamingContext with two working thread and batch
  // interval of
  // 1 second
  SparkConf conf = new SparkConf().setMaster("local[2]").setAppName(
      "Streaming Ingestion File System Text File to Dataframe");
  JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations
      .seconds(5));

  JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils
      .getInputDirectory());

  msgDataStream.print();
  // Create JavaRDD<Row>
  msgDataStream.foreachRDD(new RowProcessor());

  jssc.start();
  try {
    jssc.awaitTermination();
  } catch (InterruptedException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
  }
}

Source File: SparkStreamServiceImpl.java From searchanalytics-bigdata with MIT License

6 votes

@Override
	public void startFlumeStream() {
		JavaDStream<SparkFlumeEvent> flumeStream = FlumeUtils.createStream(
				jssc, "localhost", 41111, StorageLevels.MEMORY_AND_DISK);

		QueryStringJDStreams queryStringJDStreams = new QueryStringJDStreams();

		// Run top top search query string stream
		queryStringJDStreams
				.topQueryStringsCountInLastOneHourUsingSparkFlumeEvent(flumeStream);

		// Run top product view stream
		//TODO: uncomment to get both stats.
//		queryStringJDStreams
//				.topProductViewsCountInLastOneHourUsingSparkFlumeEvent(flumeStream);
		jssc.start();
	}

Source File: StreamingRsvpsDStreamCountWindow.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License

6 votes

public static void main(String[] args) throws InterruptedException {

        System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

        final SparkConf conf = new SparkConf()
                .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
                .setAppName(APPLICATION_NAME)
                .set("spark.mongodb.output.uri", MONGODB_OUTPUT_URI)
                .set("spark.streaming.kafka.consumer.cache.enabled", "false");

        final JavaStreamingContext streamingContext
                = new JavaStreamingContext(conf, new Duration(BATCH_DURATION_INTERVAL_MS));

        streamingContext.checkpoint(CHECKPOINT_FOLDER);

        final JavaInputDStream<ConsumerRecord<String, String>> meetupStream =
                KafkaUtils.createDirectStream(
                        streamingContext,
                        LocationStrategies.PreferConsistent(),
                        ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES)
                );
                
        // transformations, streaming algorithms, etc
        JavaDStream<Long> countStream  
            = meetupStream.countByWindow(
                 new Duration(WINDOW_LENGTH_MS), 
                 new Duration(SLIDING_INTERVAL_MS));        

        countStream.foreachRDD((JavaRDD<Long> countRDD) -> {                
            MongoSpark.save(        
                    countRDD.map(
                        r -> Document.parse("{\"rsvps_count\":\"" + String.valueOf(r) + "\"}")
                    )
            );            
        });
        
        // some time later, after outputs have completed
        meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> {        
            OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges();            

            ((CanCommitOffsets) meetupStream.inputDStream())
                .commitAsync(offsetRanges, new MeetupOffsetCommitCallback());
        });
        
        streamingContext.start();
        streamingContext.awaitTermination();    
    }

Source File: ReduceByKeyAndWindow.java From sparkResearch with Apache License 2.0

6 votes

public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("reduceByKeyAndWindow").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10));
    //检查点设置
    streamingContext.checkpoint("hdfs://localhost:9300");
    //数据源
    JavaDStream<String> dStream = streamingContext.socketTextStream("localhost", 8080);

    JavaPairDStream<String, Long> ipPairDstream = dStream.mapToPair(new GetIp());

    JavaPairDStream<String, Long> result = ipPairDstream.reduceByKeyAndWindow(new AddLongs(),
            new SubtractLongs(), Durations.seconds(30), Durations.seconds(10));

    try {
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}

Source File: StreamingService.java From cxf with Apache License 2.0

6 votes

private void processStreamOneWay(List<String> inputStrings) {
    try {
        SparkConf sparkConf = new SparkConf().setMaster("local[*]")
            .setAppName("JAX-RS Spark Connect OneWay " + SparkUtils.getRandomId());
        JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));

        JavaDStream<String> receiverStream = null;
        if ("queue".equals(receiverType)) {
            Queue<JavaRDD<String>> rddQueue = new LinkedList<>();
            for (int i = 0; i < 30; i++) {
                rddQueue.add(jssc.sparkContext().parallelize(inputStrings));
            }
            receiverStream = jssc.queueStream(rddQueue);
        } else {
            receiverStream = jssc.receiverStream(new StringListReceiver(inputStrings));
        }

        JavaPairDStream<String, Integer> wordCounts = SparkUtils.createOutputDStream(receiverStream, false);
        wordCounts.foreachRDD(new PrintOutputFunction(jssc));
        jssc.start();
    } catch (Exception ex) {
        // ignore
    }
}

Source File: JavaCustomReceiver.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) throws Exception {
  if (args.length < 2) {
    System.err.println("Usage: JavaCustomReceiver <hostname> <port>");
    System.exit(1);
  }

  StreamingExamples.setStreamingLogLevels();

  // Create the context with a 1 second batch size
  SparkConf sparkConf = new SparkConf().setAppName("JavaCustomReceiver");
  JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, new Duration(1000));

  // Create an input stream with the custom receiver on target ip:port and count the
  // words in input stream of \n delimited text (eg. generated by 'nc')
  JavaReceiverInputDStream<String> lines = ssc.receiverStream(
    new JavaCustomReceiver(args[0], Integer.parseInt(args[1])));
  JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
    @Override
    public Iterator<String> call(String x) {
      return Arrays.asList(SPACE.split(x)).iterator();
    }
  });
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
    new PairFunction<String, String, Integer>() {
      @Override public Tuple2<String, Integer> call(String s) {
        return new Tuple2<>(s, 1);
      }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
      @Override
      public Integer call(Integer i1, Integer i2) {
        return i1 + i2;
      }
    });

  wordCounts.print();
  ssc.start();
  ssc.awaitTermination();
}

Source File: SparkStreamingFromNetworkExample.java From SparkOnALog with Apache License 2.0

5 votes

public static void main(String[] args) {
  if (args.length < 3) {
    System.err.println("Usage: NetworkWordCount <master> <hostname> <port>\n" +
        "In local mode, <master> should be 'local[n]' with n > 1");
    System.exit(1);
  }

  // Create the context with a 1 second batch size
  JavaStreamingContext ssc = new JavaStreamingContext(args[0], "NetworkWordCount",
          new Duration(5000), System.getenv("SPARK_HOME"), System.getenv("SPARK_EXAMPLES_JAR"));

  // Create a NetworkInputDStream on target ip:port and count the
  // words in input stream of \n delimited test (eg. generated by 'nc')
  JavaDStream<String> lines = ssc.socketTextStream(args[1], Integer.parseInt(args[2]));
  
  lines.map(new Function<String, String> () {

@Override
public String call(String arg0) throws Exception {
	System.out.println("arg0" + arg0);
	return arg0;
}}).print();
  
  lines.print();
  ssc.start();


}

Source File: RealtimeTrafficDataProcessor.java From lambda-arch with Apache License 2.0

5 votes

/**
 * Method to get window traffic counts of different type of vehicles for each route.
 * Window duration = 30 seconds and Slide interval = 10 seconds
 *
 * @param filteredIotDataStream IoT data stream
 */
public void processWindowTrafficData(JavaDStream<IoTData> filteredIotDataStream) {

    // reduce by key and window (30 sec window and 10 sec slide).
    JavaPairDStream<AggregateKey, Long> countDStreamPair = filteredIotDataStream
            .mapToPair(iot -> new Tuple2<>(
                    new AggregateKey(iot.getRouteId(), iot.getVehicleType()),
                    1L
            ))
            .reduceByKeyAndWindow((a, b) -> a + b,
                    Durations.seconds(30),
                    Durations.seconds(10)
            );

    // Transform to dstream of TrafficData
    JavaDStream<WindowTrafficData> trafficDStream = countDStreamPair.map(windowTrafficDataFunc);

    // Map Cassandra table column
    Map<String, String> columnNameMappings = new HashMap<String, String>();
    columnNameMappings.put("routeId", "routeid");
    columnNameMappings.put("vehicleType", "vehicletype");
    columnNameMappings.put("totalCount", "totalcount");
    columnNameMappings.put("timeStamp", "timestamp");
    columnNameMappings.put("recordDate", "recorddate");

    // call CassandraStreamingJavaUtil function to save in DB
    javaFunctions(trafficDStream).writerBuilder(
            "traffickeyspace",
            "window_traffic",
            CassandraJavaUtil.mapToRow(WindowTrafficData.class, columnNameMappings)
    ).saveToCassandra();
}

Source File: AbstractJavaEsSparkStreamingTest.java From elasticsearch-hadoop with Apache License 2.0

5 votes

@Test
public void testEsRDDWriteWithMappingExclude() throws Exception {
    Map<String, Object> trip1 = new HashMap<>();
    trip1.put("reason", "business");
    trip1.put("airport", "SFO");

    Map<String, Object> trip2 = new HashMap<>();
    trip2.put("participants", 5);
    trip2.put("airport", "OTP");

    List<Map<String, Object>> docs = new ArrayList<>();
    docs.add(trip1);
    docs.add(trip2);

    String target = wrapIndex(resource("spark-test-scala-write-exclude", "data", version));

    Map<String, String> localConf = new HashMap<>(cfg);
    localConf.put(ES_MAPPING_EXCLUDE, "airport");

    JavaRDD<Map<String, Object>> batch = sc.parallelize(docs);
    Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>();
    rddQueue.add(batch);
    JavaDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue);
    JavaEsSparkStreaming.saveToEs(dstream, target, localConf);
    ssc.start();
    TimeUnit.SECONDS.sleep(2);
    ssc.stop(false, true);

    assertTrue(RestUtils.exists(target));
    assertThat(RestUtils.get(target + "/_search?"), containsString("business"));
    assertThat(RestUtils.get(target +  "/_search?"), containsString("participants"));
    assertThat(RestUtils.get(target +  "/_search?"), not(containsString("airport")));
}

Source File: WordCountSocketStateful.java From Apache-Spark-2x-for-Java-Developers with MIT License

5 votes

public static void main(String[] args) throws Exception {
 System.setProperty("hadoop.home.dir", "E:\\hadoop");

   SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]");
   JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
   streamingContext.checkpoint("E:\\hadoop\\checkpoint");
// Initial state RDD input to mapWithState
   @SuppressWarnings("unchecked")
   List<Tuple2<String, Integer>> tuples =Arrays.asList(new Tuple2<>("hello", 1), new Tuple2<>("world", 1));
   JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
   
   JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
   
   JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
  
   JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
  


  // Update the cumulative count function
  Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>> mappingFunc =
      new Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>>() {
        @Override
        public Tuple2<String, Integer> call(String word, Optional<Integer> one,
            State<Integer> state) {
          int sum = one.orElse(0) + (state.exists() ? state.get() : 0);
          Tuple2<String, Integer> output = new Tuple2<>(word, sum);
          state.update(sum);
          return output;
        }
      };

  // DStream made of get cumulative counts that get updated in every batch
  JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> stateDstream = wordCounts.mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD));

  stateDstream.print();
  streamingContext.start();
  streamingContext.awaitTermination();
}

Source File: FileStreamingEx.java From Apache-Spark-2x-for-Java-Developers with MIT License

5 votes

public static void main(String[] args) {
   	//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
	 System.setProperty("hadoop.home.dir", "E:\\hadoop");
   	//Logger rootLogger = LogManager.getRootLogger();
  		//rootLogger.setLevel(Level.WARN); 
       SparkConf conf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]");
       String inputDirectory="E:\\hadoop\\streamFolder\\";
    
       JavaSparkContext sc = new JavaSparkContext(conf);
       JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.seconds(1));
      // streamingContext.checkpoint("E:\\hadoop\\checkpoint");
       Logger rootLogger = LogManager.getRootLogger();
  		rootLogger.setLevel(Level.WARN); 
  		
  		JavaDStream<String> streamfile = streamingContext.textFileStream(inputDirectory);
  		streamfile.print();
  		streamfile.foreachRDD(rdd-> rdd.foreach(x -> System.out.println(x)));
  		
  			   		
  		JavaPairDStream<LongWritable, Text> streamedFile = streamingContext.fileStream(inputDirectory, LongWritable.class, Text.class, TextInputFormat.class);
  	 streamedFile.print();
  		
  	 streamingContext.start();
  	 

       try {
		streamingContext.awaitTermination();
	} catch (InterruptedException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}
}

Source File: StreamingTransformTranslator.java From beam with Apache License 2.0

5 votes

private static <K, V, W extends BoundedWindow> TransformEvaluator<Reshuffle<K, V>> reshuffle() {
  return new TransformEvaluator<Reshuffle<K, V>>() {
    @Override
    public void evaluate(Reshuffle<K, V> transform, EvaluationContext context) {
      @SuppressWarnings("unchecked")
      UnboundedDataset<KV<K, V>> inputDataset =
          (UnboundedDataset<KV<K, V>>) context.borrowDataset(transform);
      List<Integer> streamSources = inputDataset.getStreamSources();
      JavaDStream<WindowedValue<KV<K, V>>> dStream = inputDataset.getDStream();
      final KvCoder<K, V> coder = (KvCoder<K, V>) context.getInput(transform).getCoder();
      @SuppressWarnings("unchecked")
      final WindowingStrategy<?, W> windowingStrategy =
          (WindowingStrategy<?, W>) context.getInput(transform).getWindowingStrategy();
      @SuppressWarnings("unchecked")
      final WindowFn<Object, W> windowFn = (WindowFn<Object, W>) windowingStrategy.getWindowFn();

      final WindowedValue.WindowedValueCoder<KV<K, V>> wvCoder =
          WindowedValue.FullWindowedValueCoder.of(coder, windowFn.windowCoder());

      JavaDStream<WindowedValue<KV<K, V>>> reshuffledStream =
          dStream.transform(rdd -> GroupCombineFunctions.reshuffle(rdd, wvCoder));

      context.putDataset(transform, new UnboundedDataset<>(reshuffledStream, streamSources));
    }

    @Override
    public String toNativeString() {
      return "repartition(...)";
    }
  };
}

Source File: AbstractJavaEsSparkStreamingTest.java From elasticsearch-hadoop with Apache License 2.0

5 votes

@Test
public void testMultiIndexRDDWrite() throws Exception {
    Map<String, Object> trip1 = new HashMap<>();
    trip1.put("reason", "business");
    trip1.put("airport", "sfo");

    Map<String, Object> trip2 = new HashMap<>();
    trip2.put("participants", 5);
    trip2.put("airport", "otp");

    List<Map<String, Object>> docs = new ArrayList<>();
    docs.add(trip1);
    docs.add(trip2);

    String target = wrapIndex(resource("spark-test-trip-{airport}", "data", version));

    JavaRDD<Map<String, Object>> batch = sc.parallelize(docs);
    Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>();
    rddQueue.add(batch);
    JavaDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue);
    JavaEsSparkStreaming.saveToEs(dstream, target, cfg);
    ssc.start();
    TimeUnit.SECONDS.sleep(2);
    ssc.stop(false, true);

    assertTrue(RestUtils.exists(wrapIndex(resource("spark-test-trip-otp", "data", version))));
    assertTrue(RestUtils.exists(wrapIndex(resource("spark-test-trip-sfo", "data", version))));

    assertThat(RestUtils.get(wrapIndex(resource("spark-test-trip-sfo", "data", version) + "/_search?")), containsString("business"));
    assertThat(RestUtils.get(wrapIndex(resource("spark-test-trip-otp", "data", version) + "/_search?")), containsString("participants"));
}

Source File: WordCountSocketJava8Ex.java From Apache-Spark-2x-for-Java-Developers with MIT License

5 votes

public static void main(String[] args) throws Exception {
 
     System.setProperty("hadoop.home.dir", "E:\\hadoop");
	
  SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]");
  JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
  
  List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10));
  JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
    

  JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
  
  JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
 
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
 
  wordCounts.print();
  
JavaPairDStream<String, Integer> joinedDstream = wordCounts.transformToPair(
   new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() {
	    @Override public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> rdd) throws Exception {
	    	rdd.join(initialRDD).mapToPair(new PairFunction<Tuple2<String,Tuple2<Integer,Integer>>, String, Integer>() {
				@Override
				public Tuple2<String, Integer> call(Tuple2<String, Tuple2<Integer, Integer>> joinedTuple)
						throws Exception {
					// TODO Auto-generated method stub
					return new Tuple2<>( joinedTuple._1(), (joinedTuple._2()._1()+joinedTuple._2()._2()) );
				}
			});
		
		return rdd; 				     
	    }
	  });
 
joinedDstream.print();
  streamingContext.start();
  streamingContext.awaitTermination();
}

Source File: WordCountRecoverableEx.java From Apache-Spark-2x-for-Java-Developers with MIT License

5 votes

protected static JavaStreamingContext createContext(String ip, int port, String checkpointDirectory) {
	SparkConf sparkConf = new SparkConf().setAppName("WordCountRecoverableEx").setMaster("local[*]");
	JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
	streamingContext.checkpoint(checkpointDirectory);
	// Initial state RDD input to mapWithState
	@SuppressWarnings("unchecked")
	List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 1), new Tuple2<>("world", 1));
	JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);

	JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream(ip,port, StorageLevels.MEMORY_AND_DISK_SER);

	JavaDStream<String> words = StreamingLines.flatMap(str -> Arrays.asList(str.split(" ")).iterator());

	JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str -> new Tuple2<>(str, 1))
			.reduceByKey((count1, count2) -> count1 + count2);

	// Update the cumulative count function
	Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>> mappingFunc = new Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>>() {
		@Override
		public Tuple2<String, Integer> call(String word, Optional<Integer> one, State<Integer> state) {
			int sum = one.orElse(0) + (state.exists() ? state.get() : 0);
			Tuple2<String, Integer> output = new Tuple2<>(word, sum);
			state.update(sum);
			return output;
		}
	};

	// DStream made of get cumulative counts that get updated in every batch
	JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> stateDstream = wordCounts
			.mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD));

	stateDstream.print();
	return streamingContext;
}

Source File: SparkGroupAlsoByWindowViaWindowSet.java From beam with Apache License 2.0

5 votes

public static <K, InputT, W extends BoundedWindow>
    JavaDStream<WindowedValue<KV<K, Iterable<InputT>>>> groupByKeyAndWindow(
        final JavaDStream<WindowedValue<KV<K, InputT>>> inputDStream,
        final Coder<K> keyCoder,
        final Coder<WindowedValue<InputT>> wvCoder,
        final WindowingStrategy<?, W> windowingStrategy,
        final SerializablePipelineOptions options,
        final List<Integer> sourceIds,
        final String transformFullName) {

  final PairDStreamFunctions<ByteArray, byte[]> pairDStream =
      buildPairDStream(inputDStream, keyCoder, wvCoder);

  // use updateStateByKey to scan through the state and update elements and timers.
  final UpdateStateByKeyFunction<K, InputT, W> updateFunc =
      new UpdateStateByKeyFunction<>(
          sourceIds,
          windowingStrategy,
          (FullWindowedValueCoder<InputT>) wvCoder,
          keyCoder,
          options,
          transformFullName);

  final DStream<
          Tuple2</*K*/ ByteArray, Tuple2<StateAndTimers, /*WV<KV<K, Itr<I>>>*/ List<byte[]>>>>
      firedStream =
          pairDStream.updateStateByKey(
              updateFunc,
              pairDStream.defaultPartitioner(pairDStream.defaultPartitioner$default$1()),
              true,
              JavaSparkContext$.MODULE$.fakeClassTag());

  checkpointIfNeeded(firedStream, options);

  // filter state-only output (nothing to fire) and remove the state from the output.
  return stripStateValues(firedStream, keyCoder, (FullWindowedValueCoder<InputT>) wvCoder);
}

Source File: TestStreamingStep.java From envelope with Apache License 2.0

5 votes

@Override
public JavaDStream<?> getDStream() throws Exception {
  Queue<JavaRDD<String>> queue = new LinkedList<>();
  queue.add(generateRDD());
  JavaDStream<String> dstream = Contexts.getJavaStreamingContext().queueStream(queue) ;
  return dstream;
}

Source File: AbstractJavaEsSparkStreamingTest.java From elasticsearch-hadoop with Apache License 2.0

5 votes

@Test
public void testEsRDDWriteWithMappingExclude() throws Exception {
    Map<String, Object> trip1 = new HashMap<>();
    trip1.put("reason", "business");
    trip1.put("airport", "SFO");

    Map<String, Object> trip2 = new HashMap<>();
    trip2.put("participants", 5);
    trip2.put("airport", "OTP");

    List<Map<String, Object>> docs = new ArrayList<>();
    docs.add(trip1);
    docs.add(trip2);

    String target = wrapIndex(resource("spark-streaming-test-scala-write-exclude", "data", version));

    Map<String, String> localConf = new HashMap<>(cfg);
    localConf.put(ES_MAPPING_EXCLUDE, "airport");

    JavaRDD<Map<String, Object>> batch = sc.parallelize(docs);
    Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>();
    rddQueue.add(batch);
    JavaDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue);
    JavaEsSparkStreaming.saveToEs(dstream, target, localConf);
    ssc.start();
    TimeUnit.SECONDS.sleep(2);
    ssc.stop(false, true);

    assertTrue(RestUtils.exists(target));
    assertThat(RestUtils.get(target + "/_search?"), containsString("business"));
    assertThat(RestUtils.get(target +  "/_search?"), containsString("participants"));
    assertThat(RestUtils.get(target +  "/_search?"), not(containsString("airport")));
}

Source File: SparkGroupAlsoByWindowViaWindowSet.java From beam with Apache License 2.0

5 votes

private static <K, InputT> PairDStreamFunctions<ByteArray, byte[]> buildPairDStream(
    final JavaDStream<WindowedValue<KV<K, InputT>>> inputDStream,
    final Coder<K> keyCoder,
    final Coder<WindowedValue<InputT>> wvCoder) {

  // we have to switch to Scala API to avoid Optional in the Java API, see: SPARK-4819.
  // we also have a broader API for Scala (access to the actual key and entire iterator).
  // we use coders to convert objects in the PCollection to byte arrays, so they
  // can be transferred over the network for the shuffle and be in serialized form
  // for checkpointing.
  // for readability, we add comments with actual type next to byte[].
  // to shorten line length, we use:
  // ---- WV: WindowedValue
  // ---- Iterable: Itr
  // ---- AccumT: A
  // ---- InputT: I
  final DStream<Tuple2<ByteArray, byte[]>> tupleDStream =
      inputDStream
          .map(new ReifyTimestampsAndWindowsFunction<>())
          .mapToPair(TranslationUtils.toPairFunction())
          .mapToPair(CoderHelpers.toByteFunction(keyCoder, wvCoder))
          .dstream();

  return DStream.toPairDStreamFunctions(
      tupleDStream,
      JavaSparkContext$.MODULE$.fakeClassTag(),
      JavaSparkContext$.MODULE$.fakeClassTag(),
      null);
}

Source File: KafkaStreamFactory.java From zipkin-sparkstreaming with Apache License 2.0

5 votes

@Override public JavaDStream<byte[]> create(JavaStreamingContext jsc) {
  return KafkaUtils.createDirectStream(
      jsc,
      byte[].class,
      byte[].class,
      DefaultDecoder.class,
      DefaultDecoder.class,
      kafkaParams(),
      Collections.singleton(topic()))
      .map(m -> m._2); // get value
}

Source File: ComputeStreamingResponse.java From incubator-retired-pirk with Apache License 2.0

5 votes

/**
 * Method to perform the query given an input JavaDStream of JSON
 * 
 */
public void performQuery(JavaDStream<MapWritable> input)
{
  logger.info("Performing query: ");

  // Process non-overlapping windows of data of duration windowLength seconds
  // If we are using queue streams, there is no need to window
  if (!useQueueStream)
  {
    input.window(Durations.seconds(windowLength), Durations.seconds(windowLength));
  }

  // Extract the selectors for each dataElement based upon the query type
  // and perform a keyed hash of the selectors
  JavaPairDStream<Integer,List<BigInteger>> selectorHashToDocRDD = input.mapToPair(new HashSelectorsAndPartitionData(bVars));

  // Group by hashed selector (row) -- can combine with the line above, separating for testing and benchmarking...
  JavaPairDStream<Integer,Iterable<List<BigInteger>>> selectorGroupRDD = selectorHashToDocRDD.groupByKey();

  // Calculate the encrypted row values for each row, emit <colNum, colVal> for each row
  JavaPairDStream<Long,BigInteger> encRowRDD = selectorGroupRDD.flatMapToPair(new EncRowCalc(accum, bVars));

  // Multiply the column values by colNum: emit <colNum, finalColVal> and write the final result object
  encryptedColumnCalc(encRowRDD);

  // Start the streaming computation
  start();
}

Source File: KafkaInput.java From envelope with Apache License 2.0

5 votes

@Override
public JavaDStream<?> getDStream() throws Exception {
  if (dStream == null) {
    JavaStreamingContext jssc = Contexts.getJavaStreamingContext();
    Map<TopicPartition, Long> lastOffsets = null;
    if (doesRecordProgress(config) && !usingKafkaManagedOffsets(config)) {
      lastOffsets = getLastOffsets();
    }

    if (lastOffsets != null) {
      dStream = KafkaUtils.createDirectStream(jssc, LocationStrategies.PreferConsistent(),
          ConsumerStrategies.Subscribe(topics, kafkaParams, lastOffsets));
    } else {
      dStream = KafkaUtils.createDirectStream(jssc, LocationStrategies.PreferConsistent(),
          ConsumerStrategies.Subscribe(topics, kafkaParams));
    }

    if (ConfigUtils.getOrElse(config, WINDOW_ENABLED_CONFIG, false)) {
      int windowDuration = config.getInt(WINDOW_MILLISECONDS_CONFIG);
      if (config.hasPath(WINDOW_SLIDE_MILLISECONDS_CONFIG)) {
        int slideDuration = config.getInt(WINDOW_SLIDE_MILLISECONDS_CONFIG);
        dStream = dStream.window(new Duration(windowDuration), new Duration(slideDuration));
      } else {
        dStream = dStream.window(new Duration(windowDuration));
      }
    }
  }

  return dStream;
}

Source File: ProcessedOffsetManager.java From kafka-spark-consumer with Apache License 2.0

5 votes

@SuppressWarnings("deprecation")
public static void persists(DStream<Tuple2<String, Iterable<Long>>> partitonOffset, Properties props) {
  ClassTag<Tuple2<String, Iterable<Long>>> tuple2ClassTag =
      ScalaUtil.<String, Iterable<Long>>getTuple2ClassTag();
  JavaDStream<Tuple2<String, Iterable<Long>>> jpartitonOffset =
      new JavaDStream<Tuple2<String, Iterable<Long>>>(partitonOffset, tuple2ClassTag);
  jpartitonOffset.foreachRDD(new VoidFunction<JavaRDD<Tuple2<String, Iterable<Long>>>>() {
    @Override
    public void call(JavaRDD<Tuple2<String, Iterable<Long>>> po) throws Exception {
      List<Tuple2<String, Iterable<Long>>> poList = po.collect();
      doPersists(poList, props);
    }
  });
}

Source File: DummyStreamInput.java From envelope with Apache License 2.0

5 votes

@Override
public JavaDStream<Long> getDStream() throws Exception {
  List<Long> list = Lists.newArrayList();
  for (int i = 0; i < rowsPerBatch; i++) {
    list.add(counter++);
  }
  JavaRDD<Long> longs = Contexts.getJavaStreamingContext().sparkContext().parallelize(list);
  Queue<JavaRDD<Long>> queue = Queues.newLinkedBlockingQueue();
  queue.add(longs);
  LOG.info("Created stream queue with {} rows", list.size());
  return Contexts.getJavaStreamingContext().queueStream(queue, true);
}

Source File: SparkUtils.java From cxf with Apache License 2.0

5 votes

public static JavaPairDStream<String, Integer> createOutputDStream(
    JavaDStream<String> receiverStream, boolean withId) {
    final JavaDStream<String> words =
        receiverStream.flatMap(x -> withId ? splitInputStringWithId(x) : splitInputString(x));

    final JavaPairDStream<String, Integer> pairs = words.mapToPair(s -> {
        return new Tuple2<String, Integer>(s, 1);
    });
    return pairs.reduceByKey((i1, i2) -> {
        return i1 + i2;
    });
}

Source File: IoTTrafficDataProcessor.java From iot-traffic-monitor with Apache License 2.0

5 votes

/**
 * Method to get the vehicles which are in radius of POI and their distance from POI.
 * 
 * @param nonFilteredIotDataStream original IoT data stream
 * @param broadcastPOIValues variable containing POI coordinates, route and vehicle types to monitor.
 */
public void processPOIData(JavaDStream<IoTData> nonFilteredIotDataStream,Broadcast<Tuple3<POIData, String, String>> broadcastPOIValues) {
	 
	// Filter by routeId,vehicleType and in POI range
	JavaDStream<IoTData> iotDataStreamFiltered = nonFilteredIotDataStream
			.filter(iot -> (iot.getRouteId().equals(broadcastPOIValues.value()._2())
					&& iot.getVehicleType().contains(broadcastPOIValues.value()._3())
					&& GeoDistanceCalculator.isInPOIRadius(Double.valueOf(iot.getLatitude()),
							Double.valueOf(iot.getLongitude()), broadcastPOIValues.value()._1().getLatitude(),
							broadcastPOIValues.value()._1().getLongitude(),
							broadcastPOIValues.value()._1().getRadius())));

	// pair with poi
	JavaPairDStream<IoTData, POIData> poiDStreamPair = iotDataStreamFiltered
			.mapToPair(iot -> new Tuple2<>(iot, broadcastPOIValues.value()._1()));

	// Transform to dstream of POITrafficData
	JavaDStream<POITrafficData> trafficDStream = poiDStreamPair.map(poiTrafficDataFunc);

	// Map Cassandra table column
	Map<String, String> columnNameMappings = new HashMap<String, String>();
	columnNameMappings.put("vehicleId", "vehicleid");
	columnNameMappings.put("distance", "distance");
	columnNameMappings.put("vehicleType", "vehicletype");
	columnNameMappings.put("timeStamp", "timestamp");

	// call CassandraStreamingJavaUtil function to save in DB
	javaFunctions(trafficDStream)
			.writerBuilder("traffickeyspace", "poi_traffic",CassandraJavaUtil.mapToRow(POITrafficData.class, columnNameMappings))
			.withConstantTTL(120)//keeping data for 2 minutes
			.saveToCassandra();
}

org.apache.spark.streaming.api.java.JavaDStream Java Examples