org.apache.spark.streaming.api.java.JavaPairDStream Java Exaples

Source File: KafkaStreaming.java From sparkResearch with Apache License 2.0

8 votes

public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10000));
    //设置检查点
    streamingContext.checkpoint("HDFS URL");
    Map<String, Integer> topicThread = new HashMap<>(1);
    topicThread.put(TOPIC, THREAD);
    JavaPairInputDStream<String, String> dStream = KafkaUtils.createStream(streamingContext, HOST, GROP, topicThread);

    JavaDStream<String> words = dStream.flatMap((FlatMapFunction<Tuple2<String, String>, String>) stringStringTuple2 -> Arrays.asList(SPACE.split(stringStringTuple2._2)).iterator());

    //统计
    JavaPairDStream<String, Integer> result = words.mapToPair((PairFunction<String, String, Integer>) s -> new Tuple2<>(s, 1)).reduceByKey((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2);

    try {
        result.print();
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}

Source File: StreamingEngine.java From spark-streaming-direct-kafka with Apache License 2.0

6 votes

public void start() {
    SparkConf sparkConf = getSparkConf();
    streamingContext = new JavaStreamingContext(sparkConf,
            Durations.seconds(Long.parseLong(config.getStreamingBatchIntervalInSec())));
    JavaInputDStream<MessageAndMetadata<String, byte[]>> dStream = buildInputDStream(streamingContext);
    JavaPairDStream<String, byte[]> pairDStream = dStream.mapToPair(km -> new Tuple2<>(km.key(), km.message()));

    pairDStream.foreachRDD(new ProcessStreamingData<>(config)); // process data
    dStream.foreachRDD(new UpdateOffsetsFn<>(config.getKafkaGroupId(), config.getZkOffsetManager()));
    streamingContext.start();
}

Source File: ReduceByKeyAndWindow.java From sparkResearch with Apache License 2.0

6 votes

public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("reduceByKeyAndWindow").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10));
    //检查点设置
    streamingContext.checkpoint("hdfs://localhost:9300");
    //数据源
    JavaDStream<String> dStream = streamingContext.socketTextStream("localhost", 8080);

    JavaPairDStream<String, Long> ipPairDstream = dStream.mapToPair(new GetIp());

    JavaPairDStream<String, Long> result = ipPairDstream.reduceByKeyAndWindow(new AddLongs(),
            new SubtractLongs(), Durations.seconds(30), Durations.seconds(10));

    try {
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}

Source File: BlurLoadSparkProcessor.java From incubator-retired-blur with Apache License 2.0

6 votes

public void run() throws IOException {
  SparkConf conf = new SparkConf();
  conf.setAppName(getAppName());
  conf.set(SPARK_SERIALIZER, ORG_APACHE_SPARK_SERIALIZER_KRYO_SERIALIZER);
  JavaSparkUtil.packProjectJars(conf);
  setupSparkConf(conf);

  JavaStreamingContext ssc = new JavaStreamingContext(conf, getDuration());
  List<JavaDStream<T>> streamsList = getStreamsList(ssc);

  // Union all the streams if there is more than 1 stream
  JavaDStream<T> streams = unionStreams(ssc, streamsList);

  JavaPairDStream<String, RowMutation> pairDStream = streams.mapToPair(new PairFunction<T, String, RowMutation>() {
    public Tuple2<String, RowMutation> call(T t) {
      RowMutation rowMutation = convert(t);
      return new Tuple2<String, RowMutation>(rowMutation.getRowId(), rowMutation);
    }
  });

  pairDStream.foreachRDD(getFunction());

  ssc.start();
  ssc.awaitTermination();
}

Source File: IoTTrafficDataProcessor.java From iot-traffic-monitor with Apache License 2.0

6 votes

/**
 * Method to get window traffic counts of different type of vehicles for each route.
 * Window duration = 30 seconds and Slide interval = 10 seconds
 * 
 * @param filteredIotDataStream IoT data stream
 */
public void processWindowTrafficData(JavaDStream<IoTData> filteredIotDataStream) {

	// reduce by key and window (30 sec window and 10 sec slide).
	JavaPairDStream<AggregateKey, Long> countDStreamPair = filteredIotDataStream
			.mapToPair(iot -> new Tuple2<>(new AggregateKey(iot.getRouteId(), iot.getVehicleType()), 1L))
			.reduceByKeyAndWindow((a, b) -> a + b, Durations.seconds(30), Durations.seconds(10));

	// Transform to dstream of TrafficData
	JavaDStream<WindowTrafficData> trafficDStream = countDStreamPair.map(windowTrafficDataFunc);

	// Map Cassandra table column
	Map<String, String> columnNameMappings = new HashMap<String, String>();
	columnNameMappings.put("routeId", "routeid");
	columnNameMappings.put("vehicleType", "vehicletype");
	columnNameMappings.put("totalCount", "totalcount");
	columnNameMappings.put("timeStamp", "timestamp");
	columnNameMappings.put("recordDate", "recorddate");

	// call CassandraStreamingJavaUtil function to save in DB
	javaFunctions(trafficDStream).writerBuilder("traffickeyspace", "window_traffic",
			CassandraJavaUtil.mapToRow(WindowTrafficData.class, columnNameMappings)).saveToCassandra();
}

Source File: StreamingService.java From cxf with Apache License 2.0

6 votes

private void processStreamOneWay(List<String> inputStrings) {
    try {
        SparkConf sparkConf = new SparkConf().setMaster("local[*]")
            .setAppName("JAX-RS Spark Connect OneWay " + SparkUtils.getRandomId());
        JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));

        JavaDStream<String> receiverStream = null;
        if ("queue".equals(receiverType)) {
            Queue<JavaRDD<String>> rddQueue = new LinkedList<>();
            for (int i = 0; i < 30; i++) {
                rddQueue.add(jssc.sparkContext().parallelize(inputStrings));
            }
            receiverStream = jssc.queueStream(rddQueue);
        } else {
            receiverStream = jssc.receiverStream(new StringListReceiver(inputStrings));
        }

        JavaPairDStream<String, Integer> wordCounts = SparkUtils.createOutputDStream(receiverStream, false);
        wordCounts.foreachRDD(new PrintOutputFunction(jssc));
        jssc.start();
    } catch (Exception ex) {
        // ignore
    }
}

Source File: SparkStreamingJob.java From zipkin-sparkstreaming with Apache License 2.0

6 votes

static void streamSpansToStorage(
    JavaDStream<byte[]> stream,
    ReadSpans readSpans,
    AdjustAndConsumeSpansSharingTraceId adjustAndConsumeSpansSharingTraceId
) {
  JavaDStream<Span> spans = stream.flatMap(readSpans);

  // TODO: plug in some filter to drop spans regardless of trace ID
  // spans = spans.filter(spanFilter);

  JavaPairDStream<String, Iterable<Span>> tracesById = spans
      .mapToPair(s -> new Tuple2<>(Util.toLowerHex(s.traceIdHigh, s.traceId), s))
      .groupByKey();

  tracesById.foreachRDD(rdd -> {
    rdd.values().foreachPartition(adjustAndConsumeSpansSharingTraceId);
  });
}

Source File: IoTTrafficDataProcessor.java From iot-traffic-monitor with Apache License 2.0

5 votes

/**
 * Method to get the vehicles which are in radius of POI and their distance from POI.
 * 
 * @param nonFilteredIotDataStream original IoT data stream
 * @param broadcastPOIValues variable containing POI coordinates, route and vehicle types to monitor.
 */
public void processPOIData(JavaDStream<IoTData> nonFilteredIotDataStream,Broadcast<Tuple3<POIData, String, String>> broadcastPOIValues) {
	 
	// Filter by routeId,vehicleType and in POI range
	JavaDStream<IoTData> iotDataStreamFiltered = nonFilteredIotDataStream
			.filter(iot -> (iot.getRouteId().equals(broadcastPOIValues.value()._2())
					&& iot.getVehicleType().contains(broadcastPOIValues.value()._3())
					&& GeoDistanceCalculator.isInPOIRadius(Double.valueOf(iot.getLatitude()),
							Double.valueOf(iot.getLongitude()), broadcastPOIValues.value()._1().getLatitude(),
							broadcastPOIValues.value()._1().getLongitude(),
							broadcastPOIValues.value()._1().getRadius())));

	// pair with poi
	JavaPairDStream<IoTData, POIData> poiDStreamPair = iotDataStreamFiltered
			.mapToPair(iot -> new Tuple2<>(iot, broadcastPOIValues.value()._1()));

	// Transform to dstream of POITrafficData
	JavaDStream<POITrafficData> trafficDStream = poiDStreamPair.map(poiTrafficDataFunc);

	// Map Cassandra table column
	Map<String, String> columnNameMappings = new HashMap<String, String>();
	columnNameMappings.put("vehicleId", "vehicleid");
	columnNameMappings.put("distance", "distance");
	columnNameMappings.put("vehicleType", "vehicletype");
	columnNameMappings.put("timeStamp", "timestamp");

	// call CassandraStreamingJavaUtil function to save in DB
	javaFunctions(trafficDStream)
			.writerBuilder("traffickeyspace", "poi_traffic",CassandraJavaUtil.mapToRow(POITrafficData.class, columnNameMappings))
			.withConstantTTL(120)//keeping data for 2 minutes
			.saveToCassandra();
}

Source File: RealtimeTrafficDataProcessor.java From lambda-arch with Apache License 2.0

5 votes

/**
 * Method to get window traffic counts of different type of vehicles for each route.
 * Window duration = 30 seconds and Slide interval = 10 seconds
 *
 * @param filteredIotDataStream IoT data stream
 */
public void processWindowTrafficData(JavaDStream<IoTData> filteredIotDataStream) {

    // reduce by key and window (30 sec window and 10 sec slide).
    JavaPairDStream<AggregateKey, Long> countDStreamPair = filteredIotDataStream
            .mapToPair(iot -> new Tuple2<>(
                    new AggregateKey(iot.getRouteId(), iot.getVehicleType()),
                    1L
            ))
            .reduceByKeyAndWindow((a, b) -> a + b,
                    Durations.seconds(30),
                    Durations.seconds(10)
            );

    // Transform to dstream of TrafficData
    JavaDStream<WindowTrafficData> trafficDStream = countDStreamPair.map(windowTrafficDataFunc);

    // Map Cassandra table column
    Map<String, String> columnNameMappings = new HashMap<String, String>();
    columnNameMappings.put("routeId", "routeid");
    columnNameMappings.put("vehicleType", "vehicletype");
    columnNameMappings.put("totalCount", "totalcount");
    columnNameMappings.put("timeStamp", "timestamp");
    columnNameMappings.put("recordDate", "recorddate");

    // call CassandraStreamingJavaUtil function to save in DB
    javaFunctions(trafficDStream).writerBuilder(
            "traffickeyspace",
            "window_traffic",
            CassandraJavaUtil.mapToRow(WindowTrafficData.class, columnNameMappings)
    ).saveToCassandra();
}

Source File: RealTimeHeatMapProcessor.java From lambda-arch with Apache License 2.0

5 votes

public void processHeatMap(JavaDStream<IoTData> filteredIotDataStream) throws IOException {
    JavaDStream<Measurement> measurements = iotDataToMeasurements(filteredIotDataStream);
    JavaDStream<Measurement> measurementsWithRoundedCoordinates = roundCoordinates(measurements);
    JavaPairDStream<Coordinate, Integer> counts = countPerGridBox(measurementsWithRoundedCoordinates);
    JavaDStream<HeatMapData> heatMapStream = getHeatMap(counts);
    save(heatMapStream);
}

Source File: AbstractJavaEsSparkStreamingTest.java From elasticsearch-hadoop with Apache License 2.0

5 votes

@Test
public void testEsRDDWriteWithDynamicMapping() throws Exception {
    Map<String, Object> doc1 = new HashMap<>();
    doc1.put("number", 3);
    doc1.put("one", null);
    Set<String> values = new HashSet<>();
    values.add("2");
    doc1.put("two", values);
    doc1.put("three", ".");

    Map<String, Object> doc2 = new HashMap<>();
    doc2.put("number", 4);
    doc2.put("OTP", "Otopeni");
    doc2.put("SFO", "San Fran");

    List<Map<String, Object>> docs = new ArrayList<>();
    docs.add(doc1);
    docs.add(doc2);

    String target = wrapIndex(resource("spark-streaming-test-scala-dyn-id-write", "data", version));
    String docEndpoint = wrapIndex(docEndpoint("spark-streaming-test-scala-dyn-id-write", "data", version));

    JavaRDD<Map<String,Object>> batch = sc.parallelize(docs);
    Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>();
    rddQueue.add(batch);
    JavaDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue);

    JavaPairDStream<Integer, Map<String, Object>> metaDstream = dstream.mapToPair(new ExtractIDFunction());

    JavaEsSparkStreaming.saveToEsWithMeta(metaDstream, target, cfg);
    ssc.start();
    TimeUnit.SECONDS.sleep(2);
    ssc.stop(false, true);

    assertEquals(2, JavaEsSpark.esRDD(sc, target).count());
    assertTrue(RestUtils.exists(docEndpoint + "/3"));
    assertTrue(RestUtils.exists(docEndpoint + "/4"));

    assertThat(RestUtils.get(target + "/_search?"), containsString("SFO"));
}

Source File: RealTimeHeatMapProcessor.java From lambda-arch with Apache License 2.0

5 votes

/**
 * Reduces the dataset by counting the number of measurements for a specific grid box (rounded coordinate)
 *
 * @param measurements | The dataset of measurements
 * @return A set of tuples linking rounded coordinates to their number of occurrences
 */
private JavaPairDStream<Coordinate, Integer> countPerGridBox(JavaDStream<Measurement> measurements) {
    // reduce by key and window (30 sec window and 10 sec slide).
    return measurements.mapToPair(
            measurement -> new Tuple2<>(
                    measurement.getRoundedCoordinate(),
                    1
            )
    ).reduceByKeyAndWindow((a, b) -> a + b,
            Durations.seconds(30),
            Durations.seconds(10)
    );
}

Source File: RealtimeTrafficDataProcessor.java From lambda-arch with Apache License 2.0

5 votes

/**
 * Method to get total traffic counts of different type of vehicles for each route.
 *
 * @param filteredIotDataStream IoT data stream
 */
public void processTotalTrafficData(JavaDStream<IoTData> filteredIotDataStream) {

    // We need to get count of vehicle group by routeId and vehicleType
    JavaPairDStream<AggregateKey, Long> countDStreamPair = filteredIotDataStream
            .mapToPair(iot -> new Tuple2<>(new AggregateKey(iot.getRouteId(), iot.getVehicleType()), 1L))
            .reduceByKey((a, b) -> a + b);

    // Need to keep state for total count
    StateSpec<AggregateKey, Long, Long, Tuple2<AggregateKey, Long>> stateSpec =
            StateSpec.function(totalSumFunc).timeout(Durations.seconds(3600));

    JavaMapWithStateDStream<AggregateKey, Long, Long, Tuple2<AggregateKey, Long>> countDStreamWithStatePair =
            countDStreamPair.mapWithState(stateSpec);//maintain state for one hour

    // Transform to dstream of TrafficData
    JavaDStream<Tuple2<AggregateKey, Long>> countDStream = countDStreamWithStatePair.map(tuple2 -> tuple2);
    JavaDStream<TotalTrafficData> trafficDStream = countDStream.map(totalTrafficDataFunc);

    // Map Cassandra table column
    Map<String, String> columnNameMappings = new HashMap<String, String>();
    columnNameMappings.put("routeId", "routeid");
    columnNameMappings.put("vehicleType", "vehicletype");
    columnNameMappings.put("totalCount", "totalcount");
    columnNameMappings.put("timeStamp", "timestamp");
    columnNameMappings.put("recordDate", "recorddate");

    // call CassandraStreamingJavaUtil function to save in DB
    javaFunctions(trafficDStream).writerBuilder(
            "traffickeyspace",
            "total_traffic",
            CassandraJavaUtil.mapToRow(TotalTrafficData.class, columnNameMappings)
    ).saveToCassandra();
}

Source File: RealtimeTrafficDataProcessor.java From lambda-arch with Apache License 2.0

5 votes

/**
 * Method to get the vehicles which are in radius of POI and their distance from POI.
 *
 * @param nonFilteredIotDataStream original IoT data stream
 * @param broadcastPOIValues       variable containing POI coordinates, route and vehicle types to monitor.
 */
public void processPOIData(
        JavaDStream<IoTData> nonFilteredIotDataStream,
        Broadcast<Tuple3<POIData, String, String>> broadcastPOIValues
) {

    // Filter by routeId,vehicleType and in POI range
    JavaDStream<IoTData> iotDataStreamFiltered = nonFilteredIotDataStream
            .filter(iot -> (iot.getRouteId().equals(broadcastPOIValues.value()._2())
                    && iot.getVehicleType().contains(broadcastPOIValues.value()._3())
                    && GeoDistanceCalculator.isInPOIRadius(Double.valueOf(iot.getLatitude()),
                    Double.valueOf(iot.getLongitude()), broadcastPOIValues.value()._1().getLatitude(),
                    broadcastPOIValues.value()._1().getLongitude(),
                    broadcastPOIValues.value()._1().getRadius())));

    // pair with poi
    JavaPairDStream<IoTData, POIData> poiDStreamPair = iotDataStreamFiltered.mapToPair(
            iot -> new Tuple2<>(iot, broadcastPOIValues.value()._1())
    );

    // Transform to dstream of POITrafficData
    JavaDStream<POITrafficData> trafficDStream = poiDStreamPair.map(poiTrafficDataFunc);

    // Map Cassandra table column
    Map<String, String> columnNameMappings = new HashMap<String, String>();
    columnNameMappings.put("vehicleId", "vehicleid");
    columnNameMappings.put("distance", "distance");
    columnNameMappings.put("vehicleType", "vehicletype");
    columnNameMappings.put("timeStamp", "timestamp");

    // call CassandraStreamingJavaUtil function to save in DB
    javaFunctions(trafficDStream)
            .writerBuilder(
                    "traffickeyspace",
                    "poi_traffic",
                    CassandraJavaUtil.mapToRow(POITrafficData.class, columnNameMappings)
            )
            .withConstantTTL(120)//keeping data for 2 minutes
            .saveToCassandra();
}

Source File: FileStreamingEx.java From Apache-Spark-2x-for-Java-Developers with MIT License

5 votes

public static void main(String[] args) {
   	//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
	 System.setProperty("hadoop.home.dir", "E:\\hadoop");
   	//Logger rootLogger = LogManager.getRootLogger();
  		//rootLogger.setLevel(Level.WARN); 
       SparkConf conf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]");
       String inputDirectory="E:\\hadoop\\streamFolder\\";
    
       JavaSparkContext sc = new JavaSparkContext(conf);
       JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.seconds(1));
      // streamingContext.checkpoint("E:\\hadoop\\checkpoint");
       Logger rootLogger = LogManager.getRootLogger();
  		rootLogger.setLevel(Level.WARN); 
  		
  		JavaDStream<String> streamfile = streamingContext.textFileStream(inputDirectory);
  		streamfile.print();
  		streamfile.foreachRDD(rdd-> rdd.foreach(x -> System.out.println(x)));
  		
  			   		
  		JavaPairDStream<LongWritable, Text> streamedFile = streamingContext.fileStream(inputDirectory, LongWritable.class, Text.class, TextInputFormat.class);
  	 streamedFile.print();
  		
  	 streamingContext.start();
  	 

       try {
		streamingContext.awaitTermination();
	} catch (InterruptedException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}
}

Source File: WordCountTransformOpEx.java From Apache-Spark-2x-for-Java-Developers with MIT License

5 votes

public static void main(String[] args) throws Exception {
  
      System.setProperty("hadoop.home.dir", "E:\\hadoop");
	
   SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]");
   JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
   Logger rootLogger = LogManager.getRootLogger();
 		rootLogger.setLevel(Level.WARN); 
   List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10));
   JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
	    

   JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
   
   JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
  
   JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
  
   wordCounts.print();
   
JavaPairDStream<String, Integer> joinedDstream = wordCounts
		.transformToPair(new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() {
			@Override
			public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> rdd) throws Exception {
				JavaPairRDD<String, Integer> modRDD = rdd.join(initialRDD).mapToPair(
						new PairFunction<Tuple2<String, Tuple2<Integer, Integer>>, String, Integer>() {
							@Override
							public Tuple2<String, Integer> call(
									Tuple2<String, Tuple2<Integer, Integer>> joinedTuple) throws Exception {
								return new Tuple2<>(joinedTuple._1(),(joinedTuple._2()._1() + joinedTuple._2()._2()));
							}
						});
				return modRDD;
			}
		});

   joinedDstream.print();
   streamingContext.start();
   streamingContext.awaitTermination();
 }

Source File: ProcessedOffsetManager.java From kafka-spark-consumer with Apache License 2.0

5 votes

@SuppressWarnings("deprecation")
public static void persists(JavaPairDStream<String, Iterable<Long>> partitonOffset, Properties props) {
  partitonOffset.foreachRDD(new VoidFunction<JavaPairRDD<String,Iterable<Long>>>() {
    @Override
    public void call(JavaPairRDD<String, Iterable<Long>> po) throws Exception {
      List<Tuple2<String, Iterable<Long>>> poList = po.collect();
      doPersists(poList, props);
    }
  });
}

Source File: ProcessedOffsetManager.java From kafka-spark-consumer with Apache License 2.0

5 votes

public static <T> DStream<Tuple2<String, Iterable<Long>>>  getPartitionOffset(
    DStream<MessageAndMetadata<T>> unionStreams, Properties props) {
  ClassTag<MessageAndMetadata<T>> messageMetaClassTag =
      ScalaUtil.<T>getMessageAndMetadataClassTag();
  JavaDStream<MessageAndMetadata<T>> javaDStream =
      new JavaDStream<MessageAndMetadata<T>>(unionStreams, messageMetaClassTag);
  JavaPairDStream<String, Iterable<Long>> partitonOffset = getPartitionOffset(javaDStream, props);
  return partitonOffset.dstream();
}

Source File: ComputeStreamingResponse.java From incubator-retired-pirk with Apache License 2.0

5 votes

/**
 * Method to perform the query given an input JavaDStream of JSON
 * 
 */
public void performQuery(JavaDStream<MapWritable> input)
{
  logger.info("Performing query: ");

  // Process non-overlapping windows of data of duration windowLength seconds
  // If we are using queue streams, there is no need to window
  if (!useQueueStream)
  {
    input.window(Durations.seconds(windowLength), Durations.seconds(windowLength));
  }

  // Extract the selectors for each dataElement based upon the query type
  // and perform a keyed hash of the selectors
  JavaPairDStream<Integer,List<BigInteger>> selectorHashToDocRDD = input.mapToPair(new HashSelectorsAndPartitionData(bVars));

  // Group by hashed selector (row) -- can combine with the line above, separating for testing and benchmarking...
  JavaPairDStream<Integer,Iterable<List<BigInteger>>> selectorGroupRDD = selectorHashToDocRDD.groupByKey();

  // Calculate the encrypted row values for each row, emit <colNum, colVal> for each row
  JavaPairDStream<Long,BigInteger> encRowRDD = selectorGroupRDD.flatMapToPair(new EncRowCalc(accum, bVars));

  // Multiply the column values by colNum: emit <colNum, finalColVal> and write the final result object
  encryptedColumnCalc(encRowRDD);

  // Start the streaming computation
  start();
}

Source File: AbstractJavaEsSparkStreamingTest.java From elasticsearch-hadoop with Apache License 2.0

5 votes

@Test
public void testEsRDDWriteWithDynamicMapping() throws Exception {
    Map<String, Object> doc1 = new HashMap<>();
    doc1.put("number", 3);
    doc1.put("one", null);
    Set<String> values = new HashSet<>();
    values.add("2");
    doc1.put("two", values);
    doc1.put("three", ".");

    Map<String, Object> doc2 = new HashMap<>();
    doc2.put("number", 4);
    doc2.put("OTP", "Otopeni");
    doc2.put("SFO", "San Fran");

    List<Map<String, Object>> docs = new ArrayList<>();
    docs.add(doc1);
    docs.add(doc2);

    String target = wrapIndex(resource("spark-test-scala-dyn-id-write", "data", version));
    String docEndpoint = wrapIndex(docEndpoint("spark-test-scala-dyn-id-write", "data", version));

    JavaRDD<Map<String,Object>> batch = sc.parallelize(docs);
    Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>();
    rddQueue.add(batch);
    JavaDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue);

    JavaPairDStream<Integer, Map<String, Object>> metaDstream = dstream.mapToPair(new ExtractIDFunction());

    JavaEsSparkStreaming.saveToEsWithMeta(metaDstream, target, cfg);
    ssc.start();
    TimeUnit.SECONDS.sleep(2);
    ssc.stop(false, true);

    assertEquals(2, JavaEsSpark.esRDD(sc, target).count());
    assertTrue(RestUtils.exists(docEndpoint + "/3"));
    assertTrue(RestUtils.exists(docEndpoint + "/4"));

    assertThat(RestUtils.get(target + "/_search?"), containsString("SFO"));
}

Source File: WordCountSocketStateful.java From Apache-Spark-2x-for-Java-Developers with MIT License

5 votes

public static void main(String[] args) throws Exception {
 System.setProperty("hadoop.home.dir", "E:\\hadoop");

   SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]");
   JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
   streamingContext.checkpoint("E:\\hadoop\\checkpoint");
// Initial state RDD input to mapWithState
   @SuppressWarnings("unchecked")
   List<Tuple2<String, Integer>> tuples =Arrays.asList(new Tuple2<>("hello", 1), new Tuple2<>("world", 1));
   JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
   
   JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
   
   JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
  
   JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
  


  // Update the cumulative count function
  Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>> mappingFunc =
      new Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>>() {
        @Override
        public Tuple2<String, Integer> call(String word, Optional<Integer> one,
            State<Integer> state) {
          int sum = one.orElse(0) + (state.exists() ? state.get() : 0);
          Tuple2<String, Integer> output = new Tuple2<>(word, sum);
          state.update(sum);
          return output;
        }
      };

  // DStream made of get cumulative counts that get updated in every batch
  JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> stateDstream = wordCounts.mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD));

  stateDstream.print();
  streamingContext.start();
  streamingContext.awaitTermination();
}

Source File: JavaDirectTalosStreamSuite.java From galaxy-sdk-java with Apache License 2.0

5 votes

@Test
public void testDirectStream() throws InterruptedException {
    HashSet<String> sentMessages = new HashSet<String>();
    for (int i = 1; i <= 100; i++)
        sentMessages.add(i + "");
    talosTestUtils.sendMessagesAndWaitForReceive(topic, sentMessages);
    JavaPairDStream<String, String> stream = TalosUtils.createDirectStream(
            ssc, talosTestUtils.javaTalosParams(), talosTestUtils.credential(), new HashSet<String>() {{
                add(topic);
            }}
    );
    final Set<String> result = Collections.synchronizedSet(new HashSet<String>());
    stream.foreachRDD(
            new Function<JavaPairRDD<String, String>, Void>() {
                public Void call(JavaPairRDD<String, String> rdd) throws Exception {
                    Iterator<Tuple2<String, String>> iterator = rdd.collect().iterator();
                    while (iterator.hasNext()) {
                        result.add(iterator.next()._2);
                    }
                    return null;
                }
            }
    );
    ssc.start();
    long startTime = System.currentTimeMillis();
    boolean matches = false;
    while (!matches && System.currentTimeMillis() - startTime < 20000) {
        matches = sentMessages.size() == result.size();
        Thread.sleep(50);
    }
    Assert.assertEquals(sentMessages, result);
    ssc.stop();
}

Source File: SparkUtils.java From cxf with Apache License 2.0

5 votes

public static JavaPairDStream<String, Integer> createOutputDStream(
    JavaDStream<String> receiverStream, boolean withId) {
    final JavaDStream<String> words =
        receiverStream.flatMap(x -> withId ? splitInputStringWithId(x) : splitInputString(x));

    final JavaPairDStream<String, Integer> pairs = words.mapToPair(s -> {
        return new Tuple2<String, Integer>(s, 1);
    });
    return pairs.reduceByKey((i1, i2) -> {
        return i1 + i2;
    });
}

Source File: WordCountSocketJava8Ex.java From Apache-Spark-2x-for-Java-Developers with MIT License

5 votes

public static void main(String[] args) throws Exception {
 
     System.setProperty("hadoop.home.dir", "E:\\hadoop");
	
  SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]");
  JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
  
  List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10));
  JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
    

  JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
  
  JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
 
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
 
  wordCounts.print();
  
JavaPairDStream<String, Integer> joinedDstream = wordCounts.transformToPair(
   new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() {
	    @Override public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> rdd) throws Exception {
	    	rdd.join(initialRDD).mapToPair(new PairFunction<Tuple2<String,Tuple2<Integer,Integer>>, String, Integer>() {
				@Override
				public Tuple2<String, Integer> call(Tuple2<String, Tuple2<Integer, Integer>> joinedTuple)
						throws Exception {
					// TODO Auto-generated method stub
					return new Tuple2<>( joinedTuple._1(), (joinedTuple._2()._1()+joinedTuple._2()._2()) );
				}
			});
		
		return rdd; 				     
	    }
	  });
 
joinedDstream.print();
  streamingContext.start();
  streamingContext.awaitTermination();
}

Source File: WordCountRecoverableEx.java From Apache-Spark-2x-for-Java-Developers with MIT License

5 votes

protected static JavaStreamingContext createContext(String ip, int port, String checkpointDirectory) {
	SparkConf sparkConf = new SparkConf().setAppName("WordCountRecoverableEx").setMaster("local[*]");
	JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
	streamingContext.checkpoint(checkpointDirectory);
	// Initial state RDD input to mapWithState
	@SuppressWarnings("unchecked")
	List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 1), new Tuple2<>("world", 1));
	JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);

	JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream(ip,port, StorageLevels.MEMORY_AND_DISK_SER);

	JavaDStream<String> words = StreamingLines.flatMap(str -> Arrays.asList(str.split(" ")).iterator());

	JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str -> new Tuple2<>(str, 1))
			.reduceByKey((count1, count2) -> count1 + count2);

	// Update the cumulative count function
	Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>> mappingFunc = new Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>>() {
		@Override
		public Tuple2<String, Integer> call(String word, Optional<Integer> one, State<Integer> state) {
			int sum = one.orElse(0) + (state.exists() ? state.get() : 0);
			Tuple2<String, Integer> output = new Tuple2<>(word, sum);
			state.update(sum);
			return output;
		}
	};

	// DStream made of get cumulative counts that get updated in every batch
	JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> stateDstream = wordCounts
			.mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD));

	stateDstream.print();
	return streamingContext;
}

Source File: JavaCustomReceiver.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) throws Exception {
  if (args.length < 2) {
    System.err.println("Usage: JavaCustomReceiver <hostname> <port>");
    System.exit(1);
  }

  StreamingExamples.setStreamingLogLevels();

  // Create the context with a 1 second batch size
  SparkConf sparkConf = new SparkConf().setAppName("JavaCustomReceiver");
  JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, new Duration(1000));

  // Create an input stream with the custom receiver on target ip:port and count the
  // words in input stream of \n delimited text (eg. generated by 'nc')
  JavaReceiverInputDStream<String> lines = ssc.receiverStream(
    new JavaCustomReceiver(args[0], Integer.parseInt(args[1])));
  JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
    @Override
    public Iterator<String> call(String x) {
      return Arrays.asList(SPACE.split(x)).iterator();
    }
  });
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
    new PairFunction<String, String, Integer>() {
      @Override public Tuple2<String, Integer> call(String s) {
        return new Tuple2<>(s, 1);
      }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
      @Override
      public Integer call(Integer i1, Integer i2) {
        return i1 + i2;
      }
    });

  wordCounts.print();
  ssc.start();
  ssc.awaitTermination();
}

Source File: StreamingService.java From cxf with Apache License 2.0

5 votes

private void processStream(AsyncResponse async, List<String> inputStrings) {
    try {
        SparkConf sparkConf = new SparkConf().setMaster("local[*]")
            .setAppName("JAX-RS Spark Connect " + SparkUtils.getRandomId());
        JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));

        SparkStreamingOutput streamOut = new SparkStreamingOutput(jssc);
        SparkStreamingListener sparkListener = new SparkStreamingListener(streamOut);
        jssc.addStreamingListener(sparkListener);

        JavaDStream<String> receiverStream = null;
        if ("queue".equals(receiverType)) {
            Queue<JavaRDD<String>> rddQueue = new LinkedList<>();
            for (int i = 0; i < 30; i++) {
                rddQueue.add(jssc.sparkContext().parallelize(inputStrings));
            }
            receiverStream = jssc.queueStream(rddQueue);
        } else {
            receiverStream = jssc.receiverStream(new StringListReceiver(inputStrings));
        }

        JavaPairDStream<String, Integer> wordCounts = SparkUtils.createOutputDStream(receiverStream, false);
        wordCounts.foreachRDD(new OutputFunction(streamOut));
        jssc.start();

        executor.execute(new SparkJob(async, sparkListener));
    } catch (Exception ex) {
        // the compiler does not allow to catch SparkException directly
        if (ex instanceof SparkException) {
            async.cancel(60);
        } else {
            async.resume(new WebApplicationException(ex));
        }
    }
}

Source File: IoTTrafficDataProcessor.java From iot-traffic-monitor with Apache License 2.0

5 votes

/**
 * Method to get total traffic counts of different type of vehicles for each route.
 * 
 * @param filteredIotDataStream IoT data stream
 */
public void processTotalTrafficData(JavaDStream<IoTData> filteredIotDataStream) {

	// We need to get count of vehicle group by routeId and vehicleType
	JavaPairDStream<AggregateKey, Long> countDStreamPair = filteredIotDataStream
			.mapToPair(iot -> new Tuple2<>(new AggregateKey(iot.getRouteId(), iot.getVehicleType()), 1L))
			.reduceByKey((a, b) -> a + b);
	
	// Need to keep state for total count
	JavaMapWithStateDStream<AggregateKey, Long, Long, Tuple2<AggregateKey, Long>> countDStreamWithStatePair = countDStreamPair
			.mapWithState(StateSpec.function(totalSumFunc).timeout(Durations.seconds(3600)));//maintain state for one hour

	// Transform to dstream of TrafficData
	JavaDStream<Tuple2<AggregateKey, Long>> countDStream = countDStreamWithStatePair.map(tuple2 -> tuple2);
	JavaDStream<TotalTrafficData> trafficDStream = countDStream.map(totalTrafficDataFunc);

	// Map Cassandra table column
	Map<String, String> columnNameMappings = new HashMap<String, String>();
	columnNameMappings.put("routeId", "routeid");
	columnNameMappings.put("vehicleType", "vehicletype");
	columnNameMappings.put("totalCount", "totalcount");
	columnNameMappings.put("timeStamp", "timestamp");
	columnNameMappings.put("recordDate", "recorddate");

	// call CassandraStreamingJavaUtil function to save in DB
	javaFunctions(trafficDStream).writerBuilder("traffickeyspace", "total_traffic",
			CassandraJavaUtil.mapToRow(TotalTrafficData.class, columnNameMappings)).saveToCassandra();
}

Source File: StreamingContextConfiguration.java From Decision with Apache License 2.0

5 votes

private void configureDataContext(JavaStreamingContext context) {
    Map<String, Integer> baseTopicMap = new HashMap<>();


    configurationContext.getDataTopics().forEach( dataTopic -> baseTopicMap.put(dataTopic, 1));

    kafkaTopicService.createTopicsIfNotExist(configurationContext.getDataTopics(), configurationContext
            .getKafkaReplicationFactor(), configurationContext.getKafkaPartitions());

    HashMap<String, String> kafkaParams = new HashMap<>();
    kafkaParams.put("zookeeper.connect", configurationContext.getZookeeperHostsQuorumWithPath());
    kafkaParams.put("group.id", configurationContext.getGroupId());
     /*
     groupId must be the cluster groupId. Kafka assigns each partition of a topic to one, and one only, consumer of
      the group.
     Decision topics has only one partition (by default), so if we have two o more decision instances (consumers) reading the
     same topic with the same groupId, only one instance will be able to read from the topic
     */
    JavaPairDStream<String, byte[]> messages = KafkaUtils.createStream(context, String.class, byte[].class,
            kafka.serializer.StringDecoder.class, kafka.serializer.DefaultDecoder.class, kafkaParams, baseTopicMap,
            StorageLevel.MEMORY_AND_DISK_SER());

    AvroDeserializeMessageFunction avroDeserializeMessageFunction = new AvroDeserializeMessageFunction();
    JavaDStream<StratioStreamingMessage>  insertRequests = messages.filter(
            new FilterAvroMessagesByOperationFunction(STREAM_OPERATIONS.MANIPULATION.INSERT))
            .map(avroDeserializeMessageFunction);

    InsertIntoStreamFunction insertIntoStreamFunction = new InsertIntoStreamFunction(streamOperationService,
            configurationContext.getZookeeperHostsQuorum());
    insertRequests.foreachRDD(insertIntoStreamFunction);

}

Source File: ComputeStreamingResponse.java From incubator-retired-pirk with Apache License 2.0

5 votes

private void encryptedColumnCalc(JavaPairDStream<Long,BigInteger> encRowRDD)
{
  // Multiply the column values by colNum: emit <colNum, finalColVal>
  JavaPairDStream<Long,BigInteger> encColRDD;
  if (colMultReduceByKey)
  {
    encColRDD = encRowRDD.reduceByKey(new EncColMultReducer(bVars), numColMultPartitions);
  }
  else
  {
    encColRDD = encRowRDD.groupByKey(numColMultPartitions).mapToPair(new EncColMultGroupedMapper(bVars));
  }

  // Update the output name, by batch number
  bVars.setOutput(outputFile + "_" + accum.numBatchesGetValue());

  // Form and write the response object
  encColRDD.repartition(1).foreachRDD((VoidFunction<JavaPairRDD<Long,BigInteger>>) rdd -> {
    rdd.foreachPartition(new FinalResponseFunction(accum, bVars));

    int maxBatchesVar = bVars.getMaxBatches();
    if (maxBatchesVar != -1 && accum.numBatchesGetValue() == maxBatchesVar)
    {
      logger.info("num batches = maxBatches = " + maxBatchesVar + "; shutting down");
      System.exit(0);
    }

  });
}

org.apache.spark.streaming.api.java.JavaPairDStream Java Examples