org.apache.spark.streaming.api.java.JavaStreamingContext Java Exaples

Source File: KafkaStreaming.java From sparkResearch with Apache License 2.0

8 votes

public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10000));
    //设置检查点
    streamingContext.checkpoint("HDFS URL");
    Map<String, Integer> topicThread = new HashMap<>(1);
    topicThread.put(TOPIC, THREAD);
    JavaPairInputDStream<String, String> dStream = KafkaUtils.createStream(streamingContext, HOST, GROP, topicThread);

    JavaDStream<String> words = dStream.flatMap((FlatMapFunction<Tuple2<String, String>, String>) stringStringTuple2 -> Arrays.asList(SPACE.split(stringStringTuple2._2)).iterator());

    //统计
    JavaPairDStream<String, Integer> result = words.mapToPair((PairFunction<String, String, Integer>) s -> new Tuple2<>(s, 1)).reduceByKey((Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2);

    try {
        result.print();
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}

Source File: Window.java From sparkResearch with Apache License 2.0

6 votes

public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("window").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10));
    //检查点设置
    streamingContext.checkpoint("hdfs://localhost:9300");

    JavaDStream<String> dStream = streamingContext.socketTextStream("localhost", 8080);

    JavaDStream<String> winDstream = dStream.window(Durations.seconds(30), Durations.seconds(20));

    JavaDStream<Long> result = winDstream.count();

    try {
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}

Source File: StreamingRsvpsDStreamCountWindow.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License

6 votes

public static void main(String[] args) throws InterruptedException {

        System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

        final SparkConf conf = new SparkConf()
                .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
                .setAppName(APPLICATION_NAME)
                .set("spark.mongodb.output.uri", MONGODB_OUTPUT_URI)
                .set("spark.streaming.kafka.consumer.cache.enabled", "false");

        final JavaStreamingContext streamingContext
                = new JavaStreamingContext(conf, new Duration(BATCH_DURATION_INTERVAL_MS));

        streamingContext.checkpoint(CHECKPOINT_FOLDER);

        final JavaInputDStream<ConsumerRecord<String, String>> meetupStream =
                KafkaUtils.createDirectStream(
                        streamingContext,
                        LocationStrategies.PreferConsistent(),
                        ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES)
                );
                
        // transformations, streaming algorithms, etc
        JavaDStream<Long> countStream  
            = meetupStream.countByWindow(
                 new Duration(WINDOW_LENGTH_MS), 
                 new Duration(SLIDING_INTERVAL_MS));        

        countStream.foreachRDD((JavaRDD<Long> countRDD) -> {                
            MongoSpark.save(        
                    countRDD.map(
                        r -> Document.parse("{\"rsvps_count\":\"" + String.valueOf(r) + "\"}")
                    )
            );            
        });
        
        // some time later, after outputs have completed
        meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> {        
            OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges();            

            ((CanCommitOffsets) meetupStream.inputDStream())
                .commitAsync(offsetRanges, new MeetupOffsetCommitCallback());
        });
        
        streamingContext.start();
        streamingContext.awaitTermination();    
    }

Source File: SparkRunnerStreamingContextFactory.java From beam with Apache License 2.0

6 votes

private void checkpoint(JavaStreamingContext jssc, CheckpointDir checkpointDir) {
  Path rootCheckpointPath = checkpointDir.getRootCheckpointDir();
  Path sparkCheckpointPath = checkpointDir.getSparkCheckpointDir();
  Path beamCheckpointPath = checkpointDir.getBeamCheckpointDir();

  try {
    FileSystem fileSystem =
        rootCheckpointPath.getFileSystem(jssc.sparkContext().hadoopConfiguration());
    if (!fileSystem.exists(rootCheckpointPath)) {
      fileSystem.mkdirs(rootCheckpointPath);
    }
    if (!fileSystem.exists(sparkCheckpointPath)) {
      fileSystem.mkdirs(sparkCheckpointPath);
    }
    if (!fileSystem.exists(beamCheckpointPath)) {
      fileSystem.mkdirs(beamCheckpointPath);
    }
  } catch (IOException e) {
    throw new RuntimeException("Failed to create checkpoint dir", e);
  }

  jssc.checkpoint(sparkCheckpointPath.toString());
}

Source File: SparkStreamingBinding.java From datacollector with Apache License 2.0

6 votes

@Override
@SuppressWarnings("unchecked")
public JavaStreamingContext create() {
  sparkConf.set("spark.streaming.kafka.maxRatePerPartition", String.valueOf(maxRatePerPartition));
  // Use our classpath first, since we ship a newer version of Jackson and possibly other deps in the future.
  sparkConf.set("spark.driver.userClassPathFirst", "true");
  sparkConf.set("spark.executor.userClassPathFirst", "true");

  session = SparkSession.builder().config(sparkConf).getOrCreate();

  JavaStreamingContext result =
      new JavaStreamingContext(new JavaSparkContext(session.sparkContext()), new Duration(duration));
  Map<String, Object> props = new HashMap<>();

  props.put("group.id", groupId);
  props.put("key.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer");
  props.put("value.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer");
  for (Map.Entry<String, Object> map : props.entrySet()) {
    logMessage(Utils.format("Adding extra kafka config, {}:{}", map.getKey(), map.getValue()), isRunningInMesos);
  }

  logMessage("Meta data broker list " + metaDataBrokerList, isRunningInMesos);
  logMessage("Topic is " + topic, isRunningInMesos);
  logMessage("Auto offset reset is set to " + autoOffsetValue, isRunningInMesos);
  return createDStream(result, props);
}

Source File: ReduceByKeyAndWindow.java From sparkResearch with Apache License 2.0

6 votes

public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("reduceByKeyAndWindow").setMaster("local[2]");
    JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(10));
    //检查点设置
    streamingContext.checkpoint("hdfs://localhost:9300");
    //数据源
    JavaDStream<String> dStream = streamingContext.socketTextStream("localhost", 8080);

    JavaPairDStream<String, Long> ipPairDstream = dStream.mapToPair(new GetIp());

    JavaPairDStream<String, Long> result = ipPairDstream.reduceByKeyAndWindow(new AddLongs(),
            new SubtractLongs(), Durations.seconds(30), Durations.seconds(10));

    try {
        streamingContext.start();
        streamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}

Source File: SparkStreamDemo.java From sparkResearch with Apache License 2.0

6 votes

public static void main(String[] args) {
    //创建两个核心的本地线程,批处理的间隔为1秒
    SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("sparkStreamIng");
    JavaStreamingContext javaStreamingContext = new JavaStreamingContext(conf, Durations.seconds(1));
    //创建一个连接到IP:localhost,PORT:8080的DStream
    JavaReceiverInputDStream<String> dStream = javaStreamingContext.socketTextStream("localhost", 8080);
    JavaDStream<String> errorLine = dStream.filter(new Function<String, Boolean>() {
        @Override
        public Boolean call(String v1) throws Exception {
            return v1.contains("error");
        }
    });
    //打印包含error的行
    errorLine.print();
    try {
        //开始计算
        javaStreamingContext.start();
        //等待计算完成
        javaStreamingContext.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}

Source File: StreamingIngestionFileSystemTextFileToDataframeMultipleClassesApp.java From net.jgp.labs.spark with Apache License 2.0

6 votes

private void start() {
  // Create a local StreamingContext with two working thread and batch
  // interval of
  // 1 second
  SparkConf conf = new SparkConf().setMaster("local[2]").setAppName(
      "Streaming Ingestion File System Text File to Dataframe");
  JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations
      .seconds(5));

  JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils
      .getInputDirectory());

  msgDataStream.print();
  // Create JavaRDD<Row>
  msgDataStream.foreachRDD(new RowProcessor());

  jssc.start();
  try {
    jssc.awaitTermination();
  } catch (InterruptedException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
  }
}

Source File: StreamingIngestionFileSystemTextFileApp.java From net.jgp.labs.spark with Apache License 2.0

6 votes

private void start() {
  // Create a local StreamingContext with two working thread and batch
  // interval of
  // 1 second
  SparkConf conf = new SparkConf().setMaster("local[2]").setAppName(
      "NetworkWordCount");
  JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations
      .seconds(5));

  JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils
      .getInputDirectory());
  msgDataStream.print();

  jssc.start();
  try {
    jssc.awaitTermination();
  } catch (InterruptedException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
  }
}

Source File: WordCountRecoverableEx.java From Apache-Spark-2x-for-Java-Developers with MIT License

6 votes

public static void main(String[] args) throws Exception {
	System.setProperty("hadoop.home.dir", "E:\\hadoop");

	final String ip = "10.0.75.1";
	final int port = Integer.parseInt("9000");
	final String checkpointDirectory = "E:\\hadoop\\checkpoint";
	// Function to create JavaStreamingContext without any output operations
	// (used to detect the new context)
	Function0<JavaStreamingContext> createContextFunc = new Function0<JavaStreamingContext>() {
		@Override
		public JavaStreamingContext call() {
			return createContext(ip, port, checkpointDirectory);
		}
	};

	JavaStreamingContext ssc = JavaStreamingContext.getOrCreate(checkpointDirectory, createContextFunc);
	ssc.start();
	ssc.awaitTermination();
}

Source File: StreamingService.java From cxf with Apache License 2.0

6 votes

private void processStreamOneWay(List<String> inputStrings) {
    try {
        SparkConf sparkConf = new SparkConf().setMaster("local[*]")
            .setAppName("JAX-RS Spark Connect OneWay " + SparkUtils.getRandomId());
        JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));

        JavaDStream<String> receiverStream = null;
        if ("queue".equals(receiverType)) {
            Queue<JavaRDD<String>> rddQueue = new LinkedList<>();
            for (int i = 0; i < 30; i++) {
                rddQueue.add(jssc.sparkContext().parallelize(inputStrings));
            }
            receiverStream = jssc.queueStream(rddQueue);
        } else {
            receiverStream = jssc.receiverStream(new StringListReceiver(inputStrings));
        }

        JavaPairDStream<String, Integer> wordCounts = SparkUtils.createOutputDStream(receiverStream, false);
        wordCounts.foreachRDD(new PrintOutputFunction(jssc));
        jssc.start();
    } catch (Exception ex) {
        // ignore
    }
}

Source File: SparkStreamServiceImpl.java From searchanalytics-bigdata with MIT License

6 votes

@Override
public void setup() {
	// Create a StreamingContext with a SparkConf configuration
	SparkConf sparkConf = new SparkConf(false)
			.setAppName("JaiSpark")
			.setSparkHome("target/sparkhome")
			.setMaster("local")
			.set("spark.executor.memory", "128m")
			.set("spark.local.dir",
					new File("target/sparkhome/tmp").getAbsolutePath())
			.set("spark.cores.max", "2").set("spark.akka.threads", "2")
			.set("spark.akka.timeout", "60").set("spark.logConf", "true")
			.set("spark.cleaner.delay", "3700")
			.set("spark.cleaner.ttl", "86400")
			.set("spark.shuffle.spill", "false")
			.set("spark.driver.host", "localhost")
			.set("spark.driver.port", "43214");
	jssc = new JavaStreamingContext(sparkConf, new Duration(5000));

	String checkpointDir = hadoopClusterService.getHDFSUri()
			+ "/sparkcheckpoint";
	jssc.checkpoint(checkpointDir);
	startFlumeStream();
}

Source File: BatchUpdateFunction.java From oryx with Apache License 2.0

6 votes

BatchUpdateFunction(Config config,
                    Class<K> keyClass,
                    Class<M> messageClass,
                    Class<? extends Writable> keyWritableClass,
                    Class<? extends Writable> messageWritableClass,
                    String dataDirString,
                    String modelDirString,
                    BatchLayerUpdate<K,M,U> updateInstance,
                    JavaStreamingContext streamingContext) {
  this.keyClass = keyClass;
  this.messageClass = messageClass;
  this.keyWritableClass = keyWritableClass;
  this.messageWritableClass = messageWritableClass;
  this.dataDirString = dataDirString;
  this.modelDirString = modelDirString;
  this.updateBroker = ConfigUtils.getOptionalString(config, "oryx.update-topic.broker");
  this.updateTopic = ConfigUtils.getOptionalString(config, "oryx.update-topic.message.topic");
  this.updateInstance = updateInstance;
  this.sparkContext = streamingContext.sparkContext();
}

Source File: SparkStreaming.java From kafka-spark-avro-example with Apache License 2.0

6 votes

private static void processStream(JavaStreamingContext ssc, JavaSparkContext sc) {
  System.out.println("--> Processing stream");

  Map<String, String> props = new HashMap<>();
  props.put("bootstrap.servers", "localhost:9092");
  props.put("schema.registry.url", "http://localhost:8081");
  props.put("group.id", "spark");
  props.put("specific.avro.reader", "true");

  props.put("value.deserializer", "io.confluent.kafka.serializers.KafkaAvroDeserializer");
  props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");

  Set<String> topicsSet = new HashSet<>(Collections.singletonList("test"));

  JavaPairInputDStream<String, Object> stream = KafkaUtils.createDirectStream(ssc, String.class, Object.class,
    StringDecoder.class, KafkaAvroDecoder.class, props, topicsSet);

  stream.foreachRDD(rdd -> {
    rdd.foreachPartition(iterator -> {
        while (iterator.hasNext()) {
          Tuple2<String, Object> next = iterator.next();
          Model model = (Model) next._2();
          System.out.println(next._1() + " --> " + model);
        }
      }
    );
  });
}

Source File: StreamingEngine.java From spark-streaming-direct-kafka with Apache License 2.0

6 votes

public void start() {
    SparkConf sparkConf = getSparkConf();
    streamingContext = new JavaStreamingContext(sparkConf,
            Durations.seconds(Long.parseLong(config.getStreamingBatchIntervalInSec())));
    JavaInputDStream<MessageAndMetadata<String, byte[]>> dStream = buildInputDStream(streamingContext);
    JavaPairDStream<String, byte[]> pairDStream = dStream.mapToPair(km -> new Tuple2<>(km.key(), km.message()));

    pairDStream.foreachRDD(new ProcessStreamingData<>(config)); // process data
    dStream.foreachRDD(new UpdateOffsetsFn<>(config.getKafkaGroupId(), config.getZkOffsetManager()));
    streamingContext.start();
}

Source File: StreamingContextConfiguration.java From Decision with Apache License 2.0

5 votes

private void configureDataContext(JavaStreamingContext context) {
    Map<String, Integer> baseTopicMap = new HashMap<>();


    configurationContext.getDataTopics().forEach( dataTopic -> baseTopicMap.put(dataTopic, 1));

    kafkaTopicService.createTopicsIfNotExist(configurationContext.getDataTopics(), configurationContext
            .getKafkaReplicationFactor(), configurationContext.getKafkaPartitions());

    HashMap<String, String> kafkaParams = new HashMap<>();
    kafkaParams.put("zookeeper.connect", configurationContext.getZookeeperHostsQuorumWithPath());
    kafkaParams.put("group.id", configurationContext.getGroupId());
     /*
     groupId must be the cluster groupId. Kafka assigns each partition of a topic to one, and one only, consumer of
      the group.
     Decision topics has only one partition (by default), so if we have two o more decision instances (consumers) reading the
     same topic with the same groupId, only one instance will be able to read from the topic
     */
    JavaPairDStream<String, byte[]> messages = KafkaUtils.createStream(context, String.class, byte[].class,
            kafka.serializer.StringDecoder.class, kafka.serializer.DefaultDecoder.class, kafkaParams, baseTopicMap,
            StorageLevel.MEMORY_AND_DISK_SER());

    AvroDeserializeMessageFunction avroDeserializeMessageFunction = new AvroDeserializeMessageFunction();
    JavaDStream<StratioStreamingMessage>  insertRequests = messages.filter(
            new FilterAvroMessagesByOperationFunction(STREAM_OPERATIONS.MANIPULATION.INSERT))
            .map(avroDeserializeMessageFunction);

    InsertIntoStreamFunction insertIntoStreamFunction = new InsertIntoStreamFunction(streamOperationService,
            configurationContext.getZookeeperHostsQuorum());
    insertRequests.foreachRDD(insertIntoStreamFunction);

}

Source File: AbstractSparkLayer.java From oryx with Apache License 2.0

5 votes

protected final JavaInputDStream<ConsumerRecord<K,M>> buildInputDStream(
    JavaStreamingContext streamingContext) {

  Preconditions.checkArgument(
      KafkaUtils.topicExists(inputTopicLockMaster, inputTopic),
      "Topic %s does not exist; did you create it?", inputTopic);
  if (updateTopic != null && updateTopicLockMaster != null) {
    Preconditions.checkArgument(
        KafkaUtils.topicExists(updateTopicLockMaster, updateTopic),
        "Topic %s does not exist; did you create it?", updateTopic);
  }

  String groupID = getGroupID();

  Map<String,Object> kafkaParams = new HashMap<>();
  kafkaParams.put("group.id", groupID);
  // Don't re-consume old messages from input by default
  kafkaParams.put("auto.offset.reset", "latest"); // Ignored by Kafka 0.10 Spark integration
  kafkaParams.put("bootstrap.servers", inputBroker);
  kafkaParams.put("key.deserializer", keyDecoderClass.getName());
  kafkaParams.put("value.deserializer", messageDecoderClass.getName());

  LocationStrategy locationStrategy = LocationStrategies.PreferConsistent();
  ConsumerStrategy<K,M> consumerStrategy = ConsumerStrategies.Subscribe(
      Collections.singleton(inputTopic), kafkaParams, Collections.emptyMap());
  return org.apache.spark.streaming.kafka010.KafkaUtils.createDirectStream(
      streamingContext,
      locationStrategy,
      consumerStrategy);
}

Source File: StreamingContextConfigurationTests.java From Decision with Apache License 2.0

5 votes

@Test
public void testActionBaseFunctionCall() throws Exception {
    //        sc.sparkContext().emptyRDD().rdd().first();
    //        ssc.sparkContext().emptyRDD().rdd().first();
    assertEquals(sc instanceof JavaStreamingContext, false);
    assertEquals(ssc.sparkContext().appName(), "magic");

}

Source File: ReceiverLauncher.java From kafka-spark-consumer with Apache License 2.0

5 votes

private static <E> void assignReceiversToPartitions(int numberOfReceivers, 
        int numberOfPartition, List<JavaDStream<MessageAndMetadata<E>>> streamsList, 
        KafkaConfig config, StorageLevel storageLevel, KafkaMessageHandler<E> messageHandler, JavaStreamingContext jsc ) {

    // Create as many Receiver as Partition
    if (numberOfReceivers >= numberOfPartition) {
        for (int i = 0; i < numberOfPartition; i++) {
            streamsList.add(jsc.receiverStream(new KafkaReceiver(
                    config, i, storageLevel, messageHandler)));
        }
    } else {
        // create Range Receivers..
        Map<Integer, Set<Integer>> rMap = new HashMap<Integer, Set<Integer>>();

        for (int i = 0; i < numberOfPartition; i++) {
            int j = i % numberOfReceivers;
            Set<Integer> pSet = rMap.get(j);
            if (pSet == null) {
                pSet = new HashSet<Integer>();
                pSet.add(i);
            } else {
                pSet.add(i);
            }
            rMap.put(j, pSet);
        }
        for (int i = 0; i < numberOfReceivers; i++) {
            streamsList.add(jsc.receiverStream(new KafkaRangeReceiver(config, rMap
                    .get(i), storageLevel, messageHandler)));
        }
    }
}

Source File: BatchLayer.java From oryx with Apache License 2.0

5 votes

public void await() throws InterruptedException {
  JavaStreamingContext theStreamingContext;
  synchronized (this) {
    theStreamingContext = streamingContext;
    Preconditions.checkState(theStreamingContext != null);
  }
  log.info("Spark Streaming is running");
  theStreamingContext.awaitTermination(); // Can't do this with lock
}

Source File: TrackStreamingSourcesTest.java From beam with Apache License 2.0

5 votes

private StreamingSourceTracker(
    JavaStreamingContext jssc,
    Pipeline pipeline,
    Class<? extends PTransform> transformClassToAssert,
    Integer... expected) {
  this.ctxt = new EvaluationContext(jssc.sparkContext(), pipeline, options, jssc);
  this.evaluator =
      new SparkRunner.Evaluator(
          new StreamingTransformTranslator.Translator(new TransformTranslator.Translator()),
          ctxt);
  this.transformClassToAssert = transformClassToAssert;
  this.expected = expected;
}

Source File: StreamingContextConfiguration.java From Decision with Apache License 2.0

5 votes

@Bean(name = "streamingContext", destroyMethod = "stop")
public JavaStreamingContext streamingContext() {
    JavaStreamingContext context = this.create("stratio-streaming-context", 4040,
            configurationContext.getInternalStreamingBatchTime(), configurationContext.getInternalSparkHost());

    configureRequestContext(context);
    configureActionContext(context);
    configureDataContext(context);

    return context;
}

Source File: WordCountRecoverableEx.java From Apache-Spark-2x-for-Java-Developers with MIT License

5 votes

protected static JavaStreamingContext createContext(String ip, int port, String checkpointDirectory) {
	SparkConf sparkConf = new SparkConf().setAppName("WordCountRecoverableEx").setMaster("local[*]");
	JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
	streamingContext.checkpoint(checkpointDirectory);
	// Initial state RDD input to mapWithState
	@SuppressWarnings("unchecked")
	List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 1), new Tuple2<>("world", 1));
	JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);

	JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream(ip,port, StorageLevels.MEMORY_AND_DISK_SER);

	JavaDStream<String> words = StreamingLines.flatMap(str -> Arrays.asList(str.split(" ")).iterator());

	JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str -> new Tuple2<>(str, 1))
			.reduceByKey((count1, count2) -> count1 + count2);

	// Update the cumulative count function
	Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>> mappingFunc = new Function3<String, Optional<Integer>, State<Integer>, Tuple2<String, Integer>>() {
		@Override
		public Tuple2<String, Integer> call(String word, Optional<Integer> one, State<Integer> state) {
			int sum = one.orElse(0) + (state.exists() ? state.get() : 0);
			Tuple2<String, Integer> output = new Tuple2<>(word, sum);
			state.update(sum);
			return output;
		}
	};

	// DStream made of get cumulative counts that get updated in every batch
	JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> stateDstream = wordCounts
			.mapWithState(StateSpec.function(mappingFunc).initialState(initialRDD));

	stateDstream.print();
	return streamingContext;
}

Source File: Kafka010SparkStreamingBinding.java From datacollector with Apache License 2.0

5 votes

@Override
public JavaStreamingContext createDStream(JavaStreamingContext result, Map<String, Object> props) {
  props.put("bootstrap.servers", metaDataBrokerList);
  if (!autoOffsetValue.isEmpty()) {
    autoOffsetValue = getConfigurableAutoOffsetResetIfNonEmpty(autoOffsetValue);
    props.put(AUTO_OFFSET_RESET, autoOffsetValue);
  }
  props.putAll(extraKafkaConfigs);

  List<String> topics = ImmutableList.of(topic);
  JavaInputDStream<ConsumerRecord<byte[], byte[]>> stream;

  if (offsetHelper.isSDCCheckPointing()) {
    Map<TopicPartition, Long> fromOffsets = KafkaOffsetManagerImpl.get().getOffsetForDStream(topic, numberOfPartitions);
    stream =
        KafkaUtils.createDirectStream(
            result,
            LocationStrategies.PreferConsistent(),
            ConsumerStrategies.<byte[], byte[]>Assign(new ArrayList<TopicPartition>(fromOffsets.keySet()), props, fromOffsets)
        );
  } else {
    stream  = KafkaUtils.createDirectStream(
        result,
        LocationStrategies.PreferConsistent(),
        ConsumerStrategies.<byte[], byte[]>Subscribe(topics, props)
    );

  }
  Driver$.MODULE$.foreach(stream.dstream(), KafkaOffsetManagerImpl.get());
  return result;
}

Source File: SparkStreamingJob.java From zipkin-sparkstreaming with Apache License 2.0

5 votes

@Memoized
JavaStreamingContext jsc() {
  SparkConf conf = new SparkConf(true)
      .setMaster(master())
      .setAppName(getClass().getName());
  if (!jars().isEmpty()) conf.setJars(jars().toArray(new String[0]));
  for (Map.Entry<String, String> entry : conf().entrySet()) {
    conf.set(entry.getKey(), entry.getValue());
  }
  return new JavaStreamingContext(conf, new Duration(batchDuration()));
}

Source File: MapRStreamingBinding.java From datacollector with Apache License 2.0

5 votes

@Override
public JavaStreamingContext createDStream(JavaStreamingContext result, Map<String, Object> props) {
  List<String> topics = ImmutableList.of(topic);
  if (!autoOffsetValue.isEmpty()) {
    props.put(SparkStreamingBinding.AUTO_OFFSET_RESET, autoOffsetValue);
  }
  props.putAll(extraKafkaConfigs);

  JavaInputDStream<ConsumerRecord<byte[], byte[]>> stream;

  if (offsetHelper.isSDCCheckPointing()) {
    Map<TopicPartition, Long> fromOffsets = MaprStreamsOffsetManagerImpl.get().getOffsetForDStream(topic, numberOfPartitions);
    stream =
        KafkaUtils.createDirectStream(
            result,
            LocationStrategies.PreferConsistent(),
            ConsumerStrategies.<byte[], byte[]>Assign(new ArrayList<TopicPartition>(fromOffsets.keySet()), props, fromOffsets)
        );
  } else {
    stream  = KafkaUtils.createDirectStream(
        result,
        LocationStrategies.PreferConsistent(),
        ConsumerStrategies.<byte[], byte[]>Subscribe(topics, props)
    );

  }
  Driver$.MODULE$.foreach(stream.dstream(), MaprStreamsOffsetManagerImpl.get());
  return result;
}

Source File: ReaderWriterExample.java From spliceengine with GNU Affero General Public License v3.0

5 votes

public static void main(String[] args) throws Exception {

        final String dbUrl = args[0];
        final String hostname = args[1];
        final String port = args[2];
        final String inTargetSchema = args[3];
        final String inTargetTable = args[4];

        SparkConf conf = new SparkConf();

        JavaStreamingContext ssc = new JavaStreamingContext(conf, new Duration(500));

        JavaReceiverInputDStream<String> stream = ssc.socketTextStream(hostname, Integer.parseInt(port));

        SparkSession spark = SparkSession.builder().getOrCreate();

        // Create a SplicemachineContext based on the provided DB connection
        SplicemachineContext splicemachineContext = new SplicemachineContext(dbUrl);

        // Set target tablename and schemaname
        final String table = inTargetSchema + "." + inTargetTable;

        stream.foreachRDD((VoidFunction<JavaRDD<String>>) rdd -> {
            JavaRDD<Row> rowRDD = rdd.map((Function<String, Row>) s -> RowFactory.create(s));
            Dataset<Row> df = spark.createDataFrame(rowRDD, splicemachineContext.getSchema(table));

            splicemachineContext.insert(df, table);
        });

        ssc.start();
        ssc.awaitTermination();
    }

Source File: SparkStreaming.java From kafka-spark-avro-example with Apache License 2.0

5 votes

public static void main(String... args) {
  SparkConf conf = new SparkConf();
  conf.setMaster("local[2]");
  conf.setAppName("Spark Streaming Test Java");

  JavaSparkContext sc = new JavaSparkContext(conf);
  JavaStreamingContext ssc = new JavaStreamingContext(sc, Durations.seconds(10));

  processStream(ssc, sc);

  ssc.start();
  ssc.awaitTermination();
}

Source File: SparkStreamingSqlAnalyse.java From sylph with Apache License 2.0

5 votes

public SparkStreamingSqlAnalyse(StreamingContext ssc,
        ConnectorStore connectorStore,
        boolean isCompile)
{
    this.ssc = ssc;
    this.connectorStore = connectorStore;
    this.sparkBean = binder -> {
        binder.bind(StreamingContext.class, ssc);
        binder.bind(JavaStreamingContext.class, new JavaStreamingContext(ssc));
    };
    this.isCompile = isCompile;
}

Source File: WordCountSocketJava8Ex.java From Apache-Spark-2x-for-Java-Developers with MIT License

5 votes

public static void main(String[] args) throws Exception {
 
     System.setProperty("hadoop.home.dir", "E:\\hadoop");
	
  SparkConf sparkConf = new SparkConf().setAppName("WordCountSocketEx").setMaster("local[*]");
  JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));
  
  List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<>("hello", 10), new Tuple2<>("world", 10));
  JavaPairRDD<String, Integer> initialRDD = streamingContext.sparkContext().parallelizePairs(tuples);
    

  JavaReceiverInputDStream<String> StreamingLines = streamingContext.socketTextStream( "10.0.75.1", Integer.parseInt("9000"), StorageLevels.MEMORY_AND_DISK_SER);
  
  JavaDStream<String> words = StreamingLines.flatMap( str -> Arrays.asList(str.split(" ")).iterator() );
 
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(str-> new Tuple2<>(str, 1)).reduceByKey((count1,count2) ->count1+count2 );
 
  wordCounts.print();
  
JavaPairDStream<String, Integer> joinedDstream = wordCounts.transformToPair(
   new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() {
	    @Override public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> rdd) throws Exception {
	    	rdd.join(initialRDD).mapToPair(new PairFunction<Tuple2<String,Tuple2<Integer,Integer>>, String, Integer>() {
				@Override
				public Tuple2<String, Integer> call(Tuple2<String, Tuple2<Integer, Integer>> joinedTuple)
						throws Exception {
					// TODO Auto-generated method stub
					return new Tuple2<>( joinedTuple._1(), (joinedTuple._2()._1()+joinedTuple._2()._2()) );
				}
			});
		
		return rdd; 				     
	    }
	  });
 
joinedDstream.print();
  streamingContext.start();
  streamingContext.awaitTermination();
}

org.apache.spark.streaming.api.java.JavaStreamingContext Java Examples