org.apache.spark.streaming.api.java.JavaInputDStream Java Exaples

Source File: StreamingRsvpsDStreamCountWindow.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License

6 votes

public static void main(String[] args) throws InterruptedException {

        System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

        final SparkConf conf = new SparkConf()
                .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
                .setAppName(APPLICATION_NAME)
                .set("spark.mongodb.output.uri", MONGODB_OUTPUT_URI)
                .set("spark.streaming.kafka.consumer.cache.enabled", "false");

        final JavaStreamingContext streamingContext
                = new JavaStreamingContext(conf, new Duration(BATCH_DURATION_INTERVAL_MS));

        streamingContext.checkpoint(CHECKPOINT_FOLDER);

        final JavaInputDStream<ConsumerRecord<String, String>> meetupStream =
                KafkaUtils.createDirectStream(
                        streamingContext,
                        LocationStrategies.PreferConsistent(),
                        ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES)
                );
                
        // transformations, streaming algorithms, etc
        JavaDStream<Long> countStream  
            = meetupStream.countByWindow(
                 new Duration(WINDOW_LENGTH_MS), 
                 new Duration(SLIDING_INTERVAL_MS));        

        countStream.foreachRDD((JavaRDD<Long> countRDD) -> {                
            MongoSpark.save(        
                    countRDD.map(
                        r -> Document.parse("{\"rsvps_count\":\"" + String.valueOf(r) + "\"}")
                    )
            );            
        });
        
        // some time later, after outputs have completed
        meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> {        
            OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges();            

            ((CanCommitOffsets) meetupStream.inputDStream())
                .commitAsync(offsetRanges, new MeetupOffsetCommitCallback());
        });
        
        streamingContext.start();
        streamingContext.awaitTermination();    
    }

Source File: StreamingEngine.java From spark-streaming-direct-kafka with Apache License 2.0

6 votes

public void start() {
    SparkConf sparkConf = getSparkConf();
    streamingContext = new JavaStreamingContext(sparkConf,
            Durations.seconds(Long.parseLong(config.getStreamingBatchIntervalInSec())));
    JavaInputDStream<MessageAndMetadata<String, byte[]>> dStream = buildInputDStream(streamingContext);
    JavaPairDStream<String, byte[]> pairDStream = dStream.mapToPair(km -> new Tuple2<>(km.key(), km.message()));

    pairDStream.foreachRDD(new ProcessStreamingData<>(config)); // process data
    dStream.foreachRDD(new UpdateOffsetsFn<>(config.getKafkaGroupId(), config.getZkOffsetManager()));
    streamingContext.start();
}

Source File: AbstractJavaEsSparkStreamingTest.java From elasticsearch-hadoop with Apache License 2.0

5 votes

@Test
public void testEsRDDWriteIndexCreationDisabled() throws Exception {
    ExpectingToThrow expecting = expectingToThrow(EsHadoopIllegalArgumentException.class).from(ssc);

    Map<String, Object> doc1 = new HashMap<>();
    doc1.put("one", null);
    Set<String> values = new HashSet<>();
    values.add("2");
    doc1.put("two", values);
    doc1.put("three", ".");

    Map<String, Object> doc2 = new HashMap<>();
    doc2.put("OTP", "Otopeni");
    doc2.put("SFO", "San Fran");

    List<Map<String, Object>> docs = new ArrayList<>();
    docs.add(doc1);
    docs.add(doc2);

    String target = wrapIndex(resource("spark-test-nonexisting-scala-basic-write", "data", version));

    Map<String, String> localConf = new HashMap<>(cfg);
    localConf.put(ES_INDEX_AUTO_CREATE, "no");

    JavaRDD<Map<String, Object>> batch = sc.parallelize(docs);
    Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>();
    rddQueue.add(batch);
    JavaInputDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue, true);
    // apply closure
    JavaEsSparkStreaming.saveToEs(dstream, target, localConf);
    ssc.start();
    TimeUnit.SECONDS.sleep(2); // Let the processing happen
    ssc.stop(false, true);

    assertTrue(!RestUtils.exists(target));
    expecting.assertExceptionFound();
}

Source File: AbstractJavaEsSparkStreamingTest.java From elasticsearch-hadoop with Apache License 2.0

5 votes

@Test
public void testEsRDDWriteIndexCreationDisabled() throws Exception {
    ExpectingToThrow expecting = expectingToThrow(EsHadoopIllegalArgumentException.class).from(ssc);

    Map<String, Object> doc1 = new HashMap<>();
    doc1.put("one", null);
    Set<String> values = new HashSet<>();
    values.add("2");
    doc1.put("two", values);
    doc1.put("three", ".");

    Map<String, Object> doc2 = new HashMap<>();
    doc2.put("OTP", "Otopeni");
    doc2.put("SFO", "San Fran");

    List<Map<String, Object>> docs = new ArrayList<>();
    docs.add(doc1);
    docs.add(doc2);

    String target = wrapIndex(resource("spark-test-nonexisting-scala-basic-write", "data", version));

    Map<String, String> localConf = new HashMap<>(cfg);
    localConf.put(ES_INDEX_AUTO_CREATE, "no");

    JavaRDD<Map<String, Object>> batch = sc.parallelize(docs);
    Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>();
    rddQueue.add(batch);
    JavaInputDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue, true);
    // apply closure
    JavaEsSparkStreaming.saveToEs(dstream, target, localConf);
    ssc.start();
    TimeUnit.SECONDS.sleep(2); // Let the processing happen
    ssc.stop(false, true);

    assertTrue(!RestUtils.exists(target));
    expecting.assertExceptionFound();
}

Source File: AbstractSparkLayer.java From oryx with Apache License 2.0

5 votes

protected final JavaInputDStream<ConsumerRecord<K,M>> buildInputDStream(
    JavaStreamingContext streamingContext) {

  Preconditions.checkArgument(
      KafkaUtils.topicExists(inputTopicLockMaster, inputTopic),
      "Topic %s does not exist; did you create it?", inputTopic);
  if (updateTopic != null && updateTopicLockMaster != null) {
    Preconditions.checkArgument(
        KafkaUtils.topicExists(updateTopicLockMaster, updateTopic),
        "Topic %s does not exist; did you create it?", updateTopic);
  }

  String groupID = getGroupID();

  Map<String,Object> kafkaParams = new HashMap<>();
  kafkaParams.put("group.id", groupID);
  // Don't re-consume old messages from input by default
  kafkaParams.put("auto.offset.reset", "latest"); // Ignored by Kafka 0.10 Spark integration
  kafkaParams.put("bootstrap.servers", inputBroker);
  kafkaParams.put("key.deserializer", keyDecoderClass.getName());
  kafkaParams.put("value.deserializer", messageDecoderClass.getName());

  LocationStrategy locationStrategy = LocationStrategies.PreferConsistent();
  ConsumerStrategy<K,M> consumerStrategy = ConsumerStrategies.Subscribe(
      Collections.singleton(inputTopic), kafkaParams, Collections.emptyMap());
  return org.apache.spark.streaming.kafka010.KafkaUtils.createDirectStream(
      streamingContext,
      locationStrategy,
      consumerStrategy);
}

Source File: MapRStreaming22Binding.java From datacollector with Apache License 2.0

5 votes

@Override
public JavaStreamingContext createDStream(JavaStreamingContext result, Map<String, Object> props) {
  List<String> topics = ImmutableList.of(topic);
  if (!autoOffsetValue.isEmpty()) {
    props.put(SparkStreamingBinding.AUTO_OFFSET_RESET, autoOffsetValue);
  }
  props.putAll(extraKafkaConfigs);

  JavaInputDStream<ConsumerRecord<byte[], byte[]>> stream;

  if (offsetHelper.isSDCCheckPointing()) {
    Map<TopicPartition, Long> fromOffsets = MaprStreamsOffsetManagerImpl.get().getOffsetForDStream(topic, numberOfPartitions);
    stream =
        KafkaUtils.createDirectStream(
            result,
            LocationStrategies.PreferConsistent(),
            ConsumerStrategies.<byte[], byte[]>Assign(new ArrayList<TopicPartition>(fromOffsets.keySet()), props, fromOffsets)
        );
  } else {
    stream  = KafkaUtils.createDirectStream(
        result,
        LocationStrategies.PreferConsistent(),
        ConsumerStrategies.<byte[], byte[]>Subscribe(topics, props)
    );

  }
  Driver$.MODULE$.foreach(stream.dstream(), MaprStreamsOffsetManagerImpl.get());
  return result;
}

Source File: Kafka010SparkStreamingBinding.java From datacollector with Apache License 2.0

5 votes

@Override
public JavaStreamingContext createDStream(JavaStreamingContext result, Map<String, Object> props) {
  props.put("bootstrap.servers", metaDataBrokerList);
  if (!autoOffsetValue.isEmpty()) {
    autoOffsetValue = getConfigurableAutoOffsetResetIfNonEmpty(autoOffsetValue);
    props.put(AUTO_OFFSET_RESET, autoOffsetValue);
  }
  props.putAll(extraKafkaConfigs);

  List<String> topics = ImmutableList.of(topic);
  JavaInputDStream<ConsumerRecord<byte[], byte[]>> stream;

  if (offsetHelper.isSDCCheckPointing()) {
    Map<TopicPartition, Long> fromOffsets = KafkaOffsetManagerImpl.get().getOffsetForDStream(topic, numberOfPartitions);
    stream =
        KafkaUtils.createDirectStream(
            result,
            LocationStrategies.PreferConsistent(),
            ConsumerStrategies.<byte[], byte[]>Assign(new ArrayList<TopicPartition>(fromOffsets.keySet()), props, fromOffsets)
        );
  } else {
    stream  = KafkaUtils.createDirectStream(
        result,
        LocationStrategies.PreferConsistent(),
        ConsumerStrategies.<byte[], byte[]>Subscribe(topics, props)
    );

  }
  Driver$.MODULE$.foreach(stream.dstream(), KafkaOffsetManagerImpl.get());
  return result;
}

Source File: AbstractSparkLayer.java From spark-streaming-direct-kafka with Apache License 2.0

5 votes

public JavaInputDStream<MessageAndMetadata<String,byte[]>> buildInputDStream(
        JavaStreamingContext streamingContext) {

    HashMap<String, String> kafkaParams = config.getKafkaParams();

    // Ugly compiler-pleasing acrobatics:
    @SuppressWarnings("unchecked")
    Class<MessageAndMetadata<String, byte[]>> streamClass =
            (Class<MessageAndMetadata<String, byte[]>>) (Class<?>) MessageAndMetadata.class;

    if (!KafkaManager.topicExists(config.getZkKafka(), config.getTopic())) {
        throw new RuntimeException("Topic does not exist on server");
    }

    Map<TopicAndPartition, Long> seedOffsetsMap = KafkaManager.getOffsets(config.getZkKafka(),
            config.getZkOffsetManager(), config.getKafkaGroupId(), config.getTopic(), config.getKafkaParams());

    // TODO: try generics, instead of hardcoded values
    JavaInputDStream<MessageAndMetadata<String, byte[]>> dStream = org.apache.spark.streaming.kafka.KafkaUtils.createDirectStream(
            streamingContext,
            String.class,  // change as necessary
            byte[].class,  // change as necessary
            StringDecoder.class,
            DefaultDecoder.class,
            streamClass,
            kafkaParams,
            seedOffsetsMap,
            Functions.<MessageAndMetadata<String, byte[]>>identity());
    return dStream;
}

Source File: MapRStreamingBinding.java From datacollector with Apache License 2.0

5 votes

@Override
public JavaStreamingContext createDStream(JavaStreamingContext result, Map<String, Object> props) {
  List<String> topics = ImmutableList.of(topic);
  if (!autoOffsetValue.isEmpty()) {
    props.put(SparkStreamingBinding.AUTO_OFFSET_RESET, autoOffsetValue);
  }
  props.putAll(extraKafkaConfigs);

  JavaInputDStream<ConsumerRecord<byte[], byte[]>> stream;

  if (offsetHelper.isSDCCheckPointing()) {
    Map<TopicPartition, Long> fromOffsets = MaprStreamsOffsetManagerImpl.get().getOffsetForDStream(topic, numberOfPartitions);
    stream =
        KafkaUtils.createDirectStream(
            result,
            LocationStrategies.PreferConsistent(),
            ConsumerStrategies.<byte[], byte[]>Assign(new ArrayList<TopicPartition>(fromOffsets.keySet()), props, fromOffsets)
        );
  } else {
    stream  = KafkaUtils.createDirectStream(
        result,
        LocationStrategies.PreferConsistent(),
        ConsumerStrategies.<byte[], byte[]>Subscribe(topics, props)
    );

  }
  Driver$.MODULE$.foreach(stream.dstream(), MaprStreamsOffsetManagerImpl.get());
  return result;
}

Source File: WordCountingAppWithCheckpoint.java From tutorials with MIT License

4 votes

public static void main(String[] args) throws InterruptedException {

        Logger.getLogger("org")
            .setLevel(Level.OFF);
        Logger.getLogger("akka")
            .setLevel(Level.OFF);

        Map<String, Object> kafkaParams = new HashMap<>();
        kafkaParams.put("bootstrap.servers", "localhost:9092");
        kafkaParams.put("key.deserializer", StringDeserializer.class);
        kafkaParams.put("value.deserializer", StringDeserializer.class);
        kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream");
        kafkaParams.put("auto.offset.reset", "latest");
        kafkaParams.put("enable.auto.commit", false);

        Collection<String> topics = Arrays.asList("messages");

        SparkConf sparkConf = new SparkConf();
        sparkConf.setMaster("local[2]");
        sparkConf.setAppName("WordCountingAppWithCheckpoint");
        sparkConf.set("spark.cassandra.connection.host", "127.0.0.1");

        JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));

        sparkContext = streamingContext.sparkContext();

        streamingContext.checkpoint("./.checkpoint");

        JavaInputDStream<ConsumerRecord<String, String>> messages = KafkaUtils.createDirectStream(streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String> Subscribe(topics, kafkaParams));

        JavaPairDStream<String, String> results = messages.mapToPair(record -> new Tuple2<>(record.key(), record.value()));

        JavaDStream<String> lines = results.map(tuple2 -> tuple2._2());

        JavaDStream<String> words = lines.flatMap(x -> Arrays.asList(x.split("\\s+"))
            .iterator());

        JavaPairDStream<String, Integer> wordCounts = words.mapToPair(s -> new Tuple2<>(s, 1))
            .reduceByKey((Function2<Integer, Integer, Integer>) (i1, i2) -> i1 + i2);

        JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> cumulativeWordCounts = wordCounts.mapWithState(StateSpec.function((word, one, state) -> {
            int sum = one.orElse(0) + (state.exists() ? state.get() : 0);
            Tuple2<String, Integer> output = new Tuple2<>(word, sum);
            state.update(sum);
            return output;
        }));

        cumulativeWordCounts.foreachRDD(javaRdd -> {
            List<Tuple2<String, Integer>> wordCountList = javaRdd.collect();
            for (Tuple2<String, Integer> tuple : wordCountList) {
                List<Word> wordList = Arrays.asList(new Word(tuple._1, tuple._2));
                JavaRDD<Word> rdd = sparkContext.parallelize(wordList);
                javaFunctions(rdd).writerBuilder("vocabulary", "words", mapToRow(Word.class))
                    .saveToCassandra();
            }
        });

        streamingContext.start();
        streamingContext.awaitTermination();
    }

Source File: SparkRunner.java From jaeger-analytics-java with Apache License 2.0

4 votes

public static void main(String []args) throws InterruptedException, IOException {
  HTTPServer server = new HTTPServer(Integer.valueOf(getPropOrEnv("PROMETHEUS_PORT", "9111")));

  SparkConf sparkConf = new SparkConf()
      .setAppName("Trace DSL")
      .setMaster(getPropOrEnv("SPARK_MASTER","local[*]"));

  JavaSparkContext sc = new JavaSparkContext(sparkConf);
  JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(Integer.parseInt(getPropOrEnv("SPARK_STREAMING_BATCH_DURATION", "5000"))));

  Set<String> topics = Collections.singleton(getPropOrEnv("KAFKA_JAEGER_TOPIC", "jaeger-spans"));
  Map<String, Object> kafkaParams = new HashMap<>();
  kafkaParams.put("bootstrap.servers", getPropOrEnv("KAFKA_BOOTSTRAP_SERVER", "localhost:9092"));
  kafkaParams.put("key.deserializer", StringDeserializer.class);
  kafkaParams.put("value.deserializer", ProtoSpanDeserializer.class);
  // hack to start always from beginning
  kafkaParams.put("group.id", "jaeger-trace-aggregation-" + System.currentTimeMillis());

  if (Boolean.parseBoolean(getPropOrEnv("KAFKA_START_FROM_BEGINNING", "true"))) {
    kafkaParams.put("auto.offset.reset", "earliest");
    kafkaParams.put("enable.auto.commit", false);
    kafkaParams.put("startingOffsets", "earliest");
  }

  JavaInputDStream<ConsumerRecord<String, Span>> messages =
      KafkaUtils.createDirectStream(
          ssc,
          LocationStrategies.PreferConsistent(),
          ConsumerStrategies.Subscribe(topics, kafkaParams));

  JavaPairDStream<String, Span> traceIdSpanTuple = messages.mapToPair(record -> {
    return new Tuple2<>(record.value().traceId, record.value());
  });

 JavaDStream<Trace> tracesStream = traceIdSpanTuple.groupByKey().map(traceIdSpans -> {
   System.out.printf("traceID: %s\n", traceIdSpans._1);
    Iterable<Span> spans = traceIdSpans._2();
    Trace trace = new Trace();
    trace.traceId = traceIdSpans._1();
    trace.spans = StreamSupport.stream(spans.spliterator(), false)
        .collect(Collectors.toList());
    return trace;
  });

  MinimumClientVersion minimumClientVersion = MinimumClientVersion.builder()
      .withJavaVersion(getPropOrEnv("TRACE_QUALITY_JAVA_VERSION", "1.0.0"))
      .withGoVersion(getPropOrEnv("TRACE_QUALITY_GO_VERSION", "2.22.0"))
      .withNodeVersion(getPropOrEnv("TRACE_QUALITY_NODE_VERSION", "3.17.1"))
      .withPythonVersion(getPropOrEnv("TRACE_QUALITY_PYTHON_VERSION", "4.0.0"))
      .build();

  List<ModelRunner> modelRunner = Arrays.asList(
      new TraceHeight(),
      new ServiceDepth(),
      new ServiceHeight(),
      new NetworkLatency(),
      new NumberOfErrors(),
      new DirectDependencies(),
      // trace quality
      minimumClientVersion,
      new HasClientServerSpans(),
      new UniqueSpanId());

  tracesStream.foreachRDD((traceRDD, time) -> {
    traceRDD.foreach(trace -> {
      Graph graph = GraphCreator.create(trace);

      for (ModelRunner model: modelRunner) {
        model.runWithMetrics(graph);
      }
    });
  });

  ssc.start();
  ssc.awaitTermination();
}

Source File: BatchLayer.java From oryx with Apache License 2.0

4 votes

public synchronized void start() {
  String id = getID();
  if (id != null) {
    log.info("Starting Batch Layer {}", id);
  }

  streamingContext = buildStreamingContext();
  JavaSparkContext sparkContext = streamingContext.sparkContext();
  Configuration hadoopConf = sparkContext.hadoopConfiguration();

  Path checkpointPath = new Path(new Path(modelDirString), ".checkpoint");
  log.info("Setting checkpoint dir to {}", checkpointPath);
  sparkContext.setCheckpointDir(checkpointPath.toString());

  log.info("Creating message stream from topic");
  JavaInputDStream<ConsumerRecord<K,M>> kafkaDStream = buildInputDStream(streamingContext);
  JavaPairDStream<K,M> pairDStream =
      kafkaDStream.mapToPair(mAndM -> new Tuple2<>(mAndM.key(), mAndM.value()));

  Class<K> keyClass = getKeyClass();
  Class<M> messageClass = getMessageClass();
  pairDStream.foreachRDD(
      new BatchUpdateFunction<>(getConfig(),
                                keyClass,
                                messageClass,
                                keyWritableClass,
                                messageWritableClass,
                                dataDirString,
                                modelDirString,
                                loadUpdateInstance(),
                                streamingContext));

  // "Inline" saveAsNewAPIHadoopFiles to be able to skip saving empty RDDs
  pairDStream.foreachRDD(new SaveToHDFSFunction<>(
      dataDirString + "/oryx",
      "data",
      keyClass,
      messageClass,
      keyWritableClass,
      messageWritableClass,
      hadoopConf));

  // Must use the raw Kafka stream to get offsets
  kafkaDStream.foreachRDD(new UpdateOffsetsFn<>(getGroupID(), getInputTopicLockMaster()));

  if (maxDataAgeHours != NO_MAX_AGE) {
    pairDStream.foreachRDD(new DeleteOldDataFn<>(hadoopConf,
                                                 dataDirString,
                                                 Pattern.compile("-(\\d+)\\."),
                                                 maxDataAgeHours));
  }
  if (maxModelAgeHours != NO_MAX_AGE) {
    pairDStream.foreachRDD(new DeleteOldDataFn<>(hadoopConf,
                                                 modelDirString,
                                                 Pattern.compile("(\\d+)"),
                                                 maxModelAgeHours));
  }

  log.info("Starting Spark Streaming");

  streamingContext.start();
}

Source File: SpeedLayer.java From oryx with Apache License 2.0

4 votes

public synchronized void start() {
  String id = getID();
  if (id != null) {
    log.info("Starting Speed Layer {}", id);
  }

  streamingContext = buildStreamingContext();
  log.info("Creating message stream from topic");
  JavaInputDStream<ConsumerRecord<K,M>> kafkaDStream = buildInputDStream(streamingContext);
  JavaPairDStream<K,M> pairDStream =
      kafkaDStream.mapToPair(mAndM -> new Tuple2<>(mAndM.key(), mAndM.value()));

  KafkaConsumer<String,U> consumer = new KafkaConsumer<>(
      ConfigUtils.keyValueToProperties(
          "group.id", "OryxGroup-" + getLayerName() + '-' + UUID.randomUUID(),
          "bootstrap.servers", updateBroker,
          "max.partition.fetch.bytes", maxMessageSize,
          "key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer",
          "value.deserializer", updateDecoderClass.getName(),
          // Do start from the beginning of the update queue
          "auto.offset.reset", "earliest"
      ));
  consumer.subscribe(Collections.singletonList(updateTopic));
  consumerIterator = new ConsumeDataIterator<>(consumer);

  modelManager = loadManagerInstance();
  Configuration hadoopConf = streamingContext.sparkContext().hadoopConfiguration();
  new Thread(LoggingCallable.log(() -> {
    try {
      modelManager.consume(consumerIterator, hadoopConf);
    } catch (Throwable t) {
      log.error("Error while consuming updates", t);
      close();
    }
  }).asRunnable(), "OryxSpeedLayerUpdateConsumerThread").start();

  pairDStream.foreachRDD(new SpeedLayerUpdate<>(modelManager, updateBroker, updateTopic));

  // Must use the raw Kafka stream to get offsets
  kafkaDStream.foreachRDD(new UpdateOffsetsFn<>(getGroupID(), getInputTopicLockMaster()));

  log.info("Starting Spark Streaming");

  streamingContext.start();
}

Source File: SimpleSparkStructuredKafkaStreamingCounter.java From jMetalSP with MIT License

4 votes

@Override
public void run() {

    ConsumerStrategy<Integer,Integer> consumerStrategy =ConsumerStrategies.Subscribe(topic,kafkaParams);
    LocationStrategy locationStrategy = LocationStrategies.PreferConsistent();

    JavaInputDStream<ConsumerRecord<Integer,Integer>> stream=
            (JavaInputDStream<ConsumerRecord<Integer,Integer>>)
            KafkaUtils.createDirectStream(streamingContext,
            locationStrategy,
                    consumerStrategy);



    JavaDStream<Integer> time=stream.map(value -> value.value() );
    /*time.foreachRDD(numbers->
            {
                numbers.foreach(value->
                {
                    System.out.println("Pruebas----> " + value);
                    observable.setChanged();
                    observable.notifyObservers(new SingleObservedData<Integer>(value));
                });
            }
    );*/


        time.foreachRDD(numbers -> {
            Integer cont = numbers.reduce((key, value) -> value);
            //System.out.println("Pruebas----> " + cont);
            observable.setChanged();
            observable.notifyObservers(new ObservedValue<Integer>(cont));
        });

    
   // stream.foreachRDD((consumerRecordJavaRDD, time) -> consumerRecordJavaRDD.foreach(integer -> {
        //observable.setChanged();
        //observable.notifyObservers(new SingleObservedData<Integer>(integer.value()));
  //      System.out.println("Pruebas----> "+integer.value());
//    }));

}

Source File: SimpleSparkStructuredKafkaStreamingCounterAVRO.java From jMetalSP with MIT License

4 votes

@Override
public void run() {

    ConsumerStrategy<Integer,byte[]> consumerStrategy =ConsumerStrategies.Subscribe(topic,kafkaParams);
    LocationStrategy locationStrategy = LocationStrategies.PreferConsistent();

    JavaInputDStream<ConsumerRecord<Integer,byte []>> stream=
            (JavaInputDStream<ConsumerRecord<Integer,byte[]>>)
            KafkaUtils.createDirectStream(streamingContext,
            locationStrategy,
                    consumerStrategy);



    JavaDStream<Integer> time=stream.map(value -> {
        DataDeserializer<Counter> dataDeserializer = new DataDeserializer<>();
        //Object o =dataDeserializer.deserialize(value.value(),"avsc/Counter.avsc");
        //GenericData.Record rc=(GenericData.Record)o;
        Counter counter = dataDeserializer.deserialize(value.value(),"avsc/Counter.avsc");
        //Counter counter =  (Counter) dataDeserializer.deserialize(value.value(),"avsc/Counter.avsc");
         //return (Integer) rc.get(0);
        return (Integer) counter.get(0);
    });
    /*time.foreachRDD(numbers->
            {
                numbers.foreach(value->
                {
                    System.out.println("Pruebas----> " + value);
                    observable.setChanged();
                    observable.notifyObservers(new SingleObservedData<Integer>(value));
                });
            }
    );*/


time.foreachRDD(numbers -> {
    Integer cont = numbers.reduce((key, value) -> value);
    System.out.println("Pruebas----> " + cont);
    observable.setChanged();
    observable.notifyObservers(new ObservedValue<Integer>(cont));
});


    
   // stream.foreachRDD((consumerRecordJavaRDD, time) -> consumerRecordJavaRDD.foreach(integer -> {
        //observable.setChanged();
        //observable.notifyObservers(new SingleObservedData<Integer>(integer.value()));
  //      System.out.println("Pruebas----> "+integer.value());
//    }));

}

Source File: KafkaExample.java From Apache-Spark-2x-for-Java-Developers with MIT License

4 votes

public static void main(String[] args) {
  	//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
 System.setProperty("hadoop.home.dir", "E:\\hadoop");
  	//Logger rootLogger = LogManager.getRootLogger();
 		//rootLogger.setLevel(Level.WARN); 
      SparkConf conf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]");    
      JavaSparkContext sc = new JavaSparkContext(conf);
      JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.minutes(2));
      streamingContext.checkpoint("E:\\hadoop\\checkpoint");
      Logger rootLogger = LogManager.getRootLogger();
 		rootLogger.setLevel(Level.WARN); 
      Map<String, Object> kafkaParams = new HashMap<>();
      kafkaParams.put("bootstrap.servers", "10.0.75.1:9092");
      kafkaParams.put("key.deserializer", StringDeserializer.class);
      kafkaParams.put("value.deserializer", StringDeserializer.class);
      kafkaParams.put("group.id", "use_a_separate_group_id_for_each_strea");
      kafkaParams.put("auto.offset.reset", "latest");
     // kafkaParams.put("enable.auto.commit", false);

      Collection<String> topics = Arrays.asList("mytopic", "anothertopic");

      final JavaInputDStream<ConsumerRecord<String, String>> stream = KafkaUtils.createDirectStream(streamingContext,LocationStrategies.PreferConsistent(),
      				ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams));

      JavaPairDStream<String, String> pairRDD = stream.mapToPair(record-> new Tuple2<>(record.key(), record.value()));
     
      pairRDD.foreachRDD(pRDD-> { pRDD.foreach(tuple-> System.out.println(new Date()+" :: Kafka msg key ::"+tuple._1() +" the val is ::"+tuple._2()));});
     
      JavaDStream<String> tweetRDD = pairRDD.map(x-> x._2()).map(new TweetText());
      
      tweetRDD.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" :: "+x)));
      
     JavaDStream<String> hashtagRDD = tweetRDD.flatMap(twt-> Arrays.stream(twt.split(" ")).filter(str-> str.contains("#")).collect(Collectors.toList()).iterator() );
 
      hashtagRDD.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(x)));
      
      JavaPairDStream<String, Long> cntByVal = hashtagRDD.countByValue();
      
      cntByVal.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The count tag is ::"+x._1() +" and the val is ::"+x._2())));
      
     /* hashtagRDD.window(Durations.seconds(60), Durations.seconds(30))
                .countByValue()
               .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
      
     hashtagRDD.countByValueAndWindow(Durations.seconds(60), Durations.seconds(30))
               .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println("The window&count tag is ::"+x._1() +" and the val is ::"+x._2())));
      */
     hashtagRDD.window(Durations.minutes(8)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     hashtagRDD.window(Durations.minutes(8),Durations.minutes(2)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     hashtagRDD.window(Durations.minutes(12),Durations.minutes(8)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     hashtagRDD.window(Durations.minutes(2),Durations.minutes(2)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     hashtagRDD.window(Durations.minutes(12),Durations.minutes(12)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));
     
     /*hashtagRDD.window(Durations.minutes(5),Durations.minutes(2)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));*/
     /* hashtagRDD.window(Durations.minutes(10),Durations.minutes(1)).countByValue()
     .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));*/
     
      streamingContext.start();
      try {
	streamingContext.awaitTermination();
} catch (InterruptedException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
}
  }

Source File: KafkaSource.java From sylph with Apache License 2.0

4 votes

public JavaDStream<Row> createSource(JavaStreamingContext ssc, KafkaSourceConfig config, SourceContext context)
{
    String topics = config.getTopics();
    String brokers = config.getBrokers(); //需要把集群的host 配置到程序所在机器
    String groupId = config.getGroupid(); //消费者的名字
    String offsetMode = config.getOffsetMode();

    Map<String, Object> kafkaParams = new HashMap<>(config.getOtherConfig());
    kafkaParams.put("bootstrap.servers", brokers);
    kafkaParams.put("key.deserializer", ByteArrayDeserializer.class); //StringDeserializer
    kafkaParams.put("value.deserializer", ByteArrayDeserializer.class); //StringDeserializer
    kafkaParams.put("enable.auto.commit", false); //不自动提交偏移量
    //      "fetch.message.max.bytes" ->
    //      "session.timeout.ms" -> "30000", //session默认是30秒
    //      "heartbeat.interval.ms" -> "5000", //10秒提交一次 心跳周期
    kafkaParams.put("group.id", groupId); //注意不同的流 group.id必须要不同 否则会出现offect commit提交失败的错误
    kafkaParams.put("auto.offset.reset", offsetMode); //latest   earliest

    List<String> topicSets = Arrays.asList(topics.split(","));
    JavaInputDStream<ConsumerRecord<byte[], byte[]>> inputStream = KafkaUtils.createDirectStream(
            ssc, LocationStrategies.PreferConsistent(), ConsumerStrategies.Subscribe(topicSets, kafkaParams));

    DStream<ConsumerRecord<byte[], byte[]>> sylphKafkaOffset = new SylphKafkaOffset<ConsumerRecord<byte[], byte[]>>(inputStream.inputDStream())
    {
        @Override
        public void commitOffsets(RDD<?> kafkaRdd)
        {
            OffsetRange[] offsetRanges = ((HasOffsetRanges) kafkaRdd).offsetRanges();
            log().info("commitKafkaOffsets {}", (Object) offsetRanges);
            DStream<?> firstDStream = DStreamUtil.getFirstDStream(inputStream.dstream());
            ((CanCommitOffsets) firstDStream).commitAsync(offsetRanges);
        }
    };

    JavaDStream<ConsumerRecord<byte[], byte[]>> javaDStream = new JavaDStream<>(sylphKafkaOffset, ClassTag$.MODULE$.apply(ConsumerRecord.class));
    if ("json".equalsIgnoreCase(config.getValueType())) {
        JsonSchema jsonParser = new JsonSchema(context.getSchema());
        return javaDStream
                .map(record -> jsonParser.deserialize(record.key(), record.value(), record.topic(), record.partition(), record.offset()));
    }
    else {
        List<String> names = context.getSchema().getFieldNames();
        return javaDStream
                .map(record -> {
                    Object[] values = new Object[names.size()];
                    for (int i = 0; i < names.size(); i++) {
                        switch (names.get(i)) {
                            case "_topic":
                                values[i] = record.topic();
                                continue;
                            case "_message":
                                values[i] = new String(record.value(), UTF_8);
                                continue;
                            case "_key":
                                values[i] = record.key() == null ? null : new String(record.key(), UTF_8);
                                continue;
                            case "_partition":
                                values[i] = record.partition();
                                continue;
                            case "_offset":
                                values[i] = record.offset();
                            case "_timestamp":
                                values[i] = record.timestamp();
                            case "_timestampType":
                                values[i] = record.timestampType().id;
                            default:
                                values[i] = null;
                        }
                    }
                    return new GenericRow(values);  //GenericRowWithSchema
                });  //.window(Duration(10 * 1000))
    }
}

Source File: KafkaSource08.java From sylph with Apache License 2.0

4 votes

private static JavaDStream<ConsumerRecord<byte[], byte[]>> settingCommit(
            JavaInputDStream<ConsumerRecord<byte[], byte[]>> inputStream,
            Map<String, String> kafkaParams,
            KafkaCluster kafkaCluster,
            String groupId)
    {
        if (kafkaParams.getOrDefault("auto.commit.enable", "true").equals("false")) {
            return inputStream;
        }

        int commitInterval = Integer.parseInt(kafkaParams.getOrDefault(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "90000"));

        DStream<ConsumerRecord<byte[], byte[]>> sylphKafkaOffset = new SylphKafkaOffset<ConsumerRecord<byte[], byte[]>>(inputStream.inputDStream())
        {
            private final KafkaOffsetCommitter kafkaOffsetCommitter = new KafkaOffsetCommitter(
                    kafkaCluster,
                    groupId,
                    commitInterval);

            @Override
            public void initialize(Time time)
            {
                super.initialize(time);
                kafkaOffsetCommitter.setName("Kafka_Offset_Committer");
                kafkaOffsetCommitter.start();
            }

            @Override
            public void commitOffsets(RDD<?> kafkaRdd)
            {
                OffsetRange[] offsets = ((HasOffsetRanges) kafkaRdd).offsetRanges();
//                Map<TopicAndPartition, Long> internalOffsets = Arrays.stream(offsets)
//                        .collect(Collectors.toMap(k -> k.topicAndPartition(), v -> v.fromOffset()));
                //log().info("commit Kafka Offsets {}", internalOffsets);
                kafkaOffsetCommitter.addAll(offsets);
            }
        };
        JavaDStream<ConsumerRecord<byte[], byte[]>> dStream = new JavaDStream<>(
                sylphKafkaOffset, ClassTag$.MODULE$.<ConsumerRecord<byte[], byte[]>>apply(ConsumerRecord.class));
        return dStream;
//        inputStream = inputStream.transform(rdd -> {
//            OffsetRange[] offsets = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
//            Map<TopicAndPartition, Long> internalOffsets = Arrays.stream(offsets)
//                    .collect(Collectors.toMap(k -> k.topicAndPartition(), v -> v.fromOffset()));
//            commitKafkaOffsets(kafkaCluster, groupId, internalOffsets);
//            return rdd;
//        });
    }

Source File: KafkaSource08.java From sylph with Apache License 2.0

4 votes

public JavaDStream<Row> createSource(JavaStreamingContext ssc, KafkaSourceConfig08 config, SourceContext context)
{
    String topics = requireNonNull(config.getTopics(), "topics not setting");
    String brokers = requireNonNull(config.getBrokers(), "brokers not setting"); //需要把集群的host 配置到程序所在机器
    String groupId = requireNonNull(config.getGroupid(), "group.id not setting"); //消费者的名字
    String offsetMode = requireNonNull(config.getOffsetMode(), "offsetMode not setting");

    Map<String, String> otherConfig = config.getOtherConfig().entrySet()
            .stream()
            .filter(x -> x.getValue() != null)
            .collect(Collectors.toMap(Map.Entry::getKey, v -> v.getValue().toString()));

    Map<String, String> kafkaParams = new HashMap<>(otherConfig);
    kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers);
    //kafkaParams.put("auto.commit.enable", true); //不自动提交偏移量
    //      "fetch.message.max.bytes" ->
    //      "session.timeout.ms" -> "30000", //session默认是30秒
    //      "heartbeat.interval.ms" -> "5000", //10秒提交一次 心跳周期
    kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, groupId); //注意不同的流 group.id必须要不同 否则会出现offect commit提交失败的错误
    kafkaParams.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, offsetMode); //largest   smallest

    //----get fromOffsets
    @SuppressWarnings("unchecked")
    scala.collection.immutable.Map<String, String> map = (scala.collection.immutable.Map<String, String>) Map$.MODULE$.apply(JavaConverters.mapAsScalaMapConverter(kafkaParams).asScala().toSeq());
    final KafkaCluster kafkaCluster = new KafkaCluster(map);
    Map<TopicAndPartition, Long> fromOffsets = getFromOffset(kafkaCluster, topics, groupId);

    //--- createDirectStream  DirectKafkaInputDStream.class
    org.apache.spark.api.java.function.Function<MessageAndMetadata<byte[], byte[]>, ConsumerRecord<byte[], byte[]>> messageHandler =
            mmd -> new ConsumerRecord<>(mmd.topic(), mmd.partition(), mmd.key(), mmd.message(), mmd.offset());
    @SuppressWarnings("unchecked")
    Class<ConsumerRecord<byte[], byte[]>> recordClass = (Class<ConsumerRecord<byte[], byte[]>>) ClassTag$.MODULE$.<ConsumerRecord<byte[], byte[]>>apply(ConsumerRecord.class).runtimeClass();
    JavaInputDStream<ConsumerRecord<byte[], byte[]>> inputStream = KafkaUtils.createDirectStream(ssc,
            byte[].class, byte[].class, DefaultDecoder.class, DefaultDecoder.class, recordClass,
            kafkaParams, fromOffsets,
            messageHandler
    );
    JavaDStream<ConsumerRecord<byte[], byte[]>> dStream = settingCommit(inputStream, kafkaParams, kafkaCluster, groupId);

    if ("json".equalsIgnoreCase(config.getValueType())) {
        JsonSchema jsonParser = new JsonSchema(context.getSchema());
        return dStream
                .map(record -> {
                    return jsonParser.deserialize(record.key(), record.value(), record.topic(), record.partition(), record.offset());
                });
    }
    else {
        StructType structType = schemaToSparkType(context.getSchema());
        return dStream
                .map(record -> {
                    String[] names = structType.names();
                    Object[] values = new Object[names.length];
                    for (int i = 0; i < names.length; i++) {
                        switch (names[i]) {
                            case "_topic":
                                values[i] = record.topic();
                                continue;
                            case "_message":
                                values[i] = new String(record.value(), UTF_8);
                                continue;
                            case "_key":
                                values[i] = new String(record.key(), UTF_8);
                                continue;
                            case "_partition":
                                values[i] = record.partition();
                                continue;
                            case "_offset":
                                values[i] = record.offset();
                            default:
                                values[i] = null;
                        }
                    }
                    return (Row) new GenericRowWithSchema(values, structType);
                });  //.window(Duration(10 * 1000))
    }
}

Source File: StreamingRsvpsDStream.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License

4 votes

public static void main(String[] args) throws InterruptedException {

        System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

        final SparkConf conf = new SparkConf()
                .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
                .setAppName(APPLICATION_NAME)
                .set("spark.mongodb.output.uri", MONGODB_OUTPUT_URI);

        final JavaStreamingContext streamingContext
                = new JavaStreamingContext(conf, new Duration(BATCH_DURATION_INTERVAL_MS));

        final JavaInputDStream<ConsumerRecord<String, String>> meetupStream =
                KafkaUtils.createDirectStream(
                        streamingContext,
                        LocationStrategies.PreferConsistent(),
                        ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES)
                );
                
        // transformations, streaming algorithms, etc
        JavaDStream<ConsumerRecord<String, String>> rsvpsWithGuestsStream =
                meetupStream.filter(f -> !f.value().contains("\"guests\":0"));

        rsvpsWithGuestsStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> r) -> {        
            MongoSpark.save(
                    r.map(
                        e -> Document.parse(e.value())
                    )
            );            
        });
        
        // some time later, after outputs have completed
        meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> {        
            OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges();            

            ((CanCommitOffsets) meetupStream.inputDStream())
                .commitAsync(offsetRanges, new MeetupOffsetCommitCallback());
        });

        streamingContext.start();
        streamingContext.awaitTermination();    
    }

Source File: SparkMLTrainingAndScoringOnline.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License

4 votes

public static void main(String[] args) throws InterruptedException {

                System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

                final SparkConf conf = new SparkConf()
                    .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
                    .setAppName(APPLICATION_NAME)
                    .set("spark.sql.caseSensitive", CASE_SENSITIVE);                               

                JavaStreamingContext streamingContext = new JavaStreamingContext(conf,
                    new Duration(BATCH_DURATION_INTERVAL_MS));
                
                JavaInputDStream<ConsumerRecord<String, String>> meetupStream = 
                    KafkaUtils.createDirectStream(
                                streamingContext, 
				LocationStrategies.PreferConsistent(),
                                ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES)
                    );

                JavaDStream<String> meetupStreamValues = 
		    meetupStream.map(v -> {                     
                        return v.value();
                    });

                // Prepare the training data as strings of type: (y,[x1,x2,x3,...,xn])
                // Where n is the number of features, y is a binary label, 
                // and n must be the same for train and test.
                // e.g. "(response, [group_lat, group_long])";
                JavaDStream<String> trainData = meetupStreamValues.map(e -> {
                        
                        JSONParser jsonParser = new JSONParser();
                        JSONObject json = (JSONObject)jsonParser.parse(e);

                        String result = "(" 
                            + (String.valueOf(json.get("response")).equals("yes") ? "1.0,[":"0.0,[") 
                            + ((JSONObject)json.get("group")).get("group_lat") + "," 
                            + ((JSONObject)json.get("group")).get("group_lon")
                            + "])";
                        
                        return result;
                });

                trainData.print();

                JavaDStream<LabeledPoint> labeledPoints = trainData.map(LabeledPoint::parse);
        
                StreamingLogisticRegressionWithSGD streamingLogisticRegressionWithSGD 
			= new StreamingLogisticRegressionWithSGD()
                            .setInitialWeights(Vectors.zeros(2));

                streamingLogisticRegressionWithSGD.trainOn(labeledPoints);

                JavaPairDStream<Double, Vector> values = 
			labeledPoints.mapToPair(f -> new Tuple2<>(f.label(), f.features()));

                streamingLogisticRegressionWithSGD.predictOnValues(values).print();

                // some time later, after outputs have completed
                meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> {        
                    OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges();            

                ((CanCommitOffsets) meetupStream.inputDStream())
                    .commitAsync(offsetRanges, new MeetupOffsetCommitCallback());
                });

                streamingContext.start();
                streamingContext.awaitTermination();
        }

org.apache.spark.streaming.api.java.JavaInputDStream Java Examples