org.apache.spark.streaming.api.java.JavaInputDStream Java Examples
The following examples show how to use
org.apache.spark.streaming.api.java.JavaInputDStream.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: StreamingRsvpsDStreamCountWindow.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License | 6 votes |
public static void main(String[] args) throws InterruptedException { System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE); final SparkConf conf = new SparkConf() .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES) .setAppName(APPLICATION_NAME) .set("spark.mongodb.output.uri", MONGODB_OUTPUT_URI) .set("spark.streaming.kafka.consumer.cache.enabled", "false"); final JavaStreamingContext streamingContext = new JavaStreamingContext(conf, new Duration(BATCH_DURATION_INTERVAL_MS)); streamingContext.checkpoint(CHECKPOINT_FOLDER); final JavaInputDStream<ConsumerRecord<String, String>> meetupStream = KafkaUtils.createDirectStream( streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES) ); // transformations, streaming algorithms, etc JavaDStream<Long> countStream = meetupStream.countByWindow( new Duration(WINDOW_LENGTH_MS), new Duration(SLIDING_INTERVAL_MS)); countStream.foreachRDD((JavaRDD<Long> countRDD) -> { MongoSpark.save( countRDD.map( r -> Document.parse("{\"rsvps_count\":\"" + String.valueOf(r) + "\"}") ) ); }); // some time later, after outputs have completed meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> { OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges(); ((CanCommitOffsets) meetupStream.inputDStream()) .commitAsync(offsetRanges, new MeetupOffsetCommitCallback()); }); streamingContext.start(); streamingContext.awaitTermination(); }
Example #2
Source File: StreamingEngine.java From spark-streaming-direct-kafka with Apache License 2.0 | 6 votes |
public void start() { SparkConf sparkConf = getSparkConf(); streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(Long.parseLong(config.getStreamingBatchIntervalInSec()))); JavaInputDStream<MessageAndMetadata<String, byte[]>> dStream = buildInputDStream(streamingContext); JavaPairDStream<String, byte[]> pairDStream = dStream.mapToPair(km -> new Tuple2<>(km.key(), km.message())); pairDStream.foreachRDD(new ProcessStreamingData<>(config)); // process data dStream.foreachRDD(new UpdateOffsetsFn<>(config.getKafkaGroupId(), config.getZkOffsetManager())); streamingContext.start(); }
Example #3
Source File: AbstractJavaEsSparkStreamingTest.java From elasticsearch-hadoop with Apache License 2.0 | 5 votes |
@Test public void testEsRDDWriteIndexCreationDisabled() throws Exception { ExpectingToThrow expecting = expectingToThrow(EsHadoopIllegalArgumentException.class).from(ssc); Map<String, Object> doc1 = new HashMap<>(); doc1.put("one", null); Set<String> values = new HashSet<>(); values.add("2"); doc1.put("two", values); doc1.put("three", "."); Map<String, Object> doc2 = new HashMap<>(); doc2.put("OTP", "Otopeni"); doc2.put("SFO", "San Fran"); List<Map<String, Object>> docs = new ArrayList<>(); docs.add(doc1); docs.add(doc2); String target = wrapIndex(resource("spark-test-nonexisting-scala-basic-write", "data", version)); Map<String, String> localConf = new HashMap<>(cfg); localConf.put(ES_INDEX_AUTO_CREATE, "no"); JavaRDD<Map<String, Object>> batch = sc.parallelize(docs); Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>(); rddQueue.add(batch); JavaInputDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue, true); // apply closure JavaEsSparkStreaming.saveToEs(dstream, target, localConf); ssc.start(); TimeUnit.SECONDS.sleep(2); // Let the processing happen ssc.stop(false, true); assertTrue(!RestUtils.exists(target)); expecting.assertExceptionFound(); }
Example #4
Source File: AbstractJavaEsSparkStreamingTest.java From elasticsearch-hadoop with Apache License 2.0 | 5 votes |
@Test public void testEsRDDWriteIndexCreationDisabled() throws Exception { ExpectingToThrow expecting = expectingToThrow(EsHadoopIllegalArgumentException.class).from(ssc); Map<String, Object> doc1 = new HashMap<>(); doc1.put("one", null); Set<String> values = new HashSet<>(); values.add("2"); doc1.put("two", values); doc1.put("three", "."); Map<String, Object> doc2 = new HashMap<>(); doc2.put("OTP", "Otopeni"); doc2.put("SFO", "San Fran"); List<Map<String, Object>> docs = new ArrayList<>(); docs.add(doc1); docs.add(doc2); String target = wrapIndex(resource("spark-test-nonexisting-scala-basic-write", "data", version)); Map<String, String> localConf = new HashMap<>(cfg); localConf.put(ES_INDEX_AUTO_CREATE, "no"); JavaRDD<Map<String, Object>> batch = sc.parallelize(docs); Queue<JavaRDD<Map<String, Object>>> rddQueue = new LinkedList<>(); rddQueue.add(batch); JavaInputDStream<Map<String, Object>> dstream = ssc.queueStream(rddQueue, true); // apply closure JavaEsSparkStreaming.saveToEs(dstream, target, localConf); ssc.start(); TimeUnit.SECONDS.sleep(2); // Let the processing happen ssc.stop(false, true); assertTrue(!RestUtils.exists(target)); expecting.assertExceptionFound(); }
Example #5
Source File: AbstractSparkLayer.java From oryx with Apache License 2.0 | 5 votes |
protected final JavaInputDStream<ConsumerRecord<K,M>> buildInputDStream( JavaStreamingContext streamingContext) { Preconditions.checkArgument( KafkaUtils.topicExists(inputTopicLockMaster, inputTopic), "Topic %s does not exist; did you create it?", inputTopic); if (updateTopic != null && updateTopicLockMaster != null) { Preconditions.checkArgument( KafkaUtils.topicExists(updateTopicLockMaster, updateTopic), "Topic %s does not exist; did you create it?", updateTopic); } String groupID = getGroupID(); Map<String,Object> kafkaParams = new HashMap<>(); kafkaParams.put("group.id", groupID); // Don't re-consume old messages from input by default kafkaParams.put("auto.offset.reset", "latest"); // Ignored by Kafka 0.10 Spark integration kafkaParams.put("bootstrap.servers", inputBroker); kafkaParams.put("key.deserializer", keyDecoderClass.getName()); kafkaParams.put("value.deserializer", messageDecoderClass.getName()); LocationStrategy locationStrategy = LocationStrategies.PreferConsistent(); ConsumerStrategy<K,M> consumerStrategy = ConsumerStrategies.Subscribe( Collections.singleton(inputTopic), kafkaParams, Collections.emptyMap()); return org.apache.spark.streaming.kafka010.KafkaUtils.createDirectStream( streamingContext, locationStrategy, consumerStrategy); }
Example #6
Source File: MapRStreaming22Binding.java From datacollector with Apache License 2.0 | 5 votes |
@Override public JavaStreamingContext createDStream(JavaStreamingContext result, Map<String, Object> props) { List<String> topics = ImmutableList.of(topic); if (!autoOffsetValue.isEmpty()) { props.put(SparkStreamingBinding.AUTO_OFFSET_RESET, autoOffsetValue); } props.putAll(extraKafkaConfigs); JavaInputDStream<ConsumerRecord<byte[], byte[]>> stream; if (offsetHelper.isSDCCheckPointing()) { Map<TopicPartition, Long> fromOffsets = MaprStreamsOffsetManagerImpl.get().getOffsetForDStream(topic, numberOfPartitions); stream = KafkaUtils.createDirectStream( result, LocationStrategies.PreferConsistent(), ConsumerStrategies.<byte[], byte[]>Assign(new ArrayList<TopicPartition>(fromOffsets.keySet()), props, fromOffsets) ); } else { stream = KafkaUtils.createDirectStream( result, LocationStrategies.PreferConsistent(), ConsumerStrategies.<byte[], byte[]>Subscribe(topics, props) ); } Driver$.MODULE$.foreach(stream.dstream(), MaprStreamsOffsetManagerImpl.get()); return result; }
Example #7
Source File: Kafka010SparkStreamingBinding.java From datacollector with Apache License 2.0 | 5 votes |
@Override public JavaStreamingContext createDStream(JavaStreamingContext result, Map<String, Object> props) { props.put("bootstrap.servers", metaDataBrokerList); if (!autoOffsetValue.isEmpty()) { autoOffsetValue = getConfigurableAutoOffsetResetIfNonEmpty(autoOffsetValue); props.put(AUTO_OFFSET_RESET, autoOffsetValue); } props.putAll(extraKafkaConfigs); List<String> topics = ImmutableList.of(topic); JavaInputDStream<ConsumerRecord<byte[], byte[]>> stream; if (offsetHelper.isSDCCheckPointing()) { Map<TopicPartition, Long> fromOffsets = KafkaOffsetManagerImpl.get().getOffsetForDStream(topic, numberOfPartitions); stream = KafkaUtils.createDirectStream( result, LocationStrategies.PreferConsistent(), ConsumerStrategies.<byte[], byte[]>Assign(new ArrayList<TopicPartition>(fromOffsets.keySet()), props, fromOffsets) ); } else { stream = KafkaUtils.createDirectStream( result, LocationStrategies.PreferConsistent(), ConsumerStrategies.<byte[], byte[]>Subscribe(topics, props) ); } Driver$.MODULE$.foreach(stream.dstream(), KafkaOffsetManagerImpl.get()); return result; }
Example #8
Source File: AbstractSparkLayer.java From spark-streaming-direct-kafka with Apache License 2.0 | 5 votes |
public JavaInputDStream<MessageAndMetadata<String,byte[]>> buildInputDStream( JavaStreamingContext streamingContext) { HashMap<String, String> kafkaParams = config.getKafkaParams(); // Ugly compiler-pleasing acrobatics: @SuppressWarnings("unchecked") Class<MessageAndMetadata<String, byte[]>> streamClass = (Class<MessageAndMetadata<String, byte[]>>) (Class<?>) MessageAndMetadata.class; if (!KafkaManager.topicExists(config.getZkKafka(), config.getTopic())) { throw new RuntimeException("Topic does not exist on server"); } Map<TopicAndPartition, Long> seedOffsetsMap = KafkaManager.getOffsets(config.getZkKafka(), config.getZkOffsetManager(), config.getKafkaGroupId(), config.getTopic(), config.getKafkaParams()); // TODO: try generics, instead of hardcoded values JavaInputDStream<MessageAndMetadata<String, byte[]>> dStream = org.apache.spark.streaming.kafka.KafkaUtils.createDirectStream( streamingContext, String.class, // change as necessary byte[].class, // change as necessary StringDecoder.class, DefaultDecoder.class, streamClass, kafkaParams, seedOffsetsMap, Functions.<MessageAndMetadata<String, byte[]>>identity()); return dStream; }
Example #9
Source File: MapRStreamingBinding.java From datacollector with Apache License 2.0 | 5 votes |
@Override public JavaStreamingContext createDStream(JavaStreamingContext result, Map<String, Object> props) { List<String> topics = ImmutableList.of(topic); if (!autoOffsetValue.isEmpty()) { props.put(SparkStreamingBinding.AUTO_OFFSET_RESET, autoOffsetValue); } props.putAll(extraKafkaConfigs); JavaInputDStream<ConsumerRecord<byte[], byte[]>> stream; if (offsetHelper.isSDCCheckPointing()) { Map<TopicPartition, Long> fromOffsets = MaprStreamsOffsetManagerImpl.get().getOffsetForDStream(topic, numberOfPartitions); stream = KafkaUtils.createDirectStream( result, LocationStrategies.PreferConsistent(), ConsumerStrategies.<byte[], byte[]>Assign(new ArrayList<TopicPartition>(fromOffsets.keySet()), props, fromOffsets) ); } else { stream = KafkaUtils.createDirectStream( result, LocationStrategies.PreferConsistent(), ConsumerStrategies.<byte[], byte[]>Subscribe(topics, props) ); } Driver$.MODULE$.foreach(stream.dstream(), MaprStreamsOffsetManagerImpl.get()); return result; }
Example #10
Source File: WordCountingAppWithCheckpoint.java From tutorials with MIT License | 4 votes |
public static void main(String[] args) throws InterruptedException { Logger.getLogger("org") .setLevel(Level.OFF); Logger.getLogger("akka") .setLevel(Level.OFF); Map<String, Object> kafkaParams = new HashMap<>(); kafkaParams.put("bootstrap.servers", "localhost:9092"); kafkaParams.put("key.deserializer", StringDeserializer.class); kafkaParams.put("value.deserializer", StringDeserializer.class); kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream"); kafkaParams.put("auto.offset.reset", "latest"); kafkaParams.put("enable.auto.commit", false); Collection<String> topics = Arrays.asList("messages"); SparkConf sparkConf = new SparkConf(); sparkConf.setMaster("local[2]"); sparkConf.setAppName("WordCountingAppWithCheckpoint"); sparkConf.set("spark.cassandra.connection.host", "127.0.0.1"); JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1)); sparkContext = streamingContext.sparkContext(); streamingContext.checkpoint("./.checkpoint"); JavaInputDStream<ConsumerRecord<String, String>> messages = KafkaUtils.createDirectStream(streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String> Subscribe(topics, kafkaParams)); JavaPairDStream<String, String> results = messages.mapToPair(record -> new Tuple2<>(record.key(), record.value())); JavaDStream<String> lines = results.map(tuple2 -> tuple2._2()); JavaDStream<String> words = lines.flatMap(x -> Arrays.asList(x.split("\\s+")) .iterator()); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(s -> new Tuple2<>(s, 1)) .reduceByKey((Function2<Integer, Integer, Integer>) (i1, i2) -> i1 + i2); JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> cumulativeWordCounts = wordCounts.mapWithState(StateSpec.function((word, one, state) -> { int sum = one.orElse(0) + (state.exists() ? state.get() : 0); Tuple2<String, Integer> output = new Tuple2<>(word, sum); state.update(sum); return output; })); cumulativeWordCounts.foreachRDD(javaRdd -> { List<Tuple2<String, Integer>> wordCountList = javaRdd.collect(); for (Tuple2<String, Integer> tuple : wordCountList) { List<Word> wordList = Arrays.asList(new Word(tuple._1, tuple._2)); JavaRDD<Word> rdd = sparkContext.parallelize(wordList); javaFunctions(rdd).writerBuilder("vocabulary", "words", mapToRow(Word.class)) .saveToCassandra(); } }); streamingContext.start(); streamingContext.awaitTermination(); }
Example #11
Source File: SparkRunner.java From jaeger-analytics-java with Apache License 2.0 | 4 votes |
public static void main(String []args) throws InterruptedException, IOException { HTTPServer server = new HTTPServer(Integer.valueOf(getPropOrEnv("PROMETHEUS_PORT", "9111"))); SparkConf sparkConf = new SparkConf() .setAppName("Trace DSL") .setMaster(getPropOrEnv("SPARK_MASTER","local[*]")); JavaSparkContext sc = new JavaSparkContext(sparkConf); JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(Integer.parseInt(getPropOrEnv("SPARK_STREAMING_BATCH_DURATION", "5000")))); Set<String> topics = Collections.singleton(getPropOrEnv("KAFKA_JAEGER_TOPIC", "jaeger-spans")); Map<String, Object> kafkaParams = new HashMap<>(); kafkaParams.put("bootstrap.servers", getPropOrEnv("KAFKA_BOOTSTRAP_SERVER", "localhost:9092")); kafkaParams.put("key.deserializer", StringDeserializer.class); kafkaParams.put("value.deserializer", ProtoSpanDeserializer.class); // hack to start always from beginning kafkaParams.put("group.id", "jaeger-trace-aggregation-" + System.currentTimeMillis()); if (Boolean.parseBoolean(getPropOrEnv("KAFKA_START_FROM_BEGINNING", "true"))) { kafkaParams.put("auto.offset.reset", "earliest"); kafkaParams.put("enable.auto.commit", false); kafkaParams.put("startingOffsets", "earliest"); } JavaInputDStream<ConsumerRecord<String, Span>> messages = KafkaUtils.createDirectStream( ssc, LocationStrategies.PreferConsistent(), ConsumerStrategies.Subscribe(topics, kafkaParams)); JavaPairDStream<String, Span> traceIdSpanTuple = messages.mapToPair(record -> { return new Tuple2<>(record.value().traceId, record.value()); }); JavaDStream<Trace> tracesStream = traceIdSpanTuple.groupByKey().map(traceIdSpans -> { System.out.printf("traceID: %s\n", traceIdSpans._1); Iterable<Span> spans = traceIdSpans._2(); Trace trace = new Trace(); trace.traceId = traceIdSpans._1(); trace.spans = StreamSupport.stream(spans.spliterator(), false) .collect(Collectors.toList()); return trace; }); MinimumClientVersion minimumClientVersion = MinimumClientVersion.builder() .withJavaVersion(getPropOrEnv("TRACE_QUALITY_JAVA_VERSION", "1.0.0")) .withGoVersion(getPropOrEnv("TRACE_QUALITY_GO_VERSION", "2.22.0")) .withNodeVersion(getPropOrEnv("TRACE_QUALITY_NODE_VERSION", "3.17.1")) .withPythonVersion(getPropOrEnv("TRACE_QUALITY_PYTHON_VERSION", "4.0.0")) .build(); List<ModelRunner> modelRunner = Arrays.asList( new TraceHeight(), new ServiceDepth(), new ServiceHeight(), new NetworkLatency(), new NumberOfErrors(), new DirectDependencies(), // trace quality minimumClientVersion, new HasClientServerSpans(), new UniqueSpanId()); tracesStream.foreachRDD((traceRDD, time) -> { traceRDD.foreach(trace -> { Graph graph = GraphCreator.create(trace); for (ModelRunner model: modelRunner) { model.runWithMetrics(graph); } }); }); ssc.start(); ssc.awaitTermination(); }
Example #12
Source File: BatchLayer.java From oryx with Apache License 2.0 | 4 votes |
public synchronized void start() { String id = getID(); if (id != null) { log.info("Starting Batch Layer {}", id); } streamingContext = buildStreamingContext(); JavaSparkContext sparkContext = streamingContext.sparkContext(); Configuration hadoopConf = sparkContext.hadoopConfiguration(); Path checkpointPath = new Path(new Path(modelDirString), ".checkpoint"); log.info("Setting checkpoint dir to {}", checkpointPath); sparkContext.setCheckpointDir(checkpointPath.toString()); log.info("Creating message stream from topic"); JavaInputDStream<ConsumerRecord<K,M>> kafkaDStream = buildInputDStream(streamingContext); JavaPairDStream<K,M> pairDStream = kafkaDStream.mapToPair(mAndM -> new Tuple2<>(mAndM.key(), mAndM.value())); Class<K> keyClass = getKeyClass(); Class<M> messageClass = getMessageClass(); pairDStream.foreachRDD( new BatchUpdateFunction<>(getConfig(), keyClass, messageClass, keyWritableClass, messageWritableClass, dataDirString, modelDirString, loadUpdateInstance(), streamingContext)); // "Inline" saveAsNewAPIHadoopFiles to be able to skip saving empty RDDs pairDStream.foreachRDD(new SaveToHDFSFunction<>( dataDirString + "/oryx", "data", keyClass, messageClass, keyWritableClass, messageWritableClass, hadoopConf)); // Must use the raw Kafka stream to get offsets kafkaDStream.foreachRDD(new UpdateOffsetsFn<>(getGroupID(), getInputTopicLockMaster())); if (maxDataAgeHours != NO_MAX_AGE) { pairDStream.foreachRDD(new DeleteOldDataFn<>(hadoopConf, dataDirString, Pattern.compile("-(\\d+)\\."), maxDataAgeHours)); } if (maxModelAgeHours != NO_MAX_AGE) { pairDStream.foreachRDD(new DeleteOldDataFn<>(hadoopConf, modelDirString, Pattern.compile("(\\d+)"), maxModelAgeHours)); } log.info("Starting Spark Streaming"); streamingContext.start(); }
Example #13
Source File: SpeedLayer.java From oryx with Apache License 2.0 | 4 votes |
public synchronized void start() { String id = getID(); if (id != null) { log.info("Starting Speed Layer {}", id); } streamingContext = buildStreamingContext(); log.info("Creating message stream from topic"); JavaInputDStream<ConsumerRecord<K,M>> kafkaDStream = buildInputDStream(streamingContext); JavaPairDStream<K,M> pairDStream = kafkaDStream.mapToPair(mAndM -> new Tuple2<>(mAndM.key(), mAndM.value())); KafkaConsumer<String,U> consumer = new KafkaConsumer<>( ConfigUtils.keyValueToProperties( "group.id", "OryxGroup-" + getLayerName() + '-' + UUID.randomUUID(), "bootstrap.servers", updateBroker, "max.partition.fetch.bytes", maxMessageSize, "key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer", "value.deserializer", updateDecoderClass.getName(), // Do start from the beginning of the update queue "auto.offset.reset", "earliest" )); consumer.subscribe(Collections.singletonList(updateTopic)); consumerIterator = new ConsumeDataIterator<>(consumer); modelManager = loadManagerInstance(); Configuration hadoopConf = streamingContext.sparkContext().hadoopConfiguration(); new Thread(LoggingCallable.log(() -> { try { modelManager.consume(consumerIterator, hadoopConf); } catch (Throwable t) { log.error("Error while consuming updates", t); close(); } }).asRunnable(), "OryxSpeedLayerUpdateConsumerThread").start(); pairDStream.foreachRDD(new SpeedLayerUpdate<>(modelManager, updateBroker, updateTopic)); // Must use the raw Kafka stream to get offsets kafkaDStream.foreachRDD(new UpdateOffsetsFn<>(getGroupID(), getInputTopicLockMaster())); log.info("Starting Spark Streaming"); streamingContext.start(); }
Example #14
Source File: SimpleSparkStructuredKafkaStreamingCounter.java From jMetalSP with MIT License | 4 votes |
@Override public void run() { ConsumerStrategy<Integer,Integer> consumerStrategy =ConsumerStrategies.Subscribe(topic,kafkaParams); LocationStrategy locationStrategy = LocationStrategies.PreferConsistent(); JavaInputDStream<ConsumerRecord<Integer,Integer>> stream= (JavaInputDStream<ConsumerRecord<Integer,Integer>>) KafkaUtils.createDirectStream(streamingContext, locationStrategy, consumerStrategy); JavaDStream<Integer> time=stream.map(value -> value.value() ); /*time.foreachRDD(numbers-> { numbers.foreach(value-> { System.out.println("Pruebas----> " + value); observable.setChanged(); observable.notifyObservers(new SingleObservedData<Integer>(value)); }); } );*/ time.foreachRDD(numbers -> { Integer cont = numbers.reduce((key, value) -> value); //System.out.println("Pruebas----> " + cont); observable.setChanged(); observable.notifyObservers(new ObservedValue<Integer>(cont)); }); // stream.foreachRDD((consumerRecordJavaRDD, time) -> consumerRecordJavaRDD.foreach(integer -> { //observable.setChanged(); //observable.notifyObservers(new SingleObservedData<Integer>(integer.value())); // System.out.println("Pruebas----> "+integer.value()); // })); }
Example #15
Source File: SimpleSparkStructuredKafkaStreamingCounterAVRO.java From jMetalSP with MIT License | 4 votes |
@Override public void run() { ConsumerStrategy<Integer,byte[]> consumerStrategy =ConsumerStrategies.Subscribe(topic,kafkaParams); LocationStrategy locationStrategy = LocationStrategies.PreferConsistent(); JavaInputDStream<ConsumerRecord<Integer,byte []>> stream= (JavaInputDStream<ConsumerRecord<Integer,byte[]>>) KafkaUtils.createDirectStream(streamingContext, locationStrategy, consumerStrategy); JavaDStream<Integer> time=stream.map(value -> { DataDeserializer<Counter> dataDeserializer = new DataDeserializer<>(); //Object o =dataDeserializer.deserialize(value.value(),"avsc/Counter.avsc"); //GenericData.Record rc=(GenericData.Record)o; Counter counter = dataDeserializer.deserialize(value.value(),"avsc/Counter.avsc"); //Counter counter = (Counter) dataDeserializer.deserialize(value.value(),"avsc/Counter.avsc"); //return (Integer) rc.get(0); return (Integer) counter.get(0); }); /*time.foreachRDD(numbers-> { numbers.foreach(value-> { System.out.println("Pruebas----> " + value); observable.setChanged(); observable.notifyObservers(new SingleObservedData<Integer>(value)); }); } );*/ time.foreachRDD(numbers -> { Integer cont = numbers.reduce((key, value) -> value); System.out.println("Pruebas----> " + cont); observable.setChanged(); observable.notifyObservers(new ObservedValue<Integer>(cont)); }); // stream.foreachRDD((consumerRecordJavaRDD, time) -> consumerRecordJavaRDD.foreach(integer -> { //observable.setChanged(); //observable.notifyObservers(new SingleObservedData<Integer>(integer.value())); // System.out.println("Pruebas----> "+integer.value()); // })); }
Example #16
Source File: KafkaExample.java From Apache-Spark-2x-for-Java-Developers with MIT License | 4 votes |
public static void main(String[] args) { //Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set System.setProperty("hadoop.home.dir", "E:\\hadoop"); //Logger rootLogger = LogManager.getRootLogger(); //rootLogger.setLevel(Level.WARN); SparkConf conf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]"); JavaSparkContext sc = new JavaSparkContext(conf); JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.minutes(2)); streamingContext.checkpoint("E:\\hadoop\\checkpoint"); Logger rootLogger = LogManager.getRootLogger(); rootLogger.setLevel(Level.WARN); Map<String, Object> kafkaParams = new HashMap<>(); kafkaParams.put("bootstrap.servers", "10.0.75.1:9092"); kafkaParams.put("key.deserializer", StringDeserializer.class); kafkaParams.put("value.deserializer", StringDeserializer.class); kafkaParams.put("group.id", "use_a_separate_group_id_for_each_strea"); kafkaParams.put("auto.offset.reset", "latest"); // kafkaParams.put("enable.auto.commit", false); Collection<String> topics = Arrays.asList("mytopic", "anothertopic"); final JavaInputDStream<ConsumerRecord<String, String>> stream = KafkaUtils.createDirectStream(streamingContext,LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams)); JavaPairDStream<String, String> pairRDD = stream.mapToPair(record-> new Tuple2<>(record.key(), record.value())); pairRDD.foreachRDD(pRDD-> { pRDD.foreach(tuple-> System.out.println(new Date()+" :: Kafka msg key ::"+tuple._1() +" the val is ::"+tuple._2()));}); JavaDStream<String> tweetRDD = pairRDD.map(x-> x._2()).map(new TweetText()); tweetRDD.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" :: "+x))); JavaDStream<String> hashtagRDD = tweetRDD.flatMap(twt-> Arrays.stream(twt.split(" ")).filter(str-> str.contains("#")).collect(Collectors.toList()).iterator() ); hashtagRDD.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(x))); JavaPairDStream<String, Long> cntByVal = hashtagRDD.countByValue(); cntByVal.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The count tag is ::"+x._1() +" and the val is ::"+x._2()))); /* hashtagRDD.window(Durations.seconds(60), Durations.seconds(30)) .countByValue() .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); hashtagRDD.countByValueAndWindow(Durations.seconds(60), Durations.seconds(30)) .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println("The window&count tag is ::"+x._1() +" and the val is ::"+x._2()))); */ hashtagRDD.window(Durations.minutes(8)).countByValue() .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); hashtagRDD.window(Durations.minutes(8),Durations.minutes(2)).countByValue() .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); hashtagRDD.window(Durations.minutes(12),Durations.minutes(8)).countByValue() .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); hashtagRDD.window(Durations.minutes(2),Durations.minutes(2)).countByValue() .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); hashtagRDD.window(Durations.minutes(12),Durations.minutes(12)).countByValue() .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); /*hashtagRDD.window(Durations.minutes(5),Durations.minutes(2)).countByValue() .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));*/ /* hashtagRDD.window(Durations.minutes(10),Durations.minutes(1)).countByValue() .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));*/ streamingContext.start(); try { streamingContext.awaitTermination(); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
Example #17
Source File: KafkaSource.java From sylph with Apache License 2.0 | 4 votes |
public JavaDStream<Row> createSource(JavaStreamingContext ssc, KafkaSourceConfig config, SourceContext context) { String topics = config.getTopics(); String brokers = config.getBrokers(); //需要把集群的host 配置到程序所在机器 String groupId = config.getGroupid(); //消费者的名字 String offsetMode = config.getOffsetMode(); Map<String, Object> kafkaParams = new HashMap<>(config.getOtherConfig()); kafkaParams.put("bootstrap.servers", brokers); kafkaParams.put("key.deserializer", ByteArrayDeserializer.class); //StringDeserializer kafkaParams.put("value.deserializer", ByteArrayDeserializer.class); //StringDeserializer kafkaParams.put("enable.auto.commit", false); //不自动提交偏移量 // "fetch.message.max.bytes" -> // "session.timeout.ms" -> "30000", //session默认是30秒 // "heartbeat.interval.ms" -> "5000", //10秒提交一次 心跳周期 kafkaParams.put("group.id", groupId); //注意不同的流 group.id必须要不同 否则会出现offect commit提交失败的错误 kafkaParams.put("auto.offset.reset", offsetMode); //latest earliest List<String> topicSets = Arrays.asList(topics.split(",")); JavaInputDStream<ConsumerRecord<byte[], byte[]>> inputStream = KafkaUtils.createDirectStream( ssc, LocationStrategies.PreferConsistent(), ConsumerStrategies.Subscribe(topicSets, kafkaParams)); DStream<ConsumerRecord<byte[], byte[]>> sylphKafkaOffset = new SylphKafkaOffset<ConsumerRecord<byte[], byte[]>>(inputStream.inputDStream()) { @Override public void commitOffsets(RDD<?> kafkaRdd) { OffsetRange[] offsetRanges = ((HasOffsetRanges) kafkaRdd).offsetRanges(); log().info("commitKafkaOffsets {}", (Object) offsetRanges); DStream<?> firstDStream = DStreamUtil.getFirstDStream(inputStream.dstream()); ((CanCommitOffsets) firstDStream).commitAsync(offsetRanges); } }; JavaDStream<ConsumerRecord<byte[], byte[]>> javaDStream = new JavaDStream<>(sylphKafkaOffset, ClassTag$.MODULE$.apply(ConsumerRecord.class)); if ("json".equalsIgnoreCase(config.getValueType())) { JsonSchema jsonParser = new JsonSchema(context.getSchema()); return javaDStream .map(record -> jsonParser.deserialize(record.key(), record.value(), record.topic(), record.partition(), record.offset())); } else { List<String> names = context.getSchema().getFieldNames(); return javaDStream .map(record -> { Object[] values = new Object[names.size()]; for (int i = 0; i < names.size(); i++) { switch (names.get(i)) { case "_topic": values[i] = record.topic(); continue; case "_message": values[i] = new String(record.value(), UTF_8); continue; case "_key": values[i] = record.key() == null ? null : new String(record.key(), UTF_8); continue; case "_partition": values[i] = record.partition(); continue; case "_offset": values[i] = record.offset(); case "_timestamp": values[i] = record.timestamp(); case "_timestampType": values[i] = record.timestampType().id; default: values[i] = null; } } return new GenericRow(values); //GenericRowWithSchema }); //.window(Duration(10 * 1000)) } }
Example #18
Source File: KafkaSource08.java From sylph with Apache License 2.0 | 4 votes |
private static JavaDStream<ConsumerRecord<byte[], byte[]>> settingCommit( JavaInputDStream<ConsumerRecord<byte[], byte[]>> inputStream, Map<String, String> kafkaParams, KafkaCluster kafkaCluster, String groupId) { if (kafkaParams.getOrDefault("auto.commit.enable", "true").equals("false")) { return inputStream; } int commitInterval = Integer.parseInt(kafkaParams.getOrDefault(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "90000")); DStream<ConsumerRecord<byte[], byte[]>> sylphKafkaOffset = new SylphKafkaOffset<ConsumerRecord<byte[], byte[]>>(inputStream.inputDStream()) { private final KafkaOffsetCommitter kafkaOffsetCommitter = new KafkaOffsetCommitter( kafkaCluster, groupId, commitInterval); @Override public void initialize(Time time) { super.initialize(time); kafkaOffsetCommitter.setName("Kafka_Offset_Committer"); kafkaOffsetCommitter.start(); } @Override public void commitOffsets(RDD<?> kafkaRdd) { OffsetRange[] offsets = ((HasOffsetRanges) kafkaRdd).offsetRanges(); // Map<TopicAndPartition, Long> internalOffsets = Arrays.stream(offsets) // .collect(Collectors.toMap(k -> k.topicAndPartition(), v -> v.fromOffset())); //log().info("commit Kafka Offsets {}", internalOffsets); kafkaOffsetCommitter.addAll(offsets); } }; JavaDStream<ConsumerRecord<byte[], byte[]>> dStream = new JavaDStream<>( sylphKafkaOffset, ClassTag$.MODULE$.<ConsumerRecord<byte[], byte[]>>apply(ConsumerRecord.class)); return dStream; // inputStream = inputStream.transform(rdd -> { // OffsetRange[] offsets = ((HasOffsetRanges) rdd.rdd()).offsetRanges(); // Map<TopicAndPartition, Long> internalOffsets = Arrays.stream(offsets) // .collect(Collectors.toMap(k -> k.topicAndPartition(), v -> v.fromOffset())); // commitKafkaOffsets(kafkaCluster, groupId, internalOffsets); // return rdd; // }); }
Example #19
Source File: KafkaSource08.java From sylph with Apache License 2.0 | 4 votes |
public JavaDStream<Row> createSource(JavaStreamingContext ssc, KafkaSourceConfig08 config, SourceContext context) { String topics = requireNonNull(config.getTopics(), "topics not setting"); String brokers = requireNonNull(config.getBrokers(), "brokers not setting"); //需要把集群的host 配置到程序所在机器 String groupId = requireNonNull(config.getGroupid(), "group.id not setting"); //消费者的名字 String offsetMode = requireNonNull(config.getOffsetMode(), "offsetMode not setting"); Map<String, String> otherConfig = config.getOtherConfig().entrySet() .stream() .filter(x -> x.getValue() != null) .collect(Collectors.toMap(Map.Entry::getKey, v -> v.getValue().toString())); Map<String, String> kafkaParams = new HashMap<>(otherConfig); kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers); //kafkaParams.put("auto.commit.enable", true); //不自动提交偏移量 // "fetch.message.max.bytes" -> // "session.timeout.ms" -> "30000", //session默认是30秒 // "heartbeat.interval.ms" -> "5000", //10秒提交一次 心跳周期 kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, groupId); //注意不同的流 group.id必须要不同 否则会出现offect commit提交失败的错误 kafkaParams.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, offsetMode); //largest smallest //----get fromOffsets @SuppressWarnings("unchecked") scala.collection.immutable.Map<String, String> map = (scala.collection.immutable.Map<String, String>) Map$.MODULE$.apply(JavaConverters.mapAsScalaMapConverter(kafkaParams).asScala().toSeq()); final KafkaCluster kafkaCluster = new KafkaCluster(map); Map<TopicAndPartition, Long> fromOffsets = getFromOffset(kafkaCluster, topics, groupId); //--- createDirectStream DirectKafkaInputDStream.class org.apache.spark.api.java.function.Function<MessageAndMetadata<byte[], byte[]>, ConsumerRecord<byte[], byte[]>> messageHandler = mmd -> new ConsumerRecord<>(mmd.topic(), mmd.partition(), mmd.key(), mmd.message(), mmd.offset()); @SuppressWarnings("unchecked") Class<ConsumerRecord<byte[], byte[]>> recordClass = (Class<ConsumerRecord<byte[], byte[]>>) ClassTag$.MODULE$.<ConsumerRecord<byte[], byte[]>>apply(ConsumerRecord.class).runtimeClass(); JavaInputDStream<ConsumerRecord<byte[], byte[]>> inputStream = KafkaUtils.createDirectStream(ssc, byte[].class, byte[].class, DefaultDecoder.class, DefaultDecoder.class, recordClass, kafkaParams, fromOffsets, messageHandler ); JavaDStream<ConsumerRecord<byte[], byte[]>> dStream = settingCommit(inputStream, kafkaParams, kafkaCluster, groupId); if ("json".equalsIgnoreCase(config.getValueType())) { JsonSchema jsonParser = new JsonSchema(context.getSchema()); return dStream .map(record -> { return jsonParser.deserialize(record.key(), record.value(), record.topic(), record.partition(), record.offset()); }); } else { StructType structType = schemaToSparkType(context.getSchema()); return dStream .map(record -> { String[] names = structType.names(); Object[] values = new Object[names.length]; for (int i = 0; i < names.length; i++) { switch (names[i]) { case "_topic": values[i] = record.topic(); continue; case "_message": values[i] = new String(record.value(), UTF_8); continue; case "_key": values[i] = new String(record.key(), UTF_8); continue; case "_partition": values[i] = record.partition(); continue; case "_offset": values[i] = record.offset(); default: values[i] = null; } } return (Row) new GenericRowWithSchema(values, structType); }); //.window(Duration(10 * 1000)) } }
Example #20
Source File: StreamingRsvpsDStream.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License | 4 votes |
public static void main(String[] args) throws InterruptedException { System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE); final SparkConf conf = new SparkConf() .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES) .setAppName(APPLICATION_NAME) .set("spark.mongodb.output.uri", MONGODB_OUTPUT_URI); final JavaStreamingContext streamingContext = new JavaStreamingContext(conf, new Duration(BATCH_DURATION_INTERVAL_MS)); final JavaInputDStream<ConsumerRecord<String, String>> meetupStream = KafkaUtils.createDirectStream( streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES) ); // transformations, streaming algorithms, etc JavaDStream<ConsumerRecord<String, String>> rsvpsWithGuestsStream = meetupStream.filter(f -> !f.value().contains("\"guests\":0")); rsvpsWithGuestsStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> r) -> { MongoSpark.save( r.map( e -> Document.parse(e.value()) ) ); }); // some time later, after outputs have completed meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> { OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges(); ((CanCommitOffsets) meetupStream.inputDStream()) .commitAsync(offsetRanges, new MeetupOffsetCommitCallback()); }); streamingContext.start(); streamingContext.awaitTermination(); }
Example #21
Source File: SparkMLTrainingAndScoringOnline.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License | 4 votes |
public static void main(String[] args) throws InterruptedException { System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE); final SparkConf conf = new SparkConf() .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES) .setAppName(APPLICATION_NAME) .set("spark.sql.caseSensitive", CASE_SENSITIVE); JavaStreamingContext streamingContext = new JavaStreamingContext(conf, new Duration(BATCH_DURATION_INTERVAL_MS)); JavaInputDStream<ConsumerRecord<String, String>> meetupStream = KafkaUtils.createDirectStream( streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES) ); JavaDStream<String> meetupStreamValues = meetupStream.map(v -> { return v.value(); }); // Prepare the training data as strings of type: (y,[x1,x2,x3,...,xn]) // Where n is the number of features, y is a binary label, // and n must be the same for train and test. // e.g. "(response, [group_lat, group_long])"; JavaDStream<String> trainData = meetupStreamValues.map(e -> { JSONParser jsonParser = new JSONParser(); JSONObject json = (JSONObject)jsonParser.parse(e); String result = "(" + (String.valueOf(json.get("response")).equals("yes") ? "1.0,[":"0.0,[") + ((JSONObject)json.get("group")).get("group_lat") + "," + ((JSONObject)json.get("group")).get("group_lon") + "])"; return result; }); trainData.print(); JavaDStream<LabeledPoint> labeledPoints = trainData.map(LabeledPoint::parse); StreamingLogisticRegressionWithSGD streamingLogisticRegressionWithSGD = new StreamingLogisticRegressionWithSGD() .setInitialWeights(Vectors.zeros(2)); streamingLogisticRegressionWithSGD.trainOn(labeledPoints); JavaPairDStream<Double, Vector> values = labeledPoints.mapToPair(f -> new Tuple2<>(f.label(), f.features())); streamingLogisticRegressionWithSGD.predictOnValues(values).print(); // some time later, after outputs have completed meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> { OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges(); ((CanCommitOffsets) meetupStream.inputDStream()) .commitAsync(offsetRanges, new MeetupOffsetCommitCallback()); }); streamingContext.start(); streamingContext.awaitTermination(); }