Java Code Examples for org.apache.spark.streaming.api.java.JavaDStream#foreachRDD()
The following examples show how to use
org.apache.spark.streaming.api.java.JavaDStream#foreachRDD() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: StreamingRsvpsDStreamCountWindow.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License | 6 votes |
public static void main(String[] args) throws InterruptedException { System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE); final SparkConf conf = new SparkConf() .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES) .setAppName(APPLICATION_NAME) .set("spark.mongodb.output.uri", MONGODB_OUTPUT_URI) .set("spark.streaming.kafka.consumer.cache.enabled", "false"); final JavaStreamingContext streamingContext = new JavaStreamingContext(conf, new Duration(BATCH_DURATION_INTERVAL_MS)); streamingContext.checkpoint(CHECKPOINT_FOLDER); final JavaInputDStream<ConsumerRecord<String, String>> meetupStream = KafkaUtils.createDirectStream( streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES) ); // transformations, streaming algorithms, etc JavaDStream<Long> countStream = meetupStream.countByWindow( new Duration(WINDOW_LENGTH_MS), new Duration(SLIDING_INTERVAL_MS)); countStream.foreachRDD((JavaRDD<Long> countRDD) -> { MongoSpark.save( countRDD.map( r -> Document.parse("{\"rsvps_count\":\"" + String.valueOf(r) + "\"}") ) ); }); // some time later, after outputs have completed meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> { OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges(); ((CanCommitOffsets) meetupStream.inputDStream()) .commitAsync(offsetRanges, new MeetupOffsetCommitCallback()); }); streamingContext.start(); streamingContext.awaitTermination(); }
Example 2
Source File: StreamingIngestionFileSystemTextFileToDataframeMultipleClassesApp.java From net.jgp.labs.spark with Apache License 2.0 | 6 votes |
private void start() { // Create a local StreamingContext with two working thread and batch // interval of // 1 second SparkConf conf = new SparkConf().setMaster("local[2]").setAppName( "Streaming Ingestion File System Text File to Dataframe"); JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations .seconds(5)); JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils .getInputDirectory()); msgDataStream.print(); // Create JavaRDD<Row> msgDataStream.foreachRDD(new RowProcessor()); jssc.start(); try { jssc.awaitTermination(); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
Example 3
Source File: SimpleSparkStreamingCounterDataSource.java From jMetalSP with MIT License | 6 votes |
@Override public void run() { JMetalLogger.logger.info("Run method in the streaming data source invoked"); JMetalLogger.logger.info("Directory: " + directoryName); JavaDStream<Integer> time = streamingContext .textFileStream(directoryName) .map(line -> Integer.parseInt(line)); time.foreachRDD(numbers -> { if (numbers != null && numbers.rdd().count() > 0) { Integer cont = numbers.reduce((key, value) -> value); observable.setChanged(); observable.notifyObservers(new ObservedValue<Integer>(cont)); } }); /*time.foreachRDD(numbers -> { List<Integer> numberList = numbers.collect() ; for (Integer number : numberList) { System.out.println(number) ; observable.setChanged(); observable.notifyObservers(new ObservedValue<Integer>(number)); } }) ;*/ }
Example 4
Source File: DStreamUtil.java From sylph with Apache License 2.0 | 6 votes |
public static void dstreamAction(JavaDStream<Row> stream, Sink<JavaRDD<Row>> sink) { DStream<?> fristDStream = getFristDStream(stream.dstream()); logger.info("数据源驱动:{}", fristDStream.getClass().getName()); if ("DirectKafkaInputDStream".equals(fristDStream.getClass().getSimpleName())) { logger.info("发现job 数据源是kafka,将开启空job优化 且 自动上报offect"); stream.foreachRDD(rdd -> { RDD<?> kafkaRdd = getFristRdd(rdd.rdd()); //rdd.dependencies(0).rdd OffsetRange[] offsetRanges = ((HasOffsetRanges) kafkaRdd).offsetRanges(); if (kafkaRdd.count() > 0) { sink.run(rdd); //执行业务操作 } ((CanCommitOffsets) fristDStream).commitAsync(offsetRanges); }); } else { //非kafka数据源 暂时无法做任何优化 stream.foreachRDD(sink::run); } }
Example 5
Source File: SparkStreamingSqlAnalyse.java From sylph with Apache License 2.0 | 5 votes |
public void build() { JavaDStream<Row> inputStream = source.apply(null); SparkSession spark = SparkSession.builder().config(inputStream.context().sparkContext().getConf()).getOrCreate(); if (isCompile) { logger.info("isCompile mode will checkDStream()"); checkDStream(spark, sourceTableName, schema, handlers); } DStream<?> firstDStream = DStreamUtil.getFirstDStream(inputStream.dstream(), SylphKafkaOffset.class); logger.info("source table {}, firstDStream is {}", sourceTableName, firstDStream); inputStream.foreachRDD(rdd -> { Dataset<Row> df = spark.createDataFrame(rdd, schema); df.createOrReplaceTempView(sourceTableName); //df.show() //if kafka0.10+ if("DirectKafkaInputDStream".equals(firstDStream.getClass().getSimpleName())) {} if (firstDStream instanceof SylphKafkaOffset) { // RDD<?> kafkaRdd = DStreamUtil.getFirstRdd(rdd.rdd()); //rdd.dependencies(0).rdd if (kafkaRdd.count() > 0) { handlers.forEach(x -> x.accept(spark)); //执行业务操作 } //val offsetRanges = kafkaRdd.asInstanceOf[HasOffsetRanges].offsetRanges //firstDStream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges) ((SylphKafkaOffset<?>) firstDStream).commitOffsets(kafkaRdd); } else { handlers.forEach(x -> x.accept(spark)); } }); }
Example 6
Source File: FileStreamingEx.java From Apache-Spark-2x-for-Java-Developers with MIT License | 5 votes |
public static void main(String[] args) { //Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set System.setProperty("hadoop.home.dir", "E:\\hadoop"); //Logger rootLogger = LogManager.getRootLogger(); //rootLogger.setLevel(Level.WARN); SparkConf conf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]"); String inputDirectory="E:\\hadoop\\streamFolder\\"; JavaSparkContext sc = new JavaSparkContext(conf); JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.seconds(1)); // streamingContext.checkpoint("E:\\hadoop\\checkpoint"); Logger rootLogger = LogManager.getRootLogger(); rootLogger.setLevel(Level.WARN); JavaDStream<String> streamfile = streamingContext.textFileStream(inputDirectory); streamfile.print(); streamfile.foreachRDD(rdd-> rdd.foreach(x -> System.out.println(x))); JavaPairDStream<LongWritable, Text> streamedFile = streamingContext.fileStream(inputDirectory, LongWritable.class, Text.class, TextInputFormat.class); streamedFile.print(); streamingContext.start(); try { streamingContext.awaitTermination(); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
Example 7
Source File: StreamingContextConfiguration.java From Decision with Apache License 2.0 | 5 votes |
private void configureDataContext(JavaStreamingContext context) { Map<String, Integer> baseTopicMap = new HashMap<>(); configurationContext.getDataTopics().forEach( dataTopic -> baseTopicMap.put(dataTopic, 1)); kafkaTopicService.createTopicsIfNotExist(configurationContext.getDataTopics(), configurationContext .getKafkaReplicationFactor(), configurationContext.getKafkaPartitions()); HashMap<String, String> kafkaParams = new HashMap<>(); kafkaParams.put("zookeeper.connect", configurationContext.getZookeeperHostsQuorumWithPath()); kafkaParams.put("group.id", configurationContext.getGroupId()); /* groupId must be the cluster groupId. Kafka assigns each partition of a topic to one, and one only, consumer of the group. Decision topics has only one partition (by default), so if we have two o more decision instances (consumers) reading the same topic with the same groupId, only one instance will be able to read from the topic */ JavaPairDStream<String, byte[]> messages = KafkaUtils.createStream(context, String.class, byte[].class, kafka.serializer.StringDecoder.class, kafka.serializer.DefaultDecoder.class, kafkaParams, baseTopicMap, StorageLevel.MEMORY_AND_DISK_SER()); AvroDeserializeMessageFunction avroDeserializeMessageFunction = new AvroDeserializeMessageFunction(); JavaDStream<StratioStreamingMessage> insertRequests = messages.filter( new FilterAvroMessagesByOperationFunction(STREAM_OPERATIONS.MANIPULATION.INSERT)) .map(avroDeserializeMessageFunction); InsertIntoStreamFunction insertIntoStreamFunction = new InsertIntoStreamFunction(streamOperationService, configurationContext.getZookeeperHostsQuorum()); insertRequests.foreachRDD(insertIntoStreamFunction); }
Example 8
Source File: ProcessedOffsetManager.java From kafka-spark-consumer with Apache License 2.0 | 5 votes |
@SuppressWarnings("deprecation") public static void persists(DStream<Tuple2<String, Iterable<Long>>> partitonOffset, Properties props) { ClassTag<Tuple2<String, Iterable<Long>>> tuple2ClassTag = ScalaUtil.<String, Iterable<Long>>getTuple2ClassTag(); JavaDStream<Tuple2<String, Iterable<Long>>> jpartitonOffset = new JavaDStream<Tuple2<String, Iterable<Long>>>(partitonOffset, tuple2ClassTag); jpartitonOffset.foreachRDD(new VoidFunction<JavaRDD<Tuple2<String, Iterable<Long>>>>() { @Override public void call(JavaRDD<Tuple2<String, Iterable<Long>>> po) throws Exception { List<Tuple2<String, Iterable<Long>>> poList = po.collect(); doPersists(poList, props); } }); }
Example 9
Source File: KafkaExample.java From Apache-Spark-2x-for-Java-Developers with MIT License | 4 votes |
public static void main(String[] args) { //Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set System.setProperty("hadoop.home.dir", "E:\\hadoop"); //Logger rootLogger = LogManager.getRootLogger(); //rootLogger.setLevel(Level.WARN); SparkConf conf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]"); JavaSparkContext sc = new JavaSparkContext(conf); JavaStreamingContext streamingContext = new JavaStreamingContext(sc, Durations.minutes(2)); streamingContext.checkpoint("E:\\hadoop\\checkpoint"); Logger rootLogger = LogManager.getRootLogger(); rootLogger.setLevel(Level.WARN); Map<String, Object> kafkaParams = new HashMap<>(); kafkaParams.put("bootstrap.servers", "10.0.75.1:9092"); kafkaParams.put("key.deserializer", StringDeserializer.class); kafkaParams.put("value.deserializer", StringDeserializer.class); kafkaParams.put("group.id", "use_a_separate_group_id_for_each_strea"); kafkaParams.put("auto.offset.reset", "latest"); // kafkaParams.put("enable.auto.commit", false); Collection<String> topics = Arrays.asList("mytopic", "anothertopic"); final JavaInputDStream<ConsumerRecord<String, String>> stream = KafkaUtils.createDirectStream(streamingContext,LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams)); JavaPairDStream<String, String> pairRDD = stream.mapToPair(record-> new Tuple2<>(record.key(), record.value())); pairRDD.foreachRDD(pRDD-> { pRDD.foreach(tuple-> System.out.println(new Date()+" :: Kafka msg key ::"+tuple._1() +" the val is ::"+tuple._2()));}); JavaDStream<String> tweetRDD = pairRDD.map(x-> x._2()).map(new TweetText()); tweetRDD.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" :: "+x))); JavaDStream<String> hashtagRDD = tweetRDD.flatMap(twt-> Arrays.stream(twt.split(" ")).filter(str-> str.contains("#")).collect(Collectors.toList()).iterator() ); hashtagRDD.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(x))); JavaPairDStream<String, Long> cntByVal = hashtagRDD.countByValue(); cntByVal.foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The count tag is ::"+x._1() +" and the val is ::"+x._2()))); /* hashtagRDD.window(Durations.seconds(60), Durations.seconds(30)) .countByValue() .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); hashtagRDD.countByValueAndWindow(Durations.seconds(60), Durations.seconds(30)) .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println("The window&count tag is ::"+x._1() +" and the val is ::"+x._2()))); */ hashtagRDD.window(Durations.minutes(8)).countByValue() .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); hashtagRDD.window(Durations.minutes(8),Durations.minutes(2)).countByValue() .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); hashtagRDD.window(Durations.minutes(12),Durations.minutes(8)).countByValue() .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); hashtagRDD.window(Durations.minutes(2),Durations.minutes(2)).countByValue() .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); hashtagRDD.window(Durations.minutes(12),Durations.minutes(12)).countByValue() .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2()))); /*hashtagRDD.window(Durations.minutes(5),Durations.minutes(2)).countByValue() .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));*/ /* hashtagRDD.window(Durations.minutes(10),Durations.minutes(1)).countByValue() .foreachRDD(tRDD -> tRDD.foreach(x->System.out.println(new Date()+" ::The window count tag is ::"+x._1() +" and the val is ::"+x._2())));*/ streamingContext.start(); try { streamingContext.awaitTermination(); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
Example 10
Source File: JavaStreamingTestExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) throws Exception { if (args.length != 3) { System.err.println("Usage: JavaStreamingTestExample " + "<dataDir> <batchDuration> <numBatchesTimeout>"); System.exit(1); } String dataDir = args[0]; Duration batchDuration = Seconds.apply(Long.parseLong(args[1])); int numBatchesTimeout = Integer.parseInt(args[2]); SparkConf conf = new SparkConf().setMaster("local").setAppName("StreamingTestExample"); JavaStreamingContext ssc = new JavaStreamingContext(conf, batchDuration); ssc.checkpoint(Utils.createTempDir(System.getProperty("java.io.tmpdir"), "spark").toString()); // $example on$ JavaDStream<BinarySample> data = ssc.textFileStream(dataDir).map( new Function<String, BinarySample>() { @Override public BinarySample call(String line) { String[] ts = line.split(","); boolean label = Boolean.parseBoolean(ts[0]); double value = Double.parseDouble(ts[1]); return new BinarySample(label, value); } }); StreamingTest streamingTest = new StreamingTest() .setPeacePeriod(0) .setWindowSize(0) .setTestMethod("welch"); JavaDStream<StreamingTestResult> out = streamingTest.registerStream(data); out.print(); // $example off$ // Stop processing if test becomes significant or we time out timeoutCounter = numBatchesTimeout; out.foreachRDD(new VoidFunction<JavaRDD<StreamingTestResult>>() { @Override public void call(JavaRDD<StreamingTestResult> rdd) { timeoutCounter -= 1; boolean anySignificant = !rdd.filter(new Function<StreamingTestResult, Boolean>() { @Override public Boolean call(StreamingTestResult v) { return v.pValue() < 0.05; } }).isEmpty(); if (timeoutCounter <= 0 || anySignificant) { rdd.context().stop(); } } }); ssc.start(); ssc.awaitTermination(); }
Example 11
Source File: JavaSqlNetworkWordCount.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaNetworkWordCount <hostname> <port>"); System.exit(1); } StreamingExamples.setStreamingLogLevels(); // Create the context with a 1 second batch size SparkConf sparkConf = new SparkConf().setAppName("JavaSqlNetworkWordCount"); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1)); // Create a JavaReceiverInputDStream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') // Note that no duplication in storage level only for running locally. // Replication necessary in distributed scenario for fault tolerance. JavaReceiverInputDStream<String> lines = ssc.socketTextStream( args[0], Integer.parseInt(args[1]), StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String x) { return Arrays.asList(SPACE.split(x)).iterator(); } }); // Convert RDDs of the words DStream to DataFrame and run SQL query words.foreachRDD(new VoidFunction2<JavaRDD<String>, Time>() { @Override public void call(JavaRDD<String> rdd, Time time) { SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf()); // Convert JavaRDD[String] to JavaRDD[bean class] to DataFrame JavaRDD<JavaRecord> rowRDD = rdd.map(new Function<String, JavaRecord>() { @Override public JavaRecord call(String word) { JavaRecord record = new JavaRecord(); record.setWord(word); return record; } }); Dataset<Row> wordsDataFrame = spark.createDataFrame(rowRDD, JavaRecord.class); // Creates a temporary view using the DataFrame wordsDataFrame.createOrReplaceTempView("words"); // Do word count on table using SQL and print it Dataset<Row> wordCountsDataFrame = spark.sql("select word, count(*) as total from words group by word"); System.out.println("========= " + time + "========="); wordCountsDataFrame.show(); } }); ssc.start(); ssc.awaitTermination(); }
Example 12
Source File: StreamingRsvpsDStream.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License | 4 votes |
public static void main(String[] args) throws InterruptedException { System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE); final SparkConf conf = new SparkConf() .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES) .setAppName(APPLICATION_NAME) .set("spark.mongodb.output.uri", MONGODB_OUTPUT_URI); final JavaStreamingContext streamingContext = new JavaStreamingContext(conf, new Duration(BATCH_DURATION_INTERVAL_MS)); final JavaInputDStream<ConsumerRecord<String, String>> meetupStream = KafkaUtils.createDirectStream( streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES) ); // transformations, streaming algorithms, etc JavaDStream<ConsumerRecord<String, String>> rsvpsWithGuestsStream = meetupStream.filter(f -> !f.value().contains("\"guests\":0")); rsvpsWithGuestsStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> r) -> { MongoSpark.save( r.map( e -> Document.parse(e.value()) ) ); }); // some time later, after outputs have completed meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> { OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges(); ((CanCommitOffsets) meetupStream.inputDStream()) .commitAsync(offsetRanges, new MeetupOffsetCommitCallback()); }); streamingContext.start(); streamingContext.awaitTermination(); }
Example 13
Source File: SimpleSparkStructuredKafkaStreamingCounterAVRO.java From jMetalSP with MIT License | 4 votes |
@Override public void run() { ConsumerStrategy<Integer,byte[]> consumerStrategy =ConsumerStrategies.Subscribe(topic,kafkaParams); LocationStrategy locationStrategy = LocationStrategies.PreferConsistent(); JavaInputDStream<ConsumerRecord<Integer,byte []>> stream= (JavaInputDStream<ConsumerRecord<Integer,byte[]>>) KafkaUtils.createDirectStream(streamingContext, locationStrategy, consumerStrategy); JavaDStream<Integer> time=stream.map(value -> { DataDeserializer<Counter> dataDeserializer = new DataDeserializer<>(); //Object o =dataDeserializer.deserialize(value.value(),"avsc/Counter.avsc"); //GenericData.Record rc=(GenericData.Record)o; Counter counter = dataDeserializer.deserialize(value.value(),"avsc/Counter.avsc"); //Counter counter = (Counter) dataDeserializer.deserialize(value.value(),"avsc/Counter.avsc"); //return (Integer) rc.get(0); return (Integer) counter.get(0); }); /*time.foreachRDD(numbers-> { numbers.foreach(value-> { System.out.println("Pruebas----> " + value); observable.setChanged(); observable.notifyObservers(new SingleObservedData<Integer>(value)); }); } );*/ time.foreachRDD(numbers -> { Integer cont = numbers.reduce((key, value) -> value); System.out.println("Pruebas----> " + cont); observable.setChanged(); observable.notifyObservers(new ObservedValue<Integer>(cont)); }); // stream.foreachRDD((consumerRecordJavaRDD, time) -> consumerRecordJavaRDD.foreach(integer -> { //observable.setChanged(); //observable.notifyObservers(new SingleObservedData<Integer>(integer.value())); // System.out.println("Pruebas----> "+integer.value()); // })); }
Example 14
Source File: SimpleSparkStructuredKafkaStreamingCounter.java From jMetalSP with MIT License | 4 votes |
@Override public void run() { ConsumerStrategy<Integer,Integer> consumerStrategy =ConsumerStrategies.Subscribe(topic,kafkaParams); LocationStrategy locationStrategy = LocationStrategies.PreferConsistent(); JavaInputDStream<ConsumerRecord<Integer,Integer>> stream= (JavaInputDStream<ConsumerRecord<Integer,Integer>>) KafkaUtils.createDirectStream(streamingContext, locationStrategy, consumerStrategy); JavaDStream<Integer> time=stream.map(value -> value.value() ); /*time.foreachRDD(numbers-> { numbers.foreach(value-> { System.out.println("Pruebas----> " + value); observable.setChanged(); observable.notifyObservers(new SingleObservedData<Integer>(value)); }); } );*/ time.foreachRDD(numbers -> { Integer cont = numbers.reduce((key, value) -> value); //System.out.println("Pruebas----> " + cont); observable.setChanged(); observable.notifyObservers(new ObservedValue<Integer>(cont)); }); // stream.foreachRDD((consumerRecordJavaRDD, time) -> consumerRecordJavaRDD.foreach(integer -> { //observable.setChanged(); //observable.notifyObservers(new SingleObservedData<Integer>(integer.value())); // System.out.println("Pruebas----> "+integer.value()); // })); }
Example 15
Source File: AppMain.java From SparkToParquet with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws IOException { Flags.setFromCommandLineArgs(THE_OPTIONS, args); // 初始化Spark Conf. SparkConf conf = new SparkConf().setAppName("A SECTONG Application: Apache Log Analysis with Spark"); JavaSparkContext sc = new JavaSparkContext(conf); JavaStreamingContext jssc = new JavaStreamingContext(sc, Flags.getInstance().getSlideInterval()); SQLContext sqlContext = new SQLContext(sc); // 初始化参数 HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(Flags.getInstance().getKafka_topic().split(","))); HashMap<String, String> kafkaParams = new HashMap<String, String>(); kafkaParams.put("metadata.broker.list", Flags.getInstance().getKafka_broker()); // 从Kafka Stream获取数据 JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(jssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, topicsSet); JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() { private static final long serialVersionUID = 5266880065425088203L; public String call(Tuple2<String, String> tuple2) { return tuple2._2(); } }); JavaDStream<ApacheAccessLog> accessLogsDStream = lines.flatMap(line -> { List<ApacheAccessLog> list = new ArrayList<>(); try { // 映射每一行 list.add(ApacheAccessLog.parseFromLogLine(line)); return list; } catch (RuntimeException e) { return list; } }).cache(); accessLogsDStream.foreachRDD(rdd -> { // rdd to DataFrame DataFrame df = sqlContext.createDataFrame(rdd, ApacheAccessLog.class); // 写入Parquet文件 df.write().partitionBy("ipAddress", "method", "responseCode").mode(SaveMode.Append).parquet(Flags.getInstance().getParquetFile()); return null; }); // 启动Streaming服务器 jssc.start(); // 启动计算 jssc.awaitTermination(); // 等待终止 }
Example 16
Source File: Runner.java From envelope with Apache License 2.0 | 4 votes |
/** * Run the Envelope pipeline as a Spark Streaming job. * @param steps The full configuration of the Envelope pipeline */ @SuppressWarnings("unchecked") private void runStreaming(final Set<Step> steps) throws Exception { final Set<Step> independentNonStreamingSteps = StepUtils.getIndependentNonStreamingSteps(steps); runBatch(independentNonStreamingSteps); Set<StreamingStep> streamingSteps = StepUtils.getStreamingSteps(steps); for (final StreamingStep streamingStep : streamingSteps) { LOG.debug("Setting up streaming step: " + streamingStep.getName()); JavaDStream stream = streamingStep.getStream(); stream.foreachRDD(new VoidFunction<JavaRDD<?>>() { @Override public void call(JavaRDD<?> raw) throws Exception { // Some independent steps might be repeating steps that have been flagged for reload StepUtils.resetRepeatingSteps(steps); // This will run any batch steps (and dependents) that are not submitted runBatch(independentNonStreamingSteps); streamingStep.setData(streamingStep.translate(raw)); streamingStep.writeData(); streamingStep.setState(StepState.FINISHED); Set<Step> batchSteps = StepUtils.mergeLoadedSteps(steps, streamingStep, baseConfig); Set<Step> dependentSteps = StepUtils.getAllDependentSteps(streamingStep, batchSteps); batchSteps.add(streamingStep); batchSteps.addAll(streamingStep.loadNewBatchSteps()); batchSteps.addAll(independentNonStreamingSteps); runBatch(batchSteps); StepUtils.resetSteps(dependentSteps); streamingStep.recordProgress(raw); } }); LOG.debug("Finished setting up streaming step: " + streamingStep.getName()); } JavaStreamingContext jsc = Contexts.getJavaStreamingContext(); jsc.start(); LOG.debug("Streaming context started"); jsc.awaitTermination(); LOG.debug("Streaming context terminated"); }
Example 17
Source File: StreamingIngestionFileSystemTextFileToDataframeApp.java From net.jgp.labs.spark with Apache License 2.0 | 4 votes |
private void start() { // Create a local StreamingContext with two working thread and batch // interval of // 1 second SparkConf conf = new SparkConf().setMaster("local[2]").setAppName( "Streaming Ingestion File System Text File to Dataframe"); JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations .seconds(5)); JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils .getInputDirectory()); msgDataStream.print(); // Create JavaRDD<Row> msgDataStream.foreachRDD(new VoidFunction<JavaRDD<String>>() { private static final long serialVersionUID = -590010339928376829L; @Override public void call(JavaRDD<String> rdd) { JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() { private static final long serialVersionUID = 5167089361335095997L; @Override public Row call(String msg) { Row row = RowFactory.create(msg); return row; } }); // Create Schema StructType schema = DataTypes.createStructType( new StructField[] { DataTypes.createStructField("Message", DataTypes.StringType, true) }); // Get Spark 2.0 session SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context() .getConf()); Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema); msgDataFrame.show(); } }); jssc.start(); try { jssc.awaitTermination(); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
Example 18
Source File: SparkRunner.java From jaeger-analytics-java with Apache License 2.0 | 4 votes |
public static void main(String []args) throws InterruptedException, IOException { HTTPServer server = new HTTPServer(Integer.valueOf(getPropOrEnv("PROMETHEUS_PORT", "9111"))); SparkConf sparkConf = new SparkConf() .setAppName("Trace DSL") .setMaster(getPropOrEnv("SPARK_MASTER","local[*]")); JavaSparkContext sc = new JavaSparkContext(sparkConf); JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(Integer.parseInt(getPropOrEnv("SPARK_STREAMING_BATCH_DURATION", "5000")))); Set<String> topics = Collections.singleton(getPropOrEnv("KAFKA_JAEGER_TOPIC", "jaeger-spans")); Map<String, Object> kafkaParams = new HashMap<>(); kafkaParams.put("bootstrap.servers", getPropOrEnv("KAFKA_BOOTSTRAP_SERVER", "localhost:9092")); kafkaParams.put("key.deserializer", StringDeserializer.class); kafkaParams.put("value.deserializer", ProtoSpanDeserializer.class); // hack to start always from beginning kafkaParams.put("group.id", "jaeger-trace-aggregation-" + System.currentTimeMillis()); if (Boolean.parseBoolean(getPropOrEnv("KAFKA_START_FROM_BEGINNING", "true"))) { kafkaParams.put("auto.offset.reset", "earliest"); kafkaParams.put("enable.auto.commit", false); kafkaParams.put("startingOffsets", "earliest"); } JavaInputDStream<ConsumerRecord<String, Span>> messages = KafkaUtils.createDirectStream( ssc, LocationStrategies.PreferConsistent(), ConsumerStrategies.Subscribe(topics, kafkaParams)); JavaPairDStream<String, Span> traceIdSpanTuple = messages.mapToPair(record -> { return new Tuple2<>(record.value().traceId, record.value()); }); JavaDStream<Trace> tracesStream = traceIdSpanTuple.groupByKey().map(traceIdSpans -> { System.out.printf("traceID: %s\n", traceIdSpans._1); Iterable<Span> spans = traceIdSpans._2(); Trace trace = new Trace(); trace.traceId = traceIdSpans._1(); trace.spans = StreamSupport.stream(spans.spliterator(), false) .collect(Collectors.toList()); return trace; }); MinimumClientVersion minimumClientVersion = MinimumClientVersion.builder() .withJavaVersion(getPropOrEnv("TRACE_QUALITY_JAVA_VERSION", "1.0.0")) .withGoVersion(getPropOrEnv("TRACE_QUALITY_GO_VERSION", "2.22.0")) .withNodeVersion(getPropOrEnv("TRACE_QUALITY_NODE_VERSION", "3.17.1")) .withPythonVersion(getPropOrEnv("TRACE_QUALITY_PYTHON_VERSION", "4.0.0")) .build(); List<ModelRunner> modelRunner = Arrays.asList( new TraceHeight(), new ServiceDepth(), new ServiceHeight(), new NetworkLatency(), new NumberOfErrors(), new DirectDependencies(), // trace quality minimumClientVersion, new HasClientServerSpans(), new UniqueSpanId()); tracesStream.foreachRDD((traceRDD, time) -> { traceRDD.foreach(trace -> { Graph graph = GraphCreator.create(trace); for (ModelRunner model: modelRunner) { model.runWithMetrics(graph); } }); }); ssc.start(); ssc.awaitTermination(); }
Example 19
Source File: SampleConsumer.java From kafka-spark-consumer with Apache License 2.0 | 4 votes |
@SuppressWarnings("deprecation") private void run() { Properties props = new Properties(); props.put("zookeeper.hosts", "zkhost"); props.put("zookeeper.port", "2181"); props.put("kafka.topic", "topicA,topicB,topicC"); props.put("kafka.consumer.id", "kafka-consumer"); // Optional Properties props.put("zookeeper.broker.path", "/brokers"); props.put("zookeeper.consumer.path", "/consumers"); props.put("consumer.forcefromstart", "false"); props.put("max.poll.records", "10"); props.put("consumer.fillfreqms", "500"); props.put("consumer.backpressure.enabled", "true"); //Kafka properties props.put("bootstrap.servers", "kafkahost-1:6667," + "kafkahost-2:6667," + "kafkahost-3:6667," + "kafkahost-4:6667"); props.put("security.protocol", "SSL"); props.put("ssl.truststore.location","~/kafka-securitykafka.server.truststore.jks"); props.put("ssl.truststore.password", "test1234"); SparkConf _sparkConf = new SparkConf(); JavaStreamingContext jsc = new JavaStreamingContext(_sparkConf, Durations.seconds(30)); // Specify number of Receivers you need. int numberOfReceivers = 6; JavaDStream<MessageAndMetadata<byte[]>> unionStreams = ReceiverLauncher.launch( jsc, props, numberOfReceivers, StorageLevel.MEMORY_ONLY()); unionStreams.foreachRDD(new VoidFunction<JavaRDD<MessageAndMetadata<byte[]>>>() { @Override public void call(JavaRDD<MessageAndMetadata<byte[]>> rdd) throws Exception { //Start Application Logic rdd.foreachPartition(new VoidFunction<Iterator<MessageAndMetadata<byte[]>>>() { @Override public void call(Iterator<MessageAndMetadata<byte[]>> mmItr) throws Exception { int countTopicA = 0; int countTopicB = 0; int countTopicC = 0; while(mmItr.hasNext()) { MessageAndMetadata<byte[]> mm = mmItr.next(); if(mm.getTopic().equals("topicA")) { countTopicA++; } else if (mm.getTopic().equals("topicB")) { countTopicB++; } else if (mm.getTopic().equals("topicC")) { countTopicC++; } } System.out.println("topicA count " + countTopicA); System.out.println("topicB count " + countTopicB); System.out.println("topicC count " + countTopicC); } }); System.out.println("RDD count " + rdd.count()); //End Application Logic //commit offset System.out.println("Commiting Offset"); ProcessedOffsetManager.persistsPartition(rdd, props); } }); try { jsc.start(); jsc.awaitTermination(); }catch (Exception ex ) { jsc.ssc().sc().cancelAllJobs(); jsc.stop(true, false); System.exit(-1); } }