org.apache.spark.streaming.Time Java Examples
The following examples show how to use
org.apache.spark.streaming.Time.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SaveToHDFSFunction.java From oryx with Apache License 2.0 | 6 votes |
@Override public void call(JavaPairRDD<K,M> rdd, Time time) throws IOException { if (rdd.isEmpty()) { log.info("RDD was empty, not saving to HDFS"); } else { String file = prefix + '-' + time.milliseconds() + '.' + suffix; Path path = new Path(file); FileSystem fs = FileSystem.get(path.toUri(), hadoopConf); if (fs.exists(path)) { log.warn("Saved data already existed, possibly from a failed job. Deleting {}", path); fs.delete(path, true); } log.info("Saving RDD to HDFS at {}", file); rdd.mapToPair( new ValueToWritableFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass) ).saveAsNewAPIHadoopFile( file, keyWritableClass, messageWritableClass, SequenceFileOutputFormat.class, hadoopConf); } }
Example #2
Source File: BlurBulkLoadSparkProcessor.java From incubator-retired-blur with Apache License 2.0 | 6 votes |
@Override protected Function2<JavaPairRDD<String, RowMutation>, Time, Void> getFunction() { return new Function2<JavaPairRDD<String, RowMutation>, Time, Void>() { // Blur Thrift Client @Override public Void call(JavaPairRDD<String, RowMutation> rdd, Time time) throws Exception { Iface client = getBlurClient(); for (Tuple2<String, RowMutation> tuple : rdd.collect()) { if (tuple != null) { try { RowMutation rm = tuple._2; // Index using enqueue mutate call client.enqueueMutate(rm); } catch (Exception ex) { LOG.error("Unknown error while trying to call enqueueMutate.", ex); throw ex; } } } return null; } }; }
Example #3
Source File: WatermarkSyncedDStream.java From beam with Apache License 2.0 | 5 votes |
@Override public scala.Option<RDD<WindowedValue<T>>> compute(final Time validTime) { final long batchTime = validTime.milliseconds(); LOG.trace( "BEFORE waiting for watermark sync, " + "LastWatermarkedBatchTime: {}, current batch time: {}", GlobalWatermarkHolder.getLastWatermarkedBatchTime(), batchTime); final Stopwatch stopwatch = Stopwatch.createStarted(); awaitWatermarkSyncWith(batchTime); stopwatch.stop(); LOG.info( "Waited {} millis for watermarks to sync up with the current batch ({})", stopwatch.elapsed(TimeUnit.MILLISECONDS), batchTime); LOG.info("Watermarks are now: {}", GlobalWatermarkHolder.get(batchDuration)); LOG.trace( "AFTER waiting for watermark sync, " + "LastWatermarkedBatchTime: {}, current batch time: {}", GlobalWatermarkHolder.getLastWatermarkedBatchTime(), batchTime); final RDD<WindowedValue<T>> rdd = generateRdd(); isFirst = false; return scala.Option.apply(rdd); }
Example #4
Source File: SourceDStream.java From beam with Apache License 2.0 | 5 votes |
@Override public scala.Option<RDD<Tuple2<Source<T>, CheckpointMarkT>>> compute(Time validTime) { RDD<Tuple2<Source<T>, CheckpointMarkT>> rdd = new SourceRDD.Unbounded<>( ssc().sparkContext(), options, createMicrobatchSource(), numPartitions); return scala.Option.apply(rdd); }
Example #5
Source File: SparkUnboundedSource.java From beam with Apache License 2.0 | 5 votes |
private void report(Time batchTime, long count, SparkWatermarks sparkWatermark) { // metadata - #records read and a description. scala.collection.immutable.Map<String, Object> metadata = new scala.collection.immutable.Map.Map1<>( StreamInputInfo.METADATA_KEY_DESCRIPTION(), String.format( "Read %d records with observed watermarks %s, from %s for batch time: %s", count, sparkWatermark == null ? "N/A" : sparkWatermark, sourceName, batchTime)); StreamInputInfo streamInputInfo = new StreamInputInfo(inputDStreamId, count, metadata); ssc().scheduler().inputInfoTracker().reportInfo(batchTime, streamInputInfo); }
Example #6
Source File: SylphKafkaOffset.java From sylph with Apache License 2.0 | 4 votes |
@Override public Option<RDD<T>> compute(Time validTime) { return parent.getOrCompute(validTime); }
Example #7
Source File: KafkaSource08.java From sylph with Apache License 2.0 | 4 votes |
private static JavaDStream<ConsumerRecord<byte[], byte[]>> settingCommit( JavaInputDStream<ConsumerRecord<byte[], byte[]>> inputStream, Map<String, String> kafkaParams, KafkaCluster kafkaCluster, String groupId) { if (kafkaParams.getOrDefault("auto.commit.enable", "true").equals("false")) { return inputStream; } int commitInterval = Integer.parseInt(kafkaParams.getOrDefault(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "90000")); DStream<ConsumerRecord<byte[], byte[]>> sylphKafkaOffset = new SylphKafkaOffset<ConsumerRecord<byte[], byte[]>>(inputStream.inputDStream()) { private final KafkaOffsetCommitter kafkaOffsetCommitter = new KafkaOffsetCommitter( kafkaCluster, groupId, commitInterval); @Override public void initialize(Time time) { super.initialize(time); kafkaOffsetCommitter.setName("Kafka_Offset_Committer"); kafkaOffsetCommitter.start(); } @Override public void commitOffsets(RDD<?> kafkaRdd) { OffsetRange[] offsets = ((HasOffsetRanges) kafkaRdd).offsetRanges(); // Map<TopicAndPartition, Long> internalOffsets = Arrays.stream(offsets) // .collect(Collectors.toMap(k -> k.topicAndPartition(), v -> v.fromOffset())); //log().info("commit Kafka Offsets {}", internalOffsets); kafkaOffsetCommitter.addAll(offsets); } }; JavaDStream<ConsumerRecord<byte[], byte[]>> dStream = new JavaDStream<>( sylphKafkaOffset, ClassTag$.MODULE$.<ConsumerRecord<byte[], byte[]>>apply(ConsumerRecord.class)); return dStream; // inputStream = inputStream.transform(rdd -> { // OffsetRange[] offsets = ((HasOffsetRanges) rdd.rdd()).offsetRanges(); // Map<TopicAndPartition, Long> internalOffsets = Arrays.stream(offsets) // .collect(Collectors.toMap(k -> k.topicAndPartition(), v -> v.fromOffset())); // commitKafkaOffsets(kafkaCluster, groupId, internalOffsets); // return rdd; // }); }
Example #8
Source File: JavaRecoverableNetworkWordCount.java From SparkDemo with MIT License | 4 votes |
private static JavaStreamingContext createContext(String ip, int port, String checkpointDirectory, String outputPath) { // If you do not see this printed, that means the StreamingContext has been loaded // from the new checkpoint System.out.println("Creating new context"); final File outputFile = new File(outputPath); if (outputFile.exists()) { outputFile.delete(); } SparkConf sparkConf = new SparkConf().setAppName("JavaRecoverableNetworkWordCount"); // Create the context with a 1 second batch size JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1)); ssc.checkpoint(checkpointDirectory); // Create a socket stream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') JavaReceiverInputDStream<String> lines = ssc.socketTextStream(ip, port); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String x) { return Arrays.asList(SPACE.split(x)).iterator(); } }); JavaPairDStream<String, Integer> wordCounts = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<>(s, 1); } }).reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.foreachRDD(new VoidFunction2<JavaPairRDD<String, Integer>, Time>() { @Override public void call(JavaPairRDD<String, Integer> rdd, Time time) throws IOException { // Get or register the blacklist Broadcast final Broadcast<List<String>> blacklist = JavaWordBlacklist.getInstance(new JavaSparkContext(rdd.context())); // Get or register the droppedWordsCounter Accumulator final LongAccumulator droppedWordsCounter = JavaDroppedWordsCounter.getInstance(new JavaSparkContext(rdd.context())); // Use blacklist to drop words and use droppedWordsCounter to count them String counts = rdd.filter(new Function<Tuple2<String, Integer>, Boolean>() { @Override public Boolean call(Tuple2<String, Integer> wordCount) { if (blacklist.value().contains(wordCount._1())) { droppedWordsCounter.add(wordCount._2()); return false; } else { return true; } } }).collect().toString(); String output = "Counts at time " + time + " " + counts; System.out.println(output); System.out.println("Dropped " + droppedWordsCounter.value() + " word(s) totally"); System.out.println("Appending to " + outputFile.getAbsolutePath()); Files.append(output + "\n", outputFile, Charset.defaultCharset()); } }); return ssc; }
Example #9
Source File: JavaSqlNetworkWordCount.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaNetworkWordCount <hostname> <port>"); System.exit(1); } StreamingExamples.setStreamingLogLevels(); // Create the context with a 1 second batch size SparkConf sparkConf = new SparkConf().setAppName("JavaSqlNetworkWordCount"); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1)); // Create a JavaReceiverInputDStream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') // Note that no duplication in storage level only for running locally. // Replication necessary in distributed scenario for fault tolerance. JavaReceiverInputDStream<String> lines = ssc.socketTextStream( args[0], Integer.parseInt(args[1]), StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String x) { return Arrays.asList(SPACE.split(x)).iterator(); } }); // Convert RDDs of the words DStream to DataFrame and run SQL query words.foreachRDD(new VoidFunction2<JavaRDD<String>, Time>() { @Override public void call(JavaRDD<String> rdd, Time time) { SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf()); // Convert JavaRDD[String] to JavaRDD[bean class] to DataFrame JavaRDD<JavaRecord> rowRDD = rdd.map(new Function<String, JavaRecord>() { @Override public JavaRecord call(String word) { JavaRecord record = new JavaRecord(); record.setWord(word); return record; } }); Dataset<Row> wordsDataFrame = spark.createDataFrame(rowRDD, JavaRecord.class); // Creates a temporary view using the DataFrame wordsDataFrame.createOrReplaceTempView("words"); // Do word count on table using SQL and print it Dataset<Row> wordCountsDataFrame = spark.sql("select word, count(*) as total from words group by word"); System.out.println("========= " + time + "========="); wordCountsDataFrame.show(); } }); ssc.start(); ssc.awaitTermination(); }
Example #10
Source File: SparkUnboundedSource.java From beam with Apache License 2.0 | 4 votes |
@Override public scala.Option<RDD<BoxedUnit>> compute(Time validTime) { // compute parent. scala.Option<RDD<Metadata>> parentRDDOpt = parent.getOrCompute(validTime); final MetricsContainerStepMapAccumulator metricsAccum = MetricsAccumulator.getInstance(); long count = 0; SparkWatermarks sparkWatermark = null; Instant globalLowWatermarkForBatch = BoundedWindow.TIMESTAMP_MIN_VALUE; Instant globalHighWatermarkForBatch = BoundedWindow.TIMESTAMP_MIN_VALUE; long maxReadDuration = 0; if (parentRDDOpt.isDefined()) { JavaRDD<Metadata> parentRDD = parentRDDOpt.get().toJavaRDD(); for (Metadata metadata : parentRDD.collect()) { count += metadata.getNumRecords(); // compute the global input watermark - advance to latest of all partitions. Instant partitionLowWatermark = metadata.getLowWatermark(); globalLowWatermarkForBatch = globalLowWatermarkForBatch.isBefore(partitionLowWatermark) ? partitionLowWatermark : globalLowWatermarkForBatch; Instant partitionHighWatermark = metadata.getHighWatermark(); globalHighWatermarkForBatch = globalHighWatermarkForBatch.isBefore(partitionHighWatermark) ? partitionHighWatermark : globalHighWatermarkForBatch; // Update metrics reported in the read final Gauge gauge = Metrics.gauge(NAMESPACE, READ_DURATION_MILLIS); final MetricsContainer container = metadata.getMetricsContainers().getContainer(stepName); try (Closeable ignored = MetricsEnvironment.scopedMetricsContainer(container)) { final long readDurationMillis = metadata.getReadDurationMillis(); if (readDurationMillis > maxReadDuration) { gauge.set(readDurationMillis); } } catch (IOException e) { throw new RuntimeException(e); } metricsAccum.value().updateAll(metadata.getMetricsContainers()); } sparkWatermark = new SparkWatermarks( globalLowWatermarkForBatch, globalHighWatermarkForBatch, new Instant(validTime.milliseconds())); // add to watermark queue. GlobalWatermarkHolder.add(inputDStreamId, sparkWatermark); } // report - for RateEstimator and visibility. report(validTime, count, sparkWatermark); return scala.Option.empty(); }
Example #11
Source File: SparkStreamingFromFlumeToHBaseWindowingExample.java From SparkOnALog with Apache License 2.0 | 4 votes |
public static void main(String[] args) { if (args.length == 0) { System.err .println("Usage: SparkStreamingFromFlumeToHBaseWindowingExample {master} {host} {port} {table} {columnFamily} {windowInSeconds} {slideInSeconds"); System.exit(1); } String master = args[0]; String host = args[1]; int port = Integer.parseInt(args[2]); String tableName = args[3]; String columnFamily = args[4]; int windowInSeconds = Integer.parseInt(args[5]); int slideInSeconds = Integer.parseInt(args[5]); Duration batchInterval = new Duration(2000); Duration windowInterval = new Duration(windowInSeconds * 1000); Duration slideInterval = new Duration(slideInSeconds * 1000); JavaStreamingContext sc = new JavaStreamingContext(master, "FlumeEventCount", batchInterval, System.getenv("SPARK_HOME"), "/home/cloudera/SparkOnALog.jar"); final Broadcast<String> broadcastTableName = sc.sparkContext().broadcast(tableName); final Broadcast<String> broadcastColumnFamily = sc.sparkContext().broadcast(columnFamily); //JavaDStream<SparkFlumeEvent> flumeStream = sc.flumeStream(host, port); JavaDStream<SparkFlumeEvent> flumeStream = FlumeUtils.createStream(sc, host, port); JavaPairDStream<String, Integer> lastCounts = flumeStream .flatMap(new FlatMapFunction<SparkFlumeEvent, String>() { @Override public Iterable<String> call(SparkFlumeEvent event) throws Exception { String bodyString = new String(event.event().getBody() .array(), "UTF-8"); return Arrays.asList(bodyString.split(" ")); } }).map(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String str) throws Exception { return new Tuple2(str, 1); } }).reduceByKeyAndWindow(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer x, Integer y) throws Exception { // TODO Auto-generated method stub return x.intValue() + y.intValue(); } }, windowInterval, slideInterval); lastCounts.foreach(new Function2<JavaPairRDD<String,Integer>, Time, Void>() { @Override public Void call(JavaPairRDD<String, Integer> values, Time time) throws Exception { values.foreach(new VoidFunction<Tuple2<String, Integer>> () { @Override public void call(Tuple2<String, Integer> tuple) throws Exception { HBaseCounterIncrementor incrementor = HBaseCounterIncrementor.getInstance(broadcastTableName.value(), broadcastColumnFamily.value()); incrementor.incerment("Counter", tuple._1(), tuple._2()); System.out.println("Counter:" + tuple._1() + "," + tuple._2()); }} ); return null; }}); sc.start(); }
Example #12
Source File: SparkStreamingFromFlumeToHBaseExample.java From SparkOnALog with Apache License 2.0 | 4 votes |
public static void main(String[] args) { if (args.length == 0) { System.err .println("Usage: SparkStreamingFromFlumeToHBaseExample {master} {host} {port} {table} {columnFamily}"); System.exit(1); } String master = args[0]; String host = args[1]; int port = Integer.parseInt(args[2]); String tableName = args[3]; String columnFamily = args[4]; Duration batchInterval = new Duration(2000); JavaStreamingContext sc = new JavaStreamingContext(master, "FlumeEventCount", batchInterval, System.getenv("SPARK_HOME"), "/home/cloudera/SparkOnALog.jar"); final Broadcast<String> broadcastTableName = sc.sparkContext().broadcast(tableName); final Broadcast<String> broadcastColumnFamily = sc.sparkContext().broadcast(columnFamily); //JavaDStream<SparkFlumeEvent> flumeStream = sc.flumeStream(host, port); JavaDStream<SparkFlumeEvent> flumeStream = FlumeUtils.createStream(sc, host, port); JavaPairDStream<String, Integer> lastCounts = flumeStream .flatMap(new FlatMapFunction<SparkFlumeEvent, String>() { @Override public Iterable<String> call(SparkFlumeEvent event) throws Exception { String bodyString = new String(event.event().getBody() .array(), "UTF-8"); return Arrays.asList(bodyString.split(" ")); } }).map(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String str) throws Exception { return new Tuple2(str, 1); } }).reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer x, Integer y) throws Exception { // TODO Auto-generated method stub return x.intValue() + y.intValue(); } }); lastCounts.foreach(new Function2<JavaPairRDD<String,Integer>, Time, Void>() { @Override public Void call(JavaPairRDD<String, Integer> values, Time time) throws Exception { values.foreach(new VoidFunction<Tuple2<String, Integer>> () { @Override public void call(Tuple2<String, Integer> tuple) throws Exception { HBaseCounterIncrementor incrementor = HBaseCounterIncrementor.getInstance(broadcastTableName.value(), broadcastColumnFamily.value()); incrementor.incerment("Counter", tuple._1(), tuple._2()); System.out.println("Counter:" + tuple._1() + "," + tuple._2()); }} ); return null; }}); sc.start(); }
Example #13
Source File: BatchUpdateFunction.java From oryx with Apache License 2.0 | 4 votes |
@Override public void call(JavaPairRDD<K,M> newData, Time timestamp) throws IOException, InterruptedException { if (newData.isEmpty()) { log.info("No data in current generation's RDD; nothing to do"); return; } log.info("Beginning update at {}", timestamp); Configuration hadoopConf = sparkContext.hadoopConfiguration(); if (hadoopConf.getResource("core-site.xml") == null) { log.warn("Hadoop config like core-site.xml was not found; " + "is the Hadoop config directory on the classpath?"); } JavaPairRDD<K,M> pastData; Path inputPathPattern = new Path(dataDirString + "/*/part-*"); FileSystem fs = FileSystem.get(inputPathPattern.toUri(), hadoopConf); FileStatus[] inputPathStatuses = fs.globStatus(inputPathPattern); if (inputPathStatuses == null || inputPathStatuses.length == 0) { log.info("No past data at path(s) {}", inputPathPattern); pastData = null; } else { log.info("Found past data at path(s) like {}", inputPathStatuses[0].getPath()); Configuration updatedConf = new Configuration(hadoopConf); updatedConf.set(FileInputFormat.INPUT_DIR, joinFSPaths(fs, inputPathStatuses)); @SuppressWarnings("unchecked") JavaPairRDD<Writable,Writable> pastWritableData = (JavaPairRDD<Writable,Writable>) sparkContext.newAPIHadoopRDD(updatedConf, SequenceFileInputFormat.class, keyWritableClass, messageWritableClass); pastData = pastWritableData.mapToPair( new WritableToValueFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass)); } if (updateTopic == null || updateBroker == null) { log.info("Not producing updates to update topic since none was configured"); updateInstance.runUpdate(sparkContext, timestamp.milliseconds(), newData, pastData, modelDirString, null); } else { // This TopicProducer should not be async; sends one big model generally and // needs to occur before other updates reliably rather than be buffered try (TopicProducer<String,U> producer = new TopicProducerImpl<>(updateBroker, updateTopic, false)) { updateInstance.runUpdate(sparkContext, timestamp.milliseconds(), newData, pastData, modelDirString, producer); } } }
Example #14
Source File: BlurLoadSparkProcessor.java From incubator-retired-blur with Apache License 2.0 | votes |
protected abstract Function2<JavaPairRDD<String, RowMutation>, Time, Void> getFunction();