org.apache.spark.streaming.dstream.DStream Java Examples
The following examples show how to use
org.apache.spark.streaming.dstream.DStream.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DStreamUtil.java From sylph with Apache License 2.0 | 6 votes |
public static void dstreamAction(JavaDStream<Row> stream, Sink<JavaRDD<Row>> sink) { DStream<?> fristDStream = getFristDStream(stream.dstream()); logger.info("数据源驱动:{}", fristDStream.getClass().getName()); if ("DirectKafkaInputDStream".equals(fristDStream.getClass().getSimpleName())) { logger.info("发现job 数据源是kafka,将开启空job优化 且 自动上报offect"); stream.foreachRDD(rdd -> { RDD<?> kafkaRdd = getFristRdd(rdd.rdd()); //rdd.dependencies(0).rdd OffsetRange[] offsetRanges = ((HasOffsetRanges) kafkaRdd).offsetRanges(); if (kafkaRdd.count() > 0) { sink.run(rdd); //执行业务操作 } ((CanCommitOffsets) fristDStream).commitAsync(offsetRanges); }); } else { //非kafka数据源 暂时无法做任何优化 stream.foreachRDD(sink::run); } }
Example #2
Source File: SparkStreamingSqlAnalyse.java From sylph with Apache License 2.0 | 5 votes |
public void build() { JavaDStream<Row> inputStream = source.apply(null); SparkSession spark = SparkSession.builder().config(inputStream.context().sparkContext().getConf()).getOrCreate(); if (isCompile) { logger.info("isCompile mode will checkDStream()"); checkDStream(spark, sourceTableName, schema, handlers); } DStream<?> firstDStream = DStreamUtil.getFirstDStream(inputStream.dstream(), SylphKafkaOffset.class); logger.info("source table {}, firstDStream is {}", sourceTableName, firstDStream); inputStream.foreachRDD(rdd -> { Dataset<Row> df = spark.createDataFrame(rdd, schema); df.createOrReplaceTempView(sourceTableName); //df.show() //if kafka0.10+ if("DirectKafkaInputDStream".equals(firstDStream.getClass().getSimpleName())) {} if (firstDStream instanceof SylphKafkaOffset) { // RDD<?> kafkaRdd = DStreamUtil.getFirstRdd(rdd.rdd()); //rdd.dependencies(0).rdd if (kafkaRdd.count() > 0) { handlers.forEach(x -> x.accept(spark)); //执行业务操作 } //val offsetRanges = kafkaRdd.asInstanceOf[HasOffsetRanges].offsetRanges //firstDStream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges) ((SylphKafkaOffset<?>) firstDStream).commitOffsets(kafkaRdd); } else { handlers.forEach(x -> x.accept(spark)); } }); }
Example #3
Source File: DStreamUtil.java From sylph with Apache License 2.0 | 5 votes |
public static DStream<?> getFirstDStream(DStream<?> stream, Class<? extends DStream> first) { if (first != null && first.isInstance(stream)) { return stream; } if (stream.dependencies().isEmpty()) { return stream; } else { return getFirstDStream(stream.dependencies().head(), first); } }
Example #4
Source File: SylphKafkaOffset.java From sylph with Apache License 2.0 | 5 votes |
@Override public List<DStream<?>> dependencies() { return List$.MODULE$.<DStream<?>>newBuilder() .$plus$eq(parent) .result(); }
Example #5
Source File: ProcessedOffsetManager.java From kafka-spark-consumer with Apache License 2.0 | 5 votes |
@SuppressWarnings("deprecation") public static void persists(DStream<Tuple2<String, Iterable<Long>>> partitonOffset, Properties props) { ClassTag<Tuple2<String, Iterable<Long>>> tuple2ClassTag = ScalaUtil.<String, Iterable<Long>>getTuple2ClassTag(); JavaDStream<Tuple2<String, Iterable<Long>>> jpartitonOffset = new JavaDStream<Tuple2<String, Iterable<Long>>>(partitonOffset, tuple2ClassTag); jpartitonOffset.foreachRDD(new VoidFunction<JavaRDD<Tuple2<String, Iterable<Long>>>>() { @Override public void call(JavaRDD<Tuple2<String, Iterable<Long>>> po) throws Exception { List<Tuple2<String, Iterable<Long>>> poList = po.collect(); doPersists(poList, props); } }); }
Example #6
Source File: DStreamUtil.java From sylph with Apache License 2.0 | 5 votes |
public static DStream<?> getFristDStream(DStream<?> stream) { if (stream.dependencies().isEmpty()) { return stream; } else { return getFristDStream(stream.dependencies().head()); } }
Example #7
Source File: ProcessedOffsetManager.java From kafka-spark-consumer with Apache License 2.0 | 5 votes |
public static <T> DStream<Tuple2<String, Iterable<Long>>> getPartitionOffset( DStream<MessageAndMetadata<T>> unionStreams, Properties props) { ClassTag<MessageAndMetadata<T>> messageMetaClassTag = ScalaUtil.<T>getMessageAndMetadataClassTag(); JavaDStream<MessageAndMetadata<T>> javaDStream = new JavaDStream<MessageAndMetadata<T>>(unionStreams, messageMetaClassTag); JavaPairDStream<String, Iterable<Long>> partitonOffset = getPartitionOffset(javaDStream, props); return partitonOffset.dstream(); }
Example #8
Source File: SparkGroupAlsoByWindowViaWindowSet.java From beam with Apache License 2.0 | 5 votes |
private static void checkpointIfNeeded( final DStream<Tuple2<ByteArray, Tuple2<StateAndTimers, List<byte[]>>>> firedStream, final SerializablePipelineOptions options) { final Long checkpointDurationMillis = getBatchDuration(options); if (checkpointDurationMillis > 0) { firedStream.checkpoint(new Duration(checkpointDurationMillis)); } }
Example #9
Source File: SparkGroupAlsoByWindowViaWindowSet.java From beam with Apache License 2.0 | 5 votes |
private static <K, InputT> PairDStreamFunctions<ByteArray, byte[]> buildPairDStream( final JavaDStream<WindowedValue<KV<K, InputT>>> inputDStream, final Coder<K> keyCoder, final Coder<WindowedValue<InputT>> wvCoder) { // we have to switch to Scala API to avoid Optional in the Java API, see: SPARK-4819. // we also have a broader API for Scala (access to the actual key and entire iterator). // we use coders to convert objects in the PCollection to byte arrays, so they // can be transferred over the network for the shuffle and be in serialized form // for checkpointing. // for readability, we add comments with actual type next to byte[]. // to shorten line length, we use: // ---- WV: WindowedValue // ---- Iterable: Itr // ---- AccumT: A // ---- InputT: I final DStream<Tuple2<ByteArray, byte[]>> tupleDStream = inputDStream .map(new ReifyTimestampsAndWindowsFunction<>()) .mapToPair(TranslationUtils.toPairFunction()) .mapToPair(CoderHelpers.toByteFunction(keyCoder, wvCoder)) .dstream(); return DStream.toPairDStreamFunctions( tupleDStream, JavaSparkContext$.MODULE$.fakeClassTag(), JavaSparkContext$.MODULE$.fakeClassTag(), null); }
Example #10
Source File: SparkGroupAlsoByWindowViaWindowSet.java From beam with Apache License 2.0 | 5 votes |
public static <K, InputT, W extends BoundedWindow> JavaDStream<WindowedValue<KV<K, Iterable<InputT>>>> groupByKeyAndWindow( final JavaDStream<WindowedValue<KV<K, InputT>>> inputDStream, final Coder<K> keyCoder, final Coder<WindowedValue<InputT>> wvCoder, final WindowingStrategy<?, W> windowingStrategy, final SerializablePipelineOptions options, final List<Integer> sourceIds, final String transformFullName) { final PairDStreamFunctions<ByteArray, byte[]> pairDStream = buildPairDStream(inputDStream, keyCoder, wvCoder); // use updateStateByKey to scan through the state and update elements and timers. final UpdateStateByKeyFunction<K, InputT, W> updateFunc = new UpdateStateByKeyFunction<>( sourceIds, windowingStrategy, (FullWindowedValueCoder<InputT>) wvCoder, keyCoder, options, transformFullName); final DStream< Tuple2</*K*/ ByteArray, Tuple2<StateAndTimers, /*WV<KV<K, Itr<I>>>*/ List<byte[]>>>> firedStream = pairDStream.updateStateByKey( updateFunc, pairDStream.defaultPartitioner(pairDStream.defaultPartitioner$default$1()), true, JavaSparkContext$.MODULE$.fakeClassTag()); checkpointIfNeeded(firedStream, options); // filter state-only output (nothing to fire) and remove the state from the output. return stripStateValues(firedStream, keyCoder, (FullWindowedValueCoder<InputT>) wvCoder); }
Example #11
Source File: SparkUnboundedSource.java From beam with Apache License 2.0 | 5 votes |
ReadReportDStream( DStream<Metadata> parent, int inputDStreamId, String sourceName, String stepName) { super(parent.ssc(), JavaSparkContext$.MODULE$.fakeClassTag()); this.parent = parent; this.inputDStreamId = inputDStreamId; this.sourceName = sourceName; this.stepName = stepName; }
Example #12
Source File: ReceiverLauncher.java From kafka-spark-consumer with Apache License 2.0 | 4 votes |
public static DStream<MessageAndMetadata<byte[]>> launch( StreamingContext ssc, Properties pros, int numberOfReceivers, StorageLevel storageLevel) { JavaStreamingContext jsc = new JavaStreamingContext(ssc); return createStream(jsc, pros, numberOfReceivers, storageLevel, new IdentityMessageHandler()).dstream(); }
Example #13
Source File: ReceiverLauncher.java From kafka-spark-consumer with Apache License 2.0 | 4 votes |
public static <E> DStream<MessageAndMetadata<E>> launch( StreamingContext ssc, Properties pros, int numberOfReceivers, StorageLevel storageLevel, KafkaMessageHandler<E> messageHandler) { JavaStreamingContext jsc = new JavaStreamingContext(ssc); return createStream(jsc, pros, numberOfReceivers, storageLevel, messageHandler).dstream(); }
Example #14
Source File: SparkUnboundedSource.java From beam with Apache License 2.0 | 4 votes |
@Override public scala.collection.immutable.List<DStream<?>> dependencies() { return scala.collection.JavaConversions.asScalaBuffer( Collections.<DStream<?>>singletonList(parent)) .toList(); }
Example #15
Source File: FraudDetectionApp.java From Building-Data-Streaming-Applications-with-Apache-Kafka with MIT License | 4 votes |
public static void main(String[] args) throws Exception { String brokers = "localhost:9092"; String topics = "iplog"; CacheIPLookup cacheIPLookup = new CacheIPLookup(); SparkConf sparkConf = new SparkConf().setAppName("IP_FRAUD"); JavaStreamingContext javaStreamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(2)); Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(","))); Map<String, String> kafkaConfiguration = new HashMap<>(); kafkaConfiguration.put("metadata.broker.list", brokers); kafkaConfiguration.put("group.id", "ipfraud"); kafkaConfiguration.put("auto.offset.reset", "smallest"); JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream( javaStreamingContext, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaConfiguration, topicsSet ); JavaDStream<String> lines = messages.map(Tuple2::_2); JavaDStream<String> fraudIPs = lines.filter(new Function<String, Boolean>() { @Override public Boolean call(String s) throws Exception { String IP = s.split(" ")[0]; String[] ranges = IP.split("\\."); String range = null; try { range = ranges[0] + "." + ranges[1]; } catch (ArrayIndexOutOfBoundsException ex) { } return cacheIPLookup.isFraudIP(range); } }); DStream<String> fraudDstream = fraudIPs.dstream(); fraudDstream.saveAsTextFiles("FraudRecord", ""); javaStreamingContext.start(); javaStreamingContext.awaitTermination(); }
Example #16
Source File: KafkaSource.java From sylph with Apache License 2.0 | 4 votes |
public JavaDStream<Row> createSource(JavaStreamingContext ssc, KafkaSourceConfig config, SourceContext context) { String topics = config.getTopics(); String brokers = config.getBrokers(); //需要把集群的host 配置到程序所在机器 String groupId = config.getGroupid(); //消费者的名字 String offsetMode = config.getOffsetMode(); Map<String, Object> kafkaParams = new HashMap<>(config.getOtherConfig()); kafkaParams.put("bootstrap.servers", brokers); kafkaParams.put("key.deserializer", ByteArrayDeserializer.class); //StringDeserializer kafkaParams.put("value.deserializer", ByteArrayDeserializer.class); //StringDeserializer kafkaParams.put("enable.auto.commit", false); //不自动提交偏移量 // "fetch.message.max.bytes" -> // "session.timeout.ms" -> "30000", //session默认是30秒 // "heartbeat.interval.ms" -> "5000", //10秒提交一次 心跳周期 kafkaParams.put("group.id", groupId); //注意不同的流 group.id必须要不同 否则会出现offect commit提交失败的错误 kafkaParams.put("auto.offset.reset", offsetMode); //latest earliest List<String> topicSets = Arrays.asList(topics.split(",")); JavaInputDStream<ConsumerRecord<byte[], byte[]>> inputStream = KafkaUtils.createDirectStream( ssc, LocationStrategies.PreferConsistent(), ConsumerStrategies.Subscribe(topicSets, kafkaParams)); DStream<ConsumerRecord<byte[], byte[]>> sylphKafkaOffset = new SylphKafkaOffset<ConsumerRecord<byte[], byte[]>>(inputStream.inputDStream()) { @Override public void commitOffsets(RDD<?> kafkaRdd) { OffsetRange[] offsetRanges = ((HasOffsetRanges) kafkaRdd).offsetRanges(); log().info("commitKafkaOffsets {}", (Object) offsetRanges); DStream<?> firstDStream = DStreamUtil.getFirstDStream(inputStream.dstream()); ((CanCommitOffsets) firstDStream).commitAsync(offsetRanges); } }; JavaDStream<ConsumerRecord<byte[], byte[]>> javaDStream = new JavaDStream<>(sylphKafkaOffset, ClassTag$.MODULE$.apply(ConsumerRecord.class)); if ("json".equalsIgnoreCase(config.getValueType())) { JsonSchema jsonParser = new JsonSchema(context.getSchema()); return javaDStream .map(record -> jsonParser.deserialize(record.key(), record.value(), record.topic(), record.partition(), record.offset())); } else { List<String> names = context.getSchema().getFieldNames(); return javaDStream .map(record -> { Object[] values = new Object[names.size()]; for (int i = 0; i < names.size(); i++) { switch (names.get(i)) { case "_topic": values[i] = record.topic(); continue; case "_message": values[i] = new String(record.value(), UTF_8); continue; case "_key": values[i] = record.key() == null ? null : new String(record.key(), UTF_8); continue; case "_partition": values[i] = record.partition(); continue; case "_offset": values[i] = record.offset(); case "_timestamp": values[i] = record.timestamp(); case "_timestampType": values[i] = record.timestampType().id; default: values[i] = null; } } return new GenericRow(values); //GenericRowWithSchema }); //.window(Duration(10 * 1000)) } }
Example #17
Source File: KafkaSource08.java From sylph with Apache License 2.0 | 4 votes |
private static JavaDStream<ConsumerRecord<byte[], byte[]>> settingCommit( JavaInputDStream<ConsumerRecord<byte[], byte[]>> inputStream, Map<String, String> kafkaParams, KafkaCluster kafkaCluster, String groupId) { if (kafkaParams.getOrDefault("auto.commit.enable", "true").equals("false")) { return inputStream; } int commitInterval = Integer.parseInt(kafkaParams.getOrDefault(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "90000")); DStream<ConsumerRecord<byte[], byte[]>> sylphKafkaOffset = new SylphKafkaOffset<ConsumerRecord<byte[], byte[]>>(inputStream.inputDStream()) { private final KafkaOffsetCommitter kafkaOffsetCommitter = new KafkaOffsetCommitter( kafkaCluster, groupId, commitInterval); @Override public void initialize(Time time) { super.initialize(time); kafkaOffsetCommitter.setName("Kafka_Offset_Committer"); kafkaOffsetCommitter.start(); } @Override public void commitOffsets(RDD<?> kafkaRdd) { OffsetRange[] offsets = ((HasOffsetRanges) kafkaRdd).offsetRanges(); // Map<TopicAndPartition, Long> internalOffsets = Arrays.stream(offsets) // .collect(Collectors.toMap(k -> k.topicAndPartition(), v -> v.fromOffset())); //log().info("commit Kafka Offsets {}", internalOffsets); kafkaOffsetCommitter.addAll(offsets); } }; JavaDStream<ConsumerRecord<byte[], byte[]>> dStream = new JavaDStream<>( sylphKafkaOffset, ClassTag$.MODULE$.<ConsumerRecord<byte[], byte[]>>apply(ConsumerRecord.class)); return dStream; // inputStream = inputStream.transform(rdd -> { // OffsetRange[] offsets = ((HasOffsetRanges) rdd.rdd()).offsetRanges(); // Map<TopicAndPartition, Long> internalOffsets = Arrays.stream(offsets) // .collect(Collectors.toMap(k -> k.topicAndPartition(), v -> v.fromOffset())); // commitKafkaOffsets(kafkaCluster, groupId, internalOffsets); // return rdd; // }); }
Example #18
Source File: DStreamUtil.java From sylph with Apache License 2.0 | 4 votes |
public static DStream<?> getFirstDStream(DStream<?> stream) { return getFirstDStream(stream, null); }