org.apache.spark.streaming.Duration Java Exaples

Source File: StreamingRsvpsDStreamCountWindow.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License

6 votes

public static void main(String[] args) throws InterruptedException {

        System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

        final SparkConf conf = new SparkConf()
                .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
                .setAppName(APPLICATION_NAME)
                .set("spark.mongodb.output.uri", MONGODB_OUTPUT_URI)
                .set("spark.streaming.kafka.consumer.cache.enabled", "false");

        final JavaStreamingContext streamingContext
                = new JavaStreamingContext(conf, new Duration(BATCH_DURATION_INTERVAL_MS));

        streamingContext.checkpoint(CHECKPOINT_FOLDER);

        final JavaInputDStream<ConsumerRecord<String, String>> meetupStream =
                KafkaUtils.createDirectStream(
                        streamingContext,
                        LocationStrategies.PreferConsistent(),
                        ConsumerStrategies.<String, String>Subscribe(TOPICS, KAFKA_CONSUMER_PROPERTIES)
                );
                
        // transformations, streaming algorithms, etc
        JavaDStream<Long> countStream  
            = meetupStream.countByWindow(
                 new Duration(WINDOW_LENGTH_MS), 
                 new Duration(SLIDING_INTERVAL_MS));        

        countStream.foreachRDD((JavaRDD<Long> countRDD) -> {                
            MongoSpark.save(        
                    countRDD.map(
                        r -> Document.parse("{\"rsvps_count\":\"" + String.valueOf(r) + "\"}")
                    )
            );            
        });
        
        // some time later, after outputs have completed
        meetupStream.foreachRDD((JavaRDD<ConsumerRecord<String, String>> meetupRDD) -> {        
            OffsetRange[] offsetRanges = ((HasOffsetRanges) meetupRDD.rdd()).offsetRanges();            

            ((CanCommitOffsets) meetupStream.inputDStream())
                .commitAsync(offsetRanges, new MeetupOffsetCommitCallback());
        });
        
        streamingContext.start();
        streamingContext.awaitTermination();    
    }

Source File: SparkStreamingBinding.java From datacollector with Apache License 2.0

6 votes

@Override
@SuppressWarnings("unchecked")
public JavaStreamingContext create() {
  sparkConf.set("spark.streaming.kafka.maxRatePerPartition", String.valueOf(maxRatePerPartition));
  // Use our classpath first, since we ship a newer version of Jackson and possibly other deps in the future.
  sparkConf.set("spark.driver.userClassPathFirst", "true");
  sparkConf.set("spark.executor.userClassPathFirst", "true");

  session = SparkSession.builder().config(sparkConf).getOrCreate();

  JavaStreamingContext result =
      new JavaStreamingContext(new JavaSparkContext(session.sparkContext()), new Duration(duration));
  Map<String, Object> props = new HashMap<>();

  props.put("group.id", groupId);
  props.put("key.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer");
  props.put("value.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer");
  for (Map.Entry<String, Object> map : props.entrySet()) {
    logMessage(Utils.format("Adding extra kafka config, {}:{}", map.getKey(), map.getValue()), isRunningInMesos);
  }

  logMessage("Meta data broker list " + metaDataBrokerList, isRunningInMesos);
  logMessage("Topic is " + topic, isRunningInMesos);
  logMessage("Auto offset reset is set to " + autoOffsetValue, isRunningInMesos);
  return createDStream(result, props);
}

Source File: Flags.java From SparkToParquet with Apache License 2.0

6 votes

public static void setFromCommandLineArgs(Options options, String[] args) {
	CommandLineParser parser = new PosixParser();
	try {
		CommandLine cl = parser.parse(options, args);
		// 参数默认值
		THE_INSTANCE.windowLength = new Duration(
				Integer.parseInt(cl.getOptionValue(AppMain.WINDOW_LENGTH, "30")) * 1000);
		THE_INSTANCE.slideInterval = new Duration(
				Integer.parseInt(cl.getOptionValue(AppMain.SLIDE_INTERVAL, "5")) * 1000);
		THE_INSTANCE.kafka_broker = cl.getOptionValue(AppMain.KAFKA_BROKER, "kafka:9092");
		THE_INSTANCE.kafka_topic = cl.getOptionValue(AppMain.KAFKA_TOPIC, "apache");
		THE_INSTANCE.parquet_file = cl.getOptionValue(AppMain.PARQUET_FILE, "/user/spark/");
		THE_INSTANCE.initialized = true;
	} catch (ParseException e) {
		THE_INSTANCE.initialized = false;
		System.err.println("Parsing failed.  Reason: " + e.getMessage());
	}
}

Source File: StreamingContextConfiguration.java From Decision with Apache License 2.0

6 votes

private JavaStreamingContext create(String streamingContextName, int port, long streamingBatchTime, String sparkHost) {
    SparkConf conf = new SparkConf();
    conf.set("spark.ui.port", String.valueOf(port));
    conf.setAppName(streamingContextName);
    conf.setJars(JavaStreamingContext.jarOfClass(StreamingEngine.class));
    conf.setMaster(sparkHost);

    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.registerKryoClasses(new Class[] { StratioStreamingMessage.class, InsertMessage.class, ColumnType.class,
            Action.class});


    HashMap<String, String> tuningProperties = configurationContext.getSparkTunningProperties();
    if (tuningProperties != null && tuningProperties.size() > 0) {
        tuningProperties.forEach( (key, value) ->  conf.set(key, value));
    }

    JavaStreamingContext streamingContext = new JavaStreamingContext(conf, new Duration(streamingBatchTime));

    return streamingContext;
}

Source File: SparkStreamServiceImpl.java From searchanalytics-bigdata with MIT License

6 votes

@Override
public void setup() {
	// Create a StreamingContext with a SparkConf configuration
	SparkConf sparkConf = new SparkConf(false)
			.setAppName("JaiSpark")
			.setSparkHome("target/sparkhome")
			.setMaster("local")
			.set("spark.executor.memory", "128m")
			.set("spark.local.dir",
					new File("target/sparkhome/tmp").getAbsolutePath())
			.set("spark.cores.max", "2").set("spark.akka.threads", "2")
			.set("spark.akka.timeout", "60").set("spark.logConf", "true")
			.set("spark.cleaner.delay", "3700")
			.set("spark.cleaner.ttl", "86400")
			.set("spark.shuffle.spill", "false")
			.set("spark.driver.host", "localhost")
			.set("spark.driver.port", "43214");
	jssc = new JavaStreamingContext(sparkConf, new Duration(5000));

	String checkpointDir = hadoopClusterService.getHDFSUri()
			+ "/sparkcheckpoint";
	jssc.checkpoint(checkpointDir);
	startFlumeStream();
}

Source File: AbstractSparkLayer.java From oryx with Apache License 2.0

5 votes

protected final JavaStreamingContext buildStreamingContext() {
  log.info("Starting SparkContext with interval {} seconds", generationIntervalSec);

  SparkConf sparkConf = new SparkConf();

  // Only for tests, really
  if (sparkConf.getOption("spark.master").isEmpty()) {
    log.info("Overriding master to {} for tests", streamingMaster);
    sparkConf.setMaster(streamingMaster);
  }
  // Only for tests, really
  if (sparkConf.getOption("spark.app.name").isEmpty()) {
    String appName = "Oryx" + getLayerName();
    if (id != null) {
      appName = appName + '-' + id;
    }
    log.info("Overriding app name to {} for tests", appName);
    sparkConf.setAppName(appName);
  }
  extraSparkConfig.forEach((key, value) -> sparkConf.setIfMissing(key, value.toString()));

  // Turn this down to prevent long blocking at shutdown
  sparkConf.setIfMissing(
      "spark.streaming.gracefulStopTimeout",
      Long.toString(TimeUnit.MILLISECONDS.convert(generationIntervalSec, TimeUnit.SECONDS)));
  sparkConf.setIfMissing("spark.cleaner.ttl", Integer.toString(20 * generationIntervalSec));
  long generationIntervalMS =
      TimeUnit.MILLISECONDS.convert(generationIntervalSec, TimeUnit.SECONDS);

  JavaSparkContext jsc = JavaSparkContext.fromSparkContext(SparkContext.getOrCreate(sparkConf));
  return new JavaStreamingContext(jsc, new Duration(generationIntervalMS));
}

Source File: JavaHBaseStreamingBulkPutExample.java From learning-hadoop with Apache License 2.0

5 votes

public static void main(String args[]) {
  if (args.length == 0) {
    System.out
        .println("JavaHBaseBulkPutExample  {master} {host} {post} {tableName} {columnFamily}");
  }

  String master = args[0];
  String host = args[1];
  String port = args[2];
  String tableName = args[3];
  String columnFamily = args[4];

  System.out.println("master:" + master);
  System.out.println("host:" + host);
  System.out.println("port:" + Integer.parseInt(port));
  System.out.println("tableName:" + tableName);
  System.out.println("columnFamily:" + columnFamily);
  
  SparkConf sparkConf = new SparkConf();
  sparkConf.set("spark.cleaner.ttl", "120000");
  
  JavaSparkContext jsc = new JavaSparkContext(master,
      "JavaHBaseBulkPutExample");
  jsc.addJar("SparkHBase.jar");
  
  JavaStreamingContext jssc = new JavaStreamingContext(jsc, new Duration(1000));

  JavaReceiverInputDStream<String> javaDstream = jssc.socketTextStream(host, Integer.parseInt(port));
  
  Configuration conf = HBaseConfiguration.create();
  conf.addResource(new Path("/etc/hbase/conf/core-site.xml"));
  conf.addResource(new Path("/etc/hbase/conf/hbase-site.xml"));

  JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf);

  hbaseContext.streamBulkPut(javaDstream, tableName, new PutFunction(), true);
}

Source File: SparkRunnerStreamingContextFactory.java From beam with Apache License 2.0

5 votes

@Override
public JavaStreamingContext call() throws Exception {
  LOG.info("Creating a new Spark Streaming Context");
  // validate unbounded read properties.
  checkArgument(
      options.getMinReadTimeMillis() < options.getBatchIntervalMillis(),
      "Minimum read time has to be less than batch time.");
  checkArgument(
      options.getReadTimePercentage() > 0 && options.getReadTimePercentage() < 1,
      "Read time percentage is bound to (0, 1).");

  SparkPipelineTranslator translator =
      new StreamingTransformTranslator.Translator(new TransformTranslator.Translator());
  Duration batchDuration = new Duration(options.getBatchIntervalMillis());
  LOG.info("Setting Spark streaming batchDuration to {} msec", batchDuration.milliseconds());

  JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
  JavaStreamingContext jssc = new JavaStreamingContext(jsc, batchDuration);

  // We must first init accumulators since translators expect them to be instantiated.
  SparkRunner.initAccumulators(options, jsc);
  // do not need to create a MetricsPusher instance here because if is called in SparkRunner.run()

  EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options, jssc);
  // update cache candidates
  SparkRunner.updateCacheCandidates(pipeline, translator, ctxt);
  pipeline.traverseTopologically(new SparkRunner.Evaluator(translator, ctxt));
  ctxt.computeOutputs();

  checkpoint(jssc, checkpointDir);

  return jssc;
}

Source File: SparkGroupAlsoByWindowViaWindowSet.java From beam with Apache License 2.0

5 votes

private static void checkpointIfNeeded(
    final DStream<Tuple2<ByteArray, Tuple2<StateAndTimers, List<byte[]>>>> firedStream,
    final SerializablePipelineOptions options) {

  final Long checkpointDurationMillis = getBatchDuration(options);

  if (checkpointDurationMillis > 0) {
    firedStream.checkpoint(new Duration(checkpointDurationMillis));
  }
}

Source File: KafkaInput.java From envelope with Apache License 2.0

5 votes

@Override
public JavaDStream<?> getDStream() throws Exception {
  if (dStream == null) {
    JavaStreamingContext jssc = Contexts.getJavaStreamingContext();
    Map<TopicPartition, Long> lastOffsets = null;
    if (doesRecordProgress(config) && !usingKafkaManagedOffsets(config)) {
      lastOffsets = getLastOffsets();
    }

    if (lastOffsets != null) {
      dStream = KafkaUtils.createDirectStream(jssc, LocationStrategies.PreferConsistent(),
          ConsumerStrategies.Subscribe(topics, kafkaParams, lastOffsets));
    } else {
      dStream = KafkaUtils.createDirectStream(jssc, LocationStrategies.PreferConsistent(),
          ConsumerStrategies.Subscribe(topics, kafkaParams));
    }

    if (ConfigUtils.getOrElse(config, WINDOW_ENABLED_CONFIG, false)) {
      int windowDuration = config.getInt(WINDOW_MILLISECONDS_CONFIG);
      if (config.hasPath(WINDOW_SLIDE_MILLISECONDS_CONFIG)) {
        int slideDuration = config.getInt(WINDOW_SLIDE_MILLISECONDS_CONFIG);
        dStream = dStream.window(new Duration(windowDuration), new Duration(slideDuration));
      } else {
        dStream = dStream.window(new Duration(windowDuration));
      }
    }
  }

  return dStream;
}

Source File: SparkStreamingFromNetworkExample.java From SparkOnALog with Apache License 2.0

5 votes

public static void main(String[] args) {
  if (args.length < 3) {
    System.err.println("Usage: NetworkWordCount <master> <hostname> <port>\n" +
        "In local mode, <master> should be 'local[n]' with n > 1");
    System.exit(1);
  }

  // Create the context with a 1 second batch size
  JavaStreamingContext ssc = new JavaStreamingContext(args[0], "NetworkWordCount",
          new Duration(5000), System.getenv("SPARK_HOME"), System.getenv("SPARK_EXAMPLES_JAR"));

  // Create a NetworkInputDStream on target ip:port and count the
  // words in input stream of \n delimited test (eg. generated by 'nc')
  JavaDStream<String> lines = ssc.socketTextStream(args[1], Integer.parseInt(args[2]));
  
  lines.map(new Function<String, String> () {

@Override
public String call(String arg0) throws Exception {
	System.out.println("arg0" + arg0);
	return arg0;
}}).print();
  
  lines.print();
  ssc.start();


}

Source File: JavaCustomReceiver.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) throws Exception {
  if (args.length < 2) {
    System.err.println("Usage: JavaCustomReceiver <hostname> <port>");
    System.exit(1);
  }

  StreamingExamples.setStreamingLogLevels();

  // Create the context with a 1 second batch size
  SparkConf sparkConf = new SparkConf().setAppName("JavaCustomReceiver");
  JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, new Duration(1000));

  // Create an input stream with the custom receiver on target ip:port and count the
  // words in input stream of \n delimited text (eg. generated by 'nc')
  JavaReceiverInputDStream<String> lines = ssc.receiverStream(
    new JavaCustomReceiver(args[0], Integer.parseInt(args[1])));
  JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
    @Override
    public Iterator<String> call(String x) {
      return Arrays.asList(SPACE.split(x)).iterator();
    }
  });
  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
    new PairFunction<String, String, Integer>() {
      @Override public Tuple2<String, Integer> call(String s) {
        return new Tuple2<>(s, 1);
      }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
      @Override
      public Integer call(Integer i1, Integer i2) {
        return i1 + i2;
      }
    });

  wordCounts.print();
  ssc.start();
  ssc.awaitTermination();
}

Source File: SparkStreamingJob.java From zipkin-sparkstreaming with Apache License 2.0

5 votes

@Memoized
JavaStreamingContext jsc() {
  SparkConf conf = new SparkConf(true)
      .setMaster(master())
      .setAppName(getClass().getName());
  if (!jars().isEmpty()) conf.setJars(jars().toArray(new String[0]));
  for (Map.Entry<String, String> entry : conf().entrySet()) {
    conf.set(entry.getKey(), entry.getValue());
  }
  return new JavaStreamingContext(conf, new Duration(batchDuration()));
}

Source File: SparkStreamingSqlEngine.java From sylph with Apache License 2.0

5 votes

private static Serializable compile(String jobId, SqlFlow sqlFlow, ConnectorStore connectorStore, SparkJobConfig sparkJobConfig, URLClassLoader jobClassLoader)
        throws JVMException
{
    int batchDuration = sparkJobConfig.getSparkStreamingBatchDuration();
    final AtomicBoolean isCompile = new AtomicBoolean(true);
    final Supplier<StreamingContext> appGetter = (Supplier<StreamingContext> & Serializable) () -> {
        logger.info("========create spark StreamingContext mode isCompile = " + isCompile.get() + "============");
        SparkConf sparkConf = isCompile.get() ?
                new SparkConf().setMaster("local[*]").setAppName("sparkCompile")
                : new SparkConf();
        SparkSession sparkSession = SparkSession.builder().config(sparkConf).getOrCreate();
        StreamingContext ssc = new StreamingContext(sparkSession.sparkContext(), Duration.apply(batchDuration));

        //build sql
        SqlAnalyse analyse = new SparkStreamingSqlAnalyse(ssc, connectorStore, isCompile.get());
        try {
            buildSql(analyse, jobId, sqlFlow);
        }
        catch (Exception e) {
            throwsException(e);
        }
        return ssc;
    };

    JVMLauncher<Boolean> launcher = JVMLaunchers.<Boolean>newJvm()
            .setConsole((line) -> System.out.println(new Ansi().fg(YELLOW).a("[" + jobId + "] ").fg(GREEN).a(line).reset()))
            .setCallable(() -> {
                System.out.println("************ job start ***************");
                appGetter.get();
                return true;
            })
            .addUserURLClassLoader(jobClassLoader)
            .setClassLoader(jobClassLoader)
            .notDepThisJvmClassPath()
            .build();

    launcher.startAndGet();
    isCompile.set(false);
    return (Serializable) appGetter;
}

Source File: SparkScheduler.java From oodt with Apache License 2.0

5 votes

public SparkScheduler(JobQueue queue) {
    SparkConf conf = new SparkConf();
    conf.setMaster(System.getProperty("resource.runner.spark.host","local"));
    conf.setAppName("OODT Spark Job");

    URL location = SparkScheduler.class.getResource('/'+SparkScheduler.class.getName().replace('.', '/')+".class");
    conf.setJars(new String[]{"../lib/cas-resource-0.8-SNAPSHOT.jar"});
    sc = new SparkContext(conf);
    ssc = new StreamingContext(sc,new Duration(10000));
    this.queue = queue;
}

Source File: ReaderWriterExample.java From spliceengine with GNU Affero General Public License v3.0

5 votes

public static void main(String[] args) throws Exception {

        final String dbUrl = args[0];
        final String hostname = args[1];
        final String port = args[2];
        final String inTargetSchema = args[3];
        final String inTargetTable = args[4];

        SparkConf conf = new SparkConf();

        JavaStreamingContext ssc = new JavaStreamingContext(conf, new Duration(500));

        JavaReceiverInputDStream<String> stream = ssc.socketTextStream(hostname, Integer.parseInt(port));

        SparkSession spark = SparkSession.builder().getOrCreate();

        // Create a SplicemachineContext based on the provided DB connection
        SplicemachineContext splicemachineContext = new SplicemachineContext(dbUrl);

        // Set target tablename and schemaname
        final String table = inTargetSchema + "." + inTargetTable;

        stream.foreachRDD((VoidFunction<JavaRDD<String>>) rdd -> {
            JavaRDD<Row> rowRDD = rdd.map((Function<String, Row>) s -> RowFactory.create(s));
            Dataset<Row> df = spark.createDataFrame(rowRDD, splicemachineContext.getSchema(table));

            splicemachineContext.insert(df, table);
        });

        ssc.start();
        ssc.awaitTermination();
    }

Source File: ReaderWriterExample.java From spliceengine with GNU Affero General Public License v3.0

5 votes

public static void main(String[] args) throws Exception {

        final String dbUrl = args[0];
        final String hostname = args[1];
        final String port = args[2];
        final String inTargetSchema = args[3];
        final String inTargetTable = args[4];

        SparkConf conf = new SparkConf();

        JavaStreamingContext ssc = new JavaStreamingContext(conf, new Duration(500));
        SpliceSpark.setContext(ssc.sparkContext());

        SparkSession spark = SpliceSpark.getSessionUnsafe();

        JavaReceiverInputDStream<String> stream = ssc.socketTextStream(hostname, Integer.parseInt(port));

        // Create a SplicemachineContext based on the provided DB connection
        SplicemachineContext splicemachineContext = new SplicemachineContext(dbUrl);

        // Set target tablename and schemaname
        final String table = inTargetSchema + "." + inTargetTable;

        stream.foreachRDD((VoidFunction<JavaRDD<String>>) rdd -> {
            JavaRDD<Row> rowRDD = rdd.map((Function<String, Row>) s -> RowFactory.create(s));
            Dataset<Row> df = spark.createDataFrame(rowRDD, splicemachineContext.getSchema(table));

            splicemachineContext.insert(df, table);
        });

        ssc.start();
        ssc.awaitTermination();
    }

Source File: SparkUnboundedSource.java From beam with Apache License 2.0

4 votes

private static void checkpointStream(JavaDStream<?> dStream, SparkPipelineOptions options) {
  long checkpointDurationMillis = options.getCheckpointDurationMillis();
  if (checkpointDurationMillis > 0) {
    dStream.checkpoint(new Duration(checkpointDurationMillis));
  }
}

Source File: KafkaStreamRestHandler.java From elasticsearch-rest-command with The Unlicense

4 votes

@Override
protected void handleRequest(RestRequest request, RestChannel channel, Client client)
		throws Exception {
	final String topic = request.param("topic", "");
	final boolean schema = request.paramAsBoolean("schema", false);
	final String master = request.param("masterAddress", "local");
	final String hdfs =  request.param("hdfs", "hdfs://localhost:50070");
	final String memory =  request.param("memory", "2g");
	final String appName = request.param("appName", "appName-"+topic);
	final int duration = request.paramAsInt("duration", 1000);
	
	Thread exec = new Thread(new Runnable(){

		@Override
		public void run() {
		
			SparkConf sparkConf = new SparkConf().setAppName(appName).setMaster(master).set("spark.executor.memory", memory);
			JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(duration));
			
			Map<String, Integer> topicMap = new HashMap<String, Integer>();
			topicMap.put(topic, 3);
			
			JavaPairReceiverInputDStream<String, byte[]> kafkaStream = KafkaUtils.createStream(jssc, String.class, byte[].class, 
						kafka.serializer.DefaultDecoder.class, kafka.serializer.DefaultDecoder.class, null, 
						topicMap,  StorageLevel.MEMORY_ONLY());
	
			//JobConf confHadoop = new JobConf();
			//confHadoop.set("mapred.output.compress", "true");
			//confHadoop.set("mapred.output.compression.codec", "com.hadoop.compression.lzo.LzopCodec");
	
			kafkaStream.saveAsHadoopFiles(hdfs, "seq", Text.class, BytesWritable.class, KafkaStreamSeqOutputFormat.class);
			
			topicContextMap.put(topic, jssc);
			jssc.start();		
			jssc.awaitTermination();
			
		}
	});
	
	exec.start();
	
	channel.sendResponse(new BytesRestResponse(RestStatus.OK, String.format("{\"topic\":\"%s\"}",  topic)));
	
	
}

Source File: Throughput.java From flink-perf with Apache License 2.0

4 votes

public static void main(String[] args) {
	SparkConf conf = new SparkConf().setAppName("throughput").setMaster("local[8]");
	JavaStreamingContext ssc = new JavaStreamingContext(conf, new Duration(2000));

	JavaReceiverInputDStream<Tuple4<Long, Integer, Long, byte[]>> source = ssc.receiverStream(new Source(StorageLevel.MEMORY_ONLY()));
	JavaPairDStream<Long, Tuple3<Integer, Long, byte[]>> kvsource = source.mapToPair(new PairFunction<Tuple4<Long, Integer, Long, byte[]>, Long, Tuple3<Integer, Long, byte[]>>() {
		@Override
		public Tuple2<Long, Tuple3<Integer, Long, byte[]>> call(Tuple4<Long, Integer, Long, byte[]> longIntegerLongTuple4) throws Exception {
			return new Tuple2<Long, Tuple3<Integer, Long, byte[]>>(longIntegerLongTuple4._1(),
					new Tuple3<Integer, Long, byte[]>(longIntegerLongTuple4._2(), longIntegerLongTuple4._3(), longIntegerLongTuple4._4()));
		}
	});
	JavaDStream<Long> res = kvsource.repartition(3).mapPartitions(new FlatMapFunction<Iterator<Tuple2<Long,Tuple3<Integer,Long,byte[]>>>, Long>() {
		@Override
		public Iterable<Long> call(Iterator<Tuple2<Long, Tuple3<Integer, Long, byte[]>>> tuple2Iterator) throws Exception {
			long start = System.currentTimeMillis();
			long received = 0;
			while(tuple2Iterator.hasNext()) {
				received++;
				Tuple2<Long, Tuple3<Integer, Long, byte[]>> el = tuple2Iterator.next();

				if (el._2()._2() != 0) {
					long lat = System.currentTimeMillis() - el._2()._2();
					System.out.println("Latency " + lat + " ms");
				}
			}
			long sinceMs = (System.currentTimeMillis() - start);

			System.out.println("Finished Batch. Processed "+received+" elements in "+sinceMs+" ms.");

			return new Iterable<Long>() {
				@Override
				public Iterator<Long> iterator() {
					return new Iterator<Long>() {
						@Override
						public boolean hasNext() {
							return false;
						}

						@Override
						public Long next() {
							return 1L;
						}

						@Override
						public void remove() {

						}
					};
				}
			};
		}


	/*	@Override
		public Long call(Tuple2<Long, Tuple3<Integer, Long, byte[]>> v1) throws Exception {
			//	System.out.println("Recevied " + v1);
			if (start == 0) {

			}
			received++;
			if (received % logfreq == 0) {

				if (sinceSec == 0) {
					System.out.println("received " + received + " elements since 0");
					return 0L;
				}
				System.out.println("Received " + received + " elements since " + sinceSec + ". " +
						"Elements per second " + received / sinceSec + ", GB received " + ((received * (8 + 4 + 12)) / 1024 / 1024 / 1024));
			}
			if (v1._2()._2() != 0) {
				long lat = System.currentTimeMillis() - v1._2()._2();
				System.out.println("Latency " + lat + " ms");
			}

			return received;
		} */
	});


	//res.print();
	/*res.foreachRDD(new Function2<JavaRDD<Long>, Time, Void>() {
		@Override
		public Void call(JavaRDD<Long> integerJavaRDD, Time t) throws Exception {
			integerJavaRDD.saveAsTextFile("/home/robert/flink-workdir/flink-perf/out/"+t.toString());
			return null;
		}
	}); */
	res.print();
//	res.print();

	ssc.start();
}

Source File: SparkStreamingFromFlumeToHBaseExample.java From SparkOnALog with Apache License 2.0

4 votes

public static void main(String[] args) {
	if (args.length == 0) {
		System.err
				.println("Usage: SparkStreamingFromFlumeToHBaseExample {master} {host} {port} {table} {columnFamily}");
		System.exit(1);
	}

	String master = args[0];
	String host = args[1];
	int port = Integer.parseInt(args[2]);
	String tableName = args[3];
	String columnFamily = args[4];
	
	Duration batchInterval = new Duration(2000);

	JavaStreamingContext sc = new JavaStreamingContext(master,
			"FlumeEventCount", batchInterval,
			System.getenv("SPARK_HOME"), "/home/cloudera/SparkOnALog.jar");
	
	final Broadcast<String> broadcastTableName = sc.sparkContext().broadcast(tableName);
	final Broadcast<String> broadcastColumnFamily = sc.sparkContext().broadcast(columnFamily);
	
	//JavaDStream<SparkFlumeEvent> flumeStream = sc.flumeStream(host, port);
	
	JavaDStream<SparkFlumeEvent> flumeStream = FlumeUtils.createStream(sc, host, port);
	
	JavaPairDStream<String, Integer> lastCounts = flumeStream
			.flatMap(new FlatMapFunction<SparkFlumeEvent, String>() {

				@Override
				public Iterable<String> call(SparkFlumeEvent event)
						throws Exception {
					String bodyString = new String(event.event().getBody()
							.array(), "UTF-8");
					return Arrays.asList(bodyString.split(" "));
				}
			}).map(new PairFunction<String, String, Integer>() {
				@Override
				public Tuple2<String, Integer> call(String str)
						throws Exception {
					return new Tuple2(str, 1);
				}
			}).reduceByKey(new Function2<Integer, Integer, Integer>() {

				@Override
				public Integer call(Integer x, Integer y) throws Exception {
					// TODO Auto-generated method stub
					return x.intValue() + y.intValue();
				}
			});
			
			
			lastCounts.foreach(new Function2<JavaPairRDD<String,Integer>, Time, Void>() {

				@Override
				public Void call(JavaPairRDD<String, Integer> values,
						Time time) throws Exception {
					
					values.foreach(new VoidFunction<Tuple2<String, Integer>> () {

						@Override
						public void call(Tuple2<String, Integer> tuple)
								throws Exception {
							HBaseCounterIncrementor incrementor = 
									HBaseCounterIncrementor.getInstance(broadcastTableName.value(), broadcastColumnFamily.value());
							incrementor.incerment("Counter", tuple._1(), tuple._2());
							System.out.println("Counter:" + tuple._1() + "," + tuple._2());
							
						}} );
					
					return null;
				}});
	
	

	sc.start();

}

Source File: SparkStreamingFromFlumeToHBaseWindowingExample.java From SparkOnALog with Apache License 2.0

4 votes

public static void main(String[] args) {
	if (args.length == 0) {
		System.err
				.println("Usage: SparkStreamingFromFlumeToHBaseWindowingExample {master} {host} {port} {table} {columnFamily} {windowInSeconds} {slideInSeconds");
		System.exit(1);
	}

	String master = args[0];
	String host = args[1];
	int port = Integer.parseInt(args[2]);
	String tableName = args[3];
	String columnFamily = args[4];
	int windowInSeconds = Integer.parseInt(args[5]);
	int slideInSeconds = Integer.parseInt(args[5]);
	
	Duration batchInterval = new Duration(2000);
	Duration windowInterval = new Duration(windowInSeconds * 1000);
	Duration slideInterval = new Duration(slideInSeconds * 1000);

	JavaStreamingContext sc = new JavaStreamingContext(master,
			"FlumeEventCount", batchInterval,
			System.getenv("SPARK_HOME"), "/home/cloudera/SparkOnALog.jar");
	
	final Broadcast<String> broadcastTableName = sc.sparkContext().broadcast(tableName);
	final Broadcast<String> broadcastColumnFamily = sc.sparkContext().broadcast(columnFamily);
	
	//JavaDStream<SparkFlumeEvent> flumeStream = sc.flumeStream(host, port);
	
	JavaDStream<SparkFlumeEvent> flumeStream = FlumeUtils.createStream(sc, host, port);
	
	
	JavaPairDStream<String, Integer> lastCounts = flumeStream
			.flatMap(new FlatMapFunction<SparkFlumeEvent, String>() {

				@Override
				public Iterable<String> call(SparkFlumeEvent event)
						throws Exception {
					String bodyString = new String(event.event().getBody()
							.array(), "UTF-8");
					return Arrays.asList(bodyString.split(" "));
				}
			}).map(new PairFunction<String, String, Integer>() {
				@Override
				public Tuple2<String, Integer> call(String str)
						throws Exception {
					return new Tuple2(str, 1);
				}
			}).reduceByKeyAndWindow(new Function2<Integer, Integer, Integer>() {

				@Override
				public Integer call(Integer x, Integer y) throws Exception {
					// TODO Auto-generated method stub
					return x.intValue() + y.intValue();
				}
			}, windowInterval, slideInterval);
			
			
			lastCounts.foreach(new Function2<JavaPairRDD<String,Integer>, Time, Void>() {

				@Override
				public Void call(JavaPairRDD<String, Integer> values,
						Time time) throws Exception {
					
					values.foreach(new VoidFunction<Tuple2<String, Integer>> () {

						@Override
						public void call(Tuple2<String, Integer> tuple)
								throws Exception {
							HBaseCounterIncrementor incrementor = 
									HBaseCounterIncrementor.getInstance(broadcastTableName.value(), broadcastColumnFamily.value());
							incrementor.incerment("Counter", tuple._1(), tuple._2());
							System.out.println("Counter:" + tuple._1() + "," + tuple._2());
							
						}} );
					
					return null;
				}});
	
	

	sc.start();

}

Source File: SparkUnboundedSource.java From beam with Apache License 2.0

4 votes

@Override
public Duration slideDuration() {
  return parent.slideDuration();
}

Source File: SparkRunner.java From jaeger-analytics-java with Apache License 2.0

4 votes

public static void main(String []args) throws InterruptedException, IOException {
  HTTPServer server = new HTTPServer(Integer.valueOf(getPropOrEnv("PROMETHEUS_PORT", "9111")));

  SparkConf sparkConf = new SparkConf()
      .setAppName("Trace DSL")
      .setMaster(getPropOrEnv("SPARK_MASTER","local[*]"));

  JavaSparkContext sc = new JavaSparkContext(sparkConf);
  JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(Integer.parseInt(getPropOrEnv("SPARK_STREAMING_BATCH_DURATION", "5000"))));

  Set<String> topics = Collections.singleton(getPropOrEnv("KAFKA_JAEGER_TOPIC", "jaeger-spans"));
  Map<String, Object> kafkaParams = new HashMap<>();
  kafkaParams.put("bootstrap.servers", getPropOrEnv("KAFKA_BOOTSTRAP_SERVER", "localhost:9092"));
  kafkaParams.put("key.deserializer", StringDeserializer.class);
  kafkaParams.put("value.deserializer", ProtoSpanDeserializer.class);
  // hack to start always from beginning
  kafkaParams.put("group.id", "jaeger-trace-aggregation-" + System.currentTimeMillis());

  if (Boolean.parseBoolean(getPropOrEnv("KAFKA_START_FROM_BEGINNING", "true"))) {
    kafkaParams.put("auto.offset.reset", "earliest");
    kafkaParams.put("enable.auto.commit", false);
    kafkaParams.put("startingOffsets", "earliest");
  }

  JavaInputDStream<ConsumerRecord<String, Span>> messages =
      KafkaUtils.createDirectStream(
          ssc,
          LocationStrategies.PreferConsistent(),
          ConsumerStrategies.Subscribe(topics, kafkaParams));

  JavaPairDStream<String, Span> traceIdSpanTuple = messages.mapToPair(record -> {
    return new Tuple2<>(record.value().traceId, record.value());
  });

 JavaDStream<Trace> tracesStream = traceIdSpanTuple.groupByKey().map(traceIdSpans -> {
   System.out.printf("traceID: %s\n", traceIdSpans._1);
    Iterable<Span> spans = traceIdSpans._2();
    Trace trace = new Trace();
    trace.traceId = traceIdSpans._1();
    trace.spans = StreamSupport.stream(spans.spliterator(), false)
        .collect(Collectors.toList());
    return trace;
  });

  MinimumClientVersion minimumClientVersion = MinimumClientVersion.builder()
      .withJavaVersion(getPropOrEnv("TRACE_QUALITY_JAVA_VERSION", "1.0.0"))
      .withGoVersion(getPropOrEnv("TRACE_QUALITY_GO_VERSION", "2.22.0"))
      .withNodeVersion(getPropOrEnv("TRACE_QUALITY_NODE_VERSION", "3.17.1"))
      .withPythonVersion(getPropOrEnv("TRACE_QUALITY_PYTHON_VERSION", "4.0.0"))
      .build();

  List<ModelRunner> modelRunner = Arrays.asList(
      new TraceHeight(),
      new ServiceDepth(),
      new ServiceHeight(),
      new NetworkLatency(),
      new NumberOfErrors(),
      new DirectDependencies(),
      // trace quality
      minimumClientVersion,
      new HasClientServerSpans(),
      new UniqueSpanId());

  tracesStream.foreachRDD((traceRDD, time) -> {
    traceRDD.foreach(trace -> {
      Graph graph = GraphCreator.create(trace);

      for (ModelRunner model: modelRunner) {
        model.runWithMetrics(graph);
      }
    });
  });

  ssc.start();
  ssc.awaitTermination();
}

Source File: Flags.java From SparkToParquet with Apache License 2.0

4 votes

public Duration getSlideInterval() {
	return slideInterval;
}

Source File: Flags.java From SparkToParquet with Apache License 2.0

4 votes

public Duration getWindowLength() {
	return windowLength;
}

Source File: KafkaReceiverWordCountJava.java From Building-Data-Streaming-Applications-with-Apache-Kafka with MIT License

4 votes

public static void main(String[] args) throws Exception {
    String zkQuorum = "localhost:2181";
    String groupName = "stream";
    int numThreads = 3;
    String topicsName = "test1";
    SparkConf sparkConf = new SparkConf().setAppName("WordCountKafkaStream");

    JavaStreamingContext javaStreamingContext = new JavaStreamingContext(sparkConf, new Duration(5000));

    Map<String, Integer> topicToBeUsedBySpark = new HashMap<>();
    String[] topics = topicsName.split(",");
    for (String topic : topics) {
        topicToBeUsedBySpark.put(topic, numThreads);
    }

    JavaPairReceiverInputDStream<String, String> streamMessages =
            KafkaUtils.createStream(javaStreamingContext, zkQuorum, groupName, topicToBeUsedBySpark);

    JavaDStream<String> lines = streamMessages.map(new Function<Tuple2<String, String>, String>() {
        @Override
        public String call(Tuple2<String, String> tuple2) {
            return tuple2._2();
        }
    });

    JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
        @Override
        public Iterator<String> call(String x) {
            return Arrays.asList(WORD_DELIMETER.split(x)).iterator();
        }
    });

    JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
            new PairFunction<String, String, Integer>() {
                @Override
                public Tuple2<String, Integer> call(String s) {
                    return new Tuple2<>(s, 1);
                }
            }).reduceByKey(new Function2<Integer, Integer, Integer>() {
        @Override
        public Integer call(Integer i1, Integer i2) {
            return i1 + i2;
        }
    });

    wordCounts.print();
    javaStreamingContext.start();
    javaStreamingContext.awaitTermination();
}

Source File: JavaKafkaWordCount.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) throws Exception {
  if (args.length < 4) {
    System.err.println("Usage: JavaKafkaWordCount <zkQuorum> <group> <topics> <numThreads>");
    System.exit(1);
  }

  StreamingExamples.setStreamingLogLevels();
  SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaWordCount");
  // Create the context with 2 seconds batch size
  JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(2000));

  int numThreads = Integer.parseInt(args[3]);
  Map<String, Integer> topicMap = new HashMap<>();
  String[] topics = args[2].split(",");
  for (String topic: topics) {
    topicMap.put(topic, numThreads);
  }

  JavaPairReceiverInputDStream<String, String> messages =
          KafkaUtils.createStream(jssc, args[0], args[1], topicMap);

  JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
    @Override
    public String call(Tuple2<String, String> tuple2) {
      return tuple2._2();
    }
  });

  JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
    @Override
    public Iterator<String> call(String x) {
      return Arrays.asList(SPACE.split(x)).iterator();
    }
  });

  JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
    new PairFunction<String, String, Integer>() {
      @Override
      public Tuple2<String, Integer> call(String s) {
        return new Tuple2<>(s, 1);
      }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
      @Override
      public Integer call(Integer i1, Integer i2) {
        return i1 + i2;
      }
    });

  wordCounts.print();
  jssc.start();
  jssc.awaitTermination();
}

Source File: JavaStreamingTestExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) throws Exception {
  if (args.length != 3) {
    System.err.println("Usage: JavaStreamingTestExample " +
      "<dataDir> <batchDuration> <numBatchesTimeout>");
      System.exit(1);
  }

  String dataDir = args[0];
  Duration batchDuration = Seconds.apply(Long.parseLong(args[1]));
  int numBatchesTimeout = Integer.parseInt(args[2]);

  SparkConf conf = new SparkConf().setMaster("local").setAppName("StreamingTestExample");
  JavaStreamingContext ssc = new JavaStreamingContext(conf, batchDuration);

  ssc.checkpoint(Utils.createTempDir(System.getProperty("java.io.tmpdir"), "spark").toString());

  // $example on$
  JavaDStream<BinarySample> data = ssc.textFileStream(dataDir).map(
    new Function<String, BinarySample>() {
      @Override
      public BinarySample call(String line) {
        String[] ts = line.split(",");
        boolean label = Boolean.parseBoolean(ts[0]);
        double value = Double.parseDouble(ts[1]);
        return new BinarySample(label, value);
      }
    });

  StreamingTest streamingTest = new StreamingTest()
    .setPeacePeriod(0)
    .setWindowSize(0)
    .setTestMethod("welch");

  JavaDStream<StreamingTestResult> out = streamingTest.registerStream(data);
  out.print();
  // $example off$

  // Stop processing if test becomes significant or we time out
  timeoutCounter = numBatchesTimeout;

  out.foreachRDD(new VoidFunction<JavaRDD<StreamingTestResult>>() {
    @Override
    public void call(JavaRDD<StreamingTestResult> rdd) {
      timeoutCounter -= 1;

      boolean anySignificant = !rdd.filter(new Function<StreamingTestResult, Boolean>() {
        @Override
        public Boolean call(StreamingTestResult v) {
          return v.pValue() < 0.05;
        }
      }).isEmpty();

      if (timeoutCounter <= 0 || anySignificant) {
        rdd.context().stop();
      }
    }
  });

  ssc.start();
  ssc.awaitTermination();
}

Source File: SylphKafkaOffset.java From sylph with Apache License 2.0

4 votes

@Override
public Duration slideDuration()
{
    return parent.slideDuration();
}

org.apache.spark.streaming.Duration Java Examples