org.apache.flink.api.java.utils.ParameterTool#getProperties

Source File: KafkaConfigUtil.java From flink-learning with Apache License 2.0

5 votes

/**
 * 设置 kafka 配置
 *
 * @param parameterTool
 * @return
 */
public static Properties buildKafkaProps(ParameterTool parameterTool) {
    Properties props = parameterTool.getProperties();
    props.put("bootstrap.servers", parameterTool.get(PropertiesConstants.KAFKA_BROKERS, DEFAULT_KAFKA_BROKERS));
    props.put("zookeeper.connect", parameterTool.get(PropertiesConstants.KAFKA_ZOOKEEPER_CONNECT, DEFAULT_KAFKA_ZOOKEEPER_CONNECT));
    props.put("group.id", parameterTool.get(PropertiesConstants.KAFKA_GROUP_ID, DEFAULT_KAFKA_GROUP_ID));
    props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
    props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
    props.put("auto.offset.reset", "latest");
    return props;
}

Source File: KafkaConfigUtil.java From flink-learning with Apache License 2.0

5 votes

/**
 * 设置 kafka 配置
 *
 * @param parameterTool
 * @return
 */
public static Properties buildKafkaProps(ParameterTool parameterTool) {
    Properties props = parameterTool.getProperties();
    props.put("bootstrap.servers", parameterTool.get(PropertiesConstants.KAFKA_BROKERS, DEFAULT_KAFKA_BROKERS));
    props.put("zookeeper.connect", parameterTool.get(PropertiesConstants.KAFKA_ZOOKEEPER_CONNECT, DEFAULT_KAFKA_ZOOKEEPER_CONNECT));
    props.put("group.id", parameterTool.get(PropertiesConstants.KAFKA_GROUP_ID, DEFAULT_KAFKA_GROUP_ID));
    props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
    props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
    props.put("auto.offset.reset", "latest");
    return props;
}

Source File: KinesisExampleTest.java From Flink-CEPplus with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception {
	LOG.info("System properties: {}", System.getProperties());
	final ParameterTool parameterTool = ParameterTool.fromArgs(args);

	String inputStream = parameterTool.getRequired("input-stream");
	String outputStream = parameterTool.getRequired("output-stream");

	PubsubClient pubsub = new KinesisPubsubClient(parameterTool.getProperties());
	pubsub.createTopic(inputStream, 2, parameterTool.getProperties());
	pubsub.createTopic(outputStream, 2, parameterTool.getProperties());

	// The example job needs to start after streams are created and run in parallel to the validation logic.
	// The thread that runs the job won't terminate, we don't have a job reference to cancel it.
	// Once results are validated, the driver main thread will exit; job/cluster will be terminated from script.
	final AtomicReference<Exception> executeException = new AtomicReference<>();
	Thread executeThread =
		new Thread(
			() -> {
				try {
					KinesisExample.main(args);
					// this message won't appear in the log,
					// job is terminated when shutting down cluster
					LOG.info("executed program");
				} catch (Exception e) {
					executeException.set(e);
				}
			});
	executeThread.start();

	// generate input
	String[] messages = {
		"elephant,5,45218",
		"squirrel,12,46213",
		"bee,3,51348",
		"squirrel,22,52444",
		"bee,10,53412",
		"elephant,9,54867"
	};
	for (String msg : messages) {
		pubsub.sendMessage(inputStream, msg);
	}
	LOG.info("generated records");

	Deadline deadline  = Deadline.fromNow(Duration.ofSeconds(60));
	List<String> results = pubsub.readAllMessages(outputStream);
	while (deadline.hasTimeLeft() && executeException.get() == null && results.size() < messages.length) {
		LOG.info("waiting for results..");
		Thread.sleep(1000);
		results = pubsub.readAllMessages(outputStream);
	}

	if (executeException.get() != null) {
		throw executeException.get();
	}

	LOG.info("results: {}", results);
	Assert.assertEquals("Results received from '" + outputStream + "': " + results,
		messages.length, results.size());

	String[] expectedResults = {
		"elephant,5,45218",
		"elephant,14,54867",
		"squirrel,12,46213",
		"squirrel,34,52444",
		"bee,3,51348",
		"bee,13,53412"
	};

	for (String expectedResult : expectedResults) {
		Assert.assertTrue(expectedResult, results.contains(expectedResult));
	}

	// TODO: main thread needs to create job or CLI fails with:
	// "The program didn't contain a Flink job. Perhaps you forgot to call execute() on the execution environment."
	System.out.println("test finished");
	System.exit(0);
}

Source File: KinesisExample.java From Flink-CEPplus with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception {
	// parse input arguments
	final ParameterTool parameterTool = ParameterTool.fromArgs(args);
	StreamExecutionEnvironment env = KafkaExampleUtil.prepareExecutionEnv(parameterTool);

	String inputStream = parameterTool.getRequired("input-stream");
	String outputStream = parameterTool.getRequired("output-stream");

	FlinkKinesisConsumer<KafkaEvent> consumer = new FlinkKinesisConsumer<>(
		inputStream,
		new KafkaEventSchema(),
		parameterTool.getProperties());
	consumer.setPeriodicWatermarkAssigner(new CustomWatermarkExtractor());

	Properties producerProperties = new Properties(parameterTool.getProperties());
	// producer needs region even when URL is specified
	producerProperties.putIfAbsent(ConsumerConfigConstants.AWS_REGION, "us-east-1");
	// test driver does not deaggregate
	producerProperties.putIfAbsent("AggregationEnabled", String.valueOf(false));

	// KPL does not recognize endpoint URL..
	String kinesisUrl = producerProperties.getProperty(ConsumerConfigConstants.AWS_ENDPOINT);
	if (kinesisUrl != null) {
		URL url = new URL(kinesisUrl);
		producerProperties.put("KinesisEndpoint", url.getHost());
		producerProperties.put("KinesisPort", Integer.toString(url.getPort()));
		producerProperties.put("VerifyCertificate", "false");
	}

	FlinkKinesisProducer<KafkaEvent> producer = new FlinkKinesisProducer<>(
		new KafkaEventSchema(),
		producerProperties);
	producer.setDefaultStream(outputStream);
	producer.setDefaultPartition("fakePartition");

	DataStream<KafkaEvent> input = env
		.addSource(consumer)
		.keyBy("word")
		.map(new RollingAdditionMapper());

	input.addSink(producer);
	env.execute();
}

Source File: KinesisExampleTest.java From flink with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception {
	LOG.info("System properties: {}", System.getProperties());
	final ParameterTool parameterTool = ParameterTool.fromArgs(args);

	String inputStream = parameterTool.getRequired("input-stream");
	String outputStream = parameterTool.getRequired("output-stream");

	PubsubClient pubsub = new KinesisPubsubClient(parameterTool.getProperties());
	pubsub.createTopic(inputStream, 2, parameterTool.getProperties());
	pubsub.createTopic(outputStream, 2, parameterTool.getProperties());

	// The example job needs to start after streams are created and run in parallel to the validation logic.
	// The thread that runs the job won't terminate, we don't have a job reference to cancel it.
	// Once results are validated, the driver main thread will exit; job/cluster will be terminated from script.
	final AtomicReference<Exception> executeException = new AtomicReference<>();
	Thread executeThread =
		new Thread(
			() -> {
				try {
					KinesisExample.main(args);
					// this message won't appear in the log,
					// job is terminated when shutting down cluster
					LOG.info("executed program");
				} catch (Exception e) {
					executeException.set(e);
				}
			});
	executeThread.start();

	// generate input
	String[] messages = {
		"elephant,5,45218",
		"squirrel,12,46213",
		"bee,3,51348",
		"squirrel,22,52444",
		"bee,10,53412",
		"elephant,9,54867"
	};
	for (String msg : messages) {
		pubsub.sendMessage(inputStream, msg);
	}
	LOG.info("generated records");

	Deadline deadline  = Deadline.fromNow(Duration.ofSeconds(60));
	List<String> results = pubsub.readAllMessages(outputStream);
	while (deadline.hasTimeLeft() && executeException.get() == null && results.size() < messages.length) {
		LOG.info("waiting for results..");
		Thread.sleep(1000);
		results = pubsub.readAllMessages(outputStream);
	}

	if (executeException.get() != null) {
		throw executeException.get();
	}

	LOG.info("results: {}", results);
	Assert.assertEquals("Results received from '" + outputStream + "': " + results,
		messages.length, results.size());

	String[] expectedResults = {
		"elephant,5,45218",
		"elephant,14,54867",
		"squirrel,12,46213",
		"squirrel,34,52444",
		"bee,3,51348",
		"bee,13,53412"
	};

	for (String expectedResult : expectedResults) {
		Assert.assertTrue(expectedResult, results.contains(expectedResult));
	}

	// TODO: main thread needs to create job or CLI fails with:
	// "The program didn't contain a Flink job. Perhaps you forgot to call execute() on the execution environment."
	System.out.println("test finished");
	System.exit(0);
}

Source File: KinesisExample.java From flink with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception {
	// parse input arguments
	final ParameterTool parameterTool = ParameterTool.fromArgs(args);
	StreamExecutionEnvironment env = KafkaExampleUtil.prepareExecutionEnv(parameterTool);

	String inputStream = parameterTool.getRequired("input-stream");
	String outputStream = parameterTool.getRequired("output-stream");

	FlinkKinesisConsumer<KafkaEvent> consumer = new FlinkKinesisConsumer<>(
		inputStream,
		new KafkaEventSchema(),
		parameterTool.getProperties());
	consumer.setPeriodicWatermarkAssigner(new CustomWatermarkExtractor());

	Properties producerProperties = new Properties(parameterTool.getProperties());
	// producer needs region even when URL is specified
	producerProperties.putIfAbsent(ConsumerConfigConstants.AWS_REGION, "us-east-1");
	// test driver does not deaggregate
	producerProperties.putIfAbsent("AggregationEnabled", String.valueOf(false));

	// KPL does not recognize endpoint URL..
	String kinesisUrl = producerProperties.getProperty(ConsumerConfigConstants.AWS_ENDPOINT);
	if (kinesisUrl != null) {
		URL url = new URL(kinesisUrl);
		producerProperties.put("KinesisEndpoint", url.getHost());
		producerProperties.put("KinesisPort", Integer.toString(url.getPort()));
		producerProperties.put("VerifyCertificate", "false");
	}

	FlinkKinesisProducer<KafkaEvent> producer = new FlinkKinesisProducer<>(
		new KafkaEventSchema(),
		producerProperties);
	producer.setDefaultStream(outputStream);
	producer.setDefaultPartition("fakePartition");

	DataStream<KafkaEvent> input = env
		.addSource(consumer)
		.keyBy("word")
		.map(new RollingAdditionMapper());

	input.addSink(producer);
	env.execute();
}

Source File: ReadFromKafka.java From flinkDemo with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception {
    // create execution environment
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

    Map properties= new HashMap();
    properties.put("bootstrap.servers", "192.168.10.63:6667,192.168.10.64:6667,192.168.10.65:6667");
    properties.put("group.id", "dec-esc-group-vib-calc");
    properties.put("enable.auto.commit", "true");
    properties.put("auto.commit.interval.ms", "1000");
    properties.put("auto.offset.reset", "earliest");
    properties.put("session.timeout.ms", "30000");
    properties.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
    properties.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
    properties.put("topic", "dec-vibration-test");
    //KafkaConsumer<String,String> kafkaConsumer = new KafkaConsumer<String, String>(properties);
    // parse user parameters
    //ParameterTool parameterTool = ParameterTool.fromArgs(args);
    ParameterTool parameterTool = ParameterTool.fromMap(properties);

    FlinkKafkaConsumer010 consumer010 = new FlinkKafkaConsumer010(
                     parameterTool.getRequired("topic"), new SimpleStringSchema(), parameterTool.getProperties());

  //  consumer010.setStartFromEarliest();

    DataStream<String> messageStream = env
            .addSource(consumer010);

    // print() will write the contents of the stream to the TaskManager's standard out stream
    // the rebelance call is causing a repartitioning of the data so that all machines
    // see the messages (for example in cases when "num kafka partitions" < "num flink operators"
    messageStream.rebalance().map(new MapFunction<String, String>() {
        private static final long serialVersionUID = 1L;

        @Override
        public String map(String value) throws Exception {
            return value;

        }
    });


    messageStream.print();

    env.execute();
}

Source File: StreamingETL.java From flink-streaming-etl with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception {
	// parse arguments
	ParameterTool params = ParameterTool.fromPropertiesFile(args[0]);

	// create streaming environment
	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

	// enable event time processing
	env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);

	// enable fault-tolerance
	env.enableCheckpointing(1000);

	// enable restarts
	env.setRestartStrategy(RestartStrategies.fixedDelayRestart(50, 500L));

	env.setStateBackend(new FsStateBackend("file:///home/robert/flink-workdir/flink-streaming-etl/state-backend"));

	// run each operator separately
	env.disableOperatorChaining();

	// get data from Kafka
	Properties kParams = params.getProperties();
	kParams.setProperty("group.id", UUID.randomUUID().toString());
	DataStream<ObjectNode> inputStream = env.addSource(new FlinkKafkaConsumer09<>(params.getRequired("topic"), new JSONDeserializationSchema(), kParams)).name("Kafka 0.9 Source")
		.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<ObjectNode>(Time.minutes(1L)) {
			@Override
			public long extractTimestamp(ObjectNode jsonNodes) {
				return jsonNodes.get("timestamp_ms").asLong();
			}
		}).name("Timestamp extractor");

	// filter out records without lang field
	DataStream<ObjectNode> tweetsWithLang = inputStream.filter(jsonNode -> jsonNode.has("user") && jsonNode.get("user").has("lang")).name("Filter records without 'lang' field");

	// select only lang = "en" tweets
	DataStream<ObjectNode> englishTweets = tweetsWithLang.filter(jsonNode -> jsonNode.get("user").get("lang").asText().equals("en")).name("Select 'lang'=en tweets");

	// write to file system
	RollingSink<ObjectNode> rollingSink = new RollingSink<>(params.get("sinkPath", "/home/robert/flink-workdir/flink-streaming-etl/rolling-sink"));
	rollingSink.setBucketer(new DateTimeBucketer("yyyy-MM-dd-HH-mm")); // do a bucket for each minute
	englishTweets.addSink(rollingSink).name("Rolling FileSystem Sink");

	// build aggregates (count per language) using window (10 seconds tumbling):
	DataStream<Tuple3<Long, String, Long>> languageCounts = tweetsWithLang.keyBy(jsonNode -> jsonNode.get("user").get("lang").asText())
		.timeWindow(Time.seconds(10))
		.apply(new Tuple3<>(0L, "", 0L), new JsonFoldCounter(), new CountEmitter()).name("Count per Langauage (10 seconds tumbling)");

	// write window aggregate to ElasticSearch
	List<InetSocketAddress> transportNodes = ImmutableList.of(new InetSocketAddress(InetAddress.getByName("localhost"), 9300));
	ElasticsearchSink<Tuple3<Long, String, Long>> elasticsearchSink = new ElasticsearchSink<>(params.toMap(), transportNodes, new ESRequest());

	languageCounts.addSink(elasticsearchSink).name("ElasticSearch2 Sink");

	// word-count on the tweet stream
	DataStream<Tuple2<Date, List<Tuple2<String, Long>>>> topWordCount = tweetsWithLang
		// get text from tweets
		.map(tweet -> tweet.get("text").asText()).name("Get text from Tweets")
		// split text into (word, 1) tuples
		.flatMap(new FlatMapFunction<String, Tuple2<String, Long>>() {
			@Override
			public void flatMap(String s, Collector<Tuple2<String, Long>> collector) throws Exception {
				String[] splits = s.split(" ");
				for (String sp : splits) {
					collector.collect(new Tuple2<>(sp, 1L));
				}
			}
		}).name("Tokenize words")
		// group by word
		.keyBy(0)
		// build 1 min windows, compute every 10 seconds --> count word frequency
		.timeWindow(Time.minutes(1L), Time.seconds(10L)).apply(new WordCountingWindow()).name("Count word frequency (1 min, 10 sec sliding window)")
		// build top n every 10 seconds
		.timeWindowAll(Time.seconds(10L)).apply(new TopNWords(10)).name("TopN Window (10s)");

	// write top Ns to Kafka topic
	topWordCount.addSink(new FlinkKafkaProducer09<>(params.getRequired("wc-topic"), new ListSerSchema(), params.getProperties())).name("Write topN to Kafka");

	env.execute("Streaming ETL");

}

Source File: KinesisExampleTest.java From flink with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception {
	LOG.info("System properties: {}", System.getProperties());
	final ParameterTool parameterTool = ParameterTool.fromArgs(args);

	String inputStream = parameterTool.getRequired("input-stream");
	String outputStream = parameterTool.getRequired("output-stream");

	KinesisPubsubClient pubsub = new KinesisPubsubClient(parameterTool.getProperties());
	pubsub.createTopic(inputStream, 2, parameterTool.getProperties());
	pubsub.createTopic(outputStream, 2, parameterTool.getProperties());

	// The example job needs to start after streams are created and run in parallel to the validation logic.
	// The thread that runs the job won't terminate, we don't have a job reference to cancel it.
	// Once results are validated, the driver main thread will exit; job/cluster will be terminated from script.
	final AtomicReference<Exception> executeException = new AtomicReference<>();
	Thread executeThread =
		new Thread(
			() -> {
				try {
					KinesisExample.main(args);
					// this message won't appear in the log,
					// job is terminated when shutting down cluster
					LOG.info("executed program");
				} catch (Exception e) {
					executeException.set(e);
				}
			});
	executeThread.start();

	// generate input
	String[] messages = {
		"elephant,5,45218",
		"squirrel,12,46213",
		"bee,3,51348",
		"squirrel,22,52444",
		"bee,10,53412",
		"elephant,9,54867"
	};
	for (String msg : messages) {
		pubsub.sendMessage(inputStream, msg);
	}
	LOG.info("generated records");

	Deadline deadline  = Deadline.fromNow(Duration.ofSeconds(60));
	List<String> results = pubsub.readAllMessages(outputStream);
	while (deadline.hasTimeLeft() && executeException.get() == null && results.size() < messages.length) {
		LOG.info("waiting for results..");
		Thread.sleep(1000);
		results = pubsub.readAllMessages(outputStream);
	}

	if (executeException.get() != null) {
		throw executeException.get();
	}

	LOG.info("results: {}", results);
	Assert.assertEquals("Results received from '" + outputStream + "': " + results,
		messages.length, results.size());

	String[] expectedResults = {
		"elephant,5,45218",
		"elephant,14,54867",
		"squirrel,12,46213",
		"squirrel,34,52444",
		"bee,3,51348",
		"bee,13,53412"
	};

	for (String expectedResult : expectedResults) {
		Assert.assertTrue(expectedResult, results.contains(expectedResult));
	}

	// TODO: main thread needs to create job or CLI fails with:
	// "The program didn't contain a Flink job. Perhaps you forgot to call execute() on the execution environment."
	System.out.println("test finished");
	System.exit(0);
}

Source File: KinesisExample.java From flink with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception {
	// parse input arguments
	final ParameterTool parameterTool = ParameterTool.fromArgs(args);
	StreamExecutionEnvironment env = KafkaExampleUtil.prepareExecutionEnv(parameterTool);

	String inputStream = parameterTool.getRequired("input-stream");
	String outputStream = parameterTool.getRequired("output-stream");

	FlinkKinesisConsumer<KafkaEvent> consumer = new FlinkKinesisConsumer<>(
		inputStream,
		new KafkaEventSchema(),
		parameterTool.getProperties());
	consumer.setPeriodicWatermarkAssigner(new CustomWatermarkExtractor());

	Properties producerProperties = new Properties(parameterTool.getProperties());
	// producer needs region even when URL is specified
	producerProperties.putIfAbsent(ConsumerConfigConstants.AWS_REGION, "us-east-1");
	// test driver does not deaggregate
	producerProperties.putIfAbsent("AggregationEnabled", String.valueOf(false));

	// KPL does not recognize endpoint URL..
	String kinesisUrl = producerProperties.getProperty(ConsumerConfigConstants.AWS_ENDPOINT);
	if (kinesisUrl != null) {
		URL url = new URL(kinesisUrl);
		producerProperties.put("KinesisEndpoint", url.getHost());
		producerProperties.put("KinesisPort", Integer.toString(url.getPort()));
		producerProperties.put("VerifyCertificate", "false");
	}

	FlinkKinesisProducer<KafkaEvent> producer = new FlinkKinesisProducer<>(
		new KafkaEventSchema(),
		producerProperties);
	producer.setDefaultStream(outputStream);
	producer.setDefaultPartition("fakePartition");

	DataStream<KafkaEvent> input = env
		.addSource(consumer)
		.keyBy("word")
		.map(new RollingAdditionMapper());

	input.addSink(producer);
	env.execute();
}

Java Code Examples for org.apache.flink.api.java.utils.ParameterTool#getProperties()