Java Code Examples for org.apache.flink.streaming.api.datastream.DataStream#flatMap()
The following examples show how to use
org.apache.flink.streaming.api.datastream.DataStream#flatMap() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: RescalingITCase.java From flink with Apache License 2.0 | 5 votes |
private static JobGraph createJobGraphWithKeyedState( int parallelism, int maxParallelism, int numberKeys, int numberElements, boolean terminateAfterEmission, int checkpointingInterval) { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(parallelism); if (0 < maxParallelism) { env.getConfig().setMaxParallelism(maxParallelism); } env.enableCheckpointing(checkpointingInterval); env.setRestartStrategy(RestartStrategies.noRestart()); env.getConfig().setUseSnapshotCompression(true); DataStream<Integer> input = env.addSource(new SubtaskIndexSource( numberKeys, numberElements, terminateAfterEmission)) .keyBy(new KeySelector<Integer, Integer>() { private static final long serialVersionUID = -7952298871120320940L; @Override public Integer getKey(Integer value) throws Exception { return value; } }); SubtaskIndexFlatMapper.workCompletedLatch = new CountDownLatch(numberKeys); DataStream<Tuple2<Integer, Integer>> result = input.flatMap(new SubtaskIndexFlatMapper(numberElements)); result.addSink(new CollectionSink<Tuple2<Integer, Integer>>()); return env.getStreamGraph().getJobGraph(); }
Example 2
Source File: TwitterIntoKafka.java From flink-streaming-etl with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { // set up the streaming execution environment final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); ParameterTool params = ParameterTool.fromPropertiesFile(args[0]); DataStream<String> twitterStreamString = env.addSource(new TwitterSource(params.getProperties())); DataStream<String> filteredStream = twitterStreamString.flatMap(new ParseJson()); filteredStream.flatMap(new ThroughputLogger(5000L)).setParallelism(1); filteredStream.addSink(new FlinkKafkaProducer09<>("twitter", new SimpleStringSchema(), params.getProperties())); // execute program env.execute("Ingest data from Twitter to Kafka"); }
Example 3
Source File: SideStream.java From alchemy with Apache License 2.0 | 5 votes |
public static DataStream<Row> buildStream(StreamTableEnvironment env, SqlSelect sqlSelect, Alias leftAlias, Alias sideAlias, SourceDescriptor sideSource) throws Exception { SqlSelect leftSelect = SideParser.newSelect(sqlSelect, leftAlias.getTable(), leftAlias.getAlias(), true, false); // register leftTable Table leftTable = env.sqlQuery(leftSelect.toString()); DataStream<Row> leftStream = env.toAppendStream(leftTable, Row.class); SqlSelect rightSelect = SideParser.newSelect(sqlSelect, sideAlias.getTable(), sideAlias.getAlias(), false, false); SqlJoin sqlJoin = (SqlJoin)sqlSelect.getFrom(); List<String> equalFields = SideParser.findConditionFields(sqlJoin.getCondition(), leftAlias.getAlias()); if (sideSource.getSide().isPartition()) { leftStream = leftStream.keyBy(equalFields.toArray(new String[equalFields.size()])); } RowTypeInfo sideType = createSideType(rightSelect.getSelectList(), sideSource.getSchema()); RowTypeInfo returnType = createReturnType(leftTable.getSchema(), sideType); SideTable sideTable = createSideTable(leftTable.getSchema(), sideType, sqlJoin.getJoinType(), rightSelect, equalFields, sideAlias, sideSource.getSide()); DataStream<Row> returnStream; if (sideSource.getSide().isAsync()) { AbstractAsyncSideFunction reqRow = sideSource.transform(sideTable); returnStream = AsyncDataStream.orderedWait(leftStream, reqRow, sideSource.getSide().getTimeout(), TimeUnit.MILLISECONDS, sideSource.getSide().getCapacity()); } else { AbstractSyncSideFunction syncReqRow = sideSource.transform(sideTable); returnStream = leftStream.flatMap(syncReqRow); } returnStream.getTransformation().setOutputType(returnType); return returnStream; }
Example 4
Source File: RescalingITCase.java From flink with Apache License 2.0 | 5 votes |
private static JobGraph createJobGraphWithKeyedState( int parallelism, int maxParallelism, int numberKeys, int numberElements, boolean terminateAfterEmission, int checkpointingInterval) { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(parallelism); if (0 < maxParallelism) { env.getConfig().setMaxParallelism(maxParallelism); } env.enableCheckpointing(checkpointingInterval); env.setRestartStrategy(RestartStrategies.noRestart()); env.getConfig().setUseSnapshotCompression(true); DataStream<Integer> input = env.addSource(new SubtaskIndexSource( numberKeys, numberElements, terminateAfterEmission)) .keyBy(new KeySelector<Integer, Integer>() { private static final long serialVersionUID = -7952298871120320940L; @Override public Integer getKey(Integer value) throws Exception { return value; } }); SubtaskIndexFlatMapper.workCompletedLatch = new CountDownLatch(numberKeys); DataStream<Tuple2<Integer, Integer>> result = input.flatMap(new SubtaskIndexFlatMapper(numberElements)); result.addSink(new CollectionSink<Tuple2<Integer, Integer>>()); return env.getStreamGraph().getJobGraph(); }
Example 5
Source File: StreamingOperatorsITCase.java From flink with Apache License 2.0 | 5 votes |
@Test public void testOperatorChainWithObjectReuseAndNoOutputOperators() throws Exception { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.getConfig().enableObjectReuse(); DataStream<Integer> input = env.fromElements(1, 2, 3); input.flatMap(new FlatMapFunction<Integer, Integer>() { @Override public void flatMap(Integer value, Collector<Integer> out) throws Exception { out.collect(value << 1); } }); env.execute(); }
Example 6
Source File: RescalingITCase.java From flink with Apache License 2.0 | 5 votes |
private static JobGraph createJobGraphWithKeyedAndNonPartitionedOperatorState( int parallelism, int maxParallelism, int fixedParallelism, int numberKeys, int numberElements, boolean terminateAfterEmission, int checkpointingInterval) { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(parallelism); env.getConfig().setMaxParallelism(maxParallelism); env.enableCheckpointing(checkpointingInterval); env.setRestartStrategy(RestartStrategies.noRestart()); DataStream<Integer> input = env.addSource(new SubtaskIndexNonPartitionedStateSource( numberKeys, numberElements, terminateAfterEmission)) .setParallelism(fixedParallelism) .keyBy(new KeySelector<Integer, Integer>() { private static final long serialVersionUID = -7952298871120320940L; @Override public Integer getKey(Integer value) throws Exception { return value; } }); SubtaskIndexFlatMapper.workCompletedLatch = new CountDownLatch(numberKeys); DataStream<Tuple2<Integer, Integer>> result = input.flatMap(new SubtaskIndexFlatMapper(numberElements)); result.addSink(new CollectionSink<Tuple2<Integer, Integer>>()); return env.getStreamGraph().getJobGraph(); }
Example 7
Source File: RescalingITCase.java From flink with Apache License 2.0 | 5 votes |
private static JobGraph createJobGraphWithKeyedAndNonPartitionedOperatorState( int parallelism, int maxParallelism, int fixedParallelism, int numberKeys, int numberElements, boolean terminateAfterEmission, int checkpointingInterval) { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(parallelism); env.getConfig().setMaxParallelism(maxParallelism); env.enableCheckpointing(checkpointingInterval); env.setRestartStrategy(RestartStrategies.noRestart()); DataStream<Integer> input = env.addSource(new SubtaskIndexNonPartitionedStateSource( numberKeys, numberElements, terminateAfterEmission)) .setParallelism(fixedParallelism) .keyBy(new KeySelector<Integer, Integer>() { private static final long serialVersionUID = -7952298871120320940L; @Override public Integer getKey(Integer value) throws Exception { return value; } }); SubtaskIndexFlatMapper.workCompletedLatch = new CountDownLatch(numberKeys); DataStream<Tuple2<Integer, Integer>> result = input.flatMap(new SubtaskIndexFlatMapper(numberElements)); result.addSink(new CollectionSink<Tuple2<Integer, Integer>>()); return env.getStreamGraph().getJobGraph(); }
Example 8
Source File: KafkaConsumerTestBase.java From flink with Apache License 2.0 | 5 votes |
/** * Test that ensures that DeserializationSchema.isEndOfStream() is properly evaluated. * * @throws Exception */ public void runEndOfStreamTest() throws Exception { final int elementCount = 300; final String topic = writeSequence("testEndOfStream", elementCount, 1, 1); // read using custom schema final StreamExecutionEnvironment env1 = StreamExecutionEnvironment.getExecutionEnvironment(); env1.setParallelism(1); env1.getConfig().setRestartStrategy(RestartStrategies.noRestart()); env1.getConfig().disableSysoutLogging(); Properties props = new Properties(); props.putAll(standardProps); props.putAll(secureProps); DataStream<Tuple2<Integer, Integer>> fromKafka = env1.addSource(kafkaServer.getConsumer(topic, new FixedNumberDeserializationSchema(elementCount), props)); fromKafka.flatMap(new FlatMapFunction<Tuple2<Integer, Integer>, Void>() { @Override public void flatMap(Tuple2<Integer, Integer> value, Collector<Void> out) throws Exception { // noop ;) } }); tryExecute(env1, "Consume " + elementCount + " elements from Kafka"); deleteTestTopic(topic); }
Example 9
Source File: StreamingOperatorsITCase.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
@Test public void testOperatorChainWithObjectReuseAndNoOutputOperators() throws Exception { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.getConfig().enableObjectReuse(); DataStream<Integer> input = env.fromElements(1, 2, 3); input.flatMap(new FlatMapFunction<Integer, Integer>() { @Override public void flatMap(Integer value, Collector<Integer> out) throws Exception { out.collect(value << 1); } }); env.execute(); }
Example 10
Source File: RescalingITCase.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
private static JobGraph createJobGraphWithKeyedAndNonPartitionedOperatorState( int parallelism, int maxParallelism, int fixedParallelism, int numberKeys, int numberElements, boolean terminateAfterEmission, int checkpointingInterval) { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(parallelism); env.getConfig().setMaxParallelism(maxParallelism); env.enableCheckpointing(checkpointingInterval); env.setRestartStrategy(RestartStrategies.noRestart()); DataStream<Integer> input = env.addSource(new SubtaskIndexNonPartitionedStateSource( numberKeys, numberElements, terminateAfterEmission)) .setParallelism(fixedParallelism) .keyBy(new KeySelector<Integer, Integer>() { private static final long serialVersionUID = -7952298871120320940L; @Override public Integer getKey(Integer value) throws Exception { return value; } }); SubtaskIndexFlatMapper.workCompletedLatch = new CountDownLatch(numberKeys); DataStream<Tuple2<Integer, Integer>> result = input.flatMap(new SubtaskIndexFlatMapper(numberElements)); result.addSink(new CollectionSink<Tuple2<Integer, Integer>>()); return env.getStreamGraph().getJobGraph(); }
Example 11
Source File: KafkaConsumerTestBase.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
/** * Test that ensures that DeserializationSchema.isEndOfStream() is properly evaluated. * * @throws Exception */ public void runEndOfStreamTest() throws Exception { final int elementCount = 300; final String topic = writeSequence("testEndOfStream", elementCount, 1, 1); // read using custom schema final StreamExecutionEnvironment env1 = StreamExecutionEnvironment.getExecutionEnvironment(); env1.setParallelism(1); env1.getConfig().setRestartStrategy(RestartStrategies.noRestart()); env1.getConfig().disableSysoutLogging(); Properties props = new Properties(); props.putAll(standardProps); props.putAll(secureProps); DataStream<Tuple2<Integer, Integer>> fromKafka = env1.addSource(kafkaServer.getConsumer(topic, new FixedNumberDeserializationSchema(elementCount), props)); fromKafka.flatMap(new FlatMapFunction<Tuple2<Integer, Integer>, Void>() { @Override public void flatMap(Tuple2<Integer, Integer> value, Collector<Void> out) throws Exception { // noop ;) } }); tryExecute(env1, "Consume " + elementCount + " elements from Kafka"); deleteTestTopic(topic); }
Example 12
Source File: StreamingOperatorsITCase.java From flink with Apache License 2.0 | 5 votes |
@Test public void testOperatorChainWithObjectReuseAndNoOutputOperators() throws Exception { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.getConfig().enableObjectReuse(); DataStream<Integer> input = env.fromElements(1, 2, 3); input.flatMap(new FlatMapFunction<Integer, Integer>() { @Override public void flatMap(Integer value, Collector<Integer> out) throws Exception { out.collect(value << 1); } }); env.execute(); }
Example 13
Source File: KafkaConsumerTestBase.java From flink with Apache License 2.0 | 4 votes |
public void runKeyValueTest() throws Exception { final String topic = "keyvaluetest"; createTestTopic(topic, 1, 1); final int elementCount = 5000; // ----------- Write some data into Kafka ------------------- StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); env.setRestartStrategy(RestartStrategies.noRestart()); DataStream<Tuple2<Long, PojoValue>> kvStream = env.addSource(new SourceFunction<Tuple2<Long, PojoValue>>() { @Override public void run(SourceContext<Tuple2<Long, PojoValue>> ctx) throws Exception { Random rnd = new Random(1337); for (long i = 0; i < elementCount; i++) { PojoValue pojo = new PojoValue(); pojo.when = new Date(rnd.nextLong()); pojo.lon = rnd.nextLong(); pojo.lat = i; // make every second key null to ensure proper "null" serialization Long key = (i % 2 == 0) ? null : i; ctx.collect(new Tuple2<>(key, pojo)); } } @Override public void cancel() { } }); KeyedSerializationSchema<Tuple2<Long, PojoValue>> schema = new TypeInformationKeyValueSerializationSchema<>(Long.class, PojoValue.class, env.getConfig()); Properties producerProperties = FlinkKafkaProducerBase.getPropertiesFromBrokerList(brokerConnectionStrings); producerProperties.setProperty("retries", "3"); kafkaServer.produceIntoKafka(kvStream, topic, schema, producerProperties, null); env.execute("Write KV to Kafka"); // ----------- Read the data again ------------------- env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); env.setRestartStrategy(RestartStrategies.noRestart()); KafkaDeserializationSchema<Tuple2<Long, PojoValue>> readSchema = new TypeInformationKeyValueSerializationSchema<>(Long.class, PojoValue.class, env.getConfig()); Properties props = new Properties(); props.putAll(standardProps); props.putAll(secureProps); DataStream<Tuple2<Long, PojoValue>> fromKafka = env.addSource(kafkaServer.getConsumer(topic, readSchema, props)); fromKafka.flatMap(new RichFlatMapFunction<Tuple2<Long, PojoValue>, Object>() { long counter = 0; @Override public void flatMap(Tuple2<Long, PojoValue> value, Collector<Object> out) throws Exception { // the elements should be in order. Assert.assertTrue("Wrong value " + value.f1.lat, value.f1.lat == counter); if (value.f1.lat % 2 == 0) { assertNull("key was not null", value.f0); } else { Assert.assertTrue("Wrong value " + value.f0, value.f0 == counter); } counter++; if (counter == elementCount) { // we got the right number of elements throw new SuccessException(); } } }); tryExecute(env, "Read KV from Kafka"); deleteTestTopic(topic); }
Example 14
Source File: KafkaConsumerTestBase.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
public void runKeyValueTest() throws Exception { final String topic = "keyvaluetest"; createTestTopic(topic, 1, 1); final int elementCount = 5000; // ----------- Write some data into Kafka ------------------- StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); env.setRestartStrategy(RestartStrategies.noRestart()); env.getConfig().disableSysoutLogging(); DataStream<Tuple2<Long, PojoValue>> kvStream = env.addSource(new SourceFunction<Tuple2<Long, PojoValue>>() { @Override public void run(SourceContext<Tuple2<Long, PojoValue>> ctx) throws Exception { Random rnd = new Random(1337); for (long i = 0; i < elementCount; i++) { PojoValue pojo = new PojoValue(); pojo.when = new Date(rnd.nextLong()); pojo.lon = rnd.nextLong(); pojo.lat = i; // make every second key null to ensure proper "null" serialization Long key = (i % 2 == 0) ? null : i; ctx.collect(new Tuple2<>(key, pojo)); } } @Override public void cancel() { } }); KeyedSerializationSchema<Tuple2<Long, PojoValue>> schema = new TypeInformationKeyValueSerializationSchema<>(Long.class, PojoValue.class, env.getConfig()); Properties producerProperties = FlinkKafkaProducerBase.getPropertiesFromBrokerList(brokerConnectionStrings); producerProperties.setProperty("retries", "3"); kafkaServer.produceIntoKafka(kvStream, topic, schema, producerProperties, null); env.execute("Write KV to Kafka"); // ----------- Read the data again ------------------- env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); env.setRestartStrategy(RestartStrategies.noRestart()); env.getConfig().disableSysoutLogging(); KafkaDeserializationSchema<Tuple2<Long, PojoValue>> readSchema = new TypeInformationKeyValueSerializationSchema<>(Long.class, PojoValue.class, env.getConfig()); Properties props = new Properties(); props.putAll(standardProps); props.putAll(secureProps); DataStream<Tuple2<Long, PojoValue>> fromKafka = env.addSource(kafkaServer.getConsumer(topic, readSchema, props)); fromKafka.flatMap(new RichFlatMapFunction<Tuple2<Long, PojoValue>, Object>() { long counter = 0; @Override public void flatMap(Tuple2<Long, PojoValue> value, Collector<Object> out) throws Exception { // the elements should be in order. Assert.assertTrue("Wrong value " + value.f1.lat, value.f1.lat == counter); if (value.f1.lat % 2 == 0) { assertNull("key was not null", value.f0); } else { Assert.assertTrue("Wrong value " + value.f0, value.f0 == counter); } counter++; if (counter == elementCount) { // we got the right number of elements throw new SuccessException(); } } }); tryExecute(env, "Read KV from Kafka"); deleteTestTopic(topic); }
Example 15
Source File: Driver.java From OSTMap with Apache License 2.0 | 4 votes |
public void run(String pathToTwitterProperties, String pathToAccumuloProperties, ArrayList<String> tweet) throws Exception { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); // one watermark each ten second env.getConfig().setAutoWatermarkInterval(1000); // decide which stream source should be used DataStream<String> geoStream; if(tweet==null) { geoStream = env.addSource(new GeoTwitterSource(pathToTwitterProperties)); } else { geoStream = env.fromCollection ( tweet ); } // decide which configuration should be used RawTwitterDataSink rtdSink = new RawTwitterDataSink(); TermIndexSink tiSink = new TermIndexSink(); LanguageFrequencySink frqSink = new LanguageFrequencySink(); GeoTemporalIndexSink gtiSink = new GeoTemporalIndexSink(); SinkConfiguration sc; if(runOnMAC) { sc = SinkConfiguration.createConfigForMinicluster(accumuloInstanceName, accumuloZookeeper); } else { sc = SinkConfiguration.createConfigFromFile(pathToAccumuloProperties); } rtdSink.configure(sc, TableIdentifier.RAW_TWITTER_DATA.get()); tiSink.configure(sc, TableIdentifier.TERM_INDEX.get()); frqSink.configure(sc, TableIdentifier.TWEET_FREQUENCY.get()); gtiSink.configure(sc, TableIdentifier.GEO_TEMPORAL_INDEX.get()); // stream of tuples containing timestamp and tweet's json-String DataStream<Tuple2<Long, String>> dateStream = geoStream.flatMap(new DateExtraction()); dateStream .flatMap(new LanguageFrequencyRowExtraction()) .flatMap(new LanguageTagExtraction()) .assignTimestampsAndWatermarks(new TimestampExtractorForDateStream()) .windowAll(TumblingEventTimeWindows.of(Time.minutes(1))) .apply (new AllWindowFunctionLangFreq()) .addSink(frqSink); // stream of tuples containing RawTwitterDataKey and tweet's json-String DataStream<Tuple2<RawTwitterDataKey, String>> rtdStream = dateStream.flatMap(new CalculateRawTwitterDataKey()); /** write into rawTwitterData-table */ rtdStream.addSink(rtdSink); /** write into geoTemporalIndex-table */ rtdStream .flatMap(new GeoTemporalKeyExtraction()) .addSink(gtiSink); /** write into termIndex-table */ // processing for user rtdStream .flatMap(new UserExtraction()) .addSink(tiSink); // processing for terms rtdStream .flatMap(new TermExtraction()) .addSink(tiSink); env.execute("twitter stream"); }
Example 16
Source File: CsvSourceStreamOp.java From Alink with Apache License 2.0 | 4 votes |
@Override public Table initializeDataSource() { final String filePath = getFilePath(); final String schemaStr = getSchemaStr(); final String fieldDelim = getFieldDelimiter(); final String rowDelim = getRowDelimiter(); final Character quoteChar = getQuoteChar(); final boolean skipBlankLine = getSkipBlankLine(); final String[] colNames = CsvUtil.getColNames(schemaStr); final TypeInformation[] colTypes = CsvUtil.getColTypes(schemaStr); boolean ignoreFirstLine = getIgnoreFirstLine(); String protocol = ""; try { URL url = new URL(filePath); protocol = url.getProtocol(); } catch (MalformedURLException ignored) { } DataStream<Row> rows; StreamExecutionEnvironment execEnv = MLEnvironmentFactory.get(getMLEnvironmentId()).getStreamExecutionEnvironment(); TableSchema dummySchema = new TableSchema(new String[]{"f1"}, new TypeInformation[]{Types.STRING}); if (protocol.equalsIgnoreCase("http") || protocol.equalsIgnoreCase("https")) { HttpFileSplitReader reader = new HttpFileSplitReader(filePath); rows = execEnv .createInput(new GenericCsvInputFormat(reader, dummySchema.getFieldTypes(), rowDelim, rowDelim, ignoreFirstLine), new RowTypeInfo(dummySchema.getFieldTypes(), dummySchema.getFieldNames())) .name("http_csv_source"); } else { RowCsvInputFormat inputFormat = new RowCsvInputFormat( new Path(filePath), dummySchema.getFieldTypes(), rowDelim, rowDelim, new int[]{0}, true); inputFormat.setSkipFirstLineAsHeader(ignoreFirstLine); rows = execEnv.createInput(inputFormat).name("csv_source"); } rows = rows.flatMap(new CsvUtil.ParseCsvFunc(colTypes, fieldDelim, quoteChar, skipBlankLine)); return DataStreamConversionUtil.toTable(getMLEnvironmentId(), rows, colNames, colTypes); }
Example 17
Source File: ConsumerSample.java From aliyun-log-flink-connector with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { final ParameterTool params = ParameterTool.fromArgs(args); // final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); // For local testing Configuration conf = new Configuration(); conf.setString(CheckpointingOptions.CHECKPOINTS_DIRECTORY, "file:///Users/kel/Github/flink3/aliyun-log-flink-connector/flink2"); final StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(1, conf); env.getConfig().setGlobalJobParameters(params); env.setParallelism(1); env.enableCheckpointing(5000); env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE); env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); env.setStateBackend(new FsStateBackend("file:///Users/kel/Github/flink3/aliyun-log-flink-connector/flink")); Properties configProps = new Properties(); configProps.put(ConfigConstants.LOG_ENDPOINT, SLS_ENDPOINT); configProps.put(ConfigConstants.LOG_ACCESSSKEYID, ACCESS_KEY_ID); configProps.put(ConfigConstants.LOG_ACCESSKEY, ACCESS_KEY_SECRET); configProps.put(ConfigConstants.LOG_MAX_NUMBER_PER_FETCH, "10"); configProps.put(ConfigConstants.LOG_CONSUMER_BEGIN_POSITION, Consts.LOG_FROM_CHECKPOINT); configProps.put(ConfigConstants.LOG_CONSUMERGROUP, "23_ots_sla_etl_product1"); configProps.put(ConfigConstants.LOG_CHECKPOINT_MODE, CheckpointMode.ON_CHECKPOINTS.name()); configProps.put(ConfigConstants.LOG_COMMIT_INTERVAL_MILLIS, "10000"); FastLogGroupDeserializer deserializer = new FastLogGroupDeserializer(); DataStream<FastLogGroupList> stream = env.addSource( new FlinkLogConsumer<>(SLS_PROJECT, SLS_LOGSTORE, deserializer, configProps)); stream.flatMap((FlatMapFunction<FastLogGroupList, String>) (value, out) -> { for (FastLogGroup logGroup : value.getLogGroups()) { int logCount = logGroup.getLogsCount(); for (int i = 0; i < logCount; i++) { FastLog log = logGroup.getLogs(i); // processing log } } }); stream.writeAsText("log-" + System.nanoTime()); env.execute("Flink consumer"); }
Example 18
Source File: ExactlyOnceChecker.java From pravega-samples with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { LOG.info("Starting ExactlyOnce checker ..."); // initialize the parameter utility tool in order to retrieve input parameters ParameterTool params = ParameterTool.fromArgs(args); PravegaConfig pravegaConfig = PravegaConfig .fromParams(params) .withControllerURI(URI.create(params.get(Constants.Default_URI_PARAM, Constants.Default_URI))) .withDefaultScope(params.get(Constants.SCOPE_PARAM, Constants.DEFAULT_SCOPE)); // create the Pravega input stream (if necessary) Stream stream = Utils.createStream( pravegaConfig, params.get(Constants.STREAM_PARAM, Constants.DEFAULT_STREAM)); // initialize Flink execution environment final StreamExecutionEnvironment env = StreamExecutionEnvironment .getExecutionEnvironment() .setParallelism(1); // create the Pravega source to read a stream of text FlinkPravegaReader<IntegerEvent> reader = FlinkPravegaReader.<IntegerEvent>builder() .withPravegaConfig(pravegaConfig) .forStream(stream) .withDeserializationSchema(PravegaSerialization.deserializationFor(IntegerEvent.class)) .build(); DataStream<IntegerEvent> dataStream = env .addSource(reader) .setParallelism(1); // create output stream to data read from Pravega //dataStream.print(); DataStream<DuplicateEvent> duplicateStream = dataStream.flatMap(new FlatMapFunction<IntegerEvent, DuplicateEvent>() { @Override public void flatMap(IntegerEvent event, Collector<DuplicateEvent> out) throws Exception { if (event.isStart()) { // clear checker when the beginning of stream marker arrives checker.clear(); duplicates.clear(); System.out.println("\n============== Checker starts ==============="); } if (event.isEnd()) { if (duplicates.size() == 0) { System.out.println("No duplicate found. EXACTLY_ONCE!"); } else { System.out.println("Found duplicates"); } System.out.println("============== Checker ends ===============\n"); } if (checker.contains(event)) { duplicates.add(event); DuplicateEvent dup = new DuplicateEvent(event.getValue()); System.out.println(dup); out.collect(dup); } else { checker.add(event); } } }); // create output sink to print duplicates //duplicateStream.print(); // execute within the Flink environment env.execute("ExactlyOnceChecker"); LOG.info("Ending ExactlyOnceChecker..."); }
Example 19
Source File: StreamExecutionEnvironment.java From Flink-CEPplus with Apache License 2.0 | 3 votes |
/** * Creates a data stream that contains the contents of file created while system watches the given path. The file * will be read with the system's default character set. * * @param filePath * The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path/") * @param intervalMillis * The interval of file watching in milliseconds * @param watchType * The watch type of file stream. When watchType is {@link org.apache.flink.streaming.api.functions.source.FileMonitoringFunction.WatchType#ONLY_NEW_FILES}, the system processes * only * new files. {@link org.apache.flink.streaming.api.functions.source.FileMonitoringFunction.WatchType#REPROCESS_WITH_APPENDED} means that the system re-processes all contents of * appended file. {@link org.apache.flink.streaming.api.functions.source.FileMonitoringFunction.WatchType#PROCESS_ONLY_APPENDED} means that the system processes only appended * contents * of files. * @return The DataStream containing the given directory. * * @deprecated Use {@link #readFile(FileInputFormat, String, FileProcessingMode, long)} instead. */ @Deprecated @SuppressWarnings("deprecation") public DataStream<String> readFileStream(String filePath, long intervalMillis, FileMonitoringFunction.WatchType watchType) { DataStream<Tuple3<String, Long, Long>> source = addSource(new FileMonitoringFunction( filePath, intervalMillis, watchType), "Read File Stream source"); return source.flatMap(new FileReadFunction()); }
Example 20
Source File: BenchmarkJob.java From scotty-window-processor with Apache License 2.0 | 2 votes |
public BenchmarkJob(List<Window> assigner, StreamExecutionEnvironment env, final long runtime, final int throughput, final List<Tuple2<Long, Long>> gaps) { Map<String, String> configMap = new HashMap<>(); ParameterTool parameters = ParameterTool.fromMap(configMap); env.getConfig().setGlobalJobParameters(parameters); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); env.setParallelism(1); env.setMaxParallelism(1); KeyedScottyWindowOperator<Tuple, Tuple4<String, Integer, Long, Long>, Tuple4<String, Integer, Long, Long>> windowOperator = new KeyedScottyWindowOperator<>(new SumAggregation()); for(Window w: assigner){ windowOperator.addWindow(w); } DataStream<Tuple4<String, Integer, Long, Long>> messageStream = env .addSource(new de.tub.dima.scotty.flinkBenchmark.LoadGeneratorSource(runtime, throughput, gaps)); messageStream.flatMap(new de.tub.dima.scotty.flinkBenchmark.ThroughputLogger<>(200, throughput)); final SingleOutputStreamOperator<Tuple4<String, Integer, Long, Long>> timestampsAndWatermarks = messageStream .assignTimestampsAndWatermarks(new TimestampsAndWatermarks()); timestampsAndWatermarks .keyBy(0) .process(windowOperator) .addSink(new SinkFunction() { @Override public void invoke(final Object value) throws Exception { //System.out.println(value); } }); try { env.execute(); } catch (Exception e) { e.printStackTrace(); } }