org.apache.flink.streaming.api.datastream.DataStream#flatMap

Source File: RescalingITCase.java From flink with Apache License 2.0

5 votes

private static JobGraph createJobGraphWithKeyedState(
		int parallelism,
		int maxParallelism,
		int numberKeys,
		int numberElements,
		boolean terminateAfterEmission,
		int checkpointingInterval) {

	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(parallelism);
	if (0 < maxParallelism) {
		env.getConfig().setMaxParallelism(maxParallelism);
	}
	env.enableCheckpointing(checkpointingInterval);
	env.setRestartStrategy(RestartStrategies.noRestart());
	env.getConfig().setUseSnapshotCompression(true);

	DataStream<Integer> input = env.addSource(new SubtaskIndexSource(
			numberKeys,
			numberElements,
			terminateAfterEmission))
			.keyBy(new KeySelector<Integer, Integer>() {
				private static final long serialVersionUID = -7952298871120320940L;

				@Override
				public Integer getKey(Integer value) throws Exception {
					return value;
				}
			});

	SubtaskIndexFlatMapper.workCompletedLatch = new CountDownLatch(numberKeys);

	DataStream<Tuple2<Integer, Integer>> result = input.flatMap(new SubtaskIndexFlatMapper(numberElements));

	result.addSink(new CollectionSink<Tuple2<Integer, Integer>>());

	return env.getStreamGraph().getJobGraph();
}

Source File: TwitterIntoKafka.java From flink-streaming-etl with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {
	// set up the streaming execution environment
	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

	ParameterTool params = ParameterTool.fromPropertiesFile(args[0]);
	DataStream<String> twitterStreamString = env.addSource(new TwitterSource(params.getProperties()));
	DataStream<String> filteredStream = twitterStreamString.flatMap(new ParseJson());
	filteredStream.flatMap(new ThroughputLogger(5000L)).setParallelism(1);

	filteredStream.addSink(new FlinkKafkaProducer09<>("twitter", new SimpleStringSchema(), params.getProperties()));

	// execute program
	env.execute("Ingest data from Twitter to Kafka");
}

Source File: SideStream.java From alchemy with Apache License 2.0

5 votes

public static DataStream<Row> buildStream(StreamTableEnvironment env, SqlSelect sqlSelect, Alias leftAlias,
    Alias sideAlias, SourceDescriptor sideSource) throws Exception {
    SqlSelect leftSelect = SideParser.newSelect(sqlSelect, leftAlias.getTable(), leftAlias.getAlias(), true, false);
    // register leftTable
    Table leftTable = env.sqlQuery(leftSelect.toString());
    DataStream<Row> leftStream = env.toAppendStream(leftTable, Row.class);
    SqlSelect rightSelect
        = SideParser.newSelect(sqlSelect, sideAlias.getTable(), sideAlias.getAlias(), false, false);
    SqlJoin sqlJoin = (SqlJoin)sqlSelect.getFrom();
    List<String> equalFields = SideParser.findConditionFields(sqlJoin.getCondition(), leftAlias.getAlias());
    if (sideSource.getSide().isPartition()) {
        leftStream = leftStream.keyBy(equalFields.toArray(new String[equalFields.size()]));
    }
    RowTypeInfo sideType = createSideType(rightSelect.getSelectList(), sideSource.getSchema());
    RowTypeInfo returnType = createReturnType(leftTable.getSchema(), sideType);
    SideTable sideTable = createSideTable(leftTable.getSchema(), sideType, sqlJoin.getJoinType(), rightSelect,
        equalFields, sideAlias, sideSource.getSide());
    DataStream<Row> returnStream;
    if (sideSource.getSide().isAsync()) {
        AbstractAsyncSideFunction reqRow = sideSource.transform(sideTable);
        returnStream = AsyncDataStream.orderedWait(leftStream, reqRow, sideSource.getSide().getTimeout(),
            TimeUnit.MILLISECONDS, sideSource.getSide().getCapacity());
    } else {
        AbstractSyncSideFunction syncReqRow = sideSource.transform(sideTable);
        returnStream = leftStream.flatMap(syncReqRow);
    }
    returnStream.getTransformation().setOutputType(returnType);
    return returnStream;
}

Source File: RescalingITCase.java From flink with Apache License 2.0

5 votes

private static JobGraph createJobGraphWithKeyedState(
		int parallelism,
		int maxParallelism,
		int numberKeys,
		int numberElements,
		boolean terminateAfterEmission,
		int checkpointingInterval) {

	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(parallelism);
	if (0 < maxParallelism) {
		env.getConfig().setMaxParallelism(maxParallelism);
	}
	env.enableCheckpointing(checkpointingInterval);
	env.setRestartStrategy(RestartStrategies.noRestart());
	env.getConfig().setUseSnapshotCompression(true);

	DataStream<Integer> input = env.addSource(new SubtaskIndexSource(
			numberKeys,
			numberElements,
			terminateAfterEmission))
			.keyBy(new KeySelector<Integer, Integer>() {
				private static final long serialVersionUID = -7952298871120320940L;

				@Override
				public Integer getKey(Integer value) throws Exception {
					return value;
				}
			});

	SubtaskIndexFlatMapper.workCompletedLatch = new CountDownLatch(numberKeys);

	DataStream<Tuple2<Integer, Integer>> result = input.flatMap(new SubtaskIndexFlatMapper(numberElements));

	result.addSink(new CollectionSink<Tuple2<Integer, Integer>>());

	return env.getStreamGraph().getJobGraph();
}

Source File: StreamingOperatorsITCase.java From flink with Apache License 2.0

5 votes

@Test
public void testOperatorChainWithObjectReuseAndNoOutputOperators() throws Exception {
	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.getConfig().enableObjectReuse();
	DataStream<Integer> input = env.fromElements(1, 2, 3);
	input.flatMap(new FlatMapFunction<Integer, Integer>() {
		@Override
		public void flatMap(Integer value, Collector<Integer> out) throws Exception {
			out.collect(value << 1);
		}
	});
	env.execute();
}

Source File: RescalingITCase.java From flink with Apache License 2.0

5 votes

private static JobGraph createJobGraphWithKeyedAndNonPartitionedOperatorState(
		int parallelism,
		int maxParallelism,
		int fixedParallelism,
		int numberKeys,
		int numberElements,
		boolean terminateAfterEmission,
		int checkpointingInterval) {

	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(parallelism);
	env.getConfig().setMaxParallelism(maxParallelism);
	env.enableCheckpointing(checkpointingInterval);
	env.setRestartStrategy(RestartStrategies.noRestart());

	DataStream<Integer> input = env.addSource(new SubtaskIndexNonPartitionedStateSource(
			numberKeys,
			numberElements,
			terminateAfterEmission))
			.setParallelism(fixedParallelism)
			.keyBy(new KeySelector<Integer, Integer>() {
				private static final long serialVersionUID = -7952298871120320940L;

				@Override
				public Integer getKey(Integer value) throws Exception {
					return value;
				}
			});

	SubtaskIndexFlatMapper.workCompletedLatch = new CountDownLatch(numberKeys);

	DataStream<Tuple2<Integer, Integer>> result = input.flatMap(new SubtaskIndexFlatMapper(numberElements));

	result.addSink(new CollectionSink<Tuple2<Integer, Integer>>());

	return env.getStreamGraph().getJobGraph();
}

Source File: RescalingITCase.java From flink with Apache License 2.0

5 votes

private static JobGraph createJobGraphWithKeyedAndNonPartitionedOperatorState(
		int parallelism,
		int maxParallelism,
		int fixedParallelism,
		int numberKeys,
		int numberElements,
		boolean terminateAfterEmission,
		int checkpointingInterval) {

	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(parallelism);
	env.getConfig().setMaxParallelism(maxParallelism);
	env.enableCheckpointing(checkpointingInterval);
	env.setRestartStrategy(RestartStrategies.noRestart());

	DataStream<Integer> input = env.addSource(new SubtaskIndexNonPartitionedStateSource(
			numberKeys,
			numberElements,
			terminateAfterEmission))
			.setParallelism(fixedParallelism)
			.keyBy(new KeySelector<Integer, Integer>() {
				private static final long serialVersionUID = -7952298871120320940L;

				@Override
				public Integer getKey(Integer value) throws Exception {
					return value;
				}
			});

	SubtaskIndexFlatMapper.workCompletedLatch = new CountDownLatch(numberKeys);

	DataStream<Tuple2<Integer, Integer>> result = input.flatMap(new SubtaskIndexFlatMapper(numberElements));

	result.addSink(new CollectionSink<Tuple2<Integer, Integer>>());

	return env.getStreamGraph().getJobGraph();
}

Source File: KafkaConsumerTestBase.java From flink with Apache License 2.0

5 votes

/**
 * Test that ensures that DeserializationSchema.isEndOfStream() is properly evaluated.
 *
 * @throws Exception
 */
public void runEndOfStreamTest() throws Exception {

	final int elementCount = 300;
	final String topic = writeSequence("testEndOfStream", elementCount, 1, 1);

	// read using custom schema
	final StreamExecutionEnvironment env1 = StreamExecutionEnvironment.getExecutionEnvironment();
	env1.setParallelism(1);
	env1.getConfig().setRestartStrategy(RestartStrategies.noRestart());
	env1.getConfig().disableSysoutLogging();

	Properties props = new Properties();
	props.putAll(standardProps);
	props.putAll(secureProps);

	DataStream<Tuple2<Integer, Integer>> fromKafka = env1.addSource(kafkaServer.getConsumer(topic, new FixedNumberDeserializationSchema(elementCount), props));
	fromKafka.flatMap(new FlatMapFunction<Tuple2<Integer, Integer>, Void>() {
		@Override
		public void flatMap(Tuple2<Integer, Integer> value, Collector<Void> out) throws Exception {
			// noop ;)
		}
	});

	tryExecute(env1, "Consume " + elementCount + " elements from Kafka");

	deleteTestTopic(topic);
}

Source File: StreamingOperatorsITCase.java From Flink-CEPplus with Apache License 2.0

5 votes

@Test
public void testOperatorChainWithObjectReuseAndNoOutputOperators() throws Exception {
	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.getConfig().enableObjectReuse();
	DataStream<Integer> input = env.fromElements(1, 2, 3);
	input.flatMap(new FlatMapFunction<Integer, Integer>() {
		@Override
		public void flatMap(Integer value, Collector<Integer> out) throws Exception {
			out.collect(value << 1);
		}
	});
	env.execute();
}

Source File: RescalingITCase.java From Flink-CEPplus with Apache License 2.0

5 votes

private static JobGraph createJobGraphWithKeyedAndNonPartitionedOperatorState(
		int parallelism,
		int maxParallelism,
		int fixedParallelism,
		int numberKeys,
		int numberElements,
		boolean terminateAfterEmission,
		int checkpointingInterval) {

	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(parallelism);
	env.getConfig().setMaxParallelism(maxParallelism);
	env.enableCheckpointing(checkpointingInterval);
	env.setRestartStrategy(RestartStrategies.noRestart());

	DataStream<Integer> input = env.addSource(new SubtaskIndexNonPartitionedStateSource(
			numberKeys,
			numberElements,
			terminateAfterEmission))
			.setParallelism(fixedParallelism)
			.keyBy(new KeySelector<Integer, Integer>() {
				private static final long serialVersionUID = -7952298871120320940L;

				@Override
				public Integer getKey(Integer value) throws Exception {
					return value;
				}
			});

	SubtaskIndexFlatMapper.workCompletedLatch = new CountDownLatch(numberKeys);

	DataStream<Tuple2<Integer, Integer>> result = input.flatMap(new SubtaskIndexFlatMapper(numberElements));

	result.addSink(new CollectionSink<Tuple2<Integer, Integer>>());

	return env.getStreamGraph().getJobGraph();
}

Source File: KafkaConsumerTestBase.java From Flink-CEPplus with Apache License 2.0

5 votes

/**
 * Test that ensures that DeserializationSchema.isEndOfStream() is properly evaluated.
 *
 * @throws Exception
 */
public void runEndOfStreamTest() throws Exception {

	final int elementCount = 300;
	final String topic = writeSequence("testEndOfStream", elementCount, 1, 1);

	// read using custom schema
	final StreamExecutionEnvironment env1 = StreamExecutionEnvironment.getExecutionEnvironment();
	env1.setParallelism(1);
	env1.getConfig().setRestartStrategy(RestartStrategies.noRestart());
	env1.getConfig().disableSysoutLogging();

	Properties props = new Properties();
	props.putAll(standardProps);
	props.putAll(secureProps);

	DataStream<Tuple2<Integer, Integer>> fromKafka = env1.addSource(kafkaServer.getConsumer(topic, new FixedNumberDeserializationSchema(elementCount), props));
	fromKafka.flatMap(new FlatMapFunction<Tuple2<Integer, Integer>, Void>() {
		@Override
		public void flatMap(Tuple2<Integer, Integer> value, Collector<Void> out) throws Exception {
			// noop ;)
		}
	});

	tryExecute(env1, "Consume " + elementCount + " elements from Kafka");

	deleteTestTopic(topic);
}

Source File: StreamingOperatorsITCase.java From flink with Apache License 2.0

5 votes

@Test
public void testOperatorChainWithObjectReuseAndNoOutputOperators() throws Exception {
	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.getConfig().enableObjectReuse();
	DataStream<Integer> input = env.fromElements(1, 2, 3);
	input.flatMap(new FlatMapFunction<Integer, Integer>() {
		@Override
		public void flatMap(Integer value, Collector<Integer> out) throws Exception {
			out.collect(value << 1);
		}
	});
	env.execute();
}

Source File: KafkaConsumerTestBase.java From flink with Apache License 2.0

4 votes

public void runKeyValueTest() throws Exception {
	final String topic = "keyvaluetest";
	createTestTopic(topic, 1, 1);
	final int elementCount = 5000;

	// ----------- Write some data into Kafka -------------------

	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);
	env.setRestartStrategy(RestartStrategies.noRestart());

	DataStream<Tuple2<Long, PojoValue>> kvStream = env.addSource(new SourceFunction<Tuple2<Long, PojoValue>>() {
		@Override
		public void run(SourceContext<Tuple2<Long, PojoValue>> ctx) throws Exception {
			Random rnd = new Random(1337);
			for (long i = 0; i < elementCount; i++) {
				PojoValue pojo = new PojoValue();
				pojo.when = new Date(rnd.nextLong());
				pojo.lon = rnd.nextLong();
				pojo.lat = i;
				// make every second key null to ensure proper "null" serialization
				Long key = (i % 2 == 0) ? null : i;
				ctx.collect(new Tuple2<>(key, pojo));
			}
		}

		@Override
		public void cancel() {
		}
	});

	KeyedSerializationSchema<Tuple2<Long, PojoValue>> schema = new TypeInformationKeyValueSerializationSchema<>(Long.class, PojoValue.class, env.getConfig());
	Properties producerProperties = FlinkKafkaProducerBase.getPropertiesFromBrokerList(brokerConnectionStrings);
	producerProperties.setProperty("retries", "3");
	kafkaServer.produceIntoKafka(kvStream, topic, schema, producerProperties, null);
	env.execute("Write KV to Kafka");

	// ----------- Read the data again -------------------

	env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);
	env.setRestartStrategy(RestartStrategies.noRestart());

	KafkaDeserializationSchema<Tuple2<Long, PojoValue>> readSchema = new TypeInformationKeyValueSerializationSchema<>(Long.class, PojoValue.class, env.getConfig());

	Properties props = new Properties();
	props.putAll(standardProps);
	props.putAll(secureProps);
	DataStream<Tuple2<Long, PojoValue>> fromKafka = env.addSource(kafkaServer.getConsumer(topic, readSchema, props));
	fromKafka.flatMap(new RichFlatMapFunction<Tuple2<Long, PojoValue>, Object>() {
		long counter = 0;
		@Override
		public void flatMap(Tuple2<Long, PojoValue> value, Collector<Object> out) throws Exception {
			// the elements should be in order.
			Assert.assertTrue("Wrong value " + value.f1.lat, value.f1.lat == counter);
			if (value.f1.lat % 2 == 0) {
				assertNull("key was not null", value.f0);
			} else {
				Assert.assertTrue("Wrong value " + value.f0, value.f0 == counter);
			}
			counter++;
			if (counter == elementCount) {
				// we got the right number of elements
				throw new SuccessException();
			}
		}
	});

	tryExecute(env, "Read KV from Kafka");

	deleteTestTopic(topic);
}

Source File: KafkaConsumerTestBase.java From Flink-CEPplus with Apache License 2.0

4 votes

public void runKeyValueTest() throws Exception {
	final String topic = "keyvaluetest";
	createTestTopic(topic, 1, 1);
	final int elementCount = 5000;

	// ----------- Write some data into Kafka -------------------

	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);
	env.setRestartStrategy(RestartStrategies.noRestart());
	env.getConfig().disableSysoutLogging();

	DataStream<Tuple2<Long, PojoValue>> kvStream = env.addSource(new SourceFunction<Tuple2<Long, PojoValue>>() {
		@Override
		public void run(SourceContext<Tuple2<Long, PojoValue>> ctx) throws Exception {
			Random rnd = new Random(1337);
			for (long i = 0; i < elementCount; i++) {
				PojoValue pojo = new PojoValue();
				pojo.when = new Date(rnd.nextLong());
				pojo.lon = rnd.nextLong();
				pojo.lat = i;
				// make every second key null to ensure proper "null" serialization
				Long key = (i % 2 == 0) ? null : i;
				ctx.collect(new Tuple2<>(key, pojo));
			}
		}

		@Override
		public void cancel() {
		}
	});

	KeyedSerializationSchema<Tuple2<Long, PojoValue>> schema = new TypeInformationKeyValueSerializationSchema<>(Long.class, PojoValue.class, env.getConfig());
	Properties producerProperties = FlinkKafkaProducerBase.getPropertiesFromBrokerList(brokerConnectionStrings);
	producerProperties.setProperty("retries", "3");
	kafkaServer.produceIntoKafka(kvStream, topic, schema, producerProperties, null);
	env.execute("Write KV to Kafka");

	// ----------- Read the data again -------------------

	env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);
	env.setRestartStrategy(RestartStrategies.noRestart());
	env.getConfig().disableSysoutLogging();

	KafkaDeserializationSchema<Tuple2<Long, PojoValue>> readSchema = new TypeInformationKeyValueSerializationSchema<>(Long.class, PojoValue.class, env.getConfig());

	Properties props = new Properties();
	props.putAll(standardProps);
	props.putAll(secureProps);
	DataStream<Tuple2<Long, PojoValue>> fromKafka = env.addSource(kafkaServer.getConsumer(topic, readSchema, props));
	fromKafka.flatMap(new RichFlatMapFunction<Tuple2<Long, PojoValue>, Object>() {
		long counter = 0;
		@Override
		public void flatMap(Tuple2<Long, PojoValue> value, Collector<Object> out) throws Exception {
			// the elements should be in order.
			Assert.assertTrue("Wrong value " + value.f1.lat, value.f1.lat == counter);
			if (value.f1.lat % 2 == 0) {
				assertNull("key was not null", value.f0);
			} else {
				Assert.assertTrue("Wrong value " + value.f0, value.f0 == counter);
			}
			counter++;
			if (counter == elementCount) {
				// we got the right number of elements
				throw new SuccessException();
			}
		}
	});

	tryExecute(env, "Read KV from Kafka");

	deleteTestTopic(topic);
}

Source File: Driver.java From OSTMap with Apache License 2.0

4 votes

public void run(String pathToTwitterProperties, String pathToAccumuloProperties, ArrayList<String> tweet) throws Exception
{

    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
    // one watermark each ten second
    env.getConfig().setAutoWatermarkInterval(1000);

    // decide which stream source should be used
    DataStream<String> geoStream;
    if(tweet==null) {
        geoStream = env.addSource(new GeoTwitterSource(pathToTwitterProperties));
    }
    else {
        geoStream = env.fromCollection ( tweet );
    }

    // decide which configuration should be used
    RawTwitterDataSink rtdSink = new RawTwitterDataSink();
    TermIndexSink tiSink = new TermIndexSink();
    LanguageFrequencySink frqSink = new LanguageFrequencySink();
    GeoTemporalIndexSink gtiSink = new GeoTemporalIndexSink();
    SinkConfiguration sc;
    if(runOnMAC) {
        sc = SinkConfiguration.createConfigForMinicluster(accumuloInstanceName, accumuloZookeeper);
    }
    else {
        sc = SinkConfiguration.createConfigFromFile(pathToAccumuloProperties);
    }
    rtdSink.configure(sc, TableIdentifier.RAW_TWITTER_DATA.get());
    tiSink.configure(sc, TableIdentifier.TERM_INDEX.get());
    frqSink.configure(sc, TableIdentifier.TWEET_FREQUENCY.get());
    gtiSink.configure(sc, TableIdentifier.GEO_TEMPORAL_INDEX.get());


    // stream of tuples containing timestamp and tweet's json-String
    DataStream<Tuple2<Long, String>> dateStream = geoStream.flatMap(new DateExtraction());

    dateStream
            .flatMap(new LanguageFrequencyRowExtraction())
            .flatMap(new LanguageTagExtraction())
            .assignTimestampsAndWatermarks(new TimestampExtractorForDateStream())
            .windowAll(TumblingEventTimeWindows.of(Time.minutes(1)))
            .apply (new AllWindowFunctionLangFreq())
            .addSink(frqSink);

    // stream of tuples containing RawTwitterDataKey and tweet's json-String
    DataStream<Tuple2<RawTwitterDataKey, String>> rtdStream = dateStream.flatMap(new CalculateRawTwitterDataKey());

    /** write into rawTwitterData-table */
    rtdStream.addSink(rtdSink);

    /** write into geoTemporalIndex-table */
    rtdStream
            .flatMap(new GeoTemporalKeyExtraction())
            .addSink(gtiSink);

    /** write into termIndex-table */
    // processing for user
    rtdStream
            .flatMap(new UserExtraction())
            .addSink(tiSink);
    // processing for terms
    rtdStream
            .flatMap(new TermExtraction())
            .addSink(tiSink);

    env.execute("twitter stream");
}

Source File: CsvSourceStreamOp.java From Alink with Apache License 2.0

4 votes

@Override
public Table initializeDataSource() {
    final String filePath = getFilePath();
    final String schemaStr = getSchemaStr();
    final String fieldDelim = getFieldDelimiter();
    final String rowDelim = getRowDelimiter();
    final Character quoteChar = getQuoteChar();
    final boolean skipBlankLine = getSkipBlankLine();

    final String[] colNames = CsvUtil.getColNames(schemaStr);
    final TypeInformation[] colTypes = CsvUtil.getColTypes(schemaStr);

    boolean ignoreFirstLine = getIgnoreFirstLine();
    String protocol = "";

    try {
        URL url = new URL(filePath);
        protocol = url.getProtocol();
    } catch (MalformedURLException ignored) {
    }

    DataStream<Row> rows;
    StreamExecutionEnvironment execEnv =
        MLEnvironmentFactory.get(getMLEnvironmentId()).getStreamExecutionEnvironment();
    TableSchema dummySchema = new TableSchema(new String[]{"f1"}, new TypeInformation[]{Types.STRING});

    if (protocol.equalsIgnoreCase("http") || protocol.equalsIgnoreCase("https")) {
        HttpFileSplitReader reader = new HttpFileSplitReader(filePath);
        rows = execEnv
            .createInput(new GenericCsvInputFormat(reader, dummySchema.getFieldTypes(), rowDelim, rowDelim, ignoreFirstLine),
                new RowTypeInfo(dummySchema.getFieldTypes(), dummySchema.getFieldNames()))
            .name("http_csv_source");
    } else {
        RowCsvInputFormat inputFormat = new RowCsvInputFormat(
            new Path(filePath), dummySchema.getFieldTypes(), rowDelim, rowDelim, new int[]{0}, true);
        inputFormat.setSkipFirstLineAsHeader(ignoreFirstLine);
        rows = execEnv.createInput(inputFormat).name("csv_source");
    }

    rows = rows.flatMap(new CsvUtil.ParseCsvFunc(colTypes, fieldDelim, quoteChar, skipBlankLine));

    return DataStreamConversionUtil.toTable(getMLEnvironmentId(), rows, colNames, colTypes);
}

Source File: ConsumerSample.java From aliyun-log-flink-connector with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception {
        final ParameterTool params = ParameterTool.fromArgs(args);
//        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // For local testing
        Configuration conf = new Configuration();
        conf.setString(CheckpointingOptions.CHECKPOINTS_DIRECTORY,
                "file:///Users/kel/Github/flink3/aliyun-log-flink-connector/flink2");
        final StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(1, conf);
        env.getConfig().setGlobalJobParameters(params);
        env.setParallelism(1);
        env.enableCheckpointing(5000);
        env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
        env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);

        env.setStateBackend(new FsStateBackend("file:///Users/kel/Github/flink3/aliyun-log-flink-connector/flink"));
        Properties configProps = new Properties();
        configProps.put(ConfigConstants.LOG_ENDPOINT, SLS_ENDPOINT);
        configProps.put(ConfigConstants.LOG_ACCESSSKEYID, ACCESS_KEY_ID);
        configProps.put(ConfigConstants.LOG_ACCESSKEY, ACCESS_KEY_SECRET);
        configProps.put(ConfigConstants.LOG_MAX_NUMBER_PER_FETCH, "10");
        configProps.put(ConfigConstants.LOG_CONSUMER_BEGIN_POSITION, Consts.LOG_FROM_CHECKPOINT);
        configProps.put(ConfigConstants.LOG_CONSUMERGROUP, "23_ots_sla_etl_product1");
        configProps.put(ConfigConstants.LOG_CHECKPOINT_MODE, CheckpointMode.ON_CHECKPOINTS.name());
        configProps.put(ConfigConstants.LOG_COMMIT_INTERVAL_MILLIS, "10000");

        FastLogGroupDeserializer deserializer = new FastLogGroupDeserializer();
        DataStream<FastLogGroupList> stream = env.addSource(
                new FlinkLogConsumer<>(SLS_PROJECT, SLS_LOGSTORE, deserializer, configProps));

        stream.flatMap((FlatMapFunction<FastLogGroupList, String>) (value, out) -> {
            for (FastLogGroup logGroup : value.getLogGroups()) {
                int logCount = logGroup.getLogsCount();
                for (int i = 0; i < logCount; i++) {
                    FastLog log = logGroup.getLogs(i);
                    // processing log
                }
            }
        });
        stream.writeAsText("log-" + System.nanoTime());
        env.execute("Flink consumer");
    }

Source File: ExactlyOnceChecker.java From pravega-samples with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception {
    LOG.info("Starting ExactlyOnce checker ...");

    // initialize the parameter utility tool in order to retrieve input parameters
    ParameterTool params = ParameterTool.fromArgs(args);

    PravegaConfig pravegaConfig = PravegaConfig
            .fromParams(params)
            .withControllerURI(URI.create(params.get(Constants.Default_URI_PARAM, Constants.Default_URI)))
            .withDefaultScope(params.get(Constants.SCOPE_PARAM, Constants.DEFAULT_SCOPE));

    // create the Pravega input stream (if necessary)
    Stream stream = Utils.createStream(
            pravegaConfig,
            params.get(Constants.STREAM_PARAM, Constants.DEFAULT_STREAM));

    // initialize Flink execution environment
    final StreamExecutionEnvironment env = StreamExecutionEnvironment
            .getExecutionEnvironment()
            .setParallelism(1);

    // create the Pravega source to read a stream of text
    FlinkPravegaReader<IntegerEvent> reader = FlinkPravegaReader.<IntegerEvent>builder()
            .withPravegaConfig(pravegaConfig)
            .forStream(stream)
            .withDeserializationSchema(PravegaSerialization.deserializationFor(IntegerEvent.class))
            .build();

    DataStream<IntegerEvent> dataStream = env
            .addSource(reader)
            .setParallelism(1);

    // create output stream to data read from Pravega
    //dataStream.print();

    DataStream<DuplicateEvent> duplicateStream = dataStream.flatMap(new FlatMapFunction<IntegerEvent, DuplicateEvent>() {
        @Override
        public void flatMap(IntegerEvent event, Collector<DuplicateEvent> out) throws Exception {

            if (event.isStart()) {
                // clear checker when the beginning of stream marker arrives
                checker.clear();
                duplicates.clear();
                System.out.println("\n============== Checker starts ===============");
            }
            if (event.isEnd()) {
                if (duplicates.size() == 0) {
                    System.out.println("No duplicate found. EXACTLY_ONCE!");
                } else {
                    System.out.println("Found duplicates");
                }
                System.out.println("============== Checker ends  ===============\n");
            }
            if (checker.contains(event)) {
                duplicates.add(event);
                DuplicateEvent dup = new DuplicateEvent(event.getValue());
                System.out.println(dup);
                out.collect(dup);
            } else {
                checker.add(event);
            }
        }
    });

    // create output sink to print duplicates
    //duplicateStream.print();

    // execute within the Flink environment
    env.execute("ExactlyOnceChecker");

    LOG.info("Ending ExactlyOnceChecker...");
}

Source File: StreamExecutionEnvironment.java From Flink-CEPplus with Apache License 2.0

3 votes

/**
 * Creates a data stream that contains the contents of file created while system watches the given path. The file
 * will be read with the system's default character set.
 *
 * @param filePath
 * 		The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path/")
 * @param intervalMillis
 * 		The interval of file watching in milliseconds
 * @param watchType
 * 		The watch type of file stream. When watchType is {@link org.apache.flink.streaming.api.functions.source.FileMonitoringFunction.WatchType#ONLY_NEW_FILES}, the system processes
 * 		only
 * 		new files. {@link org.apache.flink.streaming.api.functions.source.FileMonitoringFunction.WatchType#REPROCESS_WITH_APPENDED} means that the system re-processes all contents of
 * 		appended file. {@link org.apache.flink.streaming.api.functions.source.FileMonitoringFunction.WatchType#PROCESS_ONLY_APPENDED} means that the system processes only appended
 * 		contents
 * 		of files.
 * @return The DataStream containing the given directory.
 *
 * @deprecated Use {@link #readFile(FileInputFormat, String, FileProcessingMode, long)} instead.
 */
@Deprecated
@SuppressWarnings("deprecation")
public DataStream<String> readFileStream(String filePath, long intervalMillis, FileMonitoringFunction.WatchType watchType) {
	DataStream<Tuple3<String, Long, Long>> source = addSource(new FileMonitoringFunction(
			filePath, intervalMillis, watchType), "Read File Stream source");

	return source.flatMap(new FileReadFunction());
}

Source File: BenchmarkJob.java From scotty-window-processor with Apache License 2.0

2 votes

public BenchmarkJob(List<Window> assigner, StreamExecutionEnvironment env, final long runtime,
					final int throughput, final List<Tuple2<Long, Long>> gaps) {


	Map<String, String> configMap = new HashMap<>();
	ParameterTool parameters = ParameterTool.fromMap(configMap);

	env.getConfig().setGlobalJobParameters(parameters);
	env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
	env.setParallelism(1);
	env.setMaxParallelism(1);


	KeyedScottyWindowOperator<Tuple, Tuple4<String, Integer, Long, Long>, Tuple4<String, Integer, Long, Long>> windowOperator =
			new KeyedScottyWindowOperator<>(new SumAggregation());

	for(Window w: assigner){
		windowOperator.addWindow(w);
	}


	DataStream<Tuple4<String, Integer, Long, Long>> messageStream = env
		.addSource(new de.tub.dima.scotty.flinkBenchmark.LoadGeneratorSource(runtime, throughput,  gaps));

	messageStream.flatMap(new de.tub.dima.scotty.flinkBenchmark.ThroughputLogger<>(200, throughput));



	final SingleOutputStreamOperator<Tuple4<String, Integer, Long, Long>> timestampsAndWatermarks = messageStream
		.assignTimestampsAndWatermarks(new TimestampsAndWatermarks());



	timestampsAndWatermarks
			.keyBy(0)
			.process(windowOperator)
			.addSink(new SinkFunction() {

				@Override
				public void invoke(final Object value) throws Exception {
					//System.out.println(value);
				}
			});

	try {
		env.execute();

	} catch (Exception e) {
		e.printStackTrace();
	}

}

Java Code Examples for org.apache.flink.streaming.api.datastream.DataStream#flatMap()