org.apache.flink.api.common.serialization.TypeInformationSerializationSchema Java Examples
The following examples show how to use
org.apache.flink.api.common.serialization.TypeInformationSerializationSchema.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FlinkKafkaShuffleConsumer.java From flink with Apache License 2.0 | 5 votes |
FlinkKafkaShuffleConsumer( String topic, TypeInformationSerializationSchema<T> schema, TypeSerializer<T> typeSerializer, Properties props) { // The schema is needed to call the right FlinkKafkaConsumer constructor. // It is never used, can be `null`, but `null` confuses the compiler. super(topic, schema, props); this.typeSerializer = typeSerializer; Preconditions.checkArgument( props.getProperty(PRODUCER_PARALLELISM) != null, "Missing producer parallelism for Kafka Shuffle"); producerParallelism = PropertiesUtil.getInt(props, PRODUCER_PARALLELISM, Integer.MAX_VALUE); }
Example #2
Source File: KafkaConsumerTestBase.java From flink with Apache License 2.0 | 4 votes |
/** * Tests the proper consumption when having more Flink sources than Kafka partitions, which means * that some Flink sources will read no partitions. */ public void runMultipleSourcesOnePartitionExactlyOnceTest() throws Exception { final String topic = "manyToOneTopic"; final int numPartitions = 5; final int numElementsPerPartition = 1000; final int totalElements = numPartitions * numElementsPerPartition; final int failAfterElements = numElementsPerPartition / 3; final int parallelism = 8; createTestTopic(topic, numPartitions, 1); DataGenerators.generateRandomizedIntegerSequence( StreamExecutionEnvironment.getExecutionEnvironment(), kafkaServer, topic, numPartitions, numElementsPerPartition, true); // run the topology that fails and recovers DeserializationSchema<Integer> schema = new TypeInformationSerializationSchema<>(BasicTypeInfo.INT_TYPE_INFO, new ExecutionConfig()); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.enableCheckpointing(500); env.setParallelism(parallelism); // set the number of restarts to one. The failing mapper will fail once, then it's only success exceptions. env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0)); env.getConfig().disableSysoutLogging(); env.setBufferTimeout(0); Properties props = new Properties(); props.putAll(standardProps); props.putAll(secureProps); FlinkKafkaConsumerBase<Integer> kafkaSource = kafkaServer.getConsumer(topic, schema, props); env .addSource(kafkaSource) .map(new PartitionValidatingMapper(numPartitions, 1)) .map(new FailingIdentityMapper<Integer>(failAfterElements)) .addSink(new ValidatingExactlyOnceSink(totalElements)).setParallelism(1); FailingIdentityMapper.failedBefore = false; tryExecute(env, "multi-source-one-partitions exactly once test"); deleteTestTopic(topic); }
Example #3
Source File: KafkaConsumerTestBase.java From flink with Apache License 2.0 | 4 votes |
protected void writeAppendSequence( String topicName, final int originalNumElements, final int numElementsToAppend, final int parallelism) throws Exception { LOG.info("\n===================================\n" + "== Appending sequence of " + numElementsToAppend + " into " + topicName + "==================================="); final TypeInformation<Tuple2<Integer, Integer>> resultType = TypeInformation.of(new TypeHint<Tuple2<Integer, Integer>>() {}); final SerializationSchema<Tuple2<Integer, Integer>> serSchema = new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig()); final KafkaDeserializationSchema<Tuple2<Integer, Integer>> deserSchema = new KafkaDeserializationSchemaWrapper<>( new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig())); // -------- Write the append sequence -------- StreamExecutionEnvironment writeEnv = StreamExecutionEnvironment.getExecutionEnvironment(); writeEnv.getConfig().setRestartStrategy(RestartStrategies.noRestart()); DataStream<Tuple2<Integer, Integer>> stream = writeEnv.addSource(new RichParallelSourceFunction<Tuple2<Integer, Integer>>() { private boolean running = true; @Override public void run(SourceContext<Tuple2<Integer, Integer>> ctx) throws Exception { int cnt = originalNumElements; int partition = getRuntimeContext().getIndexOfThisSubtask(); while (running && cnt < numElementsToAppend + originalNumElements) { ctx.collect(new Tuple2<>(partition, cnt)); cnt++; } } @Override public void cancel() { running = false; } }).setParallelism(parallelism); // the producer must not produce duplicates Properties producerProperties = FlinkKafkaProducerBase.getPropertiesFromBrokerList(brokerConnectionStrings); producerProperties.setProperty("retries", "0"); producerProperties.putAll(secureProps); kafkaServer.produceIntoKafka(stream, topicName, serSchema, producerProperties, new Tuple2FlinkPartitioner(parallelism)) .setParallelism(parallelism); try { writeEnv.execute("Write sequence"); } catch (Exception e) { throw new Exception("Failed to append sequence to Kafka; append job failed.", e); } LOG.info("Finished writing append sequence"); // we need to validate the sequence, because kafka's producers are not exactly once LOG.info("Validating sequence"); while (!getRunningJobs(client).isEmpty()){ Thread.sleep(50); } if (!validateSequence(topicName, parallelism, deserSchema, originalNumElements + numElementsToAppend)) { throw new Exception("Could not append a valid sequence to Kafka."); } }
Example #4
Source File: KafkaConsumerTestBase.java From flink with Apache License 2.0 | 4 votes |
protected String writeSequence( String baseTopicName, final int numElements, final int parallelism, final int replicationFactor) throws Exception { LOG.info("\n===================================\n" + "== Writing sequence of " + numElements + " into " + baseTopicName + " with p=" + parallelism + "\n" + "==================================="); final TypeInformation<Tuple2<Integer, Integer>> resultType = TypeInformation.of(new TypeHint<Tuple2<Integer, Integer>>() {}); final SerializationSchema<Tuple2<Integer, Integer>> serSchema = new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig()); final KafkaDeserializationSchema<Tuple2<Integer, Integer>> deserSchema = new KafkaDeserializationSchemaWrapper<>( new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig())); final int maxNumAttempts = 10; for (int attempt = 1; attempt <= maxNumAttempts; attempt++) { final String topicName = baseTopicName + '-' + attempt; LOG.info("Writing attempt #" + attempt); // -------- Write the Sequence -------- createTestTopic(topicName, parallelism, replicationFactor); StreamExecutionEnvironment writeEnv = StreamExecutionEnvironment.getExecutionEnvironment(); writeEnv.getConfig().setRestartStrategy(RestartStrategies.noRestart()); DataStream<Tuple2<Integer, Integer>> stream = writeEnv.addSource(new RichParallelSourceFunction<Tuple2<Integer, Integer>>() { private boolean running = true; @Override public void run(SourceContext<Tuple2<Integer, Integer>> ctx) throws Exception { int cnt = 0; int partition = getRuntimeContext().getIndexOfThisSubtask(); while (running && cnt < numElements) { ctx.collect(new Tuple2<>(partition, cnt)); cnt++; } } @Override public void cancel() { running = false; } }).setParallelism(parallelism); // the producer must not produce duplicates Properties producerProperties = FlinkKafkaProducerBase.getPropertiesFromBrokerList(brokerConnectionStrings); producerProperties.setProperty("retries", "0"); producerProperties.putAll(secureProps); kafkaServer.produceIntoKafka(stream, topicName, serSchema, producerProperties, new Tuple2FlinkPartitioner(parallelism)) .setParallelism(parallelism); try { writeEnv.execute("Write sequence"); } catch (Exception e) { LOG.error("Write attempt failed, trying again", e); deleteTestTopic(topicName); waitUntilNoJobIsRunning(client); continue; } LOG.info("Finished writing sequence"); // -------- Validate the Sequence -------- // we need to validate the sequence, because kafka's producers are not exactly once LOG.info("Validating sequence"); waitUntilNoJobIsRunning(client); if (validateSequence(topicName, parallelism, deserSchema, numElements)) { // everything is good! return topicName; } else { deleteTestTopic(topicName); // fall through the loop } } throw new Exception("Could not write a valid sequence to Kafka after " + maxNumAttempts + " attempts"); }
Example #5
Source File: KafkaConsumerTestBase.java From flink with Apache License 2.0 | 4 votes |
public void runBrokerFailureTest() throws Exception { final String topic = "brokerFailureTestTopic"; final int parallelism = 2; final int numElementsPerPartition = 1000; final int totalElements = parallelism * numElementsPerPartition; final int failAfterElements = numElementsPerPartition / 3; createTestTopic(topic, parallelism, 2); DataGenerators.generateRandomizedIntegerSequence( StreamExecutionEnvironment.getExecutionEnvironment(), kafkaServer, topic, parallelism, numElementsPerPartition, true); // find leader to shut down int leaderId = kafkaServer.getLeaderToShutDown(topic); LOG.info("Leader to shutdown {}", leaderId); // run the topology (the consumers must handle the failures) DeserializationSchema<Integer> schema = new TypeInformationSerializationSchema<>(BasicTypeInfo.INT_TYPE_INFO, new ExecutionConfig()); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(parallelism); env.enableCheckpointing(500); env.setRestartStrategy(RestartStrategies.noRestart()); Properties props = new Properties(); props.putAll(standardProps); props.putAll(secureProps); FlinkKafkaConsumerBase<Integer> kafkaSource = kafkaServer.getConsumer(topic, schema, props); env .addSource(kafkaSource) .map(new PartitionValidatingMapper(parallelism, 1)) .map(new BrokerKillingMapper<Integer>(leaderId, failAfterElements)) .addSink(new ValidatingExactlyOnceSink(totalElements)).setParallelism(1); BrokerKillingMapper.killedLeaderBefore = false; tryExecute(env, "Broker failure once test"); // start a new broker: kafkaServer.restartBroker(leaderId); }
Example #6
Source File: KafkaConsumerTestBase.java From flink with Apache License 2.0 | 4 votes |
/** * Tests the proper consumption when having more Flink sources than Kafka partitions, which means * that some Flink sources will read no partitions. */ public void runMultipleSourcesOnePartitionExactlyOnceTest() throws Exception { final String topic = "manyToOneTopic"; final int numPartitions = 5; final int numElementsPerPartition = 1000; final int totalElements = numPartitions * numElementsPerPartition; final int failAfterElements = numElementsPerPartition / 3; final int parallelism = 8; createTestTopic(topic, numPartitions, 1); DataGenerators.generateRandomizedIntegerSequence( StreamExecutionEnvironment.getExecutionEnvironment(), kafkaServer, topic, numPartitions, numElementsPerPartition, true); // run the topology that fails and recovers DeserializationSchema<Integer> schema = new TypeInformationSerializationSchema<>(BasicTypeInfo.INT_TYPE_INFO, new ExecutionConfig()); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.enableCheckpointing(500); env.setParallelism(parallelism); // set the number of restarts to one. The failing mapper will fail once, then it's only success exceptions. env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0)); env.setBufferTimeout(0); Properties props = new Properties(); props.putAll(standardProps); props.putAll(secureProps); FlinkKafkaConsumerBase<Integer> kafkaSource = kafkaServer.getConsumer(topic, schema, props); env .addSource(kafkaSource) .map(new PartitionValidatingMapper(numPartitions, 1)) .map(new FailingIdentityMapper<Integer>(failAfterElements)) .addSink(new ValidatingExactlyOnceSink(totalElements)).setParallelism(1); FailingIdentityMapper.failedBefore = false; tryExecute(env, "multi-source-one-partitions exactly once test"); deleteTestTopic(topic); }
Example #7
Source File: KafkaConsumerTestBase.java From flink with Apache License 2.0 | 4 votes |
/** * Tests the proper consumption when having fewer Flink sources than Kafka partitions, so * one Flink source will read multiple Kafka partitions. */ public void runOneSourceMultiplePartitionsExactlyOnceTest() throws Exception { final String topic = "oneToManyTopic"; final int numPartitions = 5; final int numElementsPerPartition = 1000; final int totalElements = numPartitions * numElementsPerPartition; final int failAfterElements = numElementsPerPartition / 3; final int parallelism = 2; createTestTopic(topic, numPartitions, 1); DataGenerators.generateRandomizedIntegerSequence( StreamExecutionEnvironment.getExecutionEnvironment(), kafkaServer, topic, numPartitions, numElementsPerPartition, true); // run the topology that fails and recovers DeserializationSchema<Integer> schema = new TypeInformationSerializationSchema<>(BasicTypeInfo.INT_TYPE_INFO, new ExecutionConfig()); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.enableCheckpointing(500); env.setParallelism(parallelism); env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0)); Properties props = new Properties(); props.putAll(standardProps); props.putAll(secureProps); FlinkKafkaConsumerBase<Integer> kafkaSource = kafkaServer.getConsumer(topic, schema, props); env .addSource(kafkaSource) .map(new PartitionValidatingMapper(numPartitions, 3)) .map(new FailingIdentityMapper<Integer>(failAfterElements)) .addSink(new ValidatingExactlyOnceSink(totalElements)).setParallelism(1); FailingIdentityMapper.failedBefore = false; tryExecute(env, "One-source-multi-partitions exactly once test"); deleteTestTopic(topic); }
Example #8
Source File: KafkaConsumerTestBase.java From flink with Apache License 2.0 | 4 votes |
/** * Tests the proper consumption when having a 1:1 correspondence between kafka partitions and * Flink sources. */ public void runOneToOneExactlyOnceTest() throws Exception { final String topic = "oneToOneTopic"; final int parallelism = 5; final int numElementsPerPartition = 1000; final int totalElements = parallelism * numElementsPerPartition; final int failAfterElements = numElementsPerPartition / 3; createTestTopic(topic, parallelism, 1); DataGenerators.generateRandomizedIntegerSequence( StreamExecutionEnvironment.getExecutionEnvironment(), kafkaServer, topic, parallelism, numElementsPerPartition, true); // run the topology that fails and recovers DeserializationSchema<Integer> schema = new TypeInformationSerializationSchema<>(BasicTypeInfo.INT_TYPE_INFO, new ExecutionConfig()); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.enableCheckpointing(500); env.setParallelism(parallelism); env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0)); Properties props = new Properties(); props.putAll(standardProps); props.putAll(secureProps); FlinkKafkaConsumerBase<Integer> kafkaSource = kafkaServer.getConsumer(topic, schema, props); env .addSource(kafkaSource) .map(new PartitionValidatingMapper(parallelism, 1)) .map(new FailingIdentityMapper<Integer>(failAfterElements)) .addSink(new ValidatingExactlyOnceSink(totalElements)).setParallelism(1); FailingIdentityMapper.failedBefore = false; tryExecute(env, "One-to-one exactly once test"); deleteTestTopic(topic); }
Example #9
Source File: KafkaProducerTestBase.java From flink with Apache License 2.0 | 4 votes |
/** * This test sets KafkaProducer so that it will automatically flush the data and * and fails the broker to check whether flushed records since last checkpoint were not duplicated. */ protected void testExactlyOnce(boolean regularSink, int sinksCount) throws Exception { final String topic = (regularSink ? "exactlyOnceTopicRegularSink" : "exactlyTopicCustomOperator") + sinksCount; final int partition = 0; final int numElements = 1000; final int failAfterElements = 333; for (int i = 0; i < sinksCount; i++) { createTestTopic(topic + i, 1, 1); } TypeInformationSerializationSchema<Integer> schema = new TypeInformationSerializationSchema<>(BasicTypeInfo.INT_TYPE_INFO, new ExecutionConfig()); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.enableCheckpointing(500); env.setParallelism(1); env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0)); Properties properties = new Properties(); properties.putAll(standardProps); properties.putAll(secureProps); // process exactly failAfterElements number of elements and then shutdown Kafka broker and fail application List<Integer> expectedElements = getIntegersSequence(numElements); DataStream<Integer> inputStream = env .addSource(new IntegerSource(numElements)) .map(new FailingIdentityMapper<Integer>(failAfterElements)); for (int i = 0; i < sinksCount; i++) { FlinkKafkaPartitioner<Integer> partitioner = new FlinkKafkaPartitioner<Integer>() { @Override public int partition(Integer record, byte[] key, byte[] value, String targetTopic, int[] partitions) { return partition; } }; if (regularSink) { StreamSink<Integer> kafkaSink = kafkaServer.getProducerSink(topic + i, schema, properties, partitioner); inputStream.addSink(kafkaSink.getUserFunction()); } else { kafkaServer.produceIntoKafka(inputStream, topic + i, schema, properties, partitioner); } } FailingIdentityMapper.failedBefore = false; TestUtils.tryExecute(env, "Exactly once test"); for (int i = 0; i < sinksCount; i++) { // assert that before failure we successfully snapshot/flushed all expected elements assertExactlyOnceForTopic( properties, topic + i, partition, expectedElements, KAFKA_READ_TIMEOUT); deleteTestTopic(topic + i); } }
Example #10
Source File: Kafka010ITCase.java From flink with Apache License 2.0 | 4 votes |
/** * Kafka 0.10 specific test, ensuring Timestamps are properly written to and read from Kafka. */ @Ignore("This test is disabled because of: https://issues.apache.org/jira/browse/FLINK-9217") @Test(timeout = 60000) public void testTimestamps() throws Exception { final String topic = "tstopic"; createTestTopic(topic, 3, 1); // ---------- Produce an event time stream into Kafka ------------------- StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); env.getConfig().setRestartStrategy(RestartStrategies.noRestart()); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); DataStream<Long> streamWithTimestamps = env.addSource(new SourceFunction<Long>() { private static final long serialVersionUID = -2255105836471289626L; boolean running = true; @Override public void run(SourceContext<Long> ctx) throws Exception { long i = 0; while (running) { ctx.collectWithTimestamp(i, i * 2); if (i++ == 1000L) { running = false; } } } @Override public void cancel() { running = false; } }); final TypeInformationSerializationSchema<Long> longSer = new TypeInformationSerializationSchema<>(Types.LONG, env.getConfig()); FlinkKafkaProducer010.FlinkKafkaProducer010Configuration prod = FlinkKafkaProducer010.writeToKafkaWithTimestamps(streamWithTimestamps, topic, new KeyedSerializationSchemaWrapper<>(longSer), standardProps, new FlinkKafkaPartitioner<Long>() { private static final long serialVersionUID = -6730989584364230617L; @Override public int partition(Long next, byte[] key, byte[] value, String targetTopic, int[] partitions) { return (int) (next % 3); } }); prod.setParallelism(3); prod.setWriteTimestampToKafka(true); env.execute("Produce some"); // ---------- Consume stream from Kafka ------------------- env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); env.getConfig().setRestartStrategy(RestartStrategies.noRestart()); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); FlinkKafkaConsumer010<Long> kafkaSource = new FlinkKafkaConsumer010<>(topic, new LimitedLongDeserializer(), standardProps); kafkaSource.assignTimestampsAndWatermarks(new AssignerWithPunctuatedWatermarks<Long>() { private static final long serialVersionUID = -4834111073247835189L; @Nullable @Override public Watermark checkAndGetNextWatermark(Long lastElement, long extractedTimestamp) { if (lastElement % 10 == 0) { return new Watermark(lastElement); } return null; } @Override public long extractTimestamp(Long element, long previousElementTimestamp) { return previousElementTimestamp; } }); DataStream<Long> stream = env.addSource(kafkaSource); GenericTypeInfo<Object> objectTypeInfo = new GenericTypeInfo<>(Object.class); stream.transform("timestamp validating operator", objectTypeInfo, new TimestampValidatingOperator()).setParallelism(1); env.execute("Consume again"); deleteTestTopic(topic); }
Example #11
Source File: Kafka011ITCase.java From flink with Apache License 2.0 | 4 votes |
/** * Kafka 0.11 specific test, ensuring Timestamps are properly written to and read from Kafka. */ @Test(timeout = 60000) public void testTimestamps() throws Exception { final String topic = "tstopic"; createTestTopic(topic, 3, 1); // ---------- Produce an event time stream into Kafka ------------------- StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); env.getConfig().setRestartStrategy(RestartStrategies.noRestart()); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); DataStream<Long> streamWithTimestamps = env.addSource(new SourceFunction<Long>() { private static final long serialVersionUID = -2255115836471289626L; boolean running = true; @Override public void run(SourceContext<Long> ctx) throws Exception { long i = 0; while (running) { ctx.collectWithTimestamp(i, i * 2); if (i++ == 1110L) { running = false; } } } @Override public void cancel() { running = false; } }); final TypeInformationSerializationSchema<Long> longSer = new TypeInformationSerializationSchema<>(Types.LONG, env.getConfig()); FlinkKafkaProducer011<Long> prod = new FlinkKafkaProducer011<>(topic, new KeyedSerializationSchemaWrapper<>(longSer), standardProps, Optional.of(new FlinkKafkaPartitioner<Long>() { private static final long serialVersionUID = -6730989584364230617L; @Override public int partition(Long next, byte[] key, byte[] value, String targetTopic, int[] partitions) { return (int) (next % 3); } })); prod.setWriteTimestampToKafka(true); streamWithTimestamps.addSink(prod).setParallelism(3); env.execute("Produce some"); // ---------- Consume stream from Kafka ------------------- env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); env.getConfig().setRestartStrategy(RestartStrategies.noRestart()); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); FlinkKafkaConsumer011<Long> kafkaSource = new FlinkKafkaConsumer011<>(topic, new LimitedLongDeserializer(), standardProps); kafkaSource.assignTimestampsAndWatermarks(new AssignerWithPunctuatedWatermarks<Long>() { private static final long serialVersionUID = -4834111173247835189L; @Nullable @Override public Watermark checkAndGetNextWatermark(Long lastElement, long extractedTimestamp) { if (lastElement % 11 == 0) { return new Watermark(lastElement); } return null; } @Override public long extractTimestamp(Long element, long previousElementTimestamp) { return previousElementTimestamp; } }); DataStream<Long> stream = env.addSource(kafkaSource); GenericTypeInfo<Object> objectTypeInfo = new GenericTypeInfo<>(Object.class); stream.transform("timestamp validating operator", objectTypeInfo, new TimestampValidatingOperator()).setParallelism(1); env.execute("Consume again"); deleteTestTopic(topic); }
Example #12
Source File: KafkaITCase.java From flink with Apache License 2.0 | 4 votes |
/** * Kafka 20 specific test, ensuring Timestamps are properly written to and read from Kafka. */ @Test(timeout = 60000) public void testTimestamps() throws Exception { final String topic = "tstopic"; createTestTopic(topic, 3, 1); // ---------- Produce an event time stream into Kafka ------------------- StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); env.getConfig().setRestartStrategy(RestartStrategies.noRestart()); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); DataStream<Long> streamWithTimestamps = env.addSource(new SourceFunction<Long>() { private static final long serialVersionUID = -2255115836471289626L; boolean running = true; @Override public void run(SourceContext<Long> ctx) throws Exception { long i = 0; while (running) { ctx.collectWithTimestamp(i, i * 2); if (i++ == 1110L) { running = false; } } } @Override public void cancel() { running = false; } }); final TypeInformationSerializationSchema<Long> longSer = new TypeInformationSerializationSchema<>(Types.LONG, env.getConfig()); FlinkKafkaProducer<Long> prod = new FlinkKafkaProducer<>(topic, new KeyedSerializationSchemaWrapper<>(longSer), standardProps, Optional.of(new FlinkKafkaPartitioner<Long>() { private static final long serialVersionUID = -6730989584364230617L; @Override public int partition(Long next, byte[] key, byte[] value, String targetTopic, int[] partitions) { return (int) (next % 3); } })); prod.setWriteTimestampToKafka(true); streamWithTimestamps.addSink(prod).setParallelism(3); env.execute("Produce some"); // ---------- Consume stream from Kafka ------------------- env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); env.getConfig().setRestartStrategy(RestartStrategies.noRestart()); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); FlinkKafkaConsumer<Long> kafkaSource = new FlinkKafkaConsumer<>(topic, new KafkaITCase.LimitedLongDeserializer(), standardProps); kafkaSource.assignTimestampsAndWatermarks(new AssignerWithPunctuatedWatermarks<Long>() { private static final long serialVersionUID = -4834111173247835189L; @Nullable @Override public Watermark checkAndGetNextWatermark(Long lastElement, long extractedTimestamp) { if (lastElement % 11 == 0) { return new Watermark(lastElement); } return null; } @Override public long extractTimestamp(Long element, long previousElementTimestamp) { return previousElementTimestamp; } }); DataStream<Long> stream = env.addSource(kafkaSource); GenericTypeInfo<Object> objectTypeInfo = new GenericTypeInfo<>(Object.class); stream.transform("timestamp validating operator", objectTypeInfo, new TimestampValidatingOperator()).setParallelism(1); env.execute("Consume again"); deleteTestTopic(topic); }
Example #13
Source File: FlinkKafkaShuffle.java From flink with Apache License 2.0 | 4 votes |
/** * The read side of {@link FlinkKafkaShuffle#persistentKeyBy}. * * <p>Each consumer task should read kafka partitions equal to the key group indices it is assigned. * The number of kafka partitions is the maximum parallelism of the consumer. * This version only supports numberOfPartitions = consumerParallelism. * In the case of using {@link TimeCharacteristic#EventTime}, a consumer task is responsible to emit * watermarks. Watermarks are read from the corresponding Kafka partitions. Notice that a consumer task only starts * to emit a watermark after receiving at least one watermark from each producer task to make sure watermarks * are monotonically increasing. Hence a consumer task needs to know `producerParallelism` as well. * * <p>Attention: make sure kafkaProperties include * {@link FlinkKafkaShuffle#PRODUCER_PARALLELISM} and {@link FlinkKafkaShuffle#PARTITION_NUMBER} explicitly. * {@link FlinkKafkaShuffle#PRODUCER_PARALLELISM} is the parallelism of the producer. * {@link FlinkKafkaShuffle#PARTITION_NUMBER} is the number of partitions. * They are not necessarily the same and allowed to be set independently. * * @see FlinkKafkaShuffle#persistentKeyBy * @see FlinkKafkaShuffle#writeKeyBy * * @param topic The topic of Kafka where data is persisted * @param env Execution environment. readKeyBy's environment can be different from writeKeyBy's * @param typeInformation Type information of the data persisted in Kafka * @param kafkaProperties kafka properties for Kafka Consumer * @param keySelector key selector to retrieve key * @param <T> Schema type * @param <K> Key type * @return Keyed data stream */ public static <T, K> KeyedStream<T, K> readKeyBy( String topic, StreamExecutionEnvironment env, TypeInformation<T> typeInformation, Properties kafkaProperties, KeySelector<T, K> keySelector) { TypeSerializer<T> typeSerializer = typeInformation.createSerializer(env.getConfig()); TypeInformationSerializationSchema<T> schema = new TypeInformationSerializationSchema<>(typeInformation, typeSerializer); SourceFunction<T> kafkaConsumer = new FlinkKafkaShuffleConsumer<>(topic, schema, typeSerializer, kafkaProperties); // TODO: consider situations where numberOfPartitions != consumerParallelism Preconditions.checkArgument( kafkaProperties.getProperty(PARTITION_NUMBER) != null, "Missing partition number for Kafka Shuffle"); int numberOfPartitions = PropertiesUtil.getInt(kafkaProperties, PARTITION_NUMBER, Integer.MIN_VALUE); DataStream<T> outputDataStream = env.addSource(kafkaConsumer).setParallelism(numberOfPartitions); return DataStreamUtils.reinterpretAsKeyedStream(outputDataStream, keySelector); }
Example #14
Source File: KafkaConsumerTestBase.java From flink with Apache License 2.0 | 4 votes |
protected void writeAppendSequence( String topicName, final int originalNumElements, final int numElementsToAppend, final int parallelism) throws Exception { LOG.info("\n===================================\n" + "== Appending sequence of " + numElementsToAppend + " into " + topicName + "==================================="); final TypeInformation<Tuple2<Integer, Integer>> resultType = TypeInformation.of(new TypeHint<Tuple2<Integer, Integer>>() {}); final KeyedSerializationSchema<Tuple2<Integer, Integer>> serSchema = new KeyedSerializationSchemaWrapper<>( new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig())); final KafkaDeserializationSchema<Tuple2<Integer, Integer>> deserSchema = new KafkaDeserializationSchemaWrapper<>( new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig())); // -------- Write the append sequence -------- StreamExecutionEnvironment writeEnv = StreamExecutionEnvironment.getExecutionEnvironment(); writeEnv.getConfig().setRestartStrategy(RestartStrategies.noRestart()); writeEnv.getConfig().disableSysoutLogging(); DataStream<Tuple2<Integer, Integer>> stream = writeEnv.addSource(new RichParallelSourceFunction<Tuple2<Integer, Integer>>() { private boolean running = true; @Override public void run(SourceContext<Tuple2<Integer, Integer>> ctx) throws Exception { int cnt = originalNumElements; int partition = getRuntimeContext().getIndexOfThisSubtask(); while (running && cnt < numElementsToAppend + originalNumElements) { ctx.collect(new Tuple2<>(partition, cnt)); cnt++; } } @Override public void cancel() { running = false; } }).setParallelism(parallelism); // the producer must not produce duplicates Properties producerProperties = FlinkKafkaProducerBase.getPropertiesFromBrokerList(brokerConnectionStrings); producerProperties.setProperty("retries", "0"); producerProperties.putAll(secureProps); kafkaServer.produceIntoKafka(stream, topicName, serSchema, producerProperties, new Tuple2FlinkPartitioner(parallelism)) .setParallelism(parallelism); try { writeEnv.execute("Write sequence"); } catch (Exception e) { throw new Exception("Failed to append sequence to Kafka; append job failed.", e); } LOG.info("Finished writing append sequence"); // we need to validate the sequence, because kafka's producers are not exactly once LOG.info("Validating sequence"); while (!getRunningJobs(client).isEmpty()){ Thread.sleep(50); } if (!validateSequence(topicName, parallelism, deserSchema, originalNumElements + numElementsToAppend)) { throw new Exception("Could not append a valid sequence to Kafka."); } }
Example #15
Source File: KafkaConsumerTestBase.java From flink with Apache License 2.0 | 4 votes |
protected String writeSequence( String baseTopicName, final int numElements, final int parallelism, final int replicationFactor) throws Exception { LOG.info("\n===================================\n" + "== Writing sequence of " + numElements + " into " + baseTopicName + " with p=" + parallelism + "\n" + "==================================="); final TypeInformation<Tuple2<Integer, Integer>> resultType = TypeInformation.of(new TypeHint<Tuple2<Integer, Integer>>() {}); final KeyedSerializationSchema<Tuple2<Integer, Integer>> serSchema = new KeyedSerializationSchemaWrapper<>( new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig())); final KafkaDeserializationSchema<Tuple2<Integer, Integer>> deserSchema = new KafkaDeserializationSchemaWrapper<>( new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig())); final int maxNumAttempts = 10; for (int attempt = 1; attempt <= maxNumAttempts; attempt++) { final String topicName = baseTopicName + '-' + attempt; LOG.info("Writing attempt #" + attempt); // -------- Write the Sequence -------- createTestTopic(topicName, parallelism, replicationFactor); StreamExecutionEnvironment writeEnv = StreamExecutionEnvironment.getExecutionEnvironment(); writeEnv.getConfig().setRestartStrategy(RestartStrategies.noRestart()); writeEnv.getConfig().disableSysoutLogging(); DataStream<Tuple2<Integer, Integer>> stream = writeEnv.addSource(new RichParallelSourceFunction<Tuple2<Integer, Integer>>() { private boolean running = true; @Override public void run(SourceContext<Tuple2<Integer, Integer>> ctx) throws Exception { int cnt = 0; int partition = getRuntimeContext().getIndexOfThisSubtask(); while (running && cnt < numElements) { ctx.collect(new Tuple2<>(partition, cnt)); cnt++; } } @Override public void cancel() { running = false; } }).setParallelism(parallelism); // the producer must not produce duplicates Properties producerProperties = FlinkKafkaProducerBase.getPropertiesFromBrokerList(brokerConnectionStrings); producerProperties.setProperty("retries", "0"); producerProperties.putAll(secureProps); kafkaServer.produceIntoKafka(stream, topicName, serSchema, producerProperties, new Tuple2FlinkPartitioner(parallelism)) .setParallelism(parallelism); try { writeEnv.execute("Write sequence"); } catch (Exception e) { LOG.error("Write attempt failed, trying again", e); deleteTestTopic(topicName); waitUntilNoJobIsRunning(client); continue; } LOG.info("Finished writing sequence"); // -------- Validate the Sequence -------- // we need to validate the sequence, because kafka's producers are not exactly once LOG.info("Validating sequence"); waitUntilNoJobIsRunning(client); if (validateSequence(topicName, parallelism, deserSchema, numElements)) { // everything is good! return topicName; } else { deleteTestTopic(topicName); // fall through the loop } } throw new Exception("Could not write a valid sequence to Kafka after " + maxNumAttempts + " attempts"); }
Example #16
Source File: KafkaConsumerTestBase.java From flink with Apache License 2.0 | 4 votes |
public void runBrokerFailureTest() throws Exception { final String topic = "brokerFailureTestTopic"; final int parallelism = 2; final int numElementsPerPartition = 1000; final int totalElements = parallelism * numElementsPerPartition; final int failAfterElements = numElementsPerPartition / 3; createTestTopic(topic, parallelism, 2); DataGenerators.generateRandomizedIntegerSequence( StreamExecutionEnvironment.getExecutionEnvironment(), kafkaServer, topic, parallelism, numElementsPerPartition, true); // find leader to shut down int leaderId = kafkaServer.getLeaderToShutDown(topic); LOG.info("Leader to shutdown {}", leaderId); // run the topology (the consumers must handle the failures) DeserializationSchema<Integer> schema = new TypeInformationSerializationSchema<>(BasicTypeInfo.INT_TYPE_INFO, new ExecutionConfig()); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(parallelism); env.enableCheckpointing(500); env.setRestartStrategy(RestartStrategies.noRestart()); env.getConfig().disableSysoutLogging(); Properties props = new Properties(); props.putAll(standardProps); props.putAll(secureProps); FlinkKafkaConsumerBase<Integer> kafkaSource = kafkaServer.getConsumer(topic, schema, props); env .addSource(kafkaSource) .map(new PartitionValidatingMapper(parallelism, 1)) .map(new BrokerKillingMapper<Integer>(leaderId, failAfterElements)) .addSink(new ValidatingExactlyOnceSink(totalElements)).setParallelism(1); BrokerKillingMapper.killedLeaderBefore = false; tryExecute(env, "Broker failure once test"); // start a new broker: kafkaServer.restartBroker(leaderId); }
Example #17
Source File: KafkaITCase.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
/** * Kafka 20 specific test, ensuring Timestamps are properly written to and read from Kafka. */ @Test(timeout = 60000) public void testTimestamps() throws Exception { final String topic = "tstopic"; createTestTopic(topic, 3, 1); // ---------- Produce an event time stream into Kafka ------------------- StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); env.getConfig().setRestartStrategy(RestartStrategies.noRestart()); env.getConfig().disableSysoutLogging(); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); DataStream<Long> streamWithTimestamps = env.addSource(new SourceFunction<Long>() { private static final long serialVersionUID = -2255115836471289626L; boolean running = true; @Override public void run(SourceContext<Long> ctx) throws Exception { long i = 0; while (running) { ctx.collectWithTimestamp(i, i * 2); if (i++ == 1110L) { running = false; } } } @Override public void cancel() { running = false; } }); final TypeInformationSerializationSchema<Long> longSer = new TypeInformationSerializationSchema<>(Types.LONG, env.getConfig()); FlinkKafkaProducer<Long> prod = new FlinkKafkaProducer<>(topic, new KeyedSerializationSchemaWrapper<>(longSer), standardProps, Optional.of(new FlinkKafkaPartitioner<Long>() { private static final long serialVersionUID = -6730989584364230617L; @Override public int partition(Long next, byte[] key, byte[] value, String targetTopic, int[] partitions) { return (int) (next % 3); } })); prod.setWriteTimestampToKafka(true); streamWithTimestamps.addSink(prod).setParallelism(3); env.execute("Produce some"); // ---------- Consume stream from Kafka ------------------- env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); env.getConfig().setRestartStrategy(RestartStrategies.noRestart()); env.getConfig().disableSysoutLogging(); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); FlinkKafkaConsumer<Long> kafkaSource = new FlinkKafkaConsumer<>(topic, new KafkaITCase.LimitedLongDeserializer(), standardProps); kafkaSource.assignTimestampsAndWatermarks(new AssignerWithPunctuatedWatermarks<Long>() { private static final long serialVersionUID = -4834111173247835189L; @Nullable @Override public Watermark checkAndGetNextWatermark(Long lastElement, long extractedTimestamp) { if (lastElement % 11 == 0) { return new Watermark(lastElement); } return null; } @Override public long extractTimestamp(Long element, long previousElementTimestamp) { return previousElementTimestamp; } }); DataStream<Long> stream = env.addSource(kafkaSource); GenericTypeInfo<Object> objectTypeInfo = new GenericTypeInfo<>(Object.class); stream.transform("timestamp validating operator", objectTypeInfo, new TimestampValidatingOperator()).setParallelism(1); env.execute("Consume again"); deleteTestTopic(topic); }
Example #18
Source File: KafkaConsumerTestBase.java From flink with Apache License 2.0 | 4 votes |
/** * Tests the proper consumption when having fewer Flink sources than Kafka partitions, so * one Flink source will read multiple Kafka partitions. */ public void runOneSourceMultiplePartitionsExactlyOnceTest() throws Exception { final String topic = "oneToManyTopic"; final int numPartitions = 5; final int numElementsPerPartition = 1000; final int totalElements = numPartitions * numElementsPerPartition; final int failAfterElements = numElementsPerPartition / 3; final int parallelism = 2; createTestTopic(topic, numPartitions, 1); DataGenerators.generateRandomizedIntegerSequence( StreamExecutionEnvironment.getExecutionEnvironment(), kafkaServer, topic, numPartitions, numElementsPerPartition, true); // run the topology that fails and recovers DeserializationSchema<Integer> schema = new TypeInformationSerializationSchema<>(BasicTypeInfo.INT_TYPE_INFO, new ExecutionConfig()); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.enableCheckpointing(500); env.setParallelism(parallelism); env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0)); env.getConfig().disableSysoutLogging(); Properties props = new Properties(); props.putAll(standardProps); props.putAll(secureProps); FlinkKafkaConsumerBase<Integer> kafkaSource = kafkaServer.getConsumer(topic, schema, props); env .addSource(kafkaSource) .map(new PartitionValidatingMapper(numPartitions, 3)) .map(new FailingIdentityMapper<Integer>(failAfterElements)) .addSink(new ValidatingExactlyOnceSink(totalElements)).setParallelism(1); FailingIdentityMapper.failedBefore = false; tryExecute(env, "One-source-multi-partitions exactly once test"); deleteTestTopic(topic); }
Example #19
Source File: KafkaConsumerTestBase.java From flink with Apache License 2.0 | 4 votes |
/** * Tests the proper consumption when having a 1:1 correspondence between kafka partitions and * Flink sources. */ public void runOneToOneExactlyOnceTest() throws Exception { final String topic = "oneToOneTopic"; final int parallelism = 5; final int numElementsPerPartition = 1000; final int totalElements = parallelism * numElementsPerPartition; final int failAfterElements = numElementsPerPartition / 3; createTestTopic(topic, parallelism, 1); DataGenerators.generateRandomizedIntegerSequence( StreamExecutionEnvironment.getExecutionEnvironment(), kafkaServer, topic, parallelism, numElementsPerPartition, true); // run the topology that fails and recovers DeserializationSchema<Integer> schema = new TypeInformationSerializationSchema<>(BasicTypeInfo.INT_TYPE_INFO, new ExecutionConfig()); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.enableCheckpointing(500); env.setParallelism(parallelism); env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0)); env.getConfig().disableSysoutLogging(); Properties props = new Properties(); props.putAll(standardProps); props.putAll(secureProps); FlinkKafkaConsumerBase<Integer> kafkaSource = kafkaServer.getConsumer(topic, schema, props); env .addSource(kafkaSource) .map(new PartitionValidatingMapper(parallelism, 1)) .map(new FailingIdentityMapper<Integer>(failAfterElements)) .addSink(new ValidatingExactlyOnceSink(totalElements)).setParallelism(1); FailingIdentityMapper.failedBefore = false; tryExecute(env, "One-to-one exactly once test"); deleteTestTopic(topic); }
Example #20
Source File: KafkaProducerTestBase.java From flink with Apache License 2.0 | 4 votes |
/** * This test sets KafkaProducer so that it will automatically flush the data and * and fails the broker to check whether flushed records since last checkpoint were not duplicated. */ protected void testExactlyOnce(boolean regularSink, int sinksCount) throws Exception { final String topic = (regularSink ? "exactlyOnceTopicRegularSink" : "exactlyTopicCustomOperator") + sinksCount; final int partition = 0; final int numElements = 1000; final int failAfterElements = 333; for (int i = 0; i < sinksCount; i++) { createTestTopic(topic + i, 1, 1); } TypeInformationSerializationSchema<Integer> schema = new TypeInformationSerializationSchema<>(BasicTypeInfo.INT_TYPE_INFO, new ExecutionConfig()); KeyedSerializationSchema<Integer> keyedSerializationSchema = new KeyedSerializationSchemaWrapper<>(schema); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.enableCheckpointing(500); env.setParallelism(1); env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0)); env.getConfig().disableSysoutLogging(); Properties properties = new Properties(); properties.putAll(standardProps); properties.putAll(secureProps); // process exactly failAfterElements number of elements and then shutdown Kafka broker and fail application List<Integer> expectedElements = getIntegersSequence(numElements); DataStream<Integer> inputStream = env .addSource(new IntegerSource(numElements)) .map(new FailingIdentityMapper<Integer>(failAfterElements)); for (int i = 0; i < sinksCount; i++) { FlinkKafkaPartitioner<Integer> partitioner = new FlinkKafkaPartitioner<Integer>() { @Override public int partition(Integer record, byte[] key, byte[] value, String targetTopic, int[] partitions) { return partition; } }; if (regularSink) { StreamSink<Integer> kafkaSink = kafkaServer.getProducerSink(topic + i, keyedSerializationSchema, properties, partitioner); inputStream.addSink(kafkaSink.getUserFunction()); } else { kafkaServer.produceIntoKafka(inputStream, topic + i, keyedSerializationSchema, properties, partitioner); } } FailingIdentityMapper.failedBefore = false; TestUtils.tryExecute(env, "Exactly once test"); for (int i = 0; i < sinksCount; i++) { // assert that before failure we successfully snapshot/flushed all expected elements assertExactlyOnceForTopic( properties, topic + i, partition, expectedElements, KAFKA_READ_TIMEOUT); deleteTestTopic(topic + i); } }
Example #21
Source File: Kafka010ITCase.java From flink with Apache License 2.0 | 4 votes |
/** * Kafka 0.10 specific test, ensuring Timestamps are properly written to and read from Kafka. */ @Ignore("This test is disabled because of: https://issues.apache.org/jira/browse/FLINK-9217") @Test(timeout = 60000) public void testTimestamps() throws Exception { final String topic = "tstopic"; createTestTopic(topic, 3, 1); // ---------- Produce an event time stream into Kafka ------------------- StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); env.getConfig().setRestartStrategy(RestartStrategies.noRestart()); env.getConfig().disableSysoutLogging(); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); DataStream<Long> streamWithTimestamps = env.addSource(new SourceFunction<Long>() { private static final long serialVersionUID = -2255105836471289626L; boolean running = true; @Override public void run(SourceContext<Long> ctx) throws Exception { long i = 0; while (running) { ctx.collectWithTimestamp(i, i * 2); if (i++ == 1000L) { running = false; } } } @Override public void cancel() { running = false; } }); final TypeInformationSerializationSchema<Long> longSer = new TypeInformationSerializationSchema<>(Types.LONG, env.getConfig()); FlinkKafkaProducer010.FlinkKafkaProducer010Configuration prod = FlinkKafkaProducer010.writeToKafkaWithTimestamps(streamWithTimestamps, topic, new KeyedSerializationSchemaWrapper<>(longSer), standardProps, new FlinkKafkaPartitioner<Long>() { private static final long serialVersionUID = -6730989584364230617L; @Override public int partition(Long next, byte[] key, byte[] value, String targetTopic, int[] partitions) { return (int) (next % 3); } }); prod.setParallelism(3); prod.setWriteTimestampToKafka(true); env.execute("Produce some"); // ---------- Consume stream from Kafka ------------------- env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); env.getConfig().setRestartStrategy(RestartStrategies.noRestart()); env.getConfig().disableSysoutLogging(); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); FlinkKafkaConsumer010<Long> kafkaSource = new FlinkKafkaConsumer010<>(topic, new LimitedLongDeserializer(), standardProps); kafkaSource.assignTimestampsAndWatermarks(new AssignerWithPunctuatedWatermarks<Long>() { private static final long serialVersionUID = -4834111073247835189L; @Nullable @Override public Watermark checkAndGetNextWatermark(Long lastElement, long extractedTimestamp) { if (lastElement % 10 == 0) { return new Watermark(lastElement); } return null; } @Override public long extractTimestamp(Long element, long previousElementTimestamp) { return previousElementTimestamp; } }); DataStream<Long> stream = env.addSource(kafkaSource); GenericTypeInfo<Object> objectTypeInfo = new GenericTypeInfo<>(Object.class); stream.transform("timestamp validating operator", objectTypeInfo, new TimestampValidatingOperator()).setParallelism(1); env.execute("Consume again"); deleteTestTopic(topic); }
Example #22
Source File: Kafka011ITCase.java From flink with Apache License 2.0 | 4 votes |
/** * Kafka 0.11 specific test, ensuring Timestamps are properly written to and read from Kafka. */ @Test(timeout = 60000) public void testTimestamps() throws Exception { final String topic = "tstopic"; createTestTopic(topic, 3, 1); // ---------- Produce an event time stream into Kafka ------------------- StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); env.getConfig().setRestartStrategy(RestartStrategies.noRestart()); env.getConfig().disableSysoutLogging(); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); DataStream<Long> streamWithTimestamps = env.addSource(new SourceFunction<Long>() { private static final long serialVersionUID = -2255115836471289626L; boolean running = true; @Override public void run(SourceContext<Long> ctx) throws Exception { long i = 0; while (running) { ctx.collectWithTimestamp(i, i * 2); if (i++ == 1110L) { running = false; } } } @Override public void cancel() { running = false; } }); final TypeInformationSerializationSchema<Long> longSer = new TypeInformationSerializationSchema<>(Types.LONG, env.getConfig()); FlinkKafkaProducer011<Long> prod = new FlinkKafkaProducer011<>(topic, new KeyedSerializationSchemaWrapper<>(longSer), standardProps, Optional.of(new FlinkKafkaPartitioner<Long>() { private static final long serialVersionUID = -6730989584364230617L; @Override public int partition(Long next, byte[] key, byte[] value, String targetTopic, int[] partitions) { return (int) (next % 3); } })); prod.setWriteTimestampToKafka(true); streamWithTimestamps.addSink(prod).setParallelism(3); env.execute("Produce some"); // ---------- Consume stream from Kafka ------------------- env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); env.getConfig().setRestartStrategy(RestartStrategies.noRestart()); env.getConfig().disableSysoutLogging(); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); FlinkKafkaConsumer011<Long> kafkaSource = new FlinkKafkaConsumer011<>(topic, new LimitedLongDeserializer(), standardProps); kafkaSource.assignTimestampsAndWatermarks(new AssignerWithPunctuatedWatermarks<Long>() { private static final long serialVersionUID = -4834111173247835189L; @Nullable @Override public Watermark checkAndGetNextWatermark(Long lastElement, long extractedTimestamp) { if (lastElement % 11 == 0) { return new Watermark(lastElement); } return null; } @Override public long extractTimestamp(Long element, long previousElementTimestamp) { return previousElementTimestamp; } }); DataStream<Long> stream = env.addSource(kafkaSource); GenericTypeInfo<Object> objectTypeInfo = new GenericTypeInfo<>(Object.class); stream.transform("timestamp validating operator", objectTypeInfo, new TimestampValidatingOperator()).setParallelism(1); env.execute("Consume again"); deleteTestTopic(topic); }
Example #23
Source File: KafkaITCase.java From flink with Apache License 2.0 | 4 votes |
/** * Kafka 20 specific test, ensuring Timestamps are properly written to and read from Kafka. */ @Test(timeout = 60000) public void testTimestamps() throws Exception { final String topic = "tstopic"; createTestTopic(topic, 3, 1); // ---------- Produce an event time stream into Kafka ------------------- StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); env.getConfig().setRestartStrategy(RestartStrategies.noRestart()); env.getConfig().disableSysoutLogging(); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); DataStream<Long> streamWithTimestamps = env.addSource(new SourceFunction<Long>() { private static final long serialVersionUID = -2255115836471289626L; boolean running = true; @Override public void run(SourceContext<Long> ctx) throws Exception { long i = 0; while (running) { ctx.collectWithTimestamp(i, i * 2); if (i++ == 1110L) { running = false; } } } @Override public void cancel() { running = false; } }); final TypeInformationSerializationSchema<Long> longSer = new TypeInformationSerializationSchema<>(Types.LONG, env.getConfig()); FlinkKafkaProducer<Long> prod = new FlinkKafkaProducer<>(topic, new KeyedSerializationSchemaWrapper<>(longSer), standardProps, Optional.of(new FlinkKafkaPartitioner<Long>() { private static final long serialVersionUID = -6730989584364230617L; @Override public int partition(Long next, byte[] key, byte[] value, String targetTopic, int[] partitions) { return (int) (next % 3); } })); prod.setWriteTimestampToKafka(true); streamWithTimestamps.addSink(prod).setParallelism(3); env.execute("Produce some"); // ---------- Consume stream from Kafka ------------------- env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(1); env.getConfig().setRestartStrategy(RestartStrategies.noRestart()); env.getConfig().disableSysoutLogging(); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); FlinkKafkaConsumer<Long> kafkaSource = new FlinkKafkaConsumer<>(topic, new KafkaITCase.LimitedLongDeserializer(), standardProps); kafkaSource.assignTimestampsAndWatermarks(new AssignerWithPunctuatedWatermarks<Long>() { private static final long serialVersionUID = -4834111173247835189L; @Nullable @Override public Watermark checkAndGetNextWatermark(Long lastElement, long extractedTimestamp) { if (lastElement % 11 == 0) { return new Watermark(lastElement); } return null; } @Override public long extractTimestamp(Long element, long previousElementTimestamp) { return previousElementTimestamp; } }); DataStream<Long> stream = env.addSource(kafkaSource); GenericTypeInfo<Object> objectTypeInfo = new GenericTypeInfo<>(Object.class); stream.transform("timestamp validating operator", objectTypeInfo, new TimestampValidatingOperator()).setParallelism(1); env.execute("Consume again"); deleteTestTopic(topic); }
Example #24
Source File: KafkaConsumerTestBase.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
protected void writeAppendSequence( String topicName, final int originalNumElements, final int numElementsToAppend, final int parallelism) throws Exception { LOG.info("\n===================================\n" + "== Appending sequence of " + numElementsToAppend + " into " + topicName + "==================================="); final TypeInformation<Tuple2<Integer, Integer>> resultType = TypeInformation.of(new TypeHint<Tuple2<Integer, Integer>>() {}); final KeyedSerializationSchema<Tuple2<Integer, Integer>> serSchema = new KeyedSerializationSchemaWrapper<>( new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig())); final KafkaDeserializationSchema<Tuple2<Integer, Integer>> deserSchema = new KafkaDeserializationSchemaWrapper<>( new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig())); // -------- Write the append sequence -------- StreamExecutionEnvironment writeEnv = StreamExecutionEnvironment.getExecutionEnvironment(); writeEnv.getConfig().setRestartStrategy(RestartStrategies.noRestart()); writeEnv.getConfig().disableSysoutLogging(); DataStream<Tuple2<Integer, Integer>> stream = writeEnv.addSource(new RichParallelSourceFunction<Tuple2<Integer, Integer>>() { private boolean running = true; @Override public void run(SourceContext<Tuple2<Integer, Integer>> ctx) throws Exception { int cnt = originalNumElements; int partition = getRuntimeContext().getIndexOfThisSubtask(); while (running && cnt < numElementsToAppend + originalNumElements) { ctx.collect(new Tuple2<>(partition, cnt)); cnt++; } } @Override public void cancel() { running = false; } }).setParallelism(parallelism); // the producer must not produce duplicates Properties producerProperties = FlinkKafkaProducerBase.getPropertiesFromBrokerList(brokerConnectionStrings); producerProperties.setProperty("retries", "0"); producerProperties.putAll(secureProps); kafkaServer.produceIntoKafka(stream, topicName, serSchema, producerProperties, new Tuple2FlinkPartitioner(parallelism)) .setParallelism(parallelism); try { writeEnv.execute("Write sequence"); } catch (Exception e) { throw new Exception("Failed to append sequence to Kafka; append job failed.", e); } LOG.info("Finished writing append sequence"); // we need to validate the sequence, because kafka's producers are not exactly once LOG.info("Validating sequence"); while (!getRunningJobs(client).isEmpty()){ Thread.sleep(50); } if (!validateSequence(topicName, parallelism, deserSchema, originalNumElements + numElementsToAppend)) { throw new Exception("Could not append a valid sequence to Kafka."); } }
Example #25
Source File: KafkaConsumerTestBase.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
protected String writeSequence( String baseTopicName, final int numElements, final int parallelism, final int replicationFactor) throws Exception { LOG.info("\n===================================\n" + "== Writing sequence of " + numElements + " into " + baseTopicName + " with p=" + parallelism + "\n" + "==================================="); final TypeInformation<Tuple2<Integer, Integer>> resultType = TypeInformation.of(new TypeHint<Tuple2<Integer, Integer>>() {}); final KeyedSerializationSchema<Tuple2<Integer, Integer>> serSchema = new KeyedSerializationSchemaWrapper<>( new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig())); final KafkaDeserializationSchema<Tuple2<Integer, Integer>> deserSchema = new KafkaDeserializationSchemaWrapper<>( new TypeInformationSerializationSchema<>(resultType, new ExecutionConfig())); final int maxNumAttempts = 10; for (int attempt = 1; attempt <= maxNumAttempts; attempt++) { final String topicName = baseTopicName + '-' + attempt; LOG.info("Writing attempt #" + attempt); // -------- Write the Sequence -------- createTestTopic(topicName, parallelism, replicationFactor); StreamExecutionEnvironment writeEnv = StreamExecutionEnvironment.getExecutionEnvironment(); writeEnv.getConfig().setRestartStrategy(RestartStrategies.noRestart()); writeEnv.getConfig().disableSysoutLogging(); DataStream<Tuple2<Integer, Integer>> stream = writeEnv.addSource(new RichParallelSourceFunction<Tuple2<Integer, Integer>>() { private boolean running = true; @Override public void run(SourceContext<Tuple2<Integer, Integer>> ctx) throws Exception { int cnt = 0; int partition = getRuntimeContext().getIndexOfThisSubtask(); while (running && cnt < numElements) { ctx.collect(new Tuple2<>(partition, cnt)); cnt++; } } @Override public void cancel() { running = false; } }).setParallelism(parallelism); // the producer must not produce duplicates Properties producerProperties = FlinkKafkaProducerBase.getPropertiesFromBrokerList(brokerConnectionStrings); producerProperties.setProperty("retries", "0"); producerProperties.putAll(secureProps); kafkaServer.produceIntoKafka(stream, topicName, serSchema, producerProperties, new Tuple2FlinkPartitioner(parallelism)) .setParallelism(parallelism); try { writeEnv.execute("Write sequence"); } catch (Exception e) { LOG.error("Write attempt failed, trying again", e); deleteTestTopic(topicName); waitUntilNoJobIsRunning(client); continue; } LOG.info("Finished writing sequence"); // -------- Validate the Sequence -------- // we need to validate the sequence, because kafka's producers are not exactly once LOG.info("Validating sequence"); waitUntilNoJobIsRunning(client); if (validateSequence(topicName, parallelism, deserSchema, numElements)) { // everything is good! return topicName; } else { deleteTestTopic(topicName); // fall through the loop } } throw new Exception("Could not write a valid sequence to Kafka after " + maxNumAttempts + " attempts"); }
Example #26
Source File: KafkaConsumerTestBase.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
public void runBrokerFailureTest() throws Exception { final String topic = "brokerFailureTestTopic"; final int parallelism = 2; final int numElementsPerPartition = 1000; final int totalElements = parallelism * numElementsPerPartition; final int failAfterElements = numElementsPerPartition / 3; createTestTopic(topic, parallelism, 2); DataGenerators.generateRandomizedIntegerSequence( StreamExecutionEnvironment.getExecutionEnvironment(), kafkaServer, topic, parallelism, numElementsPerPartition, true); // find leader to shut down int leaderId = kafkaServer.getLeaderToShutDown(topic); LOG.info("Leader to shutdown {}", leaderId); // run the topology (the consumers must handle the failures) DeserializationSchema<Integer> schema = new TypeInformationSerializationSchema<>(BasicTypeInfo.INT_TYPE_INFO, new ExecutionConfig()); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(parallelism); env.enableCheckpointing(500); env.setRestartStrategy(RestartStrategies.noRestart()); env.getConfig().disableSysoutLogging(); Properties props = new Properties(); props.putAll(standardProps); props.putAll(secureProps); FlinkKafkaConsumerBase<Integer> kafkaSource = kafkaServer.getConsumer(topic, schema, props); env .addSource(kafkaSource) .map(new PartitionValidatingMapper(parallelism, 1)) .map(new BrokerKillingMapper<Integer>(leaderId, failAfterElements)) .addSink(new ValidatingExactlyOnceSink(totalElements)).setParallelism(1); BrokerKillingMapper.killedLeaderBefore = false; tryExecute(env, "Broker failure once test"); // start a new broker: kafkaServer.restartBroker(leaderId); }
Example #27
Source File: KafkaConsumerTestBase.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
/** * Tests the proper consumption when having more Flink sources than Kafka partitions, which means * that some Flink sources will read no partitions. */ public void runMultipleSourcesOnePartitionExactlyOnceTest() throws Exception { final String topic = "manyToOneTopic"; final int numPartitions = 5; final int numElementsPerPartition = 1000; final int totalElements = numPartitions * numElementsPerPartition; final int failAfterElements = numElementsPerPartition / 3; final int parallelism = 8; createTestTopic(topic, numPartitions, 1); DataGenerators.generateRandomizedIntegerSequence( StreamExecutionEnvironment.getExecutionEnvironment(), kafkaServer, topic, numPartitions, numElementsPerPartition, true); // run the topology that fails and recovers DeserializationSchema<Integer> schema = new TypeInformationSerializationSchema<>(BasicTypeInfo.INT_TYPE_INFO, new ExecutionConfig()); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.enableCheckpointing(500); env.setParallelism(parallelism); // set the number of restarts to one. The failing mapper will fail once, then it's only success exceptions. env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0)); env.getConfig().disableSysoutLogging(); env.setBufferTimeout(0); Properties props = new Properties(); props.putAll(standardProps); props.putAll(secureProps); FlinkKafkaConsumerBase<Integer> kafkaSource = kafkaServer.getConsumer(topic, schema, props); env .addSource(kafkaSource) .map(new PartitionValidatingMapper(numPartitions, 1)) .map(new FailingIdentityMapper<Integer>(failAfterElements)) .addSink(new ValidatingExactlyOnceSink(totalElements)).setParallelism(1); FailingIdentityMapper.failedBefore = false; tryExecute(env, "multi-source-one-partitions exactly once test"); deleteTestTopic(topic); }
Example #28
Source File: KafkaConsumerTestBase.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
/** * Tests the proper consumption when having fewer Flink sources than Kafka partitions, so * one Flink source will read multiple Kafka partitions. */ public void runOneSourceMultiplePartitionsExactlyOnceTest() throws Exception { final String topic = "oneToManyTopic"; final int numPartitions = 5; final int numElementsPerPartition = 1000; final int totalElements = numPartitions * numElementsPerPartition; final int failAfterElements = numElementsPerPartition / 3; final int parallelism = 2; createTestTopic(topic, numPartitions, 1); DataGenerators.generateRandomizedIntegerSequence( StreamExecutionEnvironment.getExecutionEnvironment(), kafkaServer, topic, numPartitions, numElementsPerPartition, true); // run the topology that fails and recovers DeserializationSchema<Integer> schema = new TypeInformationSerializationSchema<>(BasicTypeInfo.INT_TYPE_INFO, new ExecutionConfig()); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.enableCheckpointing(500); env.setParallelism(parallelism); env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0)); env.getConfig().disableSysoutLogging(); Properties props = new Properties(); props.putAll(standardProps); props.putAll(secureProps); FlinkKafkaConsumerBase<Integer> kafkaSource = kafkaServer.getConsumer(topic, schema, props); env .addSource(kafkaSource) .map(new PartitionValidatingMapper(numPartitions, 3)) .map(new FailingIdentityMapper<Integer>(failAfterElements)) .addSink(new ValidatingExactlyOnceSink(totalElements)).setParallelism(1); FailingIdentityMapper.failedBefore = false; tryExecute(env, "One-source-multi-partitions exactly once test"); deleteTestTopic(topic); }
Example #29
Source File: KafkaConsumerTestBase.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
/** * Tests the proper consumption when having a 1:1 correspondence between kafka partitions and * Flink sources. */ public void runOneToOneExactlyOnceTest() throws Exception { final String topic = "oneToOneTopic"; final int parallelism = 5; final int numElementsPerPartition = 1000; final int totalElements = parallelism * numElementsPerPartition; final int failAfterElements = numElementsPerPartition / 3; createTestTopic(topic, parallelism, 1); DataGenerators.generateRandomizedIntegerSequence( StreamExecutionEnvironment.getExecutionEnvironment(), kafkaServer, topic, parallelism, numElementsPerPartition, true); // run the topology that fails and recovers DeserializationSchema<Integer> schema = new TypeInformationSerializationSchema<>(BasicTypeInfo.INT_TYPE_INFO, new ExecutionConfig()); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.enableCheckpointing(500); env.setParallelism(parallelism); env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0)); env.getConfig().disableSysoutLogging(); Properties props = new Properties(); props.putAll(standardProps); props.putAll(secureProps); FlinkKafkaConsumerBase<Integer> kafkaSource = kafkaServer.getConsumer(topic, schema, props); env .addSource(kafkaSource) .map(new PartitionValidatingMapper(parallelism, 1)) .map(new FailingIdentityMapper<Integer>(failAfterElements)) .addSink(new ValidatingExactlyOnceSink(totalElements)).setParallelism(1); FailingIdentityMapper.failedBefore = false; tryExecute(env, "One-to-one exactly once test"); deleteTestTopic(topic); }
Example #30
Source File: KafkaProducerTestBase.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
/** * This test sets KafkaProducer so that it will automatically flush the data and * and fails the broker to check whether flushed records since last checkpoint were not duplicated. */ protected void testExactlyOnce(boolean regularSink, int sinksCount) throws Exception { final String topic = (regularSink ? "exactlyOnceTopicRegularSink" : "exactlyTopicCustomOperator") + sinksCount; final int partition = 0; final int numElements = 1000; final int failAfterElements = 333; for (int i = 0; i < sinksCount; i++) { createTestTopic(topic + i, 1, 1); } TypeInformationSerializationSchema<Integer> schema = new TypeInformationSerializationSchema<>(BasicTypeInfo.INT_TYPE_INFO, new ExecutionConfig()); KeyedSerializationSchema<Integer> keyedSerializationSchema = new KeyedSerializationSchemaWrapper<>(schema); StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.enableCheckpointing(500); env.setParallelism(1); env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0)); env.getConfig().disableSysoutLogging(); Properties properties = new Properties(); properties.putAll(standardProps); properties.putAll(secureProps); // process exactly failAfterElements number of elements and then shutdown Kafka broker and fail application List<Integer> expectedElements = getIntegersSequence(numElements); DataStream<Integer> inputStream = env .addSource(new IntegerSource(numElements)) .map(new FailingIdentityMapper<Integer>(failAfterElements)); for (int i = 0; i < sinksCount; i++) { FlinkKafkaPartitioner<Integer> partitioner = new FlinkKafkaPartitioner<Integer>() { @Override public int partition(Integer record, byte[] key, byte[] value, String targetTopic, int[] partitions) { return partition; } }; if (regularSink) { StreamSink<Integer> kafkaSink = kafkaServer.getProducerSink(topic + i, keyedSerializationSchema, properties, partitioner); inputStream.addSink(kafkaSink.getUserFunction()); } else { kafkaServer.produceIntoKafka(inputStream, topic + i, keyedSerializationSchema, properties, partitioner); } } FailingIdentityMapper.failedBefore = false; TestUtils.tryExecute(env, "Exactly once test"); for (int i = 0; i < sinksCount; i++) { // assert that before failure we successfully snapshot/flushed all expected elements assertExactlyOnceForTopic( properties, topic + i, partition, expectedElements, KAFKA_READ_TIMEOUT); deleteTestTopic(topic + i); } }