org.apache.flink.api.common.typeinfo.TypeInformation Scala Examples
The following examples show how to use org.apache.flink.api.common.typeinfo.TypeInformation.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: DataRow.scala From flink-elasticsearch-source-connector with Apache License 2.0 | 5 votes |
package com.mnubo.flink.streaming.connectors import org.apache.commons.lang3.ClassUtils import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.java.typeutils.TypeExtractor import scala.language.existentials case class Value(v: Any, name: String, givenTypeInfo: Option[TypeInformation[_]] = None) { require(v != null || givenTypeInfo.isDefined, "You must pass a TypeInformation for null values") val typeInfo = givenTypeInfo match { case Some(ti) => ti case None => TypeExtractor.getForObject(v) } require(isAssignable(v, typeInfo.getTypeClass), s"data element '$v' is not compatible with class ${typeInfo.getTypeClass.getName}") private def isAssignable(value: Any, cl: Class[_]) = { if (value == null && classOf[AnyRef].isAssignableFrom(cl)) true else ClassUtils.isAssignable(value.getClass, cl) } } object Value { def apply(v: Any, name: String, givenTypeInfo: TypeInformation[_]) = { new Value(v, name, Some(givenTypeInfo)) } } class DataRow(private [connectors] val data: Array[Any], private [connectors] val info: DataRowTypeInfo) extends Product with Serializable { require(data != null, "data must not be null") require(info != null, "info must not be null") require(data.length == info.getArity, "data must be of the correct arity") def apply[T](i: Int): T = data(i).asInstanceOf[T] def apply[T](fieldExpression: String): T = apply(info.getFieldIndex(fieldExpression)) override def productElement(n: Int): Any = apply[AnyRef](n) override def productArity = info.getArity override def canEqual(that: Any) = that.isInstanceOf[DataRow] override def equals(that: Any) = canEqual(that) && data.sameElements(that.asInstanceOf[DataRow].data) && info.getFieldNames.sameElements(that.asInstanceOf[DataRow].info.getFieldNames) override def hashCode = { var result = 1 for (element <- data) result = 31 * result + (if (element == null) 0 else element.hashCode) result } override def toString = info.getFieldNames .zip(data.map(v => if (v == null) "null" else v.toString)) .map{case (name, value) => s"$name=$value"} .mkString("DataRow(", ", ", ")") } object DataRow { def apply(data: Value*): DataRow = { require(data != null, "data cannot be null") require(!data.contains(null), "data value cannot be null") new DataRow( data.map(_.v).toArray, new DataRowTypeInfo( data.map(_.name), data.map(_.typeInfo) ) ) } }
Example 2
Source File: FlinkSleepBlocker.scala From flink-parameter-server with Apache License 2.0 | 5 votes |
package hu.sztaki.ilab.ps.utils import org.apache.flink.api.common.functions.RichMapFunction import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.streaming.api.scala._ object FlinkSleepBlocker { def block[T: TypeInformation](stream: DataStream[T], milliseconds: Long): DataStream[T] = { stream.forward.map(new RichMapFunction[T, T] { @transient lazy val sleeper: Unit = { Thread.sleep(milliseconds) () } override def map(value: T): T = { sleeper value } }).setParallelism(stream.parallelism) } }
Example 3
Source File: DataSetMatcher.scala From piglet with Apache License 2.0 | 5 votes |
package dbis.piglet.cep.flink import scala.reflect.ClassTag import dbis.piglet.cep.nfa.NFAController import dbis.piglet.cep.engines._ import dbis.piglet.cep.ops.SelectionStrategy._ import dbis.piglet.cep.ops.OutputStrategy._ import dbis.piglet.backends.{SchemaClass => Event} import dbis.piglet.cep.ops.MatchCollector import org.apache.flink.api.common.typeinfo.TypeInformation import dbis.piglet.cep.ops.SelectionStrategy //import org.apache.flink.api.java.operators.CustomUnaryOperation //import scala.collection.mutable.ArrayBuffer import scala.collection.mutable.ListBuffer //import org.apache.flink.api.java.DataSet //import org.apache.flink.api.java.ExecutionEnvironment import scala.collection.JavaConversions._ import org.apache.flink.api.scala._ import dbis.piglet.cep.ops.EngineConf class DataSetMatcher[T <: Event: ClassTag: TypeInformation](input: DataSet[T], nfa: NFAController[T], flinkEnv: ExecutionEnvironment, sstr: SelectionStrategy = SelectionStrategy.FirstMatch, out: OutputStrategy = Combined) extends EngineConf[T](nfa, sstr) with java.io.Serializable { def compute(): DataSet[T] = { input.collect().foreach ( event => engine.runEngine(event) ) flinkEnv.fromCollection(collector.convertEventsToArray().toSeq) } }
Example 4
Source File: CustomDataStreamMatcher.scala From piglet with Apache License 2.0 | 5 votes |
package dbis.piglet.cep.flink import scala.reflect.ClassTag import dbis.piglet.cep.ops.SelectionStrategy._ import dbis.piglet.cep.ops.OutputStrategy._ import dbis.piglet.cep.nfa.NFAController import dbis.piglet.backends.{SchemaClass => Event} import org.apache.flink.api.common.typeinfo.TypeInformation //import org.apache.flink.api.java.ExecutionEnvironment //import org.apache.flink.api.java.DataSet import scala.collection.JavaConversions._ import org.apache.flink.streaming.api.scala._ class CustomDataStreamMatcher[T <: Event: ClassTag: TypeInformation](@transient val dataStream: DataStream[T]) { def matchNFA(nfa: NFAController[T], flinkEnv: StreamExecutionEnvironment, sstr: SelectionStrategy = FirstMatch, out: OutputStrategy = Combined) = { // println("create a new DataStream matcher") new DataStreamMatcher(dataStream, nfa, flinkEnv, sstr, out).compute() } } object CustomDataStreamMatcher { implicit def addDataSetMatcher[T <: Event: ClassTag: TypeInformation](@transient dataStream: DataStream[T]) = { // println("add a custom DataStream function") new CustomDataStreamMatcher(dataStream) } }
Example 5
Source File: CustomDataSetMatcher.scala From piglet with Apache License 2.0 | 5 votes |
package dbis.piglet.cep.flink import scala.reflect.ClassTag import dbis.piglet.cep.ops.SelectionStrategy._ import dbis.piglet.cep.ops.OutputStrategy._ import dbis.piglet.cep.nfa.NFAController import dbis.piglet.backends.{SchemaClass => Event} import org.apache.flink.api.common.typeinfo.TypeInformation //import org.apache.flink.api.java.ExecutionEnvironment //import org.apache.flink.api.java.DataSet import scala.collection.JavaConversions._ import org.apache.flink.api.scala._ class CustomDataSetMatcher[T <: Event: ClassTag: TypeInformation](dataSet: DataSet[T]) { def matchNFA(nfa: NFAController[T], sstr: SelectionStrategy = FirstMatch, out: OutputStrategy = Combined) = { // println("create a new DataSet matcher") val flinkEnv = dataSet.getExecutionEnvironment new DataSetMatcher(dataSet, nfa, flinkEnv, sstr, out).compute() } } object CustomDataSetMatcher { implicit def addDataSetMatcher[T <: Event: ClassTag: TypeInformation](dataSet: DataSet[T]) = { // println("add a custom DataSet function") new CustomDataSetMatcher(dataSet) } }
Example 6
Source File: DataStreamMatcher.scala From piglet with Apache License 2.0 | 5 votes |
package dbis.piglet.cep.flink import scala.reflect.ClassTag import dbis.piglet.cep.nfa.NFAController import dbis.piglet.cep.engines._ import dbis.piglet.cep.ops.SelectionStrategy._ import dbis.piglet.cep.ops.OutputStrategy._ import dbis.piglet.backends.{SchemaClass => Event} import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.streaming.api.windowing.windows.GlobalWindow import org.apache.flink.streaming.api.windowing.assigners.GlobalWindows import dbis.piglet.cep.ops.MatchCollector import dbis.piglet.cep.ops.SelectionStrategy //import org.apache.flink.api.java.operators.CustomUnaryOperation import scala.collection.mutable.ListBuffer //import org.apache.flink.api.java.DataSet //import org.apache.flink.api.java.ExecutionEnvironment import scala.collection.JavaConversions._ import org.apache.flink.streaming.api.scala._ import dbis.piglet.cep.ops.EngineConf import org.apache.flink.util.Collector class DataStreamMatcher[T <: Event: ClassTag: TypeInformation](@transient val input: DataStream[T], nfa: NFAController[T], flinkEnv: StreamExecutionEnvironment, sstr: SelectionStrategy = SelectionStrategy.FirstMatch, out: OutputStrategy = Combined) extends EngineConf[T](nfa, sstr) with java.io.Serializable { object DataStreamProcess { def customRun(gw: GlobalWindow, ts: Iterable[T], out: Collector[T]) = { ts.foreach { event => engine.runEngine(event)} val result = collector.convertEventsToArray() result.foreach { res => out.collect(res) } } } def compute(): DataStream[T] = { input.windowAll(GlobalWindows.create()).apply(DataStreamProcess.customRun _) } }
Example 7
Source File: UTF8StringSchema.scala From piglet with Apache License 2.0 | 5 votes |
package dbis.piglet.backends.flink.streaming import org.apache.commons.lang3.SerializationUtils import org.apache.flink.streaming.util.serialization._ import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.java.typeutils.TypeExtractor class UTF8StringSchema extends DeserializationSchema[String] with SerializationSchema[String] { override def deserialize(message: Array[Byte]): String = { new String(message, "UTF-8") } override def isEndOfStream(nextElement: String): Boolean = { false } override def serialize(element: String): Array[Byte] = { element.getBytes("UTF-8") } override def getProducedType(): TypeInformation[String] = { TypeExtractor.getForClass(classOf[String]) } }
Example 8
Source File: StreamFuncs.scala From piglet with Apache License 2.0 | 5 votes |
package dbis.piglet.backends.flink.streaming import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.streaming.api.scala._ import scala.reflect.ClassTag import dbis.piglet.backends._ class PigStream[T <: SchemaClass: ClassTag: TypeInformation] extends java.io.Serializable { def loadStream(env: StreamExecutionEnvironment, path: String, extract: (Array[String]) => T, delim: String = "\t"): DataStream[T] = { env.readTextFile(path).setParallelism(1).map(line => extract(line.split(delim, -1))) } def writeStream(path: String, result: DataStream[T], delim: String = ",") = result.map(_.mkString(delim)).writeAsText(path).setParallelism(1) def connect(env: StreamExecutionEnvironment, host: String, port: Int, extract: (Array[String]) => T, delim: String = "\t"): DataStream[T] = { env.socketTextStream(host,port).map(line => extract(line.split(delim, -1))) } def bind(host: String, port: Int, result: DataStream[T], delim: String = ",") = { result.map(_.mkString(delim) + "\n").writeToSocket(host, port, new UTF8StringSchema()) } def zmqSubscribe(env: StreamExecutionEnvironment, addr: String, extract: (Array[String]) => T, delim: String = "\t"): DataStream[T] = { env.addSource(new ZmqSubscriber(addr)).map(line => extract(line.split(delim, -1))) } def zmqPublish(addr: String, result: DataStream[T], delim: String = ",") = { result.map(_.mkString(delim)).addSink(new ZmqPublisher(addr)).setParallelism(1) } } class TextLoader[T <: SchemaClass: ClassTag: TypeInformation] extends java.io.Serializable { def loadStream(env: StreamExecutionEnvironment, path: String, extract: (Array[String]) => T): DataStream[T] = env.readTextFile(path).map(line => extract(Array(line))) } object TextLoader extends java.io.Serializable { def apply[T <: SchemaClass: ClassTag: TypeInformation](): TextLoader[T] = { new TextLoader[T] } } object PigStream { def apply[T <: SchemaClass: ClassTag: TypeInformation](): PigStream[T] = { new PigStream } } class RDFStream[T <: SchemaClass: ClassTag: TypeInformation] extends java.io.Serializable { val pattern = "([^\"]\\S*|\".+?\")\\s*".r def rdfize(line: String): Array[String] = { val fields = pattern.findAllIn(line).map(_.trim) fields.toArray.slice(0, 3) } def loadStream(env: StreamExecutionEnvironment, path: String, extract: (Array[String]) => T): DataStream[T] = { env.readTextFile(path).map(line => extract(rdfize(line))) } def connect(env: StreamExecutionEnvironment, host: String, port: Int, extract: (Array[String]) => T): DataStream[T] = { env.socketTextStream(host,port).map(line => extract(rdfize(line))) } def zmqSubscribe(env: StreamExecutionEnvironment, addr: String, extract: (Array[String]) => T): DataStream[T] = { env.addSource(new ZmqSubscriber(addr)).map(line => extract(rdfize(line))) } } object RDFStream { def apply[T <: SchemaClass: ClassTag: TypeInformation](): RDFStream[T] = { new RDFStream } }
Example 9
Source File: Storage.scala From piglet with Apache License 2.0 | 5 votes |
package dbis.piglet.backends.flink import dbis.piglet.backends._ import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.scala._ import org.apache.flink.core.fs.FileSystem.WriteMode._ import scala.reflect.ClassTag //----------------------------------------------------------------------------------------------------- class PigStorage[T <: SchemaClass :ClassTag: TypeInformation] extends java.io.Serializable { def load(env: ExecutionEnvironment, path: String, extract: (Array[String]) => T, delim: String = "\t", skipFirstRow: Boolean = false, skipEmpty: Boolean = false, comments: String = ""): DataSet[T] = { val raw = env.readTextFile(path) val nonEmpty = if(skipEmpty) raw.filter { line => line.nonEmpty } else raw val nonComment = if(comments.nonEmpty) nonEmpty.filter { line => !line.startsWith(comments) } else nonEmpty val content = if(skipFirstRow) { val header = nonComment.first(1).collect().head nonComment.filter { line => line != header } } else nonComment content.map(line => line.split(delim, -1)).map(extract) } def write(path: String, result: DataSet[T], delim: String = ",") = result.map(_.mkString(delim)).writeAsText(path).setParallelism(1) } object PigStorage { def apply[T <: SchemaClass :ClassTag: TypeInformation](): PigStorage[T] = { new PigStorage[T] } } class RDFFileStorage[T:ClassTag: TypeInformation] extends java.io.Serializable { val pattern = "([^\"]\\S*|\".+?\")\\s*".r def rdfize(line: String): Array[String] = { val fields = pattern.findAllIn(line).map(_.trim) fields.toArray.slice(0, 3) } def load(env: ExecutionEnvironment, path: String, extract: (Array[String]) => T): DataSet[T] = env.readTextFile(path).map(line => extract(rdfize(line))) } object RDFFileStorage { def apply[T:ClassTag: TypeInformation](): RDFFileStorage[T] = { new RDFFileStorage[T] } }
Example 10
Source File: PigFuncs.scala From piglet with Apache License 2.0 | 5 votes |
package dbis.piglet.backends.flink import java.util.Random import dbis.piglet.CommonPigFuncs import dbis.piglet.backends._ import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.java.functions._ import org.apache.flink.api.scala._ import scala.reflect.ClassTag class CustomSampler[T <: SchemaClass: ClassTag: TypeInformation](dataSet: DataSet[T]) { def sample(withReplacement: Boolean, fraction: Double, seed: Long = new Random().nextLong()) = { dataSet.mapPartition(new SampleWithFraction[T](withReplacement, fraction, seed)) } } object Sampler { implicit def addSampler[T <: SchemaClass: ClassTag: TypeInformation](dataSet: DataSet[T]) = { new CustomSampler(dataSet) } } object PigFuncs extends CommonPigFuncs { }
Example 11
Source File: package.scala From featran with Apache License 2.0 | 5 votes |
package com.spotify.featran import com.esotericsoftware.kryo.serializers.JavaSerializer import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.scala.DataSet import scala.reflect.ClassTag package object flink { implicit object FlinkCollectionType extends CollectionType[DataSet] { // force fallback to default serializer private val Ti = TypeInformation.of(classOf[Any]) override def map[A, B: ClassTag](ma: DataSet[A])(f: A => B): DataSet[B] = { implicit val tib = Ti.asInstanceOf[TypeInformation[B]] ma.map(f) } override def reduce[A](ma: DataSet[A])(f: (A, A) => A): DataSet[A] = ma.reduce(f) override def cross[A, B: ClassTag](ma: DataSet[A])(mb: DataSet[B]): DataSet[(A, B)] = ma.crossWithTiny(mb) override def pure[A, B: ClassTag](ma: DataSet[A])(b: B): DataSet[B] = { implicit val tib = Ti.asInstanceOf[TypeInformation[B]] val env = ma.getExecutionEnvironment // Kryo throws NPE on `Feature`, use Java serialization instead env.addDefaultKryoSerializer(classOf[FeatureSet[Any]], classOf[JavaSerializer]) env.fromElements(b) } } }
Example 12
Source File: FlinkKafkaCodecSerde.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.flink import org.apache.kafka.clients.producer.ProducerRecord import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.streaming.connectors.kafka._ import cloudflow.streamlets.{ CodecInlet, CodecOutlet } private[flink] class FlinkKafkaCodecSerializationSchema[T: TypeInformation](outlet: CodecOutlet[T], topic: String) extends KafkaSerializationSchema[T] { override def serialize(value: T, timestamp: java.lang.Long): ProducerRecord[Array[Byte], Array[Byte]] = new ProducerRecord(topic, outlet.codec.encode(value)) } private[flink] class FlinkKafkaCodecDeserializationSchema[T: TypeInformation](inlet: CodecInlet[T]) extends KafkaDeserializationSchema[T] { override def deserialize(record: ConsumerRecord[Array[Byte], Array[Byte]]): T = inlet.codec.decode(record.value) override def isEndOfStream(value: T): Boolean = false override def getProducedType: TypeInformation[T] = implicitly[TypeInformation[T]] }
Example 13
Source File: FlinkStreamletContextImpl.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.flink import scala.collection.JavaConverters._ import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.streaming.api.datastream.DataStreamSink import org.apache.flink.streaming.api.scala._ import org.apache.flink.streaming.connectors.kafka._ import com.typesafe.config._ import cloudflow.streamlets._ import java.{ util ⇒ ju } override def writeStream[Out: TypeInformation](outlet: CodecOutlet[Out], stream: DataStream[Out]): DataStreamSink[Out] = { val topic = findTopicForPort(outlet) val destTopic = topic.name val bootstrapServers = topic.bootstrapServers.getOrElse(internalKafkaBootstrapServers) val propsMap = Map("bootstrap.servers" -> bootstrapServers, "batch.size" -> "0") ++ topic.kafkaProducerProperties val properties = new ju.Properties() properties.putAll(propsMap.asJava) stream.addSink( new FlinkKafkaProducer[Out]( destTopic, new FlinkKafkaCodecSerializationSchema[Out](outlet, destTopic), properties, FlinkKafkaProducer.Semantic.AT_LEAST_ONCE ) ) } }
Example 14
Source File: TestFlinkStreamletContext.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.flink package testkit import org.apache.flink.streaming.api.scala._ import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.streaming.api.functions.sink.SinkFunction import org.apache.flink.streaming.api.datastream.DataStreamSink import com.typesafe.config._ import cloudflow.streamlets._ override def writeStream[Out: TypeInformation](outlet: CodecOutlet[Out], stream: DataStream[Out]): DataStreamSink[Out] = outletTaps .find(_.portName == outlet.name) .map { _ ⇒ stream.addSink(new SinkFunction[Out]() { override def invoke(out: Out) = TestFlinkStreamletContext.result.add(out.toString()) }) } .getOrElse(throw TestContextException(outlet.name, s"Bad test context, could not find destination for outlet ${outlet.name}")) } object TestFlinkStreamletContext { val result = new java.util.concurrent.ConcurrentLinkedQueue[String]() } case class TestContextException(portName: String, msg: String) extends RuntimeException(msg)
Example 15
Source File: EventDeSerializer.scala From flink-demos with Apache License 2.0 | 5 votes |
package com.dataartisans.flink.example.eventpattern.kafka import java.nio.{ByteBuffer, ByteOrder} import com.dataartisans.flink.example.eventpattern.Event import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.streaming.api.scala._ import org.apache.flink.streaming.util.serialization.{DeserializationSchema, SerializationSchema} class EventDeSerializer extends DeserializationSchema[Event] with SerializationSchema[Event] { override def deserialize(bytes: Array[Byte]): Event = { val buffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN) val address: Int = buffer.getInt(0) val eventType: Int = buffer.getInt(4) Event(address, eventType) } override def serialize(t: Event): Array[Byte] = { val byteBuffer = ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN) byteBuffer.putInt(0, t.sourceAddress) byteBuffer.putInt(4, t.event) byteBuffer.array() } override def isEndOfStream(t: Event): Boolean = false override def getProducedType: TypeInformation[Event] = { createTypeInformation[Event] } }
Example 16
Source File: DataRowRecordMarshaller.scala From flink-elasticsearch-source-connector with Apache License 2.0 | 5 votes |
package com.mnubo.flink.streaming.connectors import org.apache.flink.api.common.ExecutionConfig import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.java.typeutils.TypeExtractor @SerialVersionUID(1L) class DataRowRecordMarshaller extends RecordMarshaller[DataRow] { private var typeInfo: DataRowTypeInfo = null @transient private lazy val serializer = typeInfo .createSerializer(new ExecutionConfig) .asInstanceOf[DataRowSerializer] override def typeInformation = typeInfo override def configureFields(types: Seq[MarshallerFieldDescriptor]) = { val (fieldNames, typeInfos): (Seq[String], Seq[TypeInformation[_]]) = types .map { tp => tp.fieldName -> TypeExtractor.createTypeInfo(tp.fieldClass) } .unzip typeInfo = new DataRowTypeInfo(fieldNames, typeInfos) } override def createOrReuseInstance(fields: Seq[AnyRef], reuse: DataRow): DataRow = serializer.createOrReuseInstance(fields.toArray[AnyRef], reuse) }
Example 17
Source File: FlinkTestKits.scala From flink-jpmml with GNU Affero General Public License v3.0 | 5 votes |
package io.radicalbit.flink.pmml.scala.utils import io.radicalbit.flink.pmml.scala.sources.TemporizedSourceFunction import io.radicalbit.flink.streaming.spec.core.FlinkTestKitCompanion import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.streaming.api.functions.sink.SinkFunction import org.apache.flink.streaming.api.scala._ import org.apache.flink.test.util.AbstractTestBase import scala.collection.mutable import scala.reflect.ClassTag trait FlinkSourcedPipelineTestKit[IN1, IN2, OUT] extends AbstractTestBase { def executePipeline[IN1: TypeInformation: ClassTag, IN2: TypeInformation: ClassTag]( in1: Seq[(Long, IN1)], in2: Seq[(Long, IN2)])(pipeline: (DataStream[IN1], DataStream[IN2]) => DataStream[OUT])( implicit companion: FlinkTestKitCompanion[OUT]) = { companion.testResults = mutable.MutableList[OUT]() val env = StreamExecutionEnvironment.getExecutionEnvironment env.setParallelism(1) val events = in1 .union(in2) .sortBy(_._1) .collect { case (_, left: IN1) => (Some(left), None) case (_, right: IN2) => (None, Some(right)) } val stream = env.addSource(new TemporizedSourceFunction[IN1, IN2](events)) val stream1: DataStream[IN1] = stream.filter(either => either.isLeft).map(either => either.left.get) val stream2: DataStream[IN2] = stream.filter(either => either.isRight).map(either => either.right.get) pipeline(stream1, stream2) .addSink(new SinkFunction[OUT] { override def invoke(in: OUT) = { companion.testResults += in } }) env.execute(this.getClass.getSimpleName) companion.testResults } }
Example 18
Source File: ElasticsearchDataset.scala From flink-elasticsearch-source-connector with Apache License 2.0 | 5 votes |
package com.mnubo.flink.streaming.connectors.elasticsearch import com.mnubo.flink.streaming.connectors._ import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.java.operators.DataSource import org.apache.flink.api.java.typeutils.{PojoTypeInfo, TupleTypeInfoBase} import org.apache.flink.api.scala._ import scala.reflect.ClassTag object ElasticsearchDataset { def fromElasticsearchQuery[T : ClassTag: TypeInformation](env: ExecutionEnvironment, index: String, query: String, nodes: Set[String] = Set("localhost"), port: Int = 9200, pojoFields: Array[String] = null): DataSet[T] = { val clazz = implicitly[ClassTag[T]].runtimeClass val marshaller = if (clazz == classOf[DataRow]) new DataRowRecordMarshaller().asInstanceOf[RecordMarshaller[T]] else implicitly[TypeInformation[T]] match { case info: TupleTypeInfoBase[T] => new TupleRecordMarshaller[T](info) case info: PojoTypeInfo[T] => require(pojoFields != null, "POJO fields must be specified (not null) if output type is a POJO.") new PojoRecordMarshaller[T](info, pojoFields) case other => throw new IllegalArgumentException(s"The type ${clazz.getName} has to be a tuple, a DataRow or pojo type.") } val inputFormat = new ElasticseachInputFormat[T]( nodes, port, index, query, marshaller ) // Not the most elegant, but can't wait for the input format to be configured to get the actual schema. Have to get it now. val schema = inputFormat.fetchSchema() marshaller.configureFields(schema) new DataSet[T](new DataSource[T](env.getJavaEnv, inputFormat, marshaller.typeInformation, getCallLocationName())) } }
Example 19
Source File: RecordTransformer.scala From flink-elasticsearch-source-connector with Apache License 2.0 | 5 votes |
package com.mnubo.flink.streaming.connectors import org.apache.flink.api.common.operators.Keys.ExpressionKeys._ import org.apache.flink.api.common.typeinfo.TypeInformation import scala.annotation.tailrec import scala.language.existentials import scala.reflect.ClassTag sealed trait FieldSpecification extends Serializable case class ExistingField(name: String) extends FieldSpecification case class NewField(name: String, typeInfo: TypeInformation[_]) extends FieldSpecification trait RecordTransformer extends Serializable { val classTag = ClassTag[DataRow](classOf[DataRow]) def typeInfo : DataRowTypeInfo def transform(dataRow: DataRow, values:Any*) : DataRow } class FieldMapperRecordTransformer private[connectors](srcTypeInfo:DataRowTypeInfo, fieldSpecifications: FieldSpecification*) extends RecordTransformer { require(srcTypeInfo != null, s"srcTypeInfo must not be null") require(fieldSpecifications != null, s"fieldSpecifications must not be null") require(fieldSpecifications.nonEmpty, s"fieldSpecifications must not be empty") require(!fieldSpecifications.contains(null), s"fieldSpecifications must not contain any nulls") override val typeInfo = { val (fieldNames, elementTypes) = fieldSpecifications.flatMap { case ExistingField(name) if name == SELECT_ALL_CHAR || name == SELECT_ALL_CHAR_SCALA => srcTypeInfo.getFieldNames.zip(srcTypeInfo.getElementTypes) case ExistingField(name) => Seq(name -> srcTypeInfo.getFieldType(name)) case NewField(name, newFieldTypeInfo) => Seq(name -> newFieldTypeInfo) }.unzip require(fieldNames.length == fieldNames.distinct.length, s"Fields can't have duplicates. Fields were $fieldNames.") new DataRowTypeInfo(fieldNames, elementTypes) } private def newFieldsNames = fieldSpecifications.collect{ case newValue: NewField => newValue.name } override def transform(dataRow: DataRow, values:Any*) : DataRow = { require(dataRow != null, s"dataRow must not be null") require(values != null, s"values must not be null") require(newFieldsNames.length == values.length, s"Must specify values for all new fields and only new fields. New fields are '$newFieldsNames'") val resultValues = new Array[Any](typeInfo.getArity) @tailrec def transform(index:Int, remainingSpecs: Seq[FieldSpecification], remainingValues:Seq[Any]) : DataRow = { if(remainingSpecs.isEmpty) { new DataRow(resultValues, typeInfo) } else { val currentSpec = remainingSpecs.head currentSpec match { case ExistingField(name) if name == SELECT_ALL_CHAR || name == SELECT_ALL_CHAR_SCALA => Array.copy(dataRow.data, 0, resultValues, index, dataRow.data.length) transform(index + dataRow.data.length, remainingSpecs.tail, remainingValues) case ExistingField(name) => resultValues(index) = dataRow(name) transform(index + 1, remainingSpecs.tail, remainingValues) case NewField(name, _) => resultValues(index) = remainingValues.head transform(index + 1, remainingSpecs.tail, remainingValues.tail) } } } transform(0, fieldSpecifications, values) } } object RecordTransformer { def mapFields(srcTypeInfo: DataRowTypeInfo, fieldSpecifications: FieldSpecification*) : RecordTransformer = { new FieldMapperRecordTransformer(srcTypeInfo, fieldSpecifications:_*) } }
Example 20
Source File: package.scala From milan with Apache License 2.0 | 5 votes |
package com.amazon.milan.compiler.flink import java.io.{ByteArrayInputStream, ByteArrayOutputStream} import com.amazon.milan.compiler.flink.runtime.{UnwrapRecordsMapFunction, WrapRecordsMapFunction} import com.amazon.milan.compiler.flink.testing.IntKeyValueRecord import com.amazon.milan.compiler.flink.types.{RecordWrapper, RecordWrapperTypeInformation} import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.common.typeutils.TypeSerializer import org.apache.flink.api.java.typeutils.ResultTypeQueryable import org.apache.flink.core.memory.{DataInputView, DataInputViewStreamWrapper, DataOutputView, DataOutputViewStreamWrapper} import org.apache.flink.streaming.api.TimeCharacteristic import org.apache.flink.streaming.api.datastream.DataStream import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment import scala.language.implicitConversions import scala.util.Random package object testutil { def getTestExecutionEnvironment: StreamExecutionEnvironment = { val env = StreamExecutionEnvironment.getExecutionEnvironment env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) env.setBufferTimeout(0) env } def copyWithSerializer[T](value: T, serializer: TypeSerializer[T]): T = { val outputStream = new ByteArrayOutputStream() val outputView = new DataOutputViewStreamWrapper(outputStream) serializer.serialize(value, outputView) val bytes = outputStream.toByteArray val inputStream = new ByteArrayInputStream(bytes) val inputView = new DataInputViewStreamWrapper(inputStream) serializer.deserialize(inputView) } def copyData[T](writeValue: DataOutputView => Unit, readValue: DataInputView => T): T = { val outputStream = new ByteArrayOutputStream() val outputView = new DataOutputViewStreamWrapper(outputStream) writeValue(outputView) val bytes = outputStream.toByteArray val inputStream = new ByteArrayInputStream(bytes) val inputView = new DataInputViewStreamWrapper(inputStream) readValue(inputView) } def generateIntKeyValueRecords(recordCount: Int, keyCount: Int, maxValue: Int): List[IntKeyValueRecord] = { val rand = new Random(0) List.tabulate(recordCount)(_ => IntKeyValueRecord(rand.nextInt(keyCount), rand.nextInt(maxValue + 1))) } implicit class WrappedDataStreamExtensions[T >: Null, TKey >: Null <: Product](dataStream: DataStream[RecordWrapper[T, TKey]]) { def unwrap(recordTypeInformation: TypeInformation[T]): DataStream[T] = { val mapper = new UnwrapRecordsMapFunction[T, TKey](recordTypeInformation) this.dataStream.map(mapper) } def unwrap(): DataStream[T] = { val recordType = this.dataStream.getType.asInstanceOf[RecordWrapperTypeInformation[T, TKey]].valueTypeInformation this.unwrap(recordType) } } implicit class DataStreamExtensions[T >: Null](dataStream: DataStream[T]) { def wrap(recordTypeInformation: TypeInformation[T]): DataStream[RecordWrapper[T, Product]] = { val mapper = new WrapRecordsMapFunction[T](recordTypeInformation) this.dataStream.map(mapper) } def wrap(): DataStream[RecordWrapper[T, Product]] = { val recordType = this.dataStream.asInstanceOf[ResultTypeQueryable[T]].getProducedType this.wrap(recordType) } } }
Example 21
Source File: MapFunctions.scala From milan with Apache License 2.0 | 5 votes |
package com.amazon.milan.compiler.flink.runtime import com.amazon.milan.compiler.flink.internal.LineageRecordFactory import com.amazon.milan.compiler.flink.metrics.MetricFactory import com.amazon.milan.compiler.flink.types.{RecordWrapper, RecordWrapperTypeInformation} import com.amazon.milan.types.LineageRecord import org.apache.flink.api.common.functions.RichMapFunction import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.java.typeutils.ResultTypeQueryable import org.apache.flink.util.OutputTag object MapFunctions { val ProcessedRecordsCounterMetricName = "processed_record_count" } import com.amazon.milan.compiler.flink.runtime.MapFunctions._ abstract class MapFunctionWithLineage[TIn >: Null, TKey >: Null <: Product, TOut >: Null](outputTypeInformation: TypeInformation[TOut], keyTypeInformation: TypeInformation[TKey], lineageRecordFactory: LineageRecordFactory, lineageOutputTag: OutputTag[LineageRecord], metricFactory: MetricFactory) extends RichMapFunction[RecordWrapper[TIn, TKey], RecordWrapper[TOut, TKey]] with ResultTypeQueryable[RecordWrapper[TOut, TKey]] { @transient private lazy val processedRecordsCounter = this.metricFactory.createCounter(this.getRuntimeContext, ProcessedRecordsCounterMetricName) protected def mapValue(in: TIn): TOut override def map(record: RecordWrapper[TIn, TKey]): RecordWrapper[TOut, TKey] = { this.processedRecordsCounter.increment() val mappedValue = this.mapValue(record.value) RecordWrapper.wrap(mappedValue, record.key, record.sequenceNumber) } override def getProducedType: TypeInformation[RecordWrapper[TOut, TKey]] = RecordWrapperTypeInformation.wrap(this.outputTypeInformation, this.keyTypeInformation) } abstract class KeyedMapFunctionWithLineage[TIn >: Null, TInKey >: Null <: Product, TKey, TOut >: Null](outputTypeInfo: TypeInformation[TOut], keyTypeInfo: TypeInformation[TInKey], lineageRecordFactory: LineageRecordFactory, lineageOutputTag: OutputTag[LineageRecord], metricFactory: MetricFactory) extends RichMapFunction[RecordWrapper[TIn, TInKey], RecordWrapper[TOut, TInKey]] with ResultTypeQueryable[RecordWrapper[TOut, TInKey]] { @transient private lazy val processedRecordsCounter = this.metricFactory.createCounter(this.getRuntimeContext, ProcessedRecordsCounterMetricName) protected def getKey(recordKey: TInKey): TKey protected def mapValue(key: TKey, value: TIn): TOut override def map(record: RecordWrapper[TIn, TInKey]): RecordWrapper[TOut, TInKey] = { this.processedRecordsCounter.increment() val key = this.getKey(record.key) val mappedValue = this.mapValue(key, record.value) RecordWrapper.wrap(mappedValue, record.key, record.sequenceNumber) } override def getProducedType: TypeInformation[RecordWrapper[TOut, TInKey]] = RecordWrapperTypeInformation.wrap(this.outputTypeInfo, this.keyTypeInfo) }
Example 22
Source File: ArgCompareProcessFunctions.scala From milan with Apache License 2.0 | 5 votes |
package com.amazon.milan.compiler.flink.runtime import com.amazon.milan.compiler.flink.TypeUtil import org.apache.flink.api.common.typeinfo.TypeInformation abstract class ArgCompareKeyedProcessFunction[T >: Null, TKey >: Null <: Product, TArg](recordTypeInformation: TypeInformation[T], keyTypeInformation: TypeInformation[TKey], argTypeInformation: TypeInformation[TArg]) extends ScanKeyedProcessFunction[T, TKey, Option[TArg], T](None, keyTypeInformation, TypeUtil.createOptionTypeInfo(argTypeInformation), recordTypeInformation) { protected def getArg(value: T): TArg protected def greaterThan(arg1: TArg, arg2: TArg): Boolean override protected def process(state: Option[TArg], key: TKey, value: T): (Option[TArg], Option[T]) = { val valueArg = this.getArg(value) state match { case None => (Some(valueArg), Some(value)) case Some(stateArg) => if (this.greaterThan(valueArg, stateArg)) { (Some(valueArg), Some(value)) } else { (state, None) } } } } abstract class ArgCompareProcessFunction[T >: Null, TKey >: Null <: Product, TArg](recordTypeInformation: TypeInformation[T], keyTypeInformation: TypeInformation[TKey], argTypeInformation: TypeInformation[TArg]) extends ScanProcessFunction[T, TKey, Option[TArg], T](None, keyTypeInformation, TypeUtil.createOptionTypeInfo(argTypeInformation), recordTypeInformation) { protected def getArg(value: T): TArg protected def greaterThan(arg1: TArg, arg2: TArg): Boolean override protected def process(state: Option[TArg], value: T): (Option[TArg], Option[T]) = { val valueArg = this.getArg(value) state match { case None => (Some(valueArg), Some(value)) case Some(stateArg) => if (this.greaterThan(valueArg, stateArg)) { (Some(valueArg), Some(value)) } else { (state, None) } } } }
Example 23
Source File: KinesisDataSource.scala From milan with Apache License 2.0 | 5 votes |
package com.amazon.milan.compiler.flink.runtime import java.util.Properties import com.amazon.milan.dataformats.DataInputFormat import com.amazon.milan.serialization.MilanObjectMapper import com.typesafe.scalalogging.Logger import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.streaming.api.datastream.DataStreamSource import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment import org.apache.flink.streaming.connectors.kinesis.FlinkKinesisConsumer import org.apache.flink.streaming.connectors.kinesis.config.{AWSConfigConstants, ConsumerConfigConstants} import org.apache.flink.streaming.connectors.kinesis.serialization.KinesisDeserializationSchema import org.slf4j.LoggerFactory object KinesisDataSource { private lazy val logger = Logger(LoggerFactory.getLogger(getClass)) def addDataSource[T](env: StreamExecutionEnvironment, streamName: String, region: String, dataFormat: DataInputFormat[T], recordTypeInformation: TypeInformation[T]): DataStreamSource[T] = { this.logger.info(s"Creating Kinesis consumer for stream '$streamName', region '$region'.") val config = this.getConsumerProperties(region) val schema = new JsonDeserializationSchema[T](recordTypeInformation) val source = new FlinkKinesisConsumer[T](streamName, schema, config) env.addSource(source) } private def getConsumerProperties(region: String): Properties = { val config = new Properties() config.setProperty(AWSConfigConstants.AWS_REGION, region) config.setProperty(AWSConfigConstants.AWS_CREDENTIALS_PROVIDER, AWSConfigConstants.CredentialProvider.AUTO.toString) config.setProperty(ConsumerConfigConstants.STREAM_INITIAL_POSITION, ConsumerConfigConstants.InitialPosition.LATEST.toString) config } } class JsonDeserializationSchema[T](recordTypeInformation: TypeInformation[T]) extends KinesisDeserializationSchema[T] { override def deserialize(bytes: Array[Byte], partitionKey: String, seqNum: String, approxArrivalTimestamp: Long, stream: String, shardId: String): T = { MilanObjectMapper.readValue[T](bytes, this.recordTypeInformation.getTypeClass) } override def getProducedType: TypeInformation[T] = this.recordTypeInformation }
Example 24
Source File: RuntimeUtil.scala From milan with Apache License 2.0 | 5 votes |
package com.amazon.milan.compiler.flink.runtime import java.util import com.amazon.milan.compiler.flink.generator.FlinkGeneratorException import com.amazon.milan.serialization.MilanObjectMapper import com.fasterxml.jackson.databind.`type`.TypeFactory import org.apache.flink.api.common.typeinfo.TypeInformation import scala.collection.JavaConverters._ import scala.reflect.{ClassTag, classTag} object RuntimeUtil { val typeName: String = getClass.getTypeName.stripSuffix("$") def loadJsonList[TElement: ClassTag](listJson: String): List[TElement] = { this.loadJsonArrayList[TElement](listJson).asScala.toList } def loadJsonArrayList[TElement: ClassTag](listJson: String): util.ArrayList[TElement] = { val typeFactory = TypeFactory.defaultInstance() val itemClass = classTag[TElement].runtimeClass.asInstanceOf[Class[TElement]] val javaType = typeFactory.constructCollectionType(classOf[util.ArrayList[TElement]], itemClass) MilanObjectMapper.readValue[util.ArrayList[TElement]](listJson, javaType) } def preventGenericTypeInformation[T](typeInfo: TypeInformation[T]): TypeInformation[T] = { if (typeInfo.getClass.getName.contains("__wrapper")) { throw new FlinkGeneratorException(s"Creating TypeInformation for '${typeInfo.getTypeClass.getName}' produced a GenericTypeInformation.") } typeInfo } }
Example 25
Source File: LeftJoinKeyedCoProcessFunction.scala From milan with Apache License 2.0 | 5 votes |
package com.amazon.milan.compiler.flink.runtime import com.amazon.milan.compiler.flink.internal.JoinLineageRecordFactory import com.amazon.milan.compiler.flink.metrics.MetricFactory import com.amazon.milan.compiler.flink.types.{RecordWrapper, RecordWrapperTypeInformation} import com.amazon.milan.types.LineageRecord import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor} import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.java.typeutils.ResultTypeQueryable import org.apache.flink.configuration.Configuration import org.apache.flink.streaming.api.functions.co.KeyedCoProcessFunction import org.apache.flink.util.{Collector, OutputTag} object LeftJoinCoProcessFunction { val LeftInputRecordsCounterMetricName = "left_input_record_count" val RightInputRecordsCounterMetricName = "right_input_record_count" val OutputRecordsCounterMetricName = "output_record_count" } import com.amazon.milan.compiler.flink.runtime.LeftJoinCoProcessFunction._ abstract class LeftJoinKeyedCoProcessFunction[TLeft >: Null, TRight >: Null, TKey >: Null <: Product, TOut >: Null](rightTypeInformation: TypeInformation[TRight], keyTypeInformation: TypeInformation[TKey], outputTypeInformation: TypeInformation[TOut], leftRecordIdExtractor: RecordIdExtractor[TLeft], rightRecordIdExtractor: RecordIdExtractor[TRight], outputRecordIdExtractor: RecordIdExtractor[TOut], lineageRecordFactory: JoinLineageRecordFactory, lineageOutputTag: OutputTag[LineageRecord], metricFactory: MetricFactory) extends KeyedCoProcessFunction[TKey, RecordWrapper[TLeft, TKey], RecordWrapper[TRight, TKey], RecordWrapper[TOut, TKey]] with ResultTypeQueryable[RecordWrapper[TOut, TKey]] { @transient private lazy val canProduceLineage = leftRecordIdExtractor.canExtractRecordId && rightRecordIdExtractor.canExtractRecordId && outputRecordIdExtractor.canExtractRecordId @transient private lazy val leftInputRecordsCounter = this.metricFactory.createCounter(this.getRuntimeContext, LeftInputRecordsCounterMetricName) @transient private lazy val rightInputRecordsCounter = this.metricFactory.createCounter(this.getRuntimeContext, RightInputRecordsCounterMetricName) @transient private lazy val outputRecordsCounter = this.metricFactory.createCounter(this.getRuntimeContext, OutputRecordsCounterMetricName) @transient private var lastRightValue: ValueState[TRight] = _ protected def map(left: TLeft, right: TRight): TOut protected def postCondition(left: TLeft, right: TRight): Boolean override def processElement1(leftRecord: RecordWrapper[TLeft, TKey], context: KeyedCoProcessFunction[TKey, RecordWrapper[TLeft, TKey], RecordWrapper[TRight, TKey], RecordWrapper[TOut, TKey]]#Context, collector: Collector[RecordWrapper[TOut, TKey]]): Unit = { this.leftInputRecordsCounter.increment() val leftValue = leftRecord.value val rightValue = this.lastRightValue.value() if (this.postCondition(leftValue, rightValue)) { val output = this.map(leftValue, rightValue) if (output != null) { if (this.canProduceLineage) { val lineageRecord = this.createLineageRecord(this.outputRecordIdExtractor(output), leftValue, rightValue) context.output(this.lineageOutputTag, lineageRecord) } collector.collect(RecordWrapper.wrap[TOut, TKey](output, leftRecord.key, 0)) this.outputRecordsCounter.increment() } } } override def processElement2(rightRecord: RecordWrapper[TRight, TKey], context: KeyedCoProcessFunction[TKey, RecordWrapper[TLeft, TKey], RecordWrapper[TRight, TKey], RecordWrapper[TOut, TKey]]#Context, collector: Collector[RecordWrapper[TOut, TKey]]): Unit = { this.rightInputRecordsCounter.increment() this.lastRightValue.update(rightRecord.value) } override def open(parameters: Configuration): Unit = { val rightValueDescriptor = new ValueStateDescriptor[TRight]("lastRightValue", this.rightTypeInformation) this.lastRightValue = this.getRuntimeContext.getState(rightValueDescriptor) } override def getProducedType: TypeInformation[RecordWrapper[TOut, TKey]] = RecordWrapperTypeInformation.wrap(this.outputTypeInformation, this.keyTypeInformation) private def createLineageRecord(outputRecordId: String, leftRecord: TLeft, rightRecord: TRight): LineageRecord = { val sourceRecords = Option(leftRecord).toSeq.map(r => this.lineageRecordFactory.createLeftRecordPointer(this.leftRecordIdExtractor(r))) ++ Option(rightRecord).toSeq.map(r => this.lineageRecordFactory.createRightRecordPointer(this.rightRecordIdExtractor(r))) this.lineageRecordFactory.createLineageRecord(outputRecordId, sourceRecords) } }
Example 26
Source File: TimeWindowFlatMapProcessWindowFunction.scala From milan with Apache License 2.0 | 5 votes |
package com.amazon.milan.compiler.flink.runtime import java.lang import java.time.Instant import com.amazon.milan.compiler.flink.TypeUtil import com.amazon.milan.compiler.flink.types.{RecordWrapper, RecordWrapperTypeInformation} import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.java.typeutils.ResultTypeQueryable import org.apache.flink.configuration.Configuration import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction import org.apache.flink.streaming.api.windowing.windows.TimeWindow import org.apache.flink.util.Collector abstract class TimeWindowFlatMapProcessWindowFunction[T >: Null, TInKey >: Null <: Product, TOutKey >: Null <: Product](recordTypeInfo: TypeInformation[T], outKeyTypeInfo: TypeInformation[TOutKey]) extends ProcessWindowFunction[RecordWrapper[Option[T], TInKey], RecordWrapper[Option[T], TOutKey], TInKey, TimeWindow] with ResultTypeQueryable[RecordWrapper[Option[T], TOutKey]] { @transient private var sequenceNumberHelper: SequenceNumberHelper = _ protected def addWindowStartTimeToKey(key: TInKey, windowStart: Instant): TOutKey override def getProducedType: TypeInformation[RecordWrapper[Option[T], TOutKey]] = RecordWrapperTypeInformation.wrap(TypeUtil.createOptionTypeInfo(this.recordTypeInfo), this.outKeyTypeInfo) override def process(key: TInKey, context: ProcessWindowFunction[RecordWrapper[Option[T], TInKey], RecordWrapper[Option[T], TOutKey], TInKey, TimeWindow]#Context, items: lang.Iterable[RecordWrapper[Option[T], TInKey]], collector: Collector[RecordWrapper[Option[T], TOutKey]]): Unit = { val windowStartTime = Instant.ofEpochMilli(context.window().getStart) val record = items.iterator().next() val outKey = this.addWindowStartTimeToKey(record.key, windowStartTime) val outRecord = RecordWrapper.wrap(record.value, outKey, sequenceNumberHelper.increment()) collector.collect(outRecord) } override def open(parameters: Configuration): Unit = { this.sequenceNumberHelper = new SequenceNumberHelper(this.getRuntimeContext) } }
Example 27
Source File: UnpackOptionProcessFunction.scala From milan with Apache License 2.0 | 5 votes |
package com.amazon.milan.compiler.flink.runtime import com.amazon.milan.compiler.flink.types.{RecordWrapper, RecordWrapperTypeInformation} import com.typesafe.scalalogging.Logger import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.java.typeutils.ResultTypeQueryable import org.apache.flink.streaming.api.functions.ProcessFunction import org.apache.flink.util.Collector import org.slf4j.LoggerFactory class UnpackOptionProcessFunction[T >: Null, TKey >: Null <: Product](recordType: TypeInformation[T], keyType: TypeInformation[TKey]) extends ProcessFunction[RecordWrapper[Option[T], TKey], RecordWrapper[T, TKey]] with ResultTypeQueryable[RecordWrapper[T, TKey]] { @transient private lazy val logger = Logger(LoggerFactory.getLogger(getClass)) override def processElement(record: RecordWrapper[Option[T], TKey], context: ProcessFunction[RecordWrapper[Option[T], TKey], RecordWrapper[T, TKey]]#Context, collector: Collector[RecordWrapper[T, TKey]]): Unit = { if (record.value.isDefined) { collector.collect(RecordWrapper.wrap(record.value.get, record.key, record.sequenceNumber)) } } override def getProducedType: TypeInformation[RecordWrapper[T, TKey]] = RecordWrapperTypeInformation.wrap(this.recordType, this.keyType) }
Example 28
Source File: ArrayRecordToTupleMapFunction.scala From milan with Apache License 2.0 | 5 votes |
package com.amazon.milan.compiler.flink.runtime import com.amazon.milan.compiler.flink.types.{ArrayRecord, RecordWrapper, RecordWrapperTypeInformation} import org.apache.flink.api.common.functions.MapFunction import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.java.typeutils.ResultTypeQueryable abstract class ArrayRecordToTupleMapFunction[T >: Null, TKey >: Null <: Product](outputTypeInformation: TypeInformation[T], keyTypeInformation: TypeInformation[TKey]) extends MapFunction[RecordWrapper[ArrayRecord, TKey], RecordWrapper[T, TKey]] with ResultTypeQueryable[RecordWrapper[T, TKey]] { protected def getTuple(record: ArrayRecord): T override def map(record: RecordWrapper[ArrayRecord, TKey]): RecordWrapper[T, TKey] = { val tupleValue = this.getTuple(record.value) RecordWrapper.wrap[T, TKey](tupleValue, record.key, 0) } override def getProducedType: TypeInformation[RecordWrapper[T, TKey]] = RecordWrapperTypeInformation.wrap(this.outputTypeInformation, this.keyTypeInformation) }
Example 29
Source File: IdentityFlatMapFunction.scala From milan with Apache License 2.0 | 5 votes |
package com.amazon.milan.compiler.flink.runtime import com.amazon.milan.compiler.flink.types.{RecordWrapper, RecordWrapperTypeInformation} import org.apache.flink.api.common.functions.FlatMapFunction import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.java.typeutils.ResultTypeQueryable import org.apache.flink.util.Collector class IdentityFlatMapFunction[T >: Null, TKey >: Null <: Product](recordTypeInformation: TypeInformation[T], keyTypeInformation: TypeInformation[TKey]) extends FlatMapFunction[RecordWrapper[T, TKey], RecordWrapper[T, TKey]] with ResultTypeQueryable[RecordWrapper[T, TKey]] { override def flatMap(record: RecordWrapper[T, TKey], collector: Collector[RecordWrapper[T, TKey]]): Unit = { collector.collect(record) } override def getProducedType: TypeInformation[RecordWrapper[T, TKey]] = RecordWrapperTypeInformation.wrap(this.recordTypeInformation, this.keyTypeInformation) }
Example 30
Source File: DataSourceUtil.scala From milan with Apache License 2.0 | 5 votes |
package com.amazon.milan.compiler.flink.runtime import com.amazon.milan.application.sources.FileDataSource import com.amazon.milan.dataformats.DataInputFormat import com.amazon.milan.compiler.flink.types.{ByteArrayDataFormatFlatMapFunction, ByteArrayInputFormat, ByteArrayRecordTypeInformation} import com.typesafe.scalalogging.Logger import org.apache.flink.api.common.io.FilePathFilter import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.streaming.api.datastream.{DataStreamSource, SingleOutputStreamOperator} import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment import org.apache.flink.streaming.api.functions.source.FileProcessingMode import org.slf4j.LoggerFactory import scala.collection.JavaConverters._ object DataSourceUtil { private lazy val logger = Logger(LoggerFactory.getLogger(getClass)) def addFileDataSource[T](env: StreamExecutionEnvironment, path: String, dataFormat: DataInputFormat[T], configuration: FileDataSource.Configuration, recordTypeInformation: TypeInformation[T]): SingleOutputStreamOperator[T] = { this.logger.info(s"Adding file '$path' as an input to the streaming environment. ") val inputFormat = new ByteArrayInputFormat inputFormat.setFilesFilter(FilePathFilter.createDefaultFilter()) val processingMode = configuration.readMode match { case FileDataSource.ReadMode.Continuous => FileProcessingMode.PROCESS_CONTINUOUSLY case FileDataSource.ReadMode.Once => FileProcessingMode.PROCESS_ONCE } val changeCheckIntervalMs = processingMode match { case FileProcessingMode.PROCESS_CONTINUOUSLY => 5000L case _ => -1L } val inputLines = env.readFile( inputFormat, path, processingMode, changeCheckIntervalMs, new ByteArrayRecordTypeInformation) val mapper = new ByteArrayDataFormatFlatMapFunction[T](dataFormat, recordTypeInformation) inputLines.flatMap(mapper) } def addListDataSource[T](env: StreamExecutionEnvironment, values: List[T], runForever: Boolean, recordTypeInformation: TypeInformation[T]): DataStreamSource[T] = { if (runForever) { // If we don't want the source to terminate after the elements run out then we need to use a custom source // function rather than env.fromCollection. In order to not cause duplicate records to be sent from multiple // copies of the source function we set the parallelism to 1. val source = new ListSourceFunction[T](values, runForever) env.addSource(source, recordTypeInformation).setParallelism(1) } else { env.fromCollection(values.asJavaCollection, recordTypeInformation) } } }
Example 31
Source File: TypeInformationDataInputFormat.scala From milan with Apache License 2.0 | 5 votes |
package com.amazon.milan.compiler.flink.dataformats import java.io.{ByteArrayInputStream, EOFException, InputStream} import com.amazon.milan.dataformats.DataInputFormat import com.amazon.milan.typeutil.TypeDescriptor import org.apache.flink.api.common.ExecutionConfig import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.common.typeutils.TypeSerializer import org.apache.flink.core.memory.DataInputViewStreamWrapper class TypeInformationDataInputFormat[T](typeInfo: TypeInformation[T]) extends DataInputFormat[T] { @transient private lazy val serializer = this.createSerializer() override def getGenericArguments: List[TypeDescriptor[_]] = { // This class is not intended to be serialized by GenericTypedJsonSerializer, so this should not be called. throw new UnsupportedOperationException() } override def setGenericArguments(genericArgs: List[TypeDescriptor[_]]): Unit = { // This class is not intended to be deserialized by GenericTypedJsonDeserializer, so this should not be called. throw new UnsupportedOperationException() } override def readValue(bytes: Array[Byte], offset: Int, length: Int): Option[T] = { val input = new DataInputViewStreamWrapper(new ByteArrayInputStream(bytes, offset, length)) Some(this.serializer.deserialize(input)) } override def readValues(stream: InputStream): TraversableOnce[T] = { val input = new DataInputViewStreamWrapper(stream) Stream.continually(0) .map(_ => try { Some(this.serializer.deserialize(input)) } catch { case _: EOFException => None }) .takeWhile(_.isDefined) .map(_.get) } private def createSerializer(): TypeSerializer[T] = { val config = new ExecutionConfig() this.typeInfo.createSerializer(config) } }
Example 32
Source File: JsonDeserializationSchema.scala From milan with Apache License 2.0 | 5 votes |
package com.amazon.milan.compiler.flink.serialization import com.amazon.milan.serialization.MilanObjectMapper import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.java.typeutils.TypeExtractor import org.apache.flink.streaming.connectors.kinesis.serialization.KinesisDeserializationSchema import scala.reflect.{ClassTag, classTag} object JsonDeserializationSchema { private val objectMapper = new MilanObjectMapper() } class JsonDeserializationSchema[T: ClassTag] extends KinesisDeserializationSchema[T] with Serializable { override def deserialize(bytes: Array[Byte], partitionKey: String, seqNum: String, approxArrivalTimestamp: Long, stream: String, shardId: String): T = { JsonDeserializationSchema.objectMapper.readValue[T](bytes, classTag[T].runtimeClass.asInstanceOf[Class[T]]) } override def getProducedType: TypeInformation[T] = { TypeExtractor.getForClass(classTag[T].runtimeClass.asInstanceOf[Class[T]]) } }