org.apache.flink.api.common.typeinfo.TypeInformation Scala Example

Source File: DataRow.scala From flink-elasticsearch-source-connector with Apache License 2.0

5 votes

package com.mnubo.flink.streaming.connectors

import org.apache.commons.lang3.ClassUtils
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.TypeExtractor

import scala.language.existentials

case class Value(v: Any, name: String, givenTypeInfo: Option[TypeInformation[_]] = None) {
  require(v != null || givenTypeInfo.isDefined, "You must pass a TypeInformation for null values")

  val typeInfo = givenTypeInfo match {
    case Some(ti) => ti
    case None => TypeExtractor.getForObject(v)
  }

  require(isAssignable(v, typeInfo.getTypeClass), s"data element '$v' is not compatible with class ${typeInfo.getTypeClass.getName}")

  private def isAssignable(value: Any, cl: Class[_]) = {
    if (value == null && classOf[AnyRef].isAssignableFrom(cl))
      true
    else
      ClassUtils.isAssignable(value.getClass, cl)
  }
}

object Value {
  def apply(v: Any, name: String, givenTypeInfo: TypeInformation[_]) = {
    new Value(v, name, Some(givenTypeInfo))
  }
}



class DataRow(private [connectors] val data: Array[Any], private [connectors] val info: DataRowTypeInfo) extends Product with Serializable {
  require(data != null, "data must not be null")
  require(info != null, "info must not be null")
  require(data.length == info.getArity, "data must be of the correct arity")

  def apply[T](i: Int): T =
    data(i).asInstanceOf[T]

  def apply[T](fieldExpression: String): T =
    apply(info.getFieldIndex(fieldExpression))

  override def productElement(n: Int): Any =
    apply[AnyRef](n)

  override def productArity =
    info.getArity

  override def canEqual(that: Any) =
    that.isInstanceOf[DataRow]

  override def equals(that: Any) =
    canEqual(that) && data.sameElements(that.asInstanceOf[DataRow].data) && info.getFieldNames.sameElements(that.asInstanceOf[DataRow].info.getFieldNames)

  override def hashCode = {
    var result = 1

    for (element <- data)
      result = 31 * result + (if (element == null) 0 else element.hashCode)

    result
  }

  override def toString =
    info.getFieldNames
      .zip(data.map(v => if (v == null) "null" else v.toString))
      .map{case (name, value) => s"$name=$value"}
      .mkString("DataRow(", ", ", ")")
}

object DataRow {
  
  def apply(data: Value*): DataRow = {
    require(data != null, "data cannot be null")
    require(!data.contains(null), "data value cannot be null")

    new DataRow(
      data.map(_.v).toArray,
      new DataRowTypeInfo(
        data.map(_.name),
        data.map(_.typeInfo)
      )
    )
  }
}

Source File: FlinkSleepBlocker.scala From flink-parameter-server with Apache License 2.0

5 votes

package hu.sztaki.ilab.ps.utils

import org.apache.flink.api.common.functions.RichMapFunction
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.streaming.api.scala._

object FlinkSleepBlocker {

  
  def block[T: TypeInformation](stream: DataStream[T], milliseconds: Long): DataStream[T] = {
    stream.forward.map(new RichMapFunction[T, T] {
      @transient lazy val sleeper: Unit = {
        Thread.sleep(milliseconds)
        ()
      }

      override def map(value: T): T = {
        sleeper
        value
      }

    }).setParallelism(stream.parallelism)
  }

}

Source File: DataSetMatcher.scala From piglet with Apache License 2.0

5 votes

package dbis.piglet.cep.flink

import scala.reflect.ClassTag
import dbis.piglet.cep.nfa.NFAController
import dbis.piglet.cep.engines._
import dbis.piglet.cep.ops.SelectionStrategy._
import dbis.piglet.cep.ops.OutputStrategy._
import dbis.piglet.backends.{SchemaClass => Event}
import dbis.piglet.cep.ops.MatchCollector
import org.apache.flink.api.common.typeinfo.TypeInformation
import dbis.piglet.cep.ops.SelectionStrategy
//import org.apache.flink.api.java.operators.CustomUnaryOperation
//import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable.ListBuffer
//import org.apache.flink.api.java.DataSet
//import org.apache.flink.api.java.ExecutionEnvironment
import scala.collection.JavaConversions._
import org.apache.flink.api.scala._
import dbis.piglet.cep.ops.EngineConf

class DataSetMatcher[T <: Event: ClassTag: TypeInformation](input: DataSet[T], nfa: NFAController[T], flinkEnv: ExecutionEnvironment, sstr: SelectionStrategy = SelectionStrategy.FirstMatch, out: OutputStrategy = Combined) extends EngineConf[T](nfa, sstr) with java.io.Serializable {
  def compute(): DataSet[T] = {
   input.collect().foreach ( event => engine.runEngine(event)  )
   flinkEnv.fromCollection(collector.convertEventsToArray().toSeq)
  }

}

Source File: CustomDataStreamMatcher.scala From piglet with Apache License 2.0

5 votes

package dbis.piglet.cep.flink

import scala.reflect.ClassTag
import dbis.piglet.cep.ops.SelectionStrategy._
import dbis.piglet.cep.ops.OutputStrategy._
import dbis.piglet.cep.nfa.NFAController
import dbis.piglet.backends.{SchemaClass => Event}
import org.apache.flink.api.common.typeinfo.TypeInformation
//import org.apache.flink.api.java.ExecutionEnvironment
//import org.apache.flink.api.java.DataSet
import scala.collection.JavaConversions._
import org.apache.flink.streaming.api.scala._

class CustomDataStreamMatcher[T <: Event: ClassTag: TypeInformation](@transient val dataStream: DataStream[T]) {

  def matchNFA(nfa: NFAController[T], flinkEnv: StreamExecutionEnvironment, sstr: SelectionStrategy = FirstMatch, out: OutputStrategy = Combined)  = {
    // println("create a new DataStream matcher")
    new DataStreamMatcher(dataStream, nfa, flinkEnv, sstr, out).compute()
  }

}

object CustomDataStreamMatcher {

  implicit def addDataSetMatcher[T <: Event: ClassTag: TypeInformation](@transient dataStream: DataStream[T]) = {
    // println("add a custom DataStream function")
    new CustomDataStreamMatcher(dataStream)
  }
}

Source File: CustomDataSetMatcher.scala From piglet with Apache License 2.0

5 votes

package dbis.piglet.cep.flink

import scala.reflect.ClassTag
import dbis.piglet.cep.ops.SelectionStrategy._
import dbis.piglet.cep.ops.OutputStrategy._
import dbis.piglet.cep.nfa.NFAController
import dbis.piglet.backends.{SchemaClass => Event}
import org.apache.flink.api.common.typeinfo.TypeInformation
//import org.apache.flink.api.java.ExecutionEnvironment
//import org.apache.flink.api.java.DataSet
import scala.collection.JavaConversions._
import org.apache.flink.api.scala._

class CustomDataSetMatcher[T <: Event: ClassTag: TypeInformation](dataSet: DataSet[T]) {

  def matchNFA(nfa: NFAController[T], sstr: SelectionStrategy = FirstMatch, out: OutputStrategy = Combined)  = {
    // println("create a new DataSet matcher")
    val flinkEnv = dataSet.getExecutionEnvironment
    new DataSetMatcher(dataSet, nfa, flinkEnv, sstr, out).compute()
  }

}

object CustomDataSetMatcher {

  implicit def addDataSetMatcher[T <: Event: ClassTag: TypeInformation](dataSet: DataSet[T]) = {
    // println("add a custom DataSet function")
    new CustomDataSetMatcher(dataSet)
  }
}

Source File: DataStreamMatcher.scala From piglet with Apache License 2.0

5 votes

package dbis.piglet.cep.flink

import scala.reflect.ClassTag
import dbis.piglet.cep.nfa.NFAController
import dbis.piglet.cep.engines._
import dbis.piglet.cep.ops.SelectionStrategy._
import dbis.piglet.cep.ops.OutputStrategy._
import dbis.piglet.backends.{SchemaClass => Event}
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.streaming.api.windowing.windows.GlobalWindow
import org.apache.flink.streaming.api.windowing.assigners.GlobalWindows
import dbis.piglet.cep.ops.MatchCollector
import dbis.piglet.cep.ops.SelectionStrategy
//import org.apache.flink.api.java.operators.CustomUnaryOperation
import scala.collection.mutable.ListBuffer
//import org.apache.flink.api.java.DataSet
//import org.apache.flink.api.java.ExecutionEnvironment
import scala.collection.JavaConversions._
import org.apache.flink.streaming.api.scala._
import dbis.piglet.cep.ops.EngineConf
import org.apache.flink.util.Collector


class DataStreamMatcher[T <: Event: ClassTag: TypeInformation](@transient val input: DataStream[T], nfa: NFAController[T], flinkEnv: StreamExecutionEnvironment, sstr: SelectionStrategy = SelectionStrategy.FirstMatch, out: OutputStrategy = Combined) extends EngineConf[T](nfa, sstr) with java.io.Serializable {
  object DataStreamProcess {
    def customRun(gw: GlobalWindow, ts: Iterable[T], out: Collector[T]) = {
      ts.foreach { event => engine.runEngine(event)}
      val result = collector.convertEventsToArray()
      result.foreach { res => out.collect(res) }
    }
  }
  def compute(): DataStream[T] = {
    input.windowAll(GlobalWindows.create()).apply(DataStreamProcess.customRun _)   
  }

}

Source File: UTF8StringSchema.scala From piglet with Apache License 2.0

5 votes

package dbis.piglet.backends.flink.streaming

import org.apache.commons.lang3.SerializationUtils
import org.apache.flink.streaming.util.serialization._
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.TypeExtractor

class UTF8StringSchema extends DeserializationSchema[String] with SerializationSchema[String] {

  override def deserialize(message: Array[Byte]): String = {
    new String(message, "UTF-8")
  }   

  override def isEndOfStream(nextElement: String): Boolean = {
    false
  }   

  override def serialize(element: String): Array[Byte] = {
    element.getBytes("UTF-8")
  }   

  override def getProducedType(): TypeInformation[String] = {
    TypeExtractor.getForClass(classOf[String])
  }   
}

Source File: StreamFuncs.scala From piglet with Apache License 2.0

5 votes

package dbis.piglet.backends.flink.streaming

import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.streaming.api.scala._
import scala.reflect.ClassTag
import dbis.piglet.backends._

class PigStream[T <: SchemaClass: ClassTag: TypeInformation] extends java.io.Serializable {

  def loadStream(env: StreamExecutionEnvironment, path: String, extract: (Array[String]) => T, delim: String = "\t"): DataStream[T] = {
    env.readTextFile(path).setParallelism(1).map(line => extract(line.split(delim, -1)))
  }

  def writeStream(path: String, result: DataStream[T], delim: String = ",") = result.map(_.mkString(delim)).writeAsText(path).setParallelism(1)

  def connect(env: StreamExecutionEnvironment, host: String, port: Int, extract: (Array[String]) => T, delim: String = "\t"): DataStream[T] = {
    env.socketTextStream(host,port).map(line => extract(line.split(delim, -1)))
  }

  def bind(host: String, port: Int, result: DataStream[T], delim: String = ",") = {
    result.map(_.mkString(delim) + "\n").writeToSocket(host, port, new UTF8StringSchema())
  }

  def zmqSubscribe(env: StreamExecutionEnvironment, addr: String, extract: (Array[String]) => T, delim: String = "\t"): DataStream[T] = {
    env.addSource(new ZmqSubscriber(addr)).map(line => extract(line.split(delim, -1)))
  }

  def zmqPublish(addr: String, result: DataStream[T], delim: String = ",") = {
    result.map(_.mkString(delim)).addSink(new ZmqPublisher(addr)).setParallelism(1)
  }
}

class TextLoader[T <: SchemaClass: ClassTag: TypeInformation] extends java.io.Serializable {
  def loadStream(env: StreamExecutionEnvironment, path: String, extract: (Array[String]) => T): DataStream[T] =
    env.readTextFile(path).map(line => extract(Array(line)))
}

object TextLoader extends java.io.Serializable {
  def apply[T <: SchemaClass: ClassTag: TypeInformation](): TextLoader[T] = {
    new TextLoader[T]
  }
}

object PigStream {
  def apply[T <: SchemaClass: ClassTag: TypeInformation](): PigStream[T] = {
    new PigStream
  }
}

class RDFStream[T <: SchemaClass: ClassTag: TypeInformation] extends java.io.Serializable {

  val pattern = "([^\"]\\S*|\".+?\")\\s*".r

  def rdfize(line: String): Array[String] = {
    val fields = pattern.findAllIn(line).map(_.trim)
    fields.toArray.slice(0, 3)
  }

  def loadStream(env: StreamExecutionEnvironment, path: String, extract: (Array[String]) => T): DataStream[T] = {
    env.readTextFile(path).map(line => extract(rdfize(line)))
  }

  def connect(env: StreamExecutionEnvironment, host: String, port: Int, extract: (Array[String]) => T): DataStream[T] = {
    env.socketTextStream(host,port).map(line => extract(rdfize(line)))
  }

 def zmqSubscribe(env: StreamExecutionEnvironment, addr: String, extract: (Array[String]) => T): DataStream[T] = {
    env.addSource(new ZmqSubscriber(addr)).map(line => extract(rdfize(line)))
  }

}

object RDFStream {
  def apply[T <: SchemaClass: ClassTag: TypeInformation](): RDFStream[T] = {
    new RDFStream
  }
}

Source File: Storage.scala From piglet with Apache License 2.0

5 votes

package dbis.piglet.backends.flink

import dbis.piglet.backends._
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.scala._
import org.apache.flink.core.fs.FileSystem.WriteMode._

import scala.reflect.ClassTag


//-----------------------------------------------------------------------------------------------------

class PigStorage[T <: SchemaClass :ClassTag: TypeInformation] extends java.io.Serializable {
  def load(env: ExecutionEnvironment, path: String,  extract: (Array[String]) => T, delim: String = "\t",
      skipFirstRow: Boolean = false, skipEmpty: Boolean = false, comments: String = ""): DataSet[T] = {
    
    val raw = env.readTextFile(path) 
    val nonEmpty = if(skipEmpty) raw.filter { line => line.nonEmpty } else raw
    val nonComment = if(comments.nonEmpty) nonEmpty.filter { line => !line.startsWith(comments) } else nonEmpty
    val content = if(skipFirstRow) {
      val header = nonComment.first(1).collect().head
      nonComment.filter { line => line != header }
    } else 
      nonComment
      
    
    content.map(line => line.split(delim, -1)).map(extract)
  }

  def write(path: String, result: DataSet[T], delim: String = ",") = result.map(_.mkString(delim)).writeAsText(path).setParallelism(1)
}

object PigStorage {
  def apply[T <: SchemaClass :ClassTag: TypeInformation](): PigStorage[T] = {
    new PigStorage[T]
  }
}


class RDFFileStorage[T:ClassTag: TypeInformation] extends java.io.Serializable {
  val pattern = "([^\"]\\S*|\".+?\")\\s*".r

  def rdfize(line: String): Array[String] = {
    val fields = pattern.findAllIn(line).map(_.trim)
    fields.toArray.slice(0, 3)
  }

  def load(env: ExecutionEnvironment, path: String, extract: (Array[String]) => T): DataSet[T] =
    env.readTextFile(path).map(line => extract(rdfize(line)))
}

object RDFFileStorage {
  def apply[T:ClassTag: TypeInformation](): RDFFileStorage[T] = {
    new RDFFileStorage[T]
  }
}

Source File: PigFuncs.scala From piglet with Apache License 2.0

5 votes

package dbis.piglet.backends.flink

import java.util.Random

import dbis.piglet.CommonPigFuncs
import dbis.piglet.backends._
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.functions._
import org.apache.flink.api.scala._

import scala.reflect.ClassTag

class CustomSampler[T <: SchemaClass: ClassTag: TypeInformation](dataSet: DataSet[T]) {
  def sample(withReplacement: Boolean, fraction: Double, seed: Long = new Random().nextLong()) = {
    dataSet.mapPartition(new SampleWithFraction[T](withReplacement, fraction, seed))
  }

}

object Sampler {
  implicit def addSampler[T <: SchemaClass: ClassTag: TypeInformation](dataSet: DataSet[T]) = {
    new CustomSampler(dataSet)
  }
}

object PigFuncs extends CommonPigFuncs {
}

Source File: package.scala From featran with Apache License 2.0

5 votes

package com.spotify.featran

import com.esotericsoftware.kryo.serializers.JavaSerializer
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.scala.DataSet

import scala.reflect.ClassTag

package object flink {

  
  implicit object FlinkCollectionType extends CollectionType[DataSet] {
    // force fallback to default serializer
    private val Ti = TypeInformation.of(classOf[Any])

    override def map[A, B: ClassTag](ma: DataSet[A])(f: A => B): DataSet[B] = {
      implicit val tib = Ti.asInstanceOf[TypeInformation[B]]
      ma.map(f)
    }
    override def reduce[A](ma: DataSet[A])(f: (A, A) => A): DataSet[A] =
      ma.reduce(f)

    override def cross[A, B: ClassTag](ma: DataSet[A])(mb: DataSet[B]): DataSet[(A, B)] =
      ma.crossWithTiny(mb)

    override def pure[A, B: ClassTag](ma: DataSet[A])(b: B): DataSet[B] = {
      implicit val tib = Ti.asInstanceOf[TypeInformation[B]]
      val env = ma.getExecutionEnvironment
      // Kryo throws NPE on `Feature`, use Java serialization instead
      env.addDefaultKryoSerializer(classOf[FeatureSet[Any]], classOf[JavaSerializer])
      env.fromElements(b)
    }
  }
}

Source File: FlinkKafkaCodecSerde.scala From cloudflow with Apache License 2.0

5 votes

package cloudflow.flink

import org.apache.kafka.clients.producer.ProducerRecord
import org.apache.kafka.clients.consumer.ConsumerRecord

import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.streaming.connectors.kafka._

import cloudflow.streamlets.{ CodecInlet, CodecOutlet }

private[flink] class FlinkKafkaCodecSerializationSchema[T: TypeInformation](outlet: CodecOutlet[T], topic: String)
    extends KafkaSerializationSchema[T] {
  override def serialize(value: T, timestamp: java.lang.Long): ProducerRecord[Array[Byte], Array[Byte]] =
    new ProducerRecord(topic, outlet.codec.encode(value))
}

private[flink] class FlinkKafkaCodecDeserializationSchema[T: TypeInformation](inlet: CodecInlet[T]) extends KafkaDeserializationSchema[T] {
  override def deserialize(record: ConsumerRecord[Array[Byte], Array[Byte]]): T = inlet.codec.decode(record.value)
  override def isEndOfStream(value: T): Boolean                                 = false
  override def getProducedType: TypeInformation[T]                              = implicitly[TypeInformation[T]]
}

Source File: FlinkStreamletContextImpl.scala From cloudflow with Apache License 2.0

5 votes

package cloudflow.flink

import scala.collection.JavaConverters._

import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.streaming.api.datastream.DataStreamSink
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.connectors.kafka._

import com.typesafe.config._
import cloudflow.streamlets._
import java.{ util ⇒ ju }


  override def writeStream[Out: TypeInformation](outlet: CodecOutlet[Out], stream: DataStream[Out]): DataStreamSink[Out] = {

    val topic            = findTopicForPort(outlet)
    val destTopic        = topic.name
    val bootstrapServers = topic.bootstrapServers.getOrElse(internalKafkaBootstrapServers)

    val propsMap = Map("bootstrap.servers" -> bootstrapServers, "batch.size" -> "0") ++
          topic.kafkaProducerProperties

    val properties = new ju.Properties()
    properties.putAll(propsMap.asJava)

    stream.addSink(
      new FlinkKafkaProducer[Out](
        destTopic,
        new FlinkKafkaCodecSerializationSchema[Out](outlet, destTopic),
        properties,
        FlinkKafkaProducer.Semantic.AT_LEAST_ONCE
      )
    )
  }
}

Source File: TestFlinkStreamletContext.scala From cloudflow with Apache License 2.0

5 votes

package cloudflow.flink
package testkit

import org.apache.flink.streaming.api.scala._
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.streaming.api.functions.sink.SinkFunction
import org.apache.flink.streaming.api.datastream.DataStreamSink

import com.typesafe.config._
import cloudflow.streamlets._


  override def writeStream[Out: TypeInformation](outlet: CodecOutlet[Out], stream: DataStream[Out]): DataStreamSink[Out] =
    outletTaps
      .find(_.portName == outlet.name)
      .map { _ ⇒
        stream.addSink(new SinkFunction[Out]() {
          override def invoke(out: Out) =
            TestFlinkStreamletContext.result.add(out.toString())
        })
      }
      .getOrElse(throw TestContextException(outlet.name, s"Bad test context, could not find destination for outlet ${outlet.name}"))
}

object TestFlinkStreamletContext {
  val result = new java.util.concurrent.ConcurrentLinkedQueue[String]()
}

case class TestContextException(portName: String, msg: String) extends RuntimeException(msg)

Source File: EventDeSerializer.scala From flink-demos with Apache License 2.0

5 votes

package com.dataartisans.flink.example.eventpattern.kafka

import java.nio.{ByteBuffer, ByteOrder}

import com.dataartisans.flink.example.eventpattern.Event
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.util.serialization.{DeserializationSchema, SerializationSchema}


class EventDeSerializer extends DeserializationSchema[Event] with SerializationSchema[Event] {
  
  override def deserialize(bytes: Array[Byte]): Event = {
    val buffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN)
    val address: Int = buffer.getInt(0)
    val eventType: Int = buffer.getInt(4)
    Event(address, eventType)
  }

  override def serialize(t: Event): Array[Byte] = {
    val byteBuffer = ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN)
    byteBuffer.putInt(0, t.sourceAddress)
    byteBuffer.putInt(4, t.event)
    byteBuffer.array()
  }

  override def isEndOfStream(t: Event): Boolean = false

  override def getProducedType: TypeInformation[Event] = {
    createTypeInformation[Event]
  }
}

Source File: DataRowRecordMarshaller.scala From flink-elasticsearch-source-connector with Apache License 2.0

5 votes

package com.mnubo.flink.streaming.connectors

import org.apache.flink.api.common.ExecutionConfig
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.TypeExtractor

@SerialVersionUID(1L)
class DataRowRecordMarshaller extends RecordMarshaller[DataRow] {
  private var typeInfo: DataRowTypeInfo = null

  @transient
  private lazy val serializer = typeInfo
    .createSerializer(new ExecutionConfig)
    .asInstanceOf[DataRowSerializer]

  override def typeInformation =
    typeInfo

  override def configureFields(types: Seq[MarshallerFieldDescriptor]) = {
    val (fieldNames, typeInfos): (Seq[String], Seq[TypeInformation[_]]) =
      types
        .map { tp =>
          tp.fieldName -> TypeExtractor.createTypeInfo(tp.fieldClass)
        }
        .unzip

    typeInfo = new DataRowTypeInfo(fieldNames, typeInfos)
  }

  override def createOrReuseInstance(fields: Seq[AnyRef], reuse: DataRow): DataRow =
    serializer.createOrReuseInstance(fields.toArray[AnyRef], reuse)
}

Source File: FlinkTestKits.scala From flink-jpmml with GNU Affero General Public License v3.0

5 votes

package io.radicalbit.flink.pmml.scala.utils

import io.radicalbit.flink.pmml.scala.sources.TemporizedSourceFunction
import io.radicalbit.flink.streaming.spec.core.FlinkTestKitCompanion
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.streaming.api.functions.sink.SinkFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.test.util.AbstractTestBase

import scala.collection.mutable
import scala.reflect.ClassTag

trait FlinkSourcedPipelineTestKit[IN1, IN2, OUT] extends AbstractTestBase {

  def executePipeline[IN1: TypeInformation: ClassTag, IN2: TypeInformation: ClassTag](
      in1: Seq[(Long, IN1)],
      in2: Seq[(Long, IN2)])(pipeline: (DataStream[IN1], DataStream[IN2]) => DataStream[OUT])(
      implicit companion: FlinkTestKitCompanion[OUT]) = {

    companion.testResults = mutable.MutableList[OUT]()

    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    val events = in1
      .union(in2)
      .sortBy(_._1)
      .collect {
        case (_, left: IN1) => (Some(left), None)
        case (_, right: IN2) => (None, Some(right))
      }

    val stream = env.addSource(new TemporizedSourceFunction[IN1, IN2](events))

    val stream1: DataStream[IN1] = stream.filter(either => either.isLeft).map(either => either.left.get)
    val stream2: DataStream[IN2] = stream.filter(either => either.isRight).map(either => either.right.get)

    pipeline(stream1, stream2)
      .addSink(new SinkFunction[OUT] {
        override def invoke(in: OUT) = {
          companion.testResults += in
        }
      })

    env.execute(this.getClass.getSimpleName)

    companion.testResults
  }

}

Source File: ElasticsearchDataset.scala From flink-elasticsearch-source-connector with Apache License 2.0

5 votes

package com.mnubo.flink.streaming.connectors.elasticsearch

import com.mnubo.flink.streaming.connectors._
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.operators.DataSource
import org.apache.flink.api.java.typeutils.{PojoTypeInfo, TupleTypeInfoBase}
import org.apache.flink.api.scala._

import scala.reflect.ClassTag

object ElasticsearchDataset {
  
  def fromElasticsearchQuery[T : ClassTag: TypeInformation](env: ExecutionEnvironment,
                                                            index: String,
                                                            query: String,
                                                            nodes: Set[String] = Set("localhost"),
                                                            port: Int = 9200,
                                                            pojoFields: Array[String] = null): DataSet[T] = {
    val clazz =
      implicitly[ClassTag[T]].runtimeClass

    val marshaller =
      if (clazz == classOf[DataRow])
        new DataRowRecordMarshaller().asInstanceOf[RecordMarshaller[T]]
      else
        implicitly[TypeInformation[T]] match {
          case info: TupleTypeInfoBase[T] =>
            new TupleRecordMarshaller[T](info)
          case info: PojoTypeInfo[T] =>
            require(pojoFields != null, "POJO fields must be specified (not null) if output type is a POJO.")
            new PojoRecordMarshaller[T](info, pojoFields)
          case other =>
            throw new IllegalArgumentException(s"The type ${clazz.getName} has to be a tuple, a DataRow or pojo type.")
        }

    val inputFormat =
      new ElasticseachInputFormat[T](
        nodes,
        port,
        index,
        query,
        marshaller
      )

    // Not the most elegant, but can't wait for the input format to be configured to get the actual schema. Have to get it now.
    val schema = inputFormat.fetchSchema()
    marshaller.configureFields(schema)

    new DataSet[T](new DataSource[T](env.getJavaEnv, inputFormat, marshaller.typeInformation, getCallLocationName()))
  }


}

Source File: RecordTransformer.scala From flink-elasticsearch-source-connector with Apache License 2.0

5 votes

package com.mnubo.flink.streaming.connectors

import org.apache.flink.api.common.operators.Keys.ExpressionKeys._
import org.apache.flink.api.common.typeinfo.TypeInformation

import scala.annotation.tailrec
import scala.language.existentials
import scala.reflect.ClassTag

sealed trait FieldSpecification extends Serializable

case class ExistingField(name: String) extends FieldSpecification

case class NewField(name: String, typeInfo: TypeInformation[_]) extends FieldSpecification

trait RecordTransformer extends Serializable {
  val classTag = ClassTag[DataRow](classOf[DataRow])
  def typeInfo : DataRowTypeInfo
  def transform(dataRow: DataRow, values:Any*) : DataRow
}

class FieldMapperRecordTransformer private[connectors](srcTypeInfo:DataRowTypeInfo, fieldSpecifications: FieldSpecification*) extends RecordTransformer {
  require(srcTypeInfo != null, s"srcTypeInfo must not be null")
  require(fieldSpecifications != null, s"fieldSpecifications must not be null")
  require(fieldSpecifications.nonEmpty, s"fieldSpecifications must not be empty")
  require(!fieldSpecifications.contains(null), s"fieldSpecifications must not contain any nulls")

  override val typeInfo = {
    val (fieldNames, elementTypes) = fieldSpecifications.flatMap {
      case ExistingField(name) if name == SELECT_ALL_CHAR || name == SELECT_ALL_CHAR_SCALA => srcTypeInfo.getFieldNames.zip(srcTypeInfo.getElementTypes)
      case ExistingField(name) => Seq(name -> srcTypeInfo.getFieldType(name))
      case NewField(name, newFieldTypeInfo) => Seq(name -> newFieldTypeInfo)
    }.unzip
    require(fieldNames.length == fieldNames.distinct.length, s"Fields can't have duplicates. Fields were $fieldNames.")
    new DataRowTypeInfo(fieldNames, elementTypes)
  }

  private def newFieldsNames = fieldSpecifications.collect{ case newValue: NewField => newValue.name }

  override def transform(dataRow: DataRow, values:Any*) : DataRow = {
    require(dataRow != null, s"dataRow must not be null")
    require(values != null, s"values must not be null")
    require(newFieldsNames.length == values.length, s"Must specify values for all new fields and only new fields. New fields are '$newFieldsNames'")

    val resultValues = new Array[Any](typeInfo.getArity)
    @tailrec
    def transform(index:Int, remainingSpecs: Seq[FieldSpecification], remainingValues:Seq[Any]) : DataRow = {
      if(remainingSpecs.isEmpty) {
        new DataRow(resultValues, typeInfo)
      } else {
        val currentSpec = remainingSpecs.head
        currentSpec match {
          case ExistingField(name) if name == SELECT_ALL_CHAR || name == SELECT_ALL_CHAR_SCALA =>
            Array.copy(dataRow.data, 0, resultValues, index, dataRow.data.length)
            transform(index + dataRow.data.length, remainingSpecs.tail, remainingValues)
          case ExistingField(name) =>
            resultValues(index) = dataRow(name)
            transform(index + 1, remainingSpecs.tail, remainingValues)
          case NewField(name, _) =>
            resultValues(index) = remainingValues.head
            transform(index + 1, remainingSpecs.tail, remainingValues.tail)
        }
      }
    }
    transform(0, fieldSpecifications, values)
  }
}

object RecordTransformer {
  def mapFields(srcTypeInfo: DataRowTypeInfo, fieldSpecifications: FieldSpecification*) : RecordTransformer = {
    new FieldMapperRecordTransformer(srcTypeInfo, fieldSpecifications:_*)
  }
}

Source File: package.scala From milan with Apache License 2.0

5 votes

package com.amazon.milan.compiler.flink

import java.io.{ByteArrayInputStream, ByteArrayOutputStream}

import com.amazon.milan.compiler.flink.runtime.{UnwrapRecordsMapFunction, WrapRecordsMapFunction}
import com.amazon.milan.compiler.flink.testing.IntKeyValueRecord
import com.amazon.milan.compiler.flink.types.{RecordWrapper, RecordWrapperTypeInformation}
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.common.typeutils.TypeSerializer
import org.apache.flink.api.java.typeutils.ResultTypeQueryable
import org.apache.flink.core.memory.{DataInputView, DataInputViewStreamWrapper, DataOutputView, DataOutputViewStreamWrapper}
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.datastream.DataStream
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment

import scala.language.implicitConversions
import scala.util.Random


package object testutil {
  def getTestExecutionEnvironment: StreamExecutionEnvironment = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    env.setBufferTimeout(0)
    env
  }

  def copyWithSerializer[T](value: T, serializer: TypeSerializer[T]): T = {
    val outputStream = new ByteArrayOutputStream()
    val outputView = new DataOutputViewStreamWrapper(outputStream)
    serializer.serialize(value, outputView)

    val bytes = outputStream.toByteArray
    val inputStream = new ByteArrayInputStream(bytes)
    val inputView = new DataInputViewStreamWrapper(inputStream)
    serializer.deserialize(inputView)
  }

  def copyData[T](writeValue: DataOutputView => Unit, readValue: DataInputView => T): T = {
    val outputStream = new ByteArrayOutputStream()
    val outputView = new DataOutputViewStreamWrapper(outputStream)
    writeValue(outputView)

    val bytes = outputStream.toByteArray
    val inputStream = new ByteArrayInputStream(bytes)
    val inputView = new DataInputViewStreamWrapper(inputStream)
    readValue(inputView)
  }

  def generateIntKeyValueRecords(recordCount: Int, keyCount: Int, maxValue: Int): List[IntKeyValueRecord] = {
    val rand = new Random(0)
    List.tabulate(recordCount)(_ => IntKeyValueRecord(rand.nextInt(keyCount), rand.nextInt(maxValue + 1)))
  }

  implicit class WrappedDataStreamExtensions[T >: Null, TKey >: Null <: Product](dataStream: DataStream[RecordWrapper[T, TKey]]) {
    def unwrap(recordTypeInformation: TypeInformation[T]): DataStream[T] = {
      val mapper = new UnwrapRecordsMapFunction[T, TKey](recordTypeInformation)
      this.dataStream.map(mapper)
    }

    def unwrap(): DataStream[T] = {
      val recordType = this.dataStream.getType.asInstanceOf[RecordWrapperTypeInformation[T, TKey]].valueTypeInformation
      this.unwrap(recordType)
    }
  }

  implicit class DataStreamExtensions[T >: Null](dataStream: DataStream[T]) {
    def wrap(recordTypeInformation: TypeInformation[T]): DataStream[RecordWrapper[T, Product]] = {
      val mapper = new WrapRecordsMapFunction[T](recordTypeInformation)
      this.dataStream.map(mapper)
    }

    def wrap(): DataStream[RecordWrapper[T, Product]] = {
      val recordType = this.dataStream.asInstanceOf[ResultTypeQueryable[T]].getProducedType
      this.wrap(recordType)
    }
  }

}

Source File: MapFunctions.scala From milan with Apache License 2.0

5 votes

package com.amazon.milan.compiler.flink.runtime

import com.amazon.milan.compiler.flink.internal.LineageRecordFactory
import com.amazon.milan.compiler.flink.metrics.MetricFactory
import com.amazon.milan.compiler.flink.types.{RecordWrapper, RecordWrapperTypeInformation}
import com.amazon.milan.types.LineageRecord
import org.apache.flink.api.common.functions.RichMapFunction
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.ResultTypeQueryable
import org.apache.flink.util.OutputTag


object MapFunctions {
  val ProcessedRecordsCounterMetricName = "processed_record_count"
}

import com.amazon.milan.compiler.flink.runtime.MapFunctions._


abstract class MapFunctionWithLineage[TIn >: Null, TKey >: Null <: Product, TOut >: Null](outputTypeInformation: TypeInformation[TOut],
                                                                                          keyTypeInformation: TypeInformation[TKey],
                                                                                          lineageRecordFactory: LineageRecordFactory,
                                                                                          lineageOutputTag: OutputTag[LineageRecord],
                                                                                          metricFactory: MetricFactory)
  extends RichMapFunction[RecordWrapper[TIn, TKey], RecordWrapper[TOut, TKey]]
    with ResultTypeQueryable[RecordWrapper[TOut, TKey]] {

  @transient private lazy val processedRecordsCounter = this.metricFactory.createCounter(this.getRuntimeContext, ProcessedRecordsCounterMetricName)

  protected def mapValue(in: TIn): TOut

  override def map(record: RecordWrapper[TIn, TKey]): RecordWrapper[TOut, TKey] = {
    this.processedRecordsCounter.increment()
    val mappedValue = this.mapValue(record.value)
    RecordWrapper.wrap(mappedValue, record.key, record.sequenceNumber)
  }

  override def getProducedType: TypeInformation[RecordWrapper[TOut, TKey]] =
    RecordWrapperTypeInformation.wrap(this.outputTypeInformation, this.keyTypeInformation)
}


abstract class KeyedMapFunctionWithLineage[TIn >: Null, TInKey >: Null <: Product, TKey, TOut >: Null](outputTypeInfo: TypeInformation[TOut],
                                                                                                       keyTypeInfo: TypeInformation[TInKey],
                                                                                                       lineageRecordFactory: LineageRecordFactory,
                                                                                                       lineageOutputTag: OutputTag[LineageRecord],
                                                                                                       metricFactory: MetricFactory)
  extends RichMapFunction[RecordWrapper[TIn, TInKey], RecordWrapper[TOut, TInKey]]
    with ResultTypeQueryable[RecordWrapper[TOut, TInKey]] {

  @transient private lazy val processedRecordsCounter = this.metricFactory.createCounter(this.getRuntimeContext, ProcessedRecordsCounterMetricName)

  
  protected def getKey(recordKey: TInKey): TKey

  protected def mapValue(key: TKey, value: TIn): TOut

  override def map(record: RecordWrapper[TIn, TInKey]): RecordWrapper[TOut, TInKey] = {
    this.processedRecordsCounter.increment()
    val key = this.getKey(record.key)
    val mappedValue = this.mapValue(key, record.value)
    RecordWrapper.wrap(mappedValue, record.key, record.sequenceNumber)
  }

  override def getProducedType: TypeInformation[RecordWrapper[TOut, TInKey]] =
    RecordWrapperTypeInformation.wrap(this.outputTypeInfo, this.keyTypeInfo)
}

Source File: ArgCompareProcessFunctions.scala From milan with Apache License 2.0

5 votes

package com.amazon.milan.compiler.flink.runtime

import com.amazon.milan.compiler.flink.TypeUtil
import org.apache.flink.api.common.typeinfo.TypeInformation


abstract class ArgCompareKeyedProcessFunction[T >: Null, TKey >: Null <: Product, TArg](recordTypeInformation: TypeInformation[T],
                                                                                        keyTypeInformation: TypeInformation[TKey],
                                                                                        argTypeInformation: TypeInformation[TArg])
  extends ScanKeyedProcessFunction[T, TKey, Option[TArg], T](None, keyTypeInformation, TypeUtil.createOptionTypeInfo(argTypeInformation), recordTypeInformation) {

  protected def getArg(value: T): TArg

  protected def greaterThan(arg1: TArg, arg2: TArg): Boolean

  override protected def process(state: Option[TArg], key: TKey, value: T): (Option[TArg], Option[T]) = {
    val valueArg = this.getArg(value)

    state match {
      case None =>
        (Some(valueArg), Some(value))

      case Some(stateArg) =>
        if (this.greaterThan(valueArg, stateArg)) {
          (Some(valueArg), Some(value))
        }
        else {
          (state, None)
        }
    }
  }
}


abstract class ArgCompareProcessFunction[T >: Null, TKey >: Null <: Product, TArg](recordTypeInformation: TypeInformation[T],
                                                                                   keyTypeInformation: TypeInformation[TKey],
                                                                                   argTypeInformation: TypeInformation[TArg])
  extends ScanProcessFunction[T, TKey, Option[TArg], T](None, keyTypeInformation, TypeUtil.createOptionTypeInfo(argTypeInformation), recordTypeInformation) {

  protected def getArg(value: T): TArg

  protected def greaterThan(arg1: TArg, arg2: TArg): Boolean

  override protected def process(state: Option[TArg], value: T): (Option[TArg], Option[T]) = {
    val valueArg = this.getArg(value)

    state match {
      case None =>
        (Some(valueArg), Some(value))

      case Some(stateArg) =>
        if (this.greaterThan(valueArg, stateArg)) {
          (Some(valueArg), Some(value))
        }
        else {
          (state, None)
        }
    }
  }
}

Source File: KinesisDataSource.scala From milan with Apache License 2.0

5 votes

package com.amazon.milan.compiler.flink.runtime

import java.util.Properties

import com.amazon.milan.dataformats.DataInputFormat
import com.amazon.milan.serialization.MilanObjectMapper
import com.typesafe.scalalogging.Logger
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.streaming.api.datastream.DataStreamSource
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
import org.apache.flink.streaming.connectors.kinesis.FlinkKinesisConsumer
import org.apache.flink.streaming.connectors.kinesis.config.{AWSConfigConstants, ConsumerConfigConstants}
import org.apache.flink.streaming.connectors.kinesis.serialization.KinesisDeserializationSchema
import org.slf4j.LoggerFactory


object KinesisDataSource {
  private lazy val logger = Logger(LoggerFactory.getLogger(getClass))

  def addDataSource[T](env: StreamExecutionEnvironment,
                       streamName: String,
                       region: String,
                       dataFormat: DataInputFormat[T],
                       recordTypeInformation: TypeInformation[T]): DataStreamSource[T] = {
    this.logger.info(s"Creating Kinesis consumer for stream '$streamName', region '$region'.")

    val config = this.getConsumerProperties(region)
    val schema = new JsonDeserializationSchema[T](recordTypeInformation)

    val source = new FlinkKinesisConsumer[T](streamName, schema, config)
    env.addSource(source)
  }

  private def getConsumerProperties(region: String): Properties = {
    val config = new Properties()

    config.setProperty(AWSConfigConstants.AWS_REGION, region)
    config.setProperty(AWSConfigConstants.AWS_CREDENTIALS_PROVIDER, AWSConfigConstants.CredentialProvider.AUTO.toString)
    config.setProperty(ConsumerConfigConstants.STREAM_INITIAL_POSITION, ConsumerConfigConstants.InitialPosition.LATEST.toString)

    config
  }
}



class JsonDeserializationSchema[T](recordTypeInformation: TypeInformation[T])
  extends KinesisDeserializationSchema[T] {

  override def deserialize(bytes: Array[Byte],
                           partitionKey: String,
                           seqNum: String,
                           approxArrivalTimestamp: Long,
                           stream: String,
                           shardId: String): T = {
    MilanObjectMapper.readValue[T](bytes, this.recordTypeInformation.getTypeClass)
  }

  override def getProducedType: TypeInformation[T] =
    this.recordTypeInformation
}

Source File: RuntimeUtil.scala From milan with Apache License 2.0

5 votes

package com.amazon.milan.compiler.flink.runtime

import java.util

import com.amazon.milan.compiler.flink.generator.FlinkGeneratorException
import com.amazon.milan.serialization.MilanObjectMapper
import com.fasterxml.jackson.databind.`type`.TypeFactory
import org.apache.flink.api.common.typeinfo.TypeInformation

import scala.collection.JavaConverters._
import scala.reflect.{ClassTag, classTag}


object RuntimeUtil {
  val typeName: String = getClass.getTypeName.stripSuffix("$")

  def loadJsonList[TElement: ClassTag](listJson: String): List[TElement] = {
    this.loadJsonArrayList[TElement](listJson).asScala.toList
  }

  def loadJsonArrayList[TElement: ClassTag](listJson: String): util.ArrayList[TElement] = {
    val typeFactory = TypeFactory.defaultInstance()
    val itemClass = classTag[TElement].runtimeClass.asInstanceOf[Class[TElement]]
    val javaType = typeFactory.constructCollectionType(classOf[util.ArrayList[TElement]], itemClass)
    MilanObjectMapper.readValue[util.ArrayList[TElement]](listJson, javaType)
  }

  def preventGenericTypeInformation[T](typeInfo: TypeInformation[T]): TypeInformation[T] = {
    if (typeInfo.getClass.getName.contains("__wrapper")) {
      throw new FlinkGeneratorException(s"Creating TypeInformation for '${typeInfo.getTypeClass.getName}' produced a GenericTypeInformation.")
    }

    typeInfo
  }
}

Source File: LeftJoinKeyedCoProcessFunction.scala From milan with Apache License 2.0

5 votes

package com.amazon.milan.compiler.flink.runtime

import com.amazon.milan.compiler.flink.internal.JoinLineageRecordFactory
import com.amazon.milan.compiler.flink.metrics.MetricFactory
import com.amazon.milan.compiler.flink.types.{RecordWrapper, RecordWrapperTypeInformation}
import com.amazon.milan.types.LineageRecord
import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor}
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.ResultTypeQueryable
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.co.KeyedCoProcessFunction
import org.apache.flink.util.{Collector, OutputTag}


object LeftJoinCoProcessFunction {
  val LeftInputRecordsCounterMetricName = "left_input_record_count"
  val RightInputRecordsCounterMetricName = "right_input_record_count"
  val OutputRecordsCounterMetricName = "output_record_count"
}

import com.amazon.milan.compiler.flink.runtime.LeftJoinCoProcessFunction._


abstract class LeftJoinKeyedCoProcessFunction[TLeft >: Null, TRight >: Null, TKey >: Null <: Product, TOut >: Null](rightTypeInformation: TypeInformation[TRight],
                                                                                                                    keyTypeInformation: TypeInformation[TKey],
                                                                                                                    outputTypeInformation: TypeInformation[TOut],
                                                                                                                    leftRecordIdExtractor: RecordIdExtractor[TLeft],
                                                                                                                    rightRecordIdExtractor: RecordIdExtractor[TRight],
                                                                                                                    outputRecordIdExtractor: RecordIdExtractor[TOut],
                                                                                                                    lineageRecordFactory: JoinLineageRecordFactory,
                                                                                                                    lineageOutputTag: OutputTag[LineageRecord],
                                                                                                                    metricFactory: MetricFactory)
  extends KeyedCoProcessFunction[TKey, RecordWrapper[TLeft, TKey], RecordWrapper[TRight, TKey], RecordWrapper[TOut, TKey]]
    with ResultTypeQueryable[RecordWrapper[TOut, TKey]] {

  @transient private lazy val canProduceLineage = leftRecordIdExtractor.canExtractRecordId && rightRecordIdExtractor.canExtractRecordId && outputRecordIdExtractor.canExtractRecordId
  @transient private lazy val leftInputRecordsCounter = this.metricFactory.createCounter(this.getRuntimeContext, LeftInputRecordsCounterMetricName)
  @transient private lazy val rightInputRecordsCounter = this.metricFactory.createCounter(this.getRuntimeContext, RightInputRecordsCounterMetricName)
  @transient private lazy val outputRecordsCounter = this.metricFactory.createCounter(this.getRuntimeContext, OutputRecordsCounterMetricName)

  @transient private var lastRightValue: ValueState[TRight] = _

  protected def map(left: TLeft, right: TRight): TOut

  protected def postCondition(left: TLeft, right: TRight): Boolean

  override def processElement1(leftRecord: RecordWrapper[TLeft, TKey],
                               context: KeyedCoProcessFunction[TKey, RecordWrapper[TLeft, TKey], RecordWrapper[TRight, TKey], RecordWrapper[TOut, TKey]]#Context,
                               collector: Collector[RecordWrapper[TOut, TKey]]): Unit = {
    this.leftInputRecordsCounter.increment()

    val leftValue = leftRecord.value
    val rightValue = this.lastRightValue.value()

    if (this.postCondition(leftValue, rightValue)) {
      val output = this.map(leftValue, rightValue)

      if (output != null) {
        if (this.canProduceLineage) {
          val lineageRecord = this.createLineageRecord(this.outputRecordIdExtractor(output), leftValue, rightValue)
          context.output(this.lineageOutputTag, lineageRecord)
        }

        collector.collect(RecordWrapper.wrap[TOut, TKey](output, leftRecord.key, 0))
        this.outputRecordsCounter.increment()
      }
    }
  }

  override def processElement2(rightRecord: RecordWrapper[TRight, TKey],
                               context: KeyedCoProcessFunction[TKey, RecordWrapper[TLeft, TKey], RecordWrapper[TRight, TKey], RecordWrapper[TOut, TKey]]#Context,
                               collector: Collector[RecordWrapper[TOut, TKey]]): Unit = {
    this.rightInputRecordsCounter.increment()
    this.lastRightValue.update(rightRecord.value)
  }

  override def open(parameters: Configuration): Unit = {
    val rightValueDescriptor = new ValueStateDescriptor[TRight]("lastRightValue", this.rightTypeInformation)
    this.lastRightValue = this.getRuntimeContext.getState(rightValueDescriptor)
  }

  override def getProducedType: TypeInformation[RecordWrapper[TOut, TKey]] =
    RecordWrapperTypeInformation.wrap(this.outputTypeInformation, this.keyTypeInformation)

  private def createLineageRecord(outputRecordId: String, leftRecord: TLeft, rightRecord: TRight): LineageRecord = {
    val sourceRecords =
      Option(leftRecord).toSeq.map(r => this.lineageRecordFactory.createLeftRecordPointer(this.leftRecordIdExtractor(r))) ++
        Option(rightRecord).toSeq.map(r => this.lineageRecordFactory.createRightRecordPointer(this.rightRecordIdExtractor(r)))

    this.lineageRecordFactory.createLineageRecord(outputRecordId, sourceRecords)
  }
}

Source File: TimeWindowFlatMapProcessWindowFunction.scala From milan with Apache License 2.0

5 votes

package com.amazon.milan.compiler.flink.runtime

import java.lang
import java.time.Instant

import com.amazon.milan.compiler.flink.TypeUtil
import com.amazon.milan.compiler.flink.types.{RecordWrapper, RecordWrapperTypeInformation}
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.ResultTypeQueryable
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector


abstract class TimeWindowFlatMapProcessWindowFunction[T >: Null, TInKey >: Null <: Product, TOutKey >: Null <: Product](recordTypeInfo: TypeInformation[T],
                                                                                                                        outKeyTypeInfo: TypeInformation[TOutKey])
  extends ProcessWindowFunction[RecordWrapper[Option[T], TInKey], RecordWrapper[Option[T], TOutKey], TInKey, TimeWindow]
    with ResultTypeQueryable[RecordWrapper[Option[T], TOutKey]] {

  @transient private var sequenceNumberHelper: SequenceNumberHelper = _

  protected def addWindowStartTimeToKey(key: TInKey, windowStart: Instant): TOutKey

  override def getProducedType: TypeInformation[RecordWrapper[Option[T], TOutKey]] =
    RecordWrapperTypeInformation.wrap(TypeUtil.createOptionTypeInfo(this.recordTypeInfo), this.outKeyTypeInfo)

  override def process(key: TInKey,
                       context: ProcessWindowFunction[RecordWrapper[Option[T], TInKey], RecordWrapper[Option[T], TOutKey], TInKey, TimeWindow]#Context,
                       items: lang.Iterable[RecordWrapper[Option[T], TInKey]],
                       collector: Collector[RecordWrapper[Option[T], TOutKey]]): Unit = {
    val windowStartTime = Instant.ofEpochMilli(context.window().getStart)

    val record = items.iterator().next()
    val outKey = this.addWindowStartTimeToKey(record.key, windowStartTime)
    val outRecord = RecordWrapper.wrap(record.value, outKey, sequenceNumberHelper.increment())
    collector.collect(outRecord)
  }

  override def open(parameters: Configuration): Unit = {
    this.sequenceNumberHelper = new SequenceNumberHelper(this.getRuntimeContext)
  }
}

Source File: UnpackOptionProcessFunction.scala From milan with Apache License 2.0

5 votes

package com.amazon.milan.compiler.flink.runtime

import com.amazon.milan.compiler.flink.types.{RecordWrapper, RecordWrapperTypeInformation}
import com.typesafe.scalalogging.Logger
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.ResultTypeQueryable
import org.apache.flink.streaming.api.functions.ProcessFunction
import org.apache.flink.util.Collector
import org.slf4j.LoggerFactory


class UnpackOptionProcessFunction[T >: Null, TKey >: Null <: Product](recordType: TypeInformation[T],
                                                                      keyType: TypeInformation[TKey])
  extends ProcessFunction[RecordWrapper[Option[T], TKey], RecordWrapper[T, TKey]]
    with ResultTypeQueryable[RecordWrapper[T, TKey]] {

  @transient private lazy val logger = Logger(LoggerFactory.getLogger(getClass))

  override def processElement(record: RecordWrapper[Option[T], TKey],
                              context: ProcessFunction[RecordWrapper[Option[T], TKey], RecordWrapper[T, TKey]]#Context,
                              collector: Collector[RecordWrapper[T, TKey]]): Unit = {
    if (record.value.isDefined) {
      collector.collect(RecordWrapper.wrap(record.value.get, record.key, record.sequenceNumber))
    }
  }

  override def getProducedType: TypeInformation[RecordWrapper[T, TKey]] =
    RecordWrapperTypeInformation.wrap(this.recordType, this.keyType)
}

Source File: ArrayRecordToTupleMapFunction.scala From milan with Apache License 2.0

5 votes

package com.amazon.milan.compiler.flink.runtime

import com.amazon.milan.compiler.flink.types.{ArrayRecord, RecordWrapper, RecordWrapperTypeInformation}
import org.apache.flink.api.common.functions.MapFunction
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.ResultTypeQueryable


abstract class ArrayRecordToTupleMapFunction[T >: Null, TKey >: Null <: Product](outputTypeInformation: TypeInformation[T],
                                                                                 keyTypeInformation: TypeInformation[TKey])
  extends MapFunction[RecordWrapper[ArrayRecord, TKey], RecordWrapper[T, TKey]]
    with ResultTypeQueryable[RecordWrapper[T, TKey]] {

  protected def getTuple(record: ArrayRecord): T

  override def map(record: RecordWrapper[ArrayRecord, TKey]): RecordWrapper[T, TKey] = {
    val tupleValue = this.getTuple(record.value)
    RecordWrapper.wrap[T, TKey](tupleValue, record.key, 0)
  }

  override def getProducedType: TypeInformation[RecordWrapper[T, TKey]] =
    RecordWrapperTypeInformation.wrap(this.outputTypeInformation, this.keyTypeInformation)
}

Source File: IdentityFlatMapFunction.scala From milan with Apache License 2.0

5 votes

package com.amazon.milan.compiler.flink.runtime

import com.amazon.milan.compiler.flink.types.{RecordWrapper, RecordWrapperTypeInformation}
import org.apache.flink.api.common.functions.FlatMapFunction
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.ResultTypeQueryable
import org.apache.flink.util.Collector



class IdentityFlatMapFunction[T >: Null, TKey >: Null <: Product](recordTypeInformation: TypeInformation[T],
                                                                  keyTypeInformation: TypeInformation[TKey])
  extends FlatMapFunction[RecordWrapper[T, TKey], RecordWrapper[T, TKey]]
    with ResultTypeQueryable[RecordWrapper[T, TKey]] {

  override def flatMap(record: RecordWrapper[T, TKey], collector: Collector[RecordWrapper[T, TKey]]): Unit = {
    collector.collect(record)
  }

  override def getProducedType: TypeInformation[RecordWrapper[T, TKey]] =
    RecordWrapperTypeInformation.wrap(this.recordTypeInformation, this.keyTypeInformation)
}

Source File: DataSourceUtil.scala From milan with Apache License 2.0

5 votes

package com.amazon.milan.compiler.flink.runtime

import com.amazon.milan.application.sources.FileDataSource
import com.amazon.milan.dataformats.DataInputFormat
import com.amazon.milan.compiler.flink.types.{ByteArrayDataFormatFlatMapFunction, ByteArrayInputFormat, ByteArrayRecordTypeInformation}
import com.typesafe.scalalogging.Logger
import org.apache.flink.api.common.io.FilePathFilter
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.streaming.api.datastream.{DataStreamSource, SingleOutputStreamOperator}
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
import org.apache.flink.streaming.api.functions.source.FileProcessingMode
import org.slf4j.LoggerFactory

import scala.collection.JavaConverters._


object DataSourceUtil {
  private lazy val logger = Logger(LoggerFactory.getLogger(getClass))

  def addFileDataSource[T](env: StreamExecutionEnvironment,
                           path: String,
                           dataFormat: DataInputFormat[T],
                           configuration: FileDataSource.Configuration,
                           recordTypeInformation: TypeInformation[T]): SingleOutputStreamOperator[T] = {
    this.logger.info(s"Adding file '$path' as an input to the streaming environment. ")

    val inputFormat = new ByteArrayInputFormat
    inputFormat.setFilesFilter(FilePathFilter.createDefaultFilter())

    val processingMode = configuration.readMode match {
      case FileDataSource.ReadMode.Continuous => FileProcessingMode.PROCESS_CONTINUOUSLY
      case FileDataSource.ReadMode.Once => FileProcessingMode.PROCESS_ONCE
    }

    val changeCheckIntervalMs = processingMode match {
      case FileProcessingMode.PROCESS_CONTINUOUSLY => 5000L
      case _ => -1L
    }

    val inputLines = env.readFile(
      inputFormat,
      path,
      processingMode,
      changeCheckIntervalMs,
      new ByteArrayRecordTypeInformation)

    val mapper = new ByteArrayDataFormatFlatMapFunction[T](dataFormat, recordTypeInformation)
    inputLines.flatMap(mapper)
  }

  def addListDataSource[T](env: StreamExecutionEnvironment,
                           values: List[T],
                           runForever: Boolean,
                           recordTypeInformation: TypeInformation[T]): DataStreamSource[T] = {
    if (runForever) {
      // If we don't want the source to terminate after the elements run out then we need to use a custom source
      // function rather than env.fromCollection. In order to not cause duplicate records to be sent from multiple
      // copies of the source function we set the parallelism to 1.
      val source = new ListSourceFunction[T](values, runForever)
      env.addSource(source, recordTypeInformation).setParallelism(1)
    }
    else {
      env.fromCollection(values.asJavaCollection, recordTypeInformation)
    }
  }
}

Source File: TypeInformationDataInputFormat.scala From milan with Apache License 2.0

5 votes

package com.amazon.milan.compiler.flink.dataformats

import java.io.{ByteArrayInputStream, EOFException, InputStream}

import com.amazon.milan.dataformats.DataInputFormat
import com.amazon.milan.typeutil.TypeDescriptor
import org.apache.flink.api.common.ExecutionConfig
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.common.typeutils.TypeSerializer
import org.apache.flink.core.memory.DataInputViewStreamWrapper



class TypeInformationDataInputFormat[T](typeInfo: TypeInformation[T]) extends DataInputFormat[T] {
  @transient private lazy val serializer = this.createSerializer()

  override def getGenericArguments: List[TypeDescriptor[_]] = {
    // This class is not intended to be serialized by GenericTypedJsonSerializer, so this should not be called.
    throw new UnsupportedOperationException()
  }

  override def setGenericArguments(genericArgs: List[TypeDescriptor[_]]): Unit = {
    // This class is not intended to be deserialized by GenericTypedJsonDeserializer, so this should not be called.
    throw new UnsupportedOperationException()
  }

  override def readValue(bytes: Array[Byte], offset: Int, length: Int): Option[T] = {
    val input = new DataInputViewStreamWrapper(new ByteArrayInputStream(bytes, offset, length))
    Some(this.serializer.deserialize(input))
  }

  override def readValues(stream: InputStream): TraversableOnce[T] = {
    val input = new DataInputViewStreamWrapper(stream)
    Stream.continually(0)
      .map(_ =>
        try {
          Some(this.serializer.deserialize(input))
        }
        catch {
          case _: EOFException => None
        })
      .takeWhile(_.isDefined)
      .map(_.get)
  }

  private def createSerializer(): TypeSerializer[T] = {
    val config = new ExecutionConfig()
    this.typeInfo.createSerializer(config)
  }
}

Source File: JsonDeserializationSchema.scala From milan with Apache License 2.0

5 votes

package com.amazon.milan.compiler.flink.serialization

import com.amazon.milan.serialization.MilanObjectMapper
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.TypeExtractor
import org.apache.flink.streaming.connectors.kinesis.serialization.KinesisDeserializationSchema

import scala.reflect.{ClassTag, classTag}


object JsonDeserializationSchema {
  private val objectMapper = new MilanObjectMapper()
}



class JsonDeserializationSchema[T: ClassTag] extends KinesisDeserializationSchema[T] with Serializable {
  override def deserialize(bytes: Array[Byte],
                           partitionKey: String,
                           seqNum: String,
                           approxArrivalTimestamp: Long,
                           stream: String,
                           shardId: String): T = {
    JsonDeserializationSchema.objectMapper.readValue[T](bytes, classTag[T].runtimeClass.asInstanceOf[Class[T]])
  }

  override def getProducedType: TypeInformation[T] = {
    TypeExtractor.getForClass(classTag[T].runtimeClass.asInstanceOf[Class[T]])
  }
}

org.apache.flink.api.common.typeinfo.TypeInformation Scala Examples