org.apache.flink.api.common.typeinfo.TypeInformation Scala Examples

The following examples show how to use org.apache.flink.api.common.typeinfo.TypeInformation. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: DataRow.scala    From flink-elasticsearch-source-connector   with Apache License 2.0 5 votes vote down vote up
package com.mnubo.flink.streaming.connectors

import org.apache.commons.lang3.ClassUtils
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.TypeExtractor

import scala.language.existentials

case class Value(v: Any, name: String, givenTypeInfo: Option[TypeInformation[_]] = None) {
  require(v != null || givenTypeInfo.isDefined, "You must pass a TypeInformation for null values")

  val typeInfo = givenTypeInfo match {
    case Some(ti) => ti
    case None => TypeExtractor.getForObject(v)
  }

  require(isAssignable(v, typeInfo.getTypeClass), s"data element '$v' is not compatible with class ${typeInfo.getTypeClass.getName}")

  private def isAssignable(value: Any, cl: Class[_]) = {
    if (value == null && classOf[AnyRef].isAssignableFrom(cl))
      true
    else
      ClassUtils.isAssignable(value.getClass, cl)
  }
}

object Value {
  def apply(v: Any, name: String, givenTypeInfo: TypeInformation[_]) = {
    new Value(v, name, Some(givenTypeInfo))
  }
}



class DataRow(private [connectors] val data: Array[Any], private [connectors] val info: DataRowTypeInfo) extends Product with Serializable {
  require(data != null, "data must not be null")
  require(info != null, "info must not be null")
  require(data.length == info.getArity, "data must be of the correct arity")

  def apply[T](i: Int): T =
    data(i).asInstanceOf[T]

  def apply[T](fieldExpression: String): T =
    apply(info.getFieldIndex(fieldExpression))

  override def productElement(n: Int): Any =
    apply[AnyRef](n)

  override def productArity =
    info.getArity

  override def canEqual(that: Any) =
    that.isInstanceOf[DataRow]

  override def equals(that: Any) =
    canEqual(that) && data.sameElements(that.asInstanceOf[DataRow].data) && info.getFieldNames.sameElements(that.asInstanceOf[DataRow].info.getFieldNames)

  override def hashCode = {
    var result = 1

    for (element <- data)
      result = 31 * result + (if (element == null) 0 else element.hashCode)

    result
  }

  override def toString =
    info.getFieldNames
      .zip(data.map(v => if (v == null) "null" else v.toString))
      .map{case (name, value) => s"$name=$value"}
      .mkString("DataRow(", ", ", ")")
}

object DataRow {
  
  def apply(data: Value*): DataRow = {
    require(data != null, "data cannot be null")
    require(!data.contains(null), "data value cannot be null")

    new DataRow(
      data.map(_.v).toArray,
      new DataRowTypeInfo(
        data.map(_.name),
        data.map(_.typeInfo)
      )
    )
  }
} 
Example 2
Source File: FlinkSleepBlocker.scala    From flink-parameter-server   with Apache License 2.0 5 votes vote down vote up
package hu.sztaki.ilab.ps.utils

import org.apache.flink.api.common.functions.RichMapFunction
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.streaming.api.scala._

object FlinkSleepBlocker {

  
  def block[T: TypeInformation](stream: DataStream[T], milliseconds: Long): DataStream[T] = {
    stream.forward.map(new RichMapFunction[T, T] {
      @transient lazy val sleeper: Unit = {
        Thread.sleep(milliseconds)
        ()
      }

      override def map(value: T): T = {
        sleeper
        value
      }

    }).setParallelism(stream.parallelism)
  }

} 
Example 3
Source File: DataSetMatcher.scala    From piglet   with Apache License 2.0 5 votes vote down vote up
package dbis.piglet.cep.flink

import scala.reflect.ClassTag
import dbis.piglet.cep.nfa.NFAController
import dbis.piglet.cep.engines._
import dbis.piglet.cep.ops.SelectionStrategy._
import dbis.piglet.cep.ops.OutputStrategy._
import dbis.piglet.backends.{SchemaClass => Event}
import dbis.piglet.cep.ops.MatchCollector
import org.apache.flink.api.common.typeinfo.TypeInformation
import dbis.piglet.cep.ops.SelectionStrategy
//import org.apache.flink.api.java.operators.CustomUnaryOperation
//import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable.ListBuffer
//import org.apache.flink.api.java.DataSet
//import org.apache.flink.api.java.ExecutionEnvironment
import scala.collection.JavaConversions._
import org.apache.flink.api.scala._
import dbis.piglet.cep.ops.EngineConf

class DataSetMatcher[T <: Event: ClassTag: TypeInformation](input: DataSet[T], nfa: NFAController[T], flinkEnv: ExecutionEnvironment, sstr: SelectionStrategy = SelectionStrategy.FirstMatch, out: OutputStrategy = Combined) extends EngineConf[T](nfa, sstr) with java.io.Serializable {
  def compute(): DataSet[T] = {
   input.collect().foreach ( event => engine.runEngine(event)  )
   flinkEnv.fromCollection(collector.convertEventsToArray().toSeq)
  }

} 
Example 4
Source File: CustomDataStreamMatcher.scala    From piglet   with Apache License 2.0 5 votes vote down vote up
package dbis.piglet.cep.flink

import scala.reflect.ClassTag
import dbis.piglet.cep.ops.SelectionStrategy._
import dbis.piglet.cep.ops.OutputStrategy._
import dbis.piglet.cep.nfa.NFAController
import dbis.piglet.backends.{SchemaClass => Event}
import org.apache.flink.api.common.typeinfo.TypeInformation
//import org.apache.flink.api.java.ExecutionEnvironment
//import org.apache.flink.api.java.DataSet
import scala.collection.JavaConversions._
import org.apache.flink.streaming.api.scala._

class CustomDataStreamMatcher[T <: Event: ClassTag: TypeInformation](@transient val dataStream: DataStream[T]) {

  def matchNFA(nfa: NFAController[T], flinkEnv: StreamExecutionEnvironment, sstr: SelectionStrategy = FirstMatch, out: OutputStrategy = Combined)  = {
    // println("create a new DataStream matcher")
    new DataStreamMatcher(dataStream, nfa, flinkEnv, sstr, out).compute()
  }

}

object CustomDataStreamMatcher {

  implicit def addDataSetMatcher[T <: Event: ClassTag: TypeInformation](@transient dataStream: DataStream[T]) = {
    // println("add a custom DataStream function")
    new CustomDataStreamMatcher(dataStream)
  }
} 
Example 5
Source File: CustomDataSetMatcher.scala    From piglet   with Apache License 2.0 5 votes vote down vote up
package dbis.piglet.cep.flink

import scala.reflect.ClassTag
import dbis.piglet.cep.ops.SelectionStrategy._
import dbis.piglet.cep.ops.OutputStrategy._
import dbis.piglet.cep.nfa.NFAController
import dbis.piglet.backends.{SchemaClass => Event}
import org.apache.flink.api.common.typeinfo.TypeInformation
//import org.apache.flink.api.java.ExecutionEnvironment
//import org.apache.flink.api.java.DataSet
import scala.collection.JavaConversions._
import org.apache.flink.api.scala._

class CustomDataSetMatcher[T <: Event: ClassTag: TypeInformation](dataSet: DataSet[T]) {

  def matchNFA(nfa: NFAController[T], sstr: SelectionStrategy = FirstMatch, out: OutputStrategy = Combined)  = {
    // println("create a new DataSet matcher")
    val flinkEnv = dataSet.getExecutionEnvironment
    new DataSetMatcher(dataSet, nfa, flinkEnv, sstr, out).compute()
  }

}

object CustomDataSetMatcher {

  implicit def addDataSetMatcher[T <: Event: ClassTag: TypeInformation](dataSet: DataSet[T]) = {
    // println("add a custom DataSet function")
    new CustomDataSetMatcher(dataSet)
  }
} 
Example 6
Source File: DataStreamMatcher.scala    From piglet   with Apache License 2.0 5 votes vote down vote up
package dbis.piglet.cep.flink

import scala.reflect.ClassTag
import dbis.piglet.cep.nfa.NFAController
import dbis.piglet.cep.engines._
import dbis.piglet.cep.ops.SelectionStrategy._
import dbis.piglet.cep.ops.OutputStrategy._
import dbis.piglet.backends.{SchemaClass => Event}
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.streaming.api.windowing.windows.GlobalWindow
import org.apache.flink.streaming.api.windowing.assigners.GlobalWindows
import dbis.piglet.cep.ops.MatchCollector
import dbis.piglet.cep.ops.SelectionStrategy
//import org.apache.flink.api.java.operators.CustomUnaryOperation
import scala.collection.mutable.ListBuffer
//import org.apache.flink.api.java.DataSet
//import org.apache.flink.api.java.ExecutionEnvironment
import scala.collection.JavaConversions._
import org.apache.flink.streaming.api.scala._
import dbis.piglet.cep.ops.EngineConf
import org.apache.flink.util.Collector


class DataStreamMatcher[T <: Event: ClassTag: TypeInformation](@transient val input: DataStream[T], nfa: NFAController[T], flinkEnv: StreamExecutionEnvironment, sstr: SelectionStrategy = SelectionStrategy.FirstMatch, out: OutputStrategy = Combined) extends EngineConf[T](nfa, sstr) with java.io.Serializable {
  object DataStreamProcess {
    def customRun(gw: GlobalWindow, ts: Iterable[T], out: Collector[T]) = {
      ts.foreach { event => engine.runEngine(event)}
      val result = collector.convertEventsToArray()
      result.foreach { res => out.collect(res) }
    }
  }
  def compute(): DataStream[T] = {
    input.windowAll(GlobalWindows.create()).apply(DataStreamProcess.customRun _)   
  }

} 
Example 7
Source File: UTF8StringSchema.scala    From piglet   with Apache License 2.0 5 votes vote down vote up
package dbis.piglet.backends.flink.streaming

import org.apache.commons.lang3.SerializationUtils
import org.apache.flink.streaming.util.serialization._
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.TypeExtractor

class UTF8StringSchema extends DeserializationSchema[String] with SerializationSchema[String] {

  override def deserialize(message: Array[Byte]): String = {
    new String(message, "UTF-8")
  }   

  override def isEndOfStream(nextElement: String): Boolean = {
    false
  }   

  override def serialize(element: String): Array[Byte] = {
    element.getBytes("UTF-8")
  }   

  override def getProducedType(): TypeInformation[String] = {
    TypeExtractor.getForClass(classOf[String])
  }   
} 
Example 8
Source File: StreamFuncs.scala    From piglet   with Apache License 2.0 5 votes vote down vote up
package dbis.piglet.backends.flink.streaming

import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.streaming.api.scala._
import scala.reflect.ClassTag
import dbis.piglet.backends._

class PigStream[T <: SchemaClass: ClassTag: TypeInformation] extends java.io.Serializable {

  def loadStream(env: StreamExecutionEnvironment, path: String, extract: (Array[String]) => T, delim: String = "\t"): DataStream[T] = {
    env.readTextFile(path).setParallelism(1).map(line => extract(line.split(delim, -1)))
  }

  def writeStream(path: String, result: DataStream[T], delim: String = ",") = result.map(_.mkString(delim)).writeAsText(path).setParallelism(1)

  def connect(env: StreamExecutionEnvironment, host: String, port: Int, extract: (Array[String]) => T, delim: String = "\t"): DataStream[T] = {
    env.socketTextStream(host,port).map(line => extract(line.split(delim, -1)))
  }

  def bind(host: String, port: Int, result: DataStream[T], delim: String = ",") = {
    result.map(_.mkString(delim) + "\n").writeToSocket(host, port, new UTF8StringSchema())
  }

  def zmqSubscribe(env: StreamExecutionEnvironment, addr: String, extract: (Array[String]) => T, delim: String = "\t"): DataStream[T] = {
    env.addSource(new ZmqSubscriber(addr)).map(line => extract(line.split(delim, -1)))
  }

  def zmqPublish(addr: String, result: DataStream[T], delim: String = ",") = {
    result.map(_.mkString(delim)).addSink(new ZmqPublisher(addr)).setParallelism(1)
  }
}

class TextLoader[T <: SchemaClass: ClassTag: TypeInformation] extends java.io.Serializable {
  def loadStream(env: StreamExecutionEnvironment, path: String, extract: (Array[String]) => T): DataStream[T] =
    env.readTextFile(path).map(line => extract(Array(line)))
}

object TextLoader extends java.io.Serializable {
  def apply[T <: SchemaClass: ClassTag: TypeInformation](): TextLoader[T] = {
    new TextLoader[T]
  }
}

object PigStream {
  def apply[T <: SchemaClass: ClassTag: TypeInformation](): PigStream[T] = {
    new PigStream
  }
}

class RDFStream[T <: SchemaClass: ClassTag: TypeInformation] extends java.io.Serializable {

  val pattern = "([^\"]\\S*|\".+?\")\\s*".r

  def rdfize(line: String): Array[String] = {
    val fields = pattern.findAllIn(line).map(_.trim)
    fields.toArray.slice(0, 3)
  }

  def loadStream(env: StreamExecutionEnvironment, path: String, extract: (Array[String]) => T): DataStream[T] = {
    env.readTextFile(path).map(line => extract(rdfize(line)))
  }

  def connect(env: StreamExecutionEnvironment, host: String, port: Int, extract: (Array[String]) => T): DataStream[T] = {
    env.socketTextStream(host,port).map(line => extract(rdfize(line)))
  }

 def zmqSubscribe(env: StreamExecutionEnvironment, addr: String, extract: (Array[String]) => T): DataStream[T] = {
    env.addSource(new ZmqSubscriber(addr)).map(line => extract(rdfize(line)))
  }

}

object RDFStream {
  def apply[T <: SchemaClass: ClassTag: TypeInformation](): RDFStream[T] = {
    new RDFStream
  }
} 
Example 9
Source File: Storage.scala    From piglet   with Apache License 2.0 5 votes vote down vote up
package dbis.piglet.backends.flink

import dbis.piglet.backends._
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.scala._
import org.apache.flink.core.fs.FileSystem.WriteMode._

import scala.reflect.ClassTag


//-----------------------------------------------------------------------------------------------------

class PigStorage[T <: SchemaClass :ClassTag: TypeInformation] extends java.io.Serializable {
  def load(env: ExecutionEnvironment, path: String,  extract: (Array[String]) => T, delim: String = "\t",
      skipFirstRow: Boolean = false, skipEmpty: Boolean = false, comments: String = ""): DataSet[T] = {
    
    val raw = env.readTextFile(path) 
    val nonEmpty = if(skipEmpty) raw.filter { line => line.nonEmpty } else raw
    val nonComment = if(comments.nonEmpty) nonEmpty.filter { line => !line.startsWith(comments) } else nonEmpty
    val content = if(skipFirstRow) {
      val header = nonComment.first(1).collect().head
      nonComment.filter { line => line != header }
    } else 
      nonComment
      
    
    content.map(line => line.split(delim, -1)).map(extract)
  }

  def write(path: String, result: DataSet[T], delim: String = ",") = result.map(_.mkString(delim)).writeAsText(path).setParallelism(1)
}

object PigStorage {
  def apply[T <: SchemaClass :ClassTag: TypeInformation](): PigStorage[T] = {
    new PigStorage[T]
  }
}


class RDFFileStorage[T:ClassTag: TypeInformation] extends java.io.Serializable {
  val pattern = "([^\"]\\S*|\".+?\")\\s*".r

  def rdfize(line: String): Array[String] = {
    val fields = pattern.findAllIn(line).map(_.trim)
    fields.toArray.slice(0, 3)
  }

  def load(env: ExecutionEnvironment, path: String, extract: (Array[String]) => T): DataSet[T] =
    env.readTextFile(path).map(line => extract(rdfize(line)))
}

object RDFFileStorage {
  def apply[T:ClassTag: TypeInformation](): RDFFileStorage[T] = {
    new RDFFileStorage[T]
  }
} 
Example 10
Source File: PigFuncs.scala    From piglet   with Apache License 2.0 5 votes vote down vote up
package dbis.piglet.backends.flink

import java.util.Random

import dbis.piglet.CommonPigFuncs
import dbis.piglet.backends._
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.functions._
import org.apache.flink.api.scala._

import scala.reflect.ClassTag

class CustomSampler[T <: SchemaClass: ClassTag: TypeInformation](dataSet: DataSet[T]) {
  def sample(withReplacement: Boolean, fraction: Double, seed: Long = new Random().nextLong()) = {
    dataSet.mapPartition(new SampleWithFraction[T](withReplacement, fraction, seed))
  }

}

object Sampler {
  implicit def addSampler[T <: SchemaClass: ClassTag: TypeInformation](dataSet: DataSet[T]) = {
    new CustomSampler(dataSet)
  }
}

object PigFuncs extends CommonPigFuncs {
} 
Example 11
Source File: package.scala    From featran   with Apache License 2.0 5 votes vote down vote up
package com.spotify.featran

import com.esotericsoftware.kryo.serializers.JavaSerializer
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.scala.DataSet

import scala.reflect.ClassTag

package object flink {

  
  implicit object FlinkCollectionType extends CollectionType[DataSet] {
    // force fallback to default serializer
    private val Ti = TypeInformation.of(classOf[Any])

    override def map[A, B: ClassTag](ma: DataSet[A])(f: A => B): DataSet[B] = {
      implicit val tib = Ti.asInstanceOf[TypeInformation[B]]
      ma.map(f)
    }
    override def reduce[A](ma: DataSet[A])(f: (A, A) => A): DataSet[A] =
      ma.reduce(f)

    override def cross[A, B: ClassTag](ma: DataSet[A])(mb: DataSet[B]): DataSet[(A, B)] =
      ma.crossWithTiny(mb)

    override def pure[A, B: ClassTag](ma: DataSet[A])(b: B): DataSet[B] = {
      implicit val tib = Ti.asInstanceOf[TypeInformation[B]]
      val env = ma.getExecutionEnvironment
      // Kryo throws NPE on `Feature`, use Java serialization instead
      env.addDefaultKryoSerializer(classOf[FeatureSet[Any]], classOf[JavaSerializer])
      env.fromElements(b)
    }
  }
} 
Example 12
Source File: FlinkKafkaCodecSerde.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package cloudflow.flink

import org.apache.kafka.clients.producer.ProducerRecord
import org.apache.kafka.clients.consumer.ConsumerRecord

import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.streaming.connectors.kafka._

import cloudflow.streamlets.{ CodecInlet, CodecOutlet }

private[flink] class FlinkKafkaCodecSerializationSchema[T: TypeInformation](outlet: CodecOutlet[T], topic: String)
    extends KafkaSerializationSchema[T] {
  override def serialize(value: T, timestamp: java.lang.Long): ProducerRecord[Array[Byte], Array[Byte]] =
    new ProducerRecord(topic, outlet.codec.encode(value))
}

private[flink] class FlinkKafkaCodecDeserializationSchema[T: TypeInformation](inlet: CodecInlet[T]) extends KafkaDeserializationSchema[T] {
  override def deserialize(record: ConsumerRecord[Array[Byte], Array[Byte]]): T = inlet.codec.decode(record.value)
  override def isEndOfStream(value: T): Boolean                                 = false
  override def getProducedType: TypeInformation[T]                              = implicitly[TypeInformation[T]]
} 
Example 13
Source File: FlinkStreamletContextImpl.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package cloudflow.flink

import scala.collection.JavaConverters._

import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.streaming.api.datastream.DataStreamSink
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.connectors.kafka._

import com.typesafe.config._
import cloudflow.streamlets._
import java.{ util ⇒ ju }


  override def writeStream[Out: TypeInformation](outlet: CodecOutlet[Out], stream: DataStream[Out]): DataStreamSink[Out] = {

    val topic            = findTopicForPort(outlet)
    val destTopic        = topic.name
    val bootstrapServers = topic.bootstrapServers.getOrElse(internalKafkaBootstrapServers)

    val propsMap = Map("bootstrap.servers" -> bootstrapServers, "batch.size" -> "0") ++
          topic.kafkaProducerProperties

    val properties = new ju.Properties()
    properties.putAll(propsMap.asJava)

    stream.addSink(
      new FlinkKafkaProducer[Out](
        destTopic,
        new FlinkKafkaCodecSerializationSchema[Out](outlet, destTopic),
        properties,
        FlinkKafkaProducer.Semantic.AT_LEAST_ONCE
      )
    )
  }
} 
Example 14
Source File: TestFlinkStreamletContext.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package cloudflow.flink
package testkit

import org.apache.flink.streaming.api.scala._
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.streaming.api.functions.sink.SinkFunction
import org.apache.flink.streaming.api.datastream.DataStreamSink

import com.typesafe.config._
import cloudflow.streamlets._


  override def writeStream[Out: TypeInformation](outlet: CodecOutlet[Out], stream: DataStream[Out]): DataStreamSink[Out] =
    outletTaps
      .find(_.portName == outlet.name)
      .map { _ ⇒
        stream.addSink(new SinkFunction[Out]() {
          override def invoke(out: Out) =
            TestFlinkStreamletContext.result.add(out.toString())
        })
      }
      .getOrElse(throw TestContextException(outlet.name, s"Bad test context, could not find destination for outlet ${outlet.name}"))
}

object TestFlinkStreamletContext {
  val result = new java.util.concurrent.ConcurrentLinkedQueue[String]()
}

case class TestContextException(portName: String, msg: String) extends RuntimeException(msg) 
Example 15
Source File: EventDeSerializer.scala    From flink-demos   with Apache License 2.0 5 votes vote down vote up
package com.dataartisans.flink.example.eventpattern.kafka

import java.nio.{ByteBuffer, ByteOrder}

import com.dataartisans.flink.example.eventpattern.Event
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.util.serialization.{DeserializationSchema, SerializationSchema}


class EventDeSerializer extends DeserializationSchema[Event] with SerializationSchema[Event] {
  
  override def deserialize(bytes: Array[Byte]): Event = {
    val buffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN)
    val address: Int = buffer.getInt(0)
    val eventType: Int = buffer.getInt(4)
    Event(address, eventType)
  }

  override def serialize(t: Event): Array[Byte] = {
    val byteBuffer = ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN)
    byteBuffer.putInt(0, t.sourceAddress)
    byteBuffer.putInt(4, t.event)
    byteBuffer.array()
  }

  override def isEndOfStream(t: Event): Boolean = false

  override def getProducedType: TypeInformation[Event] = {
    createTypeInformation[Event]
  }
} 
Example 16
Source File: DataRowRecordMarshaller.scala    From flink-elasticsearch-source-connector   with Apache License 2.0 5 votes vote down vote up
package com.mnubo.flink.streaming.connectors

import org.apache.flink.api.common.ExecutionConfig
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.TypeExtractor

@SerialVersionUID(1L)
class DataRowRecordMarshaller extends RecordMarshaller[DataRow] {
  private var typeInfo: DataRowTypeInfo = null

  @transient
  private lazy val serializer = typeInfo
    .createSerializer(new ExecutionConfig)
    .asInstanceOf[DataRowSerializer]

  override def typeInformation =
    typeInfo

  override def configureFields(types: Seq[MarshallerFieldDescriptor]) = {
    val (fieldNames, typeInfos): (Seq[String], Seq[TypeInformation[_]]) =
      types
        .map { tp =>
          tp.fieldName -> TypeExtractor.createTypeInfo(tp.fieldClass)
        }
        .unzip

    typeInfo = new DataRowTypeInfo(fieldNames, typeInfos)
  }

  override def createOrReuseInstance(fields: Seq[AnyRef], reuse: DataRow): DataRow =
    serializer.createOrReuseInstance(fields.toArray[AnyRef], reuse)
} 
Example 17
Source File: FlinkTestKits.scala    From flink-jpmml   with GNU Affero General Public License v3.0 5 votes vote down vote up
package io.radicalbit.flink.pmml.scala.utils

import io.radicalbit.flink.pmml.scala.sources.TemporizedSourceFunction
import io.radicalbit.flink.streaming.spec.core.FlinkTestKitCompanion
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.streaming.api.functions.sink.SinkFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.test.util.AbstractTestBase

import scala.collection.mutable
import scala.reflect.ClassTag

trait FlinkSourcedPipelineTestKit[IN1, IN2, OUT] extends AbstractTestBase {

  def executePipeline[IN1: TypeInformation: ClassTag, IN2: TypeInformation: ClassTag](
      in1: Seq[(Long, IN1)],
      in2: Seq[(Long, IN2)])(pipeline: (DataStream[IN1], DataStream[IN2]) => DataStream[OUT])(
      implicit companion: FlinkTestKitCompanion[OUT]) = {

    companion.testResults = mutable.MutableList[OUT]()

    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    val events = in1
      .union(in2)
      .sortBy(_._1)
      .collect {
        case (_, left: IN1) => (Some(left), None)
        case (_, right: IN2) => (None, Some(right))
      }

    val stream = env.addSource(new TemporizedSourceFunction[IN1, IN2](events))

    val stream1: DataStream[IN1] = stream.filter(either => either.isLeft).map(either => either.left.get)
    val stream2: DataStream[IN2] = stream.filter(either => either.isRight).map(either => either.right.get)

    pipeline(stream1, stream2)
      .addSink(new SinkFunction[OUT] {
        override def invoke(in: OUT) = {
          companion.testResults += in
        }
      })

    env.execute(this.getClass.getSimpleName)

    companion.testResults
  }

} 
Example 18
Source File: ElasticsearchDataset.scala    From flink-elasticsearch-source-connector   with Apache License 2.0 5 votes vote down vote up
package com.mnubo.flink.streaming.connectors.elasticsearch

import com.mnubo.flink.streaming.connectors._
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.operators.DataSource
import org.apache.flink.api.java.typeutils.{PojoTypeInfo, TupleTypeInfoBase}
import org.apache.flink.api.scala._

import scala.reflect.ClassTag

object ElasticsearchDataset {
  
  def fromElasticsearchQuery[T : ClassTag: TypeInformation](env: ExecutionEnvironment,
                                                            index: String,
                                                            query: String,
                                                            nodes: Set[String] = Set("localhost"),
                                                            port: Int = 9200,
                                                            pojoFields: Array[String] = null): DataSet[T] = {
    val clazz =
      implicitly[ClassTag[T]].runtimeClass

    val marshaller =
      if (clazz == classOf[DataRow])
        new DataRowRecordMarshaller().asInstanceOf[RecordMarshaller[T]]
      else
        implicitly[TypeInformation[T]] match {
          case info: TupleTypeInfoBase[T] =>
            new TupleRecordMarshaller[T](info)
          case info: PojoTypeInfo[T] =>
            require(pojoFields != null, "POJO fields must be specified (not null) if output type is a POJO.")
            new PojoRecordMarshaller[T](info, pojoFields)
          case other =>
            throw new IllegalArgumentException(s"The type ${clazz.getName} has to be a tuple, a DataRow or pojo type.")
        }

    val inputFormat =
      new ElasticseachInputFormat[T](
        nodes,
        port,
        index,
        query,
        marshaller
      )

    // Not the most elegant, but can't wait for the input format to be configured to get the actual schema. Have to get it now.
    val schema = inputFormat.fetchSchema()
    marshaller.configureFields(schema)

    new DataSet[T](new DataSource[T](env.getJavaEnv, inputFormat, marshaller.typeInformation, getCallLocationName()))
  }


} 
Example 19
Source File: RecordTransformer.scala    From flink-elasticsearch-source-connector   with Apache License 2.0 5 votes vote down vote up
package com.mnubo.flink.streaming.connectors

import org.apache.flink.api.common.operators.Keys.ExpressionKeys._
import org.apache.flink.api.common.typeinfo.TypeInformation

import scala.annotation.tailrec
import scala.language.existentials
import scala.reflect.ClassTag

sealed trait FieldSpecification extends Serializable

case class ExistingField(name: String) extends FieldSpecification

case class NewField(name: String, typeInfo: TypeInformation[_]) extends FieldSpecification

trait RecordTransformer extends Serializable {
  val classTag = ClassTag[DataRow](classOf[DataRow])
  def typeInfo : DataRowTypeInfo
  def transform(dataRow: DataRow, values:Any*) : DataRow
}

class FieldMapperRecordTransformer private[connectors](srcTypeInfo:DataRowTypeInfo, fieldSpecifications: FieldSpecification*) extends RecordTransformer {
  require(srcTypeInfo != null, s"srcTypeInfo must not be null")
  require(fieldSpecifications != null, s"fieldSpecifications must not be null")
  require(fieldSpecifications.nonEmpty, s"fieldSpecifications must not be empty")
  require(!fieldSpecifications.contains(null), s"fieldSpecifications must not contain any nulls")

  override val typeInfo = {
    val (fieldNames, elementTypes) = fieldSpecifications.flatMap {
      case ExistingField(name) if name == SELECT_ALL_CHAR || name == SELECT_ALL_CHAR_SCALA => srcTypeInfo.getFieldNames.zip(srcTypeInfo.getElementTypes)
      case ExistingField(name) => Seq(name -> srcTypeInfo.getFieldType(name))
      case NewField(name, newFieldTypeInfo) => Seq(name -> newFieldTypeInfo)
    }.unzip
    require(fieldNames.length == fieldNames.distinct.length, s"Fields can't have duplicates. Fields were $fieldNames.")
    new DataRowTypeInfo(fieldNames, elementTypes)
  }

  private def newFieldsNames = fieldSpecifications.collect{ case newValue: NewField => newValue.name }

  override def transform(dataRow: DataRow, values:Any*) : DataRow = {
    require(dataRow != null, s"dataRow must not be null")
    require(values != null, s"values must not be null")
    require(newFieldsNames.length == values.length, s"Must specify values for all new fields and only new fields. New fields are '$newFieldsNames'")

    val resultValues = new Array[Any](typeInfo.getArity)
    @tailrec
    def transform(index:Int, remainingSpecs: Seq[FieldSpecification], remainingValues:Seq[Any]) : DataRow = {
      if(remainingSpecs.isEmpty) {
        new DataRow(resultValues, typeInfo)
      } else {
        val currentSpec = remainingSpecs.head
        currentSpec match {
          case ExistingField(name) if name == SELECT_ALL_CHAR || name == SELECT_ALL_CHAR_SCALA =>
            Array.copy(dataRow.data, 0, resultValues, index, dataRow.data.length)
            transform(index + dataRow.data.length, remainingSpecs.tail, remainingValues)
          case ExistingField(name) =>
            resultValues(index) = dataRow(name)
            transform(index + 1, remainingSpecs.tail, remainingValues)
          case NewField(name, _) =>
            resultValues(index) = remainingValues.head
            transform(index + 1, remainingSpecs.tail, remainingValues.tail)
        }
      }
    }
    transform(0, fieldSpecifications, values)
  }
}

object RecordTransformer {
  def mapFields(srcTypeInfo: DataRowTypeInfo, fieldSpecifications: FieldSpecification*) : RecordTransformer = {
    new FieldMapperRecordTransformer(srcTypeInfo, fieldSpecifications:_*)
  }
} 
Example 20
Source File: package.scala    From milan   with Apache License 2.0 5 votes vote down vote up
package com.amazon.milan.compiler.flink

import java.io.{ByteArrayInputStream, ByteArrayOutputStream}

import com.amazon.milan.compiler.flink.runtime.{UnwrapRecordsMapFunction, WrapRecordsMapFunction}
import com.amazon.milan.compiler.flink.testing.IntKeyValueRecord
import com.amazon.milan.compiler.flink.types.{RecordWrapper, RecordWrapperTypeInformation}
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.common.typeutils.TypeSerializer
import org.apache.flink.api.java.typeutils.ResultTypeQueryable
import org.apache.flink.core.memory.{DataInputView, DataInputViewStreamWrapper, DataOutputView, DataOutputViewStreamWrapper}
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.datastream.DataStream
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment

import scala.language.implicitConversions
import scala.util.Random


package object testutil {
  def getTestExecutionEnvironment: StreamExecutionEnvironment = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    env.setBufferTimeout(0)
    env
  }

  def copyWithSerializer[T](value: T, serializer: TypeSerializer[T]): T = {
    val outputStream = new ByteArrayOutputStream()
    val outputView = new DataOutputViewStreamWrapper(outputStream)
    serializer.serialize(value, outputView)

    val bytes = outputStream.toByteArray
    val inputStream = new ByteArrayInputStream(bytes)
    val inputView = new DataInputViewStreamWrapper(inputStream)
    serializer.deserialize(inputView)
  }

  def copyData[T](writeValue: DataOutputView => Unit, readValue: DataInputView => T): T = {
    val outputStream = new ByteArrayOutputStream()
    val outputView = new DataOutputViewStreamWrapper(outputStream)
    writeValue(outputView)

    val bytes = outputStream.toByteArray
    val inputStream = new ByteArrayInputStream(bytes)
    val inputView = new DataInputViewStreamWrapper(inputStream)
    readValue(inputView)
  }

  def generateIntKeyValueRecords(recordCount: Int, keyCount: Int, maxValue: Int): List[IntKeyValueRecord] = {
    val rand = new Random(0)
    List.tabulate(recordCount)(_ => IntKeyValueRecord(rand.nextInt(keyCount), rand.nextInt(maxValue + 1)))
  }

  implicit class WrappedDataStreamExtensions[T >: Null, TKey >: Null <: Product](dataStream: DataStream[RecordWrapper[T, TKey]]) {
    def unwrap(recordTypeInformation: TypeInformation[T]): DataStream[T] = {
      val mapper = new UnwrapRecordsMapFunction[T, TKey](recordTypeInformation)
      this.dataStream.map(mapper)
    }

    def unwrap(): DataStream[T] = {
      val recordType = this.dataStream.getType.asInstanceOf[RecordWrapperTypeInformation[T, TKey]].valueTypeInformation
      this.unwrap(recordType)
    }
  }

  implicit class DataStreamExtensions[T >: Null](dataStream: DataStream[T]) {
    def wrap(recordTypeInformation: TypeInformation[T]): DataStream[RecordWrapper[T, Product]] = {
      val mapper = new WrapRecordsMapFunction[T](recordTypeInformation)
      this.dataStream.map(mapper)
    }

    def wrap(): DataStream[RecordWrapper[T, Product]] = {
      val recordType = this.dataStream.asInstanceOf[ResultTypeQueryable[T]].getProducedType
      this.wrap(recordType)
    }
  }

} 
Example 21
Source File: MapFunctions.scala    From milan   with Apache License 2.0 5 votes vote down vote up
package com.amazon.milan.compiler.flink.runtime

import com.amazon.milan.compiler.flink.internal.LineageRecordFactory
import com.amazon.milan.compiler.flink.metrics.MetricFactory
import com.amazon.milan.compiler.flink.types.{RecordWrapper, RecordWrapperTypeInformation}
import com.amazon.milan.types.LineageRecord
import org.apache.flink.api.common.functions.RichMapFunction
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.ResultTypeQueryable
import org.apache.flink.util.OutputTag


object MapFunctions {
  val ProcessedRecordsCounterMetricName = "processed_record_count"
}

import com.amazon.milan.compiler.flink.runtime.MapFunctions._


abstract class MapFunctionWithLineage[TIn >: Null, TKey >: Null <: Product, TOut >: Null](outputTypeInformation: TypeInformation[TOut],
                                                                                          keyTypeInformation: TypeInformation[TKey],
                                                                                          lineageRecordFactory: LineageRecordFactory,
                                                                                          lineageOutputTag: OutputTag[LineageRecord],
                                                                                          metricFactory: MetricFactory)
  extends RichMapFunction[RecordWrapper[TIn, TKey], RecordWrapper[TOut, TKey]]
    with ResultTypeQueryable[RecordWrapper[TOut, TKey]] {

  @transient private lazy val processedRecordsCounter = this.metricFactory.createCounter(this.getRuntimeContext, ProcessedRecordsCounterMetricName)

  protected def mapValue(in: TIn): TOut

  override def map(record: RecordWrapper[TIn, TKey]): RecordWrapper[TOut, TKey] = {
    this.processedRecordsCounter.increment()
    val mappedValue = this.mapValue(record.value)
    RecordWrapper.wrap(mappedValue, record.key, record.sequenceNumber)
  }

  override def getProducedType: TypeInformation[RecordWrapper[TOut, TKey]] =
    RecordWrapperTypeInformation.wrap(this.outputTypeInformation, this.keyTypeInformation)
}


abstract class KeyedMapFunctionWithLineage[TIn >: Null, TInKey >: Null <: Product, TKey, TOut >: Null](outputTypeInfo: TypeInformation[TOut],
                                                                                                       keyTypeInfo: TypeInformation[TInKey],
                                                                                                       lineageRecordFactory: LineageRecordFactory,
                                                                                                       lineageOutputTag: OutputTag[LineageRecord],
                                                                                                       metricFactory: MetricFactory)
  extends RichMapFunction[RecordWrapper[TIn, TInKey], RecordWrapper[TOut, TInKey]]
    with ResultTypeQueryable[RecordWrapper[TOut, TInKey]] {

  @transient private lazy val processedRecordsCounter = this.metricFactory.createCounter(this.getRuntimeContext, ProcessedRecordsCounterMetricName)

  
  protected def getKey(recordKey: TInKey): TKey

  protected def mapValue(key: TKey, value: TIn): TOut

  override def map(record: RecordWrapper[TIn, TInKey]): RecordWrapper[TOut, TInKey] = {
    this.processedRecordsCounter.increment()
    val key = this.getKey(record.key)
    val mappedValue = this.mapValue(key, record.value)
    RecordWrapper.wrap(mappedValue, record.key, record.sequenceNumber)
  }

  override def getProducedType: TypeInformation[RecordWrapper[TOut, TInKey]] =
    RecordWrapperTypeInformation.wrap(this.outputTypeInfo, this.keyTypeInfo)
} 
Example 22
Source File: ArgCompareProcessFunctions.scala    From milan   with Apache License 2.0 5 votes vote down vote up
package com.amazon.milan.compiler.flink.runtime

import com.amazon.milan.compiler.flink.TypeUtil
import org.apache.flink.api.common.typeinfo.TypeInformation


abstract class ArgCompareKeyedProcessFunction[T >: Null, TKey >: Null <: Product, TArg](recordTypeInformation: TypeInformation[T],
                                                                                        keyTypeInformation: TypeInformation[TKey],
                                                                                        argTypeInformation: TypeInformation[TArg])
  extends ScanKeyedProcessFunction[T, TKey, Option[TArg], T](None, keyTypeInformation, TypeUtil.createOptionTypeInfo(argTypeInformation), recordTypeInformation) {

  protected def getArg(value: T): TArg

  protected def greaterThan(arg1: TArg, arg2: TArg): Boolean

  override protected def process(state: Option[TArg], key: TKey, value: T): (Option[TArg], Option[T]) = {
    val valueArg = this.getArg(value)

    state match {
      case None =>
        (Some(valueArg), Some(value))

      case Some(stateArg) =>
        if (this.greaterThan(valueArg, stateArg)) {
          (Some(valueArg), Some(value))
        }
        else {
          (state, None)
        }
    }
  }
}


abstract class ArgCompareProcessFunction[T >: Null, TKey >: Null <: Product, TArg](recordTypeInformation: TypeInformation[T],
                                                                                   keyTypeInformation: TypeInformation[TKey],
                                                                                   argTypeInformation: TypeInformation[TArg])
  extends ScanProcessFunction[T, TKey, Option[TArg], T](None, keyTypeInformation, TypeUtil.createOptionTypeInfo(argTypeInformation), recordTypeInformation) {

  protected def getArg(value: T): TArg

  protected def greaterThan(arg1: TArg, arg2: TArg): Boolean

  override protected def process(state: Option[TArg], value: T): (Option[TArg], Option[T]) = {
    val valueArg = this.getArg(value)

    state match {
      case None =>
        (Some(valueArg), Some(value))

      case Some(stateArg) =>
        if (this.greaterThan(valueArg, stateArg)) {
          (Some(valueArg), Some(value))
        }
        else {
          (state, None)
        }
    }
  }
} 
Example 23
Source File: KinesisDataSource.scala    From milan   with Apache License 2.0 5 votes vote down vote up
package com.amazon.milan.compiler.flink.runtime

import java.util.Properties

import com.amazon.milan.dataformats.DataInputFormat
import com.amazon.milan.serialization.MilanObjectMapper
import com.typesafe.scalalogging.Logger
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.streaming.api.datastream.DataStreamSource
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
import org.apache.flink.streaming.connectors.kinesis.FlinkKinesisConsumer
import org.apache.flink.streaming.connectors.kinesis.config.{AWSConfigConstants, ConsumerConfigConstants}
import org.apache.flink.streaming.connectors.kinesis.serialization.KinesisDeserializationSchema
import org.slf4j.LoggerFactory


object KinesisDataSource {
  private lazy val logger = Logger(LoggerFactory.getLogger(getClass))

  def addDataSource[T](env: StreamExecutionEnvironment,
                       streamName: String,
                       region: String,
                       dataFormat: DataInputFormat[T],
                       recordTypeInformation: TypeInformation[T]): DataStreamSource[T] = {
    this.logger.info(s"Creating Kinesis consumer for stream '$streamName', region '$region'.")

    val config = this.getConsumerProperties(region)
    val schema = new JsonDeserializationSchema[T](recordTypeInformation)

    val source = new FlinkKinesisConsumer[T](streamName, schema, config)
    env.addSource(source)
  }

  private def getConsumerProperties(region: String): Properties = {
    val config = new Properties()

    config.setProperty(AWSConfigConstants.AWS_REGION, region)
    config.setProperty(AWSConfigConstants.AWS_CREDENTIALS_PROVIDER, AWSConfigConstants.CredentialProvider.AUTO.toString)
    config.setProperty(ConsumerConfigConstants.STREAM_INITIAL_POSITION, ConsumerConfigConstants.InitialPosition.LATEST.toString)

    config
  }
}



class JsonDeserializationSchema[T](recordTypeInformation: TypeInformation[T])
  extends KinesisDeserializationSchema[T] {

  override def deserialize(bytes: Array[Byte],
                           partitionKey: String,
                           seqNum: String,
                           approxArrivalTimestamp: Long,
                           stream: String,
                           shardId: String): T = {
    MilanObjectMapper.readValue[T](bytes, this.recordTypeInformation.getTypeClass)
  }

  override def getProducedType: TypeInformation[T] =
    this.recordTypeInformation
} 
Example 24
Source File: RuntimeUtil.scala    From milan   with Apache License 2.0 5 votes vote down vote up
package com.amazon.milan.compiler.flink.runtime

import java.util

import com.amazon.milan.compiler.flink.generator.FlinkGeneratorException
import com.amazon.milan.serialization.MilanObjectMapper
import com.fasterxml.jackson.databind.`type`.TypeFactory
import org.apache.flink.api.common.typeinfo.TypeInformation

import scala.collection.JavaConverters._
import scala.reflect.{ClassTag, classTag}


object RuntimeUtil {
  val typeName: String = getClass.getTypeName.stripSuffix("$")

  def loadJsonList[TElement: ClassTag](listJson: String): List[TElement] = {
    this.loadJsonArrayList[TElement](listJson).asScala.toList
  }

  def loadJsonArrayList[TElement: ClassTag](listJson: String): util.ArrayList[TElement] = {
    val typeFactory = TypeFactory.defaultInstance()
    val itemClass = classTag[TElement].runtimeClass.asInstanceOf[Class[TElement]]
    val javaType = typeFactory.constructCollectionType(classOf[util.ArrayList[TElement]], itemClass)
    MilanObjectMapper.readValue[util.ArrayList[TElement]](listJson, javaType)
  }

  def preventGenericTypeInformation[T](typeInfo: TypeInformation[T]): TypeInformation[T] = {
    if (typeInfo.getClass.getName.contains("__wrapper")) {
      throw new FlinkGeneratorException(s"Creating TypeInformation for '${typeInfo.getTypeClass.getName}' produced a GenericTypeInformation.")
    }

    typeInfo
  }
} 
Example 25
Source File: LeftJoinKeyedCoProcessFunction.scala    From milan   with Apache License 2.0 5 votes vote down vote up
package com.amazon.milan.compiler.flink.runtime

import com.amazon.milan.compiler.flink.internal.JoinLineageRecordFactory
import com.amazon.milan.compiler.flink.metrics.MetricFactory
import com.amazon.milan.compiler.flink.types.{RecordWrapper, RecordWrapperTypeInformation}
import com.amazon.milan.types.LineageRecord
import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor}
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.ResultTypeQueryable
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.co.KeyedCoProcessFunction
import org.apache.flink.util.{Collector, OutputTag}


object LeftJoinCoProcessFunction {
  val LeftInputRecordsCounterMetricName = "left_input_record_count"
  val RightInputRecordsCounterMetricName = "right_input_record_count"
  val OutputRecordsCounterMetricName = "output_record_count"
}

import com.amazon.milan.compiler.flink.runtime.LeftJoinCoProcessFunction._


abstract class LeftJoinKeyedCoProcessFunction[TLeft >: Null, TRight >: Null, TKey >: Null <: Product, TOut >: Null](rightTypeInformation: TypeInformation[TRight],
                                                                                                                    keyTypeInformation: TypeInformation[TKey],
                                                                                                                    outputTypeInformation: TypeInformation[TOut],
                                                                                                                    leftRecordIdExtractor: RecordIdExtractor[TLeft],
                                                                                                                    rightRecordIdExtractor: RecordIdExtractor[TRight],
                                                                                                                    outputRecordIdExtractor: RecordIdExtractor[TOut],
                                                                                                                    lineageRecordFactory: JoinLineageRecordFactory,
                                                                                                                    lineageOutputTag: OutputTag[LineageRecord],
                                                                                                                    metricFactory: MetricFactory)
  extends KeyedCoProcessFunction[TKey, RecordWrapper[TLeft, TKey], RecordWrapper[TRight, TKey], RecordWrapper[TOut, TKey]]
    with ResultTypeQueryable[RecordWrapper[TOut, TKey]] {

  @transient private lazy val canProduceLineage = leftRecordIdExtractor.canExtractRecordId && rightRecordIdExtractor.canExtractRecordId && outputRecordIdExtractor.canExtractRecordId
  @transient private lazy val leftInputRecordsCounter = this.metricFactory.createCounter(this.getRuntimeContext, LeftInputRecordsCounterMetricName)
  @transient private lazy val rightInputRecordsCounter = this.metricFactory.createCounter(this.getRuntimeContext, RightInputRecordsCounterMetricName)
  @transient private lazy val outputRecordsCounter = this.metricFactory.createCounter(this.getRuntimeContext, OutputRecordsCounterMetricName)

  @transient private var lastRightValue: ValueState[TRight] = _

  protected def map(left: TLeft, right: TRight): TOut

  protected def postCondition(left: TLeft, right: TRight): Boolean

  override def processElement1(leftRecord: RecordWrapper[TLeft, TKey],
                               context: KeyedCoProcessFunction[TKey, RecordWrapper[TLeft, TKey], RecordWrapper[TRight, TKey], RecordWrapper[TOut, TKey]]#Context,
                               collector: Collector[RecordWrapper[TOut, TKey]]): Unit = {
    this.leftInputRecordsCounter.increment()

    val leftValue = leftRecord.value
    val rightValue = this.lastRightValue.value()

    if (this.postCondition(leftValue, rightValue)) {
      val output = this.map(leftValue, rightValue)

      if (output != null) {
        if (this.canProduceLineage) {
          val lineageRecord = this.createLineageRecord(this.outputRecordIdExtractor(output), leftValue, rightValue)
          context.output(this.lineageOutputTag, lineageRecord)
        }

        collector.collect(RecordWrapper.wrap[TOut, TKey](output, leftRecord.key, 0))
        this.outputRecordsCounter.increment()
      }
    }
  }

  override def processElement2(rightRecord: RecordWrapper[TRight, TKey],
                               context: KeyedCoProcessFunction[TKey, RecordWrapper[TLeft, TKey], RecordWrapper[TRight, TKey], RecordWrapper[TOut, TKey]]#Context,
                               collector: Collector[RecordWrapper[TOut, TKey]]): Unit = {
    this.rightInputRecordsCounter.increment()
    this.lastRightValue.update(rightRecord.value)
  }

  override def open(parameters: Configuration): Unit = {
    val rightValueDescriptor = new ValueStateDescriptor[TRight]("lastRightValue", this.rightTypeInformation)
    this.lastRightValue = this.getRuntimeContext.getState(rightValueDescriptor)
  }

  override def getProducedType: TypeInformation[RecordWrapper[TOut, TKey]] =
    RecordWrapperTypeInformation.wrap(this.outputTypeInformation, this.keyTypeInformation)

  private def createLineageRecord(outputRecordId: String, leftRecord: TLeft, rightRecord: TRight): LineageRecord = {
    val sourceRecords =
      Option(leftRecord).toSeq.map(r => this.lineageRecordFactory.createLeftRecordPointer(this.leftRecordIdExtractor(r))) ++
        Option(rightRecord).toSeq.map(r => this.lineageRecordFactory.createRightRecordPointer(this.rightRecordIdExtractor(r)))

    this.lineageRecordFactory.createLineageRecord(outputRecordId, sourceRecords)
  }
} 
Example 26
Source File: TimeWindowFlatMapProcessWindowFunction.scala    From milan   with Apache License 2.0 5 votes vote down vote up
package com.amazon.milan.compiler.flink.runtime

import java.lang
import java.time.Instant

import com.amazon.milan.compiler.flink.TypeUtil
import com.amazon.milan.compiler.flink.types.{RecordWrapper, RecordWrapperTypeInformation}
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.ResultTypeQueryable
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector


abstract class TimeWindowFlatMapProcessWindowFunction[T >: Null, TInKey >: Null <: Product, TOutKey >: Null <: Product](recordTypeInfo: TypeInformation[T],
                                                                                                                        outKeyTypeInfo: TypeInformation[TOutKey])
  extends ProcessWindowFunction[RecordWrapper[Option[T], TInKey], RecordWrapper[Option[T], TOutKey], TInKey, TimeWindow]
    with ResultTypeQueryable[RecordWrapper[Option[T], TOutKey]] {

  @transient private var sequenceNumberHelper: SequenceNumberHelper = _

  protected def addWindowStartTimeToKey(key: TInKey, windowStart: Instant): TOutKey

  override def getProducedType: TypeInformation[RecordWrapper[Option[T], TOutKey]] =
    RecordWrapperTypeInformation.wrap(TypeUtil.createOptionTypeInfo(this.recordTypeInfo), this.outKeyTypeInfo)

  override def process(key: TInKey,
                       context: ProcessWindowFunction[RecordWrapper[Option[T], TInKey], RecordWrapper[Option[T], TOutKey], TInKey, TimeWindow]#Context,
                       items: lang.Iterable[RecordWrapper[Option[T], TInKey]],
                       collector: Collector[RecordWrapper[Option[T], TOutKey]]): Unit = {
    val windowStartTime = Instant.ofEpochMilli(context.window().getStart)

    val record = items.iterator().next()
    val outKey = this.addWindowStartTimeToKey(record.key, windowStartTime)
    val outRecord = RecordWrapper.wrap(record.value, outKey, sequenceNumberHelper.increment())
    collector.collect(outRecord)
  }

  override def open(parameters: Configuration): Unit = {
    this.sequenceNumberHelper = new SequenceNumberHelper(this.getRuntimeContext)
  }
} 
Example 27
Source File: UnpackOptionProcessFunction.scala    From milan   with Apache License 2.0 5 votes vote down vote up
package com.amazon.milan.compiler.flink.runtime

import com.amazon.milan.compiler.flink.types.{RecordWrapper, RecordWrapperTypeInformation}
import com.typesafe.scalalogging.Logger
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.ResultTypeQueryable
import org.apache.flink.streaming.api.functions.ProcessFunction
import org.apache.flink.util.Collector
import org.slf4j.LoggerFactory


class UnpackOptionProcessFunction[T >: Null, TKey >: Null <: Product](recordType: TypeInformation[T],
                                                                      keyType: TypeInformation[TKey])
  extends ProcessFunction[RecordWrapper[Option[T], TKey], RecordWrapper[T, TKey]]
    with ResultTypeQueryable[RecordWrapper[T, TKey]] {

  @transient private lazy val logger = Logger(LoggerFactory.getLogger(getClass))

  override def processElement(record: RecordWrapper[Option[T], TKey],
                              context: ProcessFunction[RecordWrapper[Option[T], TKey], RecordWrapper[T, TKey]]#Context,
                              collector: Collector[RecordWrapper[T, TKey]]): Unit = {
    if (record.value.isDefined) {
      collector.collect(RecordWrapper.wrap(record.value.get, record.key, record.sequenceNumber))
    }
  }

  override def getProducedType: TypeInformation[RecordWrapper[T, TKey]] =
    RecordWrapperTypeInformation.wrap(this.recordType, this.keyType)
} 
Example 28
Source File: ArrayRecordToTupleMapFunction.scala    From milan   with Apache License 2.0 5 votes vote down vote up
package com.amazon.milan.compiler.flink.runtime

import com.amazon.milan.compiler.flink.types.{ArrayRecord, RecordWrapper, RecordWrapperTypeInformation}
import org.apache.flink.api.common.functions.MapFunction
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.ResultTypeQueryable


abstract class ArrayRecordToTupleMapFunction[T >: Null, TKey >: Null <: Product](outputTypeInformation: TypeInformation[T],
                                                                                 keyTypeInformation: TypeInformation[TKey])
  extends MapFunction[RecordWrapper[ArrayRecord, TKey], RecordWrapper[T, TKey]]
    with ResultTypeQueryable[RecordWrapper[T, TKey]] {

  protected def getTuple(record: ArrayRecord): T

  override def map(record: RecordWrapper[ArrayRecord, TKey]): RecordWrapper[T, TKey] = {
    val tupleValue = this.getTuple(record.value)
    RecordWrapper.wrap[T, TKey](tupleValue, record.key, 0)
  }

  override def getProducedType: TypeInformation[RecordWrapper[T, TKey]] =
    RecordWrapperTypeInformation.wrap(this.outputTypeInformation, this.keyTypeInformation)
} 
Example 29
Source File: IdentityFlatMapFunction.scala    From milan   with Apache License 2.0 5 votes vote down vote up
package com.amazon.milan.compiler.flink.runtime

import com.amazon.milan.compiler.flink.types.{RecordWrapper, RecordWrapperTypeInformation}
import org.apache.flink.api.common.functions.FlatMapFunction
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.ResultTypeQueryable
import org.apache.flink.util.Collector



class IdentityFlatMapFunction[T >: Null, TKey >: Null <: Product](recordTypeInformation: TypeInformation[T],
                                                                  keyTypeInformation: TypeInformation[TKey])
  extends FlatMapFunction[RecordWrapper[T, TKey], RecordWrapper[T, TKey]]
    with ResultTypeQueryable[RecordWrapper[T, TKey]] {

  override def flatMap(record: RecordWrapper[T, TKey], collector: Collector[RecordWrapper[T, TKey]]): Unit = {
    collector.collect(record)
  }

  override def getProducedType: TypeInformation[RecordWrapper[T, TKey]] =
    RecordWrapperTypeInformation.wrap(this.recordTypeInformation, this.keyTypeInformation)
} 
Example 30
Source File: DataSourceUtil.scala    From milan   with Apache License 2.0 5 votes vote down vote up
package com.amazon.milan.compiler.flink.runtime

import com.amazon.milan.application.sources.FileDataSource
import com.amazon.milan.dataformats.DataInputFormat
import com.amazon.milan.compiler.flink.types.{ByteArrayDataFormatFlatMapFunction, ByteArrayInputFormat, ByteArrayRecordTypeInformation}
import com.typesafe.scalalogging.Logger
import org.apache.flink.api.common.io.FilePathFilter
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.streaming.api.datastream.{DataStreamSource, SingleOutputStreamOperator}
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
import org.apache.flink.streaming.api.functions.source.FileProcessingMode
import org.slf4j.LoggerFactory

import scala.collection.JavaConverters._


object DataSourceUtil {
  private lazy val logger = Logger(LoggerFactory.getLogger(getClass))

  def addFileDataSource[T](env: StreamExecutionEnvironment,
                           path: String,
                           dataFormat: DataInputFormat[T],
                           configuration: FileDataSource.Configuration,
                           recordTypeInformation: TypeInformation[T]): SingleOutputStreamOperator[T] = {
    this.logger.info(s"Adding file '$path' as an input to the streaming environment. ")

    val inputFormat = new ByteArrayInputFormat
    inputFormat.setFilesFilter(FilePathFilter.createDefaultFilter())

    val processingMode = configuration.readMode match {
      case FileDataSource.ReadMode.Continuous => FileProcessingMode.PROCESS_CONTINUOUSLY
      case FileDataSource.ReadMode.Once => FileProcessingMode.PROCESS_ONCE
    }

    val changeCheckIntervalMs = processingMode match {
      case FileProcessingMode.PROCESS_CONTINUOUSLY => 5000L
      case _ => -1L
    }

    val inputLines = env.readFile(
      inputFormat,
      path,
      processingMode,
      changeCheckIntervalMs,
      new ByteArrayRecordTypeInformation)

    val mapper = new ByteArrayDataFormatFlatMapFunction[T](dataFormat, recordTypeInformation)
    inputLines.flatMap(mapper)
  }

  def addListDataSource[T](env: StreamExecutionEnvironment,
                           values: List[T],
                           runForever: Boolean,
                           recordTypeInformation: TypeInformation[T]): DataStreamSource[T] = {
    if (runForever) {
      // If we don't want the source to terminate after the elements run out then we need to use a custom source
      // function rather than env.fromCollection. In order to not cause duplicate records to be sent from multiple
      // copies of the source function we set the parallelism to 1.
      val source = new ListSourceFunction[T](values, runForever)
      env.addSource(source, recordTypeInformation).setParallelism(1)
    }
    else {
      env.fromCollection(values.asJavaCollection, recordTypeInformation)
    }
  }
} 
Example 31
Source File: TypeInformationDataInputFormat.scala    From milan   with Apache License 2.0 5 votes vote down vote up
package com.amazon.milan.compiler.flink.dataformats

import java.io.{ByteArrayInputStream, EOFException, InputStream}

import com.amazon.milan.dataformats.DataInputFormat
import com.amazon.milan.typeutil.TypeDescriptor
import org.apache.flink.api.common.ExecutionConfig
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.common.typeutils.TypeSerializer
import org.apache.flink.core.memory.DataInputViewStreamWrapper



class TypeInformationDataInputFormat[T](typeInfo: TypeInformation[T]) extends DataInputFormat[T] {
  @transient private lazy val serializer = this.createSerializer()

  override def getGenericArguments: List[TypeDescriptor[_]] = {
    // This class is not intended to be serialized by GenericTypedJsonSerializer, so this should not be called.
    throw new UnsupportedOperationException()
  }

  override def setGenericArguments(genericArgs: List[TypeDescriptor[_]]): Unit = {
    // This class is not intended to be deserialized by GenericTypedJsonDeserializer, so this should not be called.
    throw new UnsupportedOperationException()
  }

  override def readValue(bytes: Array[Byte], offset: Int, length: Int): Option[T] = {
    val input = new DataInputViewStreamWrapper(new ByteArrayInputStream(bytes, offset, length))
    Some(this.serializer.deserialize(input))
  }

  override def readValues(stream: InputStream): TraversableOnce[T] = {
    val input = new DataInputViewStreamWrapper(stream)
    Stream.continually(0)
      .map(_ =>
        try {
          Some(this.serializer.deserialize(input))
        }
        catch {
          case _: EOFException => None
        })
      .takeWhile(_.isDefined)
      .map(_.get)
  }

  private def createSerializer(): TypeSerializer[T] = {
    val config = new ExecutionConfig()
    this.typeInfo.createSerializer(config)
  }
} 
Example 32
Source File: JsonDeserializationSchema.scala    From milan   with Apache License 2.0 5 votes vote down vote up
package com.amazon.milan.compiler.flink.serialization

import com.amazon.milan.serialization.MilanObjectMapper
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.api.java.typeutils.TypeExtractor
import org.apache.flink.streaming.connectors.kinesis.serialization.KinesisDeserializationSchema

import scala.reflect.{ClassTag, classTag}


object JsonDeserializationSchema {
  private val objectMapper = new MilanObjectMapper()
}



class JsonDeserializationSchema[T: ClassTag] extends KinesisDeserializationSchema[T] with Serializable {
  override def deserialize(bytes: Array[Byte],
                           partitionKey: String,
                           seqNum: String,
                           approxArrivalTimestamp: Long,
                           stream: String,
                           shardId: String): T = {
    JsonDeserializationSchema.objectMapper.readValue[T](bytes, classTag[T].runtimeClass.asInstanceOf[Class[T]])
  }

  override def getProducedType: TypeInformation[T] = {
    TypeExtractor.getForClass(classTag[T].runtimeClass.asInstanceOf[Class[T]])
  }
}