org.apache.spark.api.java.JavaSparkContext Scala Examples
The following examples show how to use org.apache.spark.api.java.JavaSparkContext.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: RRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.r import java.util.{Map => JMap} import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext} import org.apache.spark.api.python.PythonRDD import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD private abstract class BaseRRDD[T: ClassTag, U: ClassTag]( parent: RDD[T], numPartitions: Int, func: Array[Byte], deserializer: String, serializer: String, packageNames: Array[Byte], broadcastVars: Array[Broadcast[Object]]) extends RDD[U](parent) with Logging { override def getPartitions: Array[Partition] = parent.partitions override def compute(partition: Partition, context: TaskContext): Iterator[U] = { val runner = new RRunner[U]( func, deserializer, serializer, packageNames, broadcastVars, numPartitions) // The parent may be also an RRDD, so we should launch it first. val parentIterator = firstParent[T].iterator(partition, context) runner.compute(parentIterator, partition.index) } } def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int): JavaRDD[Array[Byte]] = { PythonRDD.readRDDFromFile(jsc, fileName, parallelism) } }
Example 2
Source File: RiakPythonHelper.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.util.python import com.basho.riak.spark._ import com.basho.riak.spark.rdd.RiakRDD import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.api.java.JavaRDD import com.basho.riak.spark.writer.WriteConf import org.apache.spark.rdd.RDD import java.util.ArrayList import scala.collection.JavaConversions._ class RiakPythonHelper { implicit val pickling = new PicklingUtils() def riakBucket(jsc: JavaSparkContext, bucketName: String, bucketType: String): RiakRDD[(String, Any)] = { jsc.sc.riakBucket(bucketName, bucketType) } def saveToRiak(jrdd: JavaRDD[Array[Byte]], bucketName: String, bucketType: String) = { jrdd.rdd.unpickle().saveToRiak(bucketName, bucketType, WriteConf()) } def query2iKeys[K](jsc: JavaSparkContext, bucketName: String, bucketType: String, index: String, keys: ArrayList[K]) = jsc.sc.riakBucket(bucketName, bucketType).query2iKeys(index, keys: _*) def queryBucketKeys(jsc: JavaSparkContext, bucketName: String, bucketType: String, keys: ArrayList[String]) = jsc.sc.riakBucket(bucketName, bucketType).queryBucketKeys(keys: _*) def partitionBy2iRanges[K](jsc: JavaSparkContext, bucketName: String, bucketType: String, index: String, ranges: ArrayList[ArrayList[K]]) = { val r = ranges.map(x => (x(0), x(1))) jsc.sc.riakBucket(bucketName, bucketType).partitionBy2iRanges(index, r: _*) } def partitionBy2iKeys[K](jsc: JavaSparkContext, bucketName: String, bucketType: String, index: String, keys: ArrayList[K]) = jsc.sc.riakBucket(bucketName, bucketType).partitionBy2iKeys(index, keys: _*) def pickleRows(rdd: RDD[_]): RDD[Array[Byte]] = rdd.pickle() def javaRDD(rdd: RDD[_]) = JavaRDD.fromRDD(rdd) }
Example 3
Source File: HBaseSQLContext.scala From Backup-Repo with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.OverrideCatalog import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.execution.{EnsureRequirements, SparkPlan} import org.apache.spark.sql.hbase.execution.{AddCoprocessor, HBaseStrategies} class HBaseSQLContext(sc: SparkContext) extends SQLContext(sc) { self => def this(sparkContext: JavaSparkContext) = this(sparkContext.sc) protected[sql] override lazy val conf: SQLConf = new HBaseSQLConf HBaseConfiguration.merge( sc.hadoopConfiguration, HBaseConfiguration.create(sc.hadoopConfiguration)) @transient override protected[sql] lazy val catalog: HBaseCatalog = new HBaseCatalog(this, sc.hadoopConfiguration) with OverrideCatalog experimental.extraStrategies = Seq((new SparkPlanner with HBaseStrategies).HBaseDataSource) @transient override protected[sql] val prepareForExecution = new RuleExecutor[SparkPlan] { val batches = Batch("Add exchange", Once, EnsureRequirements(self)) :: Batch("Add coprocessor", Once, AddCoprocessor(self)) :: Nil } }
Example 4
Source File: SparkSuite.scala From spark-sorted with Apache License 2.0 | 5 votes |
package com.tresata.spark.sorted import org.scalactic.Equality import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql.{ Dataset, SparkSession } object SparkSuite { lazy val spark: SparkSession = { val session = SparkSession.builder .master("local[*]") .appName("test") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.ui.enabled", false) .config("spark.sql.shuffle.partitions", 4) .getOrCreate() session } lazy val sc: SparkContext = spark.sparkContext lazy val jsc = new JavaSparkContext(sc) def javaSparkContext() = jsc } trait SparkSuite { implicit lazy val spark: SparkSession = SparkSuite.spark implicit lazy val sc: SparkContext = SparkSuite.spark.sparkContext implicit def rddEq[X]: Equality[RDD[X]] = new Equality[RDD[X]] { private def toCounts[Y](s: Seq[Y]): Map[Y, Int] = s.groupBy(identity).mapValues(_.size) def areEqual(a: RDD[X], b: Any): Boolean = b match { case s: Seq[_] => toCounts(a.collect) == toCounts(s) case rdd: RDD[_] => toCounts(a.collect) == toCounts(rdd.collect) } } implicit def gsEq[K, V](implicit rddEq: Equality[RDD[(K, V)]]): Equality[GroupSorted[K, V]] = new Equality[GroupSorted[K, V]] { def areEqual(a: GroupSorted[K, V], b: Any): Boolean = rddEq.areEqual(a, b) } implicit def dsEq[X](implicit rddEq: Equality[RDD[X]]): Equality[Dataset[X]] = new Equality[Dataset[X]] { def areEqual(a: Dataset[X], b: Any): Boolean = b match { case ds: Dataset[_] => rddEq.areEqual(a.rdd, ds.rdd) case x => rddEq.areEqual(a.rdd, x) } } }
Example 5
Source File: RRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.api.r import java.util.{Map => JMap} import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext} import org.apache.spark.api.python.PythonRDD import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD private abstract class BaseRRDD[T: ClassTag, U: ClassTag]( parent: RDD[T], numPartitions: Int, func: Array[Byte], deserializer: String, serializer: String, packageNames: Array[Byte], broadcastVars: Array[Broadcast[Object]]) extends RDD[U](parent) with Logging { override def getPartitions: Array[Partition] = parent.partitions override def compute(partition: Partition, context: TaskContext): Iterator[U] = { val runner = new RRunner[U]( func, deserializer, serializer, packageNames, broadcastVars, numPartitions) // The parent may be also an RRDD, so we should launch it first. val parentIterator = firstParent[T].iterator(partition, context) runner.compute(parentIterator, partition.index) } } def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int): JavaRDD[Array[Byte]] = { PythonRDD.readRDDFromFile(jsc, fileName, parallelism) } }
Example 6
Source File: RRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.r import java.util.{Map => JMap} import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext} import org.apache.spark.api.python.PythonRDD import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD private abstract class BaseRRDD[T: ClassTag, U: ClassTag]( parent: RDD[T], numPartitions: Int, func: Array[Byte], deserializer: String, serializer: String, packageNames: Array[Byte], broadcastVars: Array[Broadcast[Object]]) extends RDD[U](parent) with Logging { override def getPartitions: Array[Partition] = parent.partitions override def compute(partition: Partition, context: TaskContext): Iterator[U] = { val runner = new RRunner[U]( func, deserializer, serializer, packageNames, broadcastVars, numPartitions) // The parent may be also an RRDD, so we should launch it first. val parentIterator = firstParent[T].iterator(partition, context) runner.compute(parentIterator, partition.index) } } def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int): JavaRDD[Array[Byte]] = { PythonRDD.readRDDFromFile(jsc, fileName, parallelism) } }
Example 7
Source File: CustomCodeEntryPoint.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.workflowexecutor.customcode import java.util.concurrent.TimeoutException import java.util.concurrent.atomic.AtomicReference import scala.annotation.tailrec import scala.concurrent.duration._ import scala.concurrent.{Await, Promise} import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql.DataFrame import org.apache.spark.{SparkConf, SparkContext} import ai.deepsense.commons.utils.Logging import ai.deepsense.deeplang._ import ai.deepsense.sparkutils.SparkSQLSession class CustomCodeEntryPoint( val sparkContext: SparkContext, val sparkSQLSession: SparkSQLSession, val dataFrameStorage: DataFrameStorage, val operationExecutionDispatcher: OperationExecutionDispatcher) extends Logging { import ai.deepsense.workflowexecutor.customcode.CustomCodeEntryPoint._ def getSparkContext: JavaSparkContext = sparkContext def getSparkSQLSession: SparkSQLSession = sparkSQLSession def getNewSparkSQLSession: SparkSQLSession = sparkSQLSession.newSession() def getSparkConf: SparkConf = sparkContext.getConf private val codeExecutor: AtomicReference[Promise[CustomCodeExecutor]] = new AtomicReference(Promise()) private val pythonPort: AtomicReference[Promise[Int]] = new AtomicReference(Promise()) def getCodeExecutor(timeout: Duration): CustomCodeExecutor = getFromPromise(codeExecutor.get, timeout) def getPythonPort(timeout: Duration): Int = getFromPromise(pythonPort.get, timeout) def registerCodeExecutor(newCodeExecutor: CustomCodeExecutor): Unit = replacePromise(codeExecutor, newCodeExecutor) def registerCallbackServerPort(newPort: Int): Unit = replacePromise(pythonPort, newPort) def retrieveInputDataFrame(workflowId: String, nodeId: String, portNumber: Int): DataFrame = dataFrameStorage.getInputDataFrame(workflowId, nodeId, portNumber).get def retrieveOutputDataFrame(workflowId: String, nodeId: String, portNumber: Int): DataFrame = dataFrameStorage.getOutputDataFrame(workflowId, nodeId, portNumber).get def registerOutputDataFrame( workflowId: String, nodeId: String, portNumber: Int, dataFrame: DataFrame): Unit = dataFrameStorage.setOutputDataFrame(workflowId, nodeId, portNumber, dataFrame) def executionCompleted(workflowId: String, nodeId: String): Unit = operationExecutionDispatcher.executionEnded(workflowId, nodeId, Right(())) def executionFailed(workflowId: String, nodeId: String, error: String): Unit = operationExecutionDispatcher.executionEnded(workflowId, nodeId, Left(error)) } object CustomCodeEntryPoint { private case class PromiseReplacedException() extends Exception @tailrec private def getFromPromise[T](promise: => Promise[T], timeout: Duration): T = { try { Await.result(promise.future, timeout) } catch { case e: TimeoutException => throw e case e: PromiseReplacedException => getFromPromise(promise, timeout) } } private def replacePromise[T](promise: AtomicReference[Promise[T]], newValue: T): Unit = { val oldPromise = promise.getAndSet(Promise.successful(newValue)) try { oldPromise.failure(new PromiseReplacedException) } catch { // The oldPromise will have been completed always, except for the first time. // The illegal state is expected, but we have to complete the oldPromise, // since someone might be waiting on it. case e: IllegalStateException => () } } case class CustomCodeEntryPointConfig( pyExecutorSetupTimeout: Duration = 5.seconds) }
Example 8
Source File: HBaseSparkSession.scala From Heracles with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.Analyzer import org.apache.spark.sql.catalyst.catalog.ExternalCatalog import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.SparkPlanner import org.apache.spark.sql.hbase.execution.{HBaseSourceAnalysis, HBaseStrategies} import org.apache.spark.sql.internal.{BaseSessionStateBuilder, SQLConf, SessionState, SharedState} class HBaseSparkSession(sc: SparkContext) extends SparkSession(sc) { self => def this(sparkContext: JavaSparkContext) = this(sparkContext.sc) @transient override lazy val sessionState: SessionState = new HBaseSessionStateBuilder(this).build() HBaseConfiguration.merge( sc.hadoopConfiguration, HBaseConfiguration.create(sc.hadoopConfiguration)) @transient override lazy val sharedState: SharedState = new HBaseSharedState(sc, this.sqlContext) } class HBaseSessionStateBuilder(session: SparkSession, parentState: Option[SessionState] = None) extends BaseSessionStateBuilder(session) { override lazy val conf: SQLConf = new HBaseSQLConf override protected def newBuilder: NewBuilder = new HBaseSessionStateBuilder(_, _) override lazy val experimentalMethods: ExperimentalMethods = { val result = new ExperimentalMethods; result.extraStrategies = Seq((new SparkPlanner(session.sparkContext, conf, new ExperimentalMethods) with HBaseStrategies).HBaseDataSource) result } override lazy val analyzer: Analyzer = { new Analyzer(catalog, conf) { override val extendedResolutionRules: Seq[Rule[LogicalPlan]] = new FindDataSourceTable(session) +: new ResolveSQLOnFile(session) +: customResolutionRules override val postHocResolutionRules: Seq[Rule[LogicalPlan]] = PreprocessTableCreation(session) +: PreprocessTableInsertion(conf) +: DataSourceAnalysis(conf) +: HBaseSourceAnalysis(session) +: customPostHocResolutionRules override val extendedCheckRules = customCheckRules } } } class HBaseSharedState(sc: SparkContext, sqlContext: SQLContext) extends SharedState(sc) { override lazy val externalCatalog: ExternalCatalog = new HBaseCatalog(sqlContext, sc.hadoopConfiguration) }
Example 9
Source File: HBaseSQLContext.scala From Spark-SQL-on-HBase with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.OverrideCatalog import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.execution.{EnsureRequirements, SparkPlan} import org.apache.spark.sql.hbase.execution.{AddCoprocessor, HBaseStrategies} class HBaseSQLContext(sc: SparkContext) extends SQLContext(sc) { self => def this(sparkContext: JavaSparkContext) = this(sparkContext.sc) protected[sql] override lazy val conf: SQLConf = new HBaseSQLConf HBaseConfiguration.merge( sc.hadoopConfiguration, HBaseConfiguration.create(sc.hadoopConfiguration)) @transient override protected[sql] lazy val catalog: HBaseCatalog = new HBaseCatalog(this, sc.hadoopConfiguration) with OverrideCatalog experimental.extraStrategies = Seq((new SparkPlanner with HBaseStrategies).HBaseDataSource) @transient override protected[sql] val prepareForExecution = new RuleExecutor[SparkPlan] { val batches = Batch("Add exchange", Once, EnsureRequirements(self)) :: Batch("Add coprocessor", Once, AddCoprocessor(self)) :: Nil } }
Example 10
Source File: SystemArg.scala From mist with Apache License 2.0 | 5 votes |
package mist.api import mist.api.data.JsMap import org.apache.spark.{SparkContext, SparkSessionUtils} import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql.{SQLContext, SparkSession} import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.JavaStreamingContext trait SystemArg[A] extends ArgDef[A] { final def validate(params: JsMap): Extraction[Unit] = Extracted(()) } object SystemArg { def apply[A](tags: Seq[String], f: => Extraction[A]): ArgDef[A] = new SystemArg[A] { override def extract(ctx: FnContext): Extraction[A] = f override def describe() = Seq(InternalArgument(tags)) } def apply[A](tags: Seq[String], f: FullFnContext => Extraction[A]): ArgDef[A] = new SystemArg[A] { override def extract(ctx: FnContext): Extraction[A] = ctx match { case c: FullFnContext => f(c) case _ => val desc = s"Unknown type of job context ${ctx.getClass.getSimpleName} " + s"expected ${FullFnContext.getClass.getSimpleName}" Failed.InternalError(desc) } override def describe() = Seq(InternalArgument(tags)) } } trait SparkArgs { val sparkContextArg: ArgDef[SparkContext] = SystemArg( Seq.empty, c => Extracted(c.sc) ) val streamingContextArg: ArgDef[StreamingContext] = SystemArg(Seq(ArgInfo.StreamingContextTag), ctx => { val ssc = StreamingContext.getActiveOrCreate(() => new StreamingContext(ctx.sc, ctx.streamingDuration)) Extracted(ssc) } ) val sqlContextArg: ArgDef[SQLContext] = SystemArg(Seq(ArgInfo.SqlContextTag), ctx => sparkContextArg.map(SQLContext.getOrCreate).extract(ctx) ) // HiveContext should be cached per jvm // see #325 val hiveContextArg: ArgDef[HiveContext] = new SystemArg[HiveContext] { var cache: HiveContext = _ override def extract(ctx: FnContext): Extraction[HiveContext] = synchronized { ctx match { case c: FullFnContext => if (cache == null) cache = new HiveContext(c.sc) Extracted(cache) case _ => Failed.InternalError(s"Unknown type of job context ${ctx.getClass.getSimpleName} expected ${FullFnContext.getClass.getSimpleName}") } } override def describe(): Seq[ArgInfo] = Seq(InternalArgument( Seq(ArgInfo.HiveContextTag, ArgInfo.SqlContextTag))) } val javaSparkContextArg: ArgDef[JavaSparkContext] = sparkContextArg.map(sc => new JavaSparkContext(sc)) val javaStreamingContextArg: ArgDef[JavaStreamingContext] = SystemArg(Seq(ArgInfo.StreamingContextTag), ctx => streamingContextArg.map(scc => new JavaStreamingContext(scc)).extract(ctx)) val sparkSessionArg: ArgDef[SparkSession] = SystemArg(Seq(ArgInfo.SqlContextTag), ctx => sparkContextArg.map(sc => SparkSessionUtils.getOrCreate(sc, false)).extract(ctx) ) val sparkSessionWithHiveArg: ArgDef[SparkSession] = SystemArg( Seq(ArgInfo.SqlContextTag, ArgInfo.HiveContextTag), ctx => sparkContextArg.map(sc => SparkSessionUtils.getOrCreate(sc, true)).extract(ctx)) } object SparkArgs extends SparkArgs
Example 11
Source File: MistScContext.scala From mist with Apache License 2.0 | 5 votes |
package io.hydrosphere.mist.worker import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql.{SQLContext, SparkSession} import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.{Duration => SDuration} import org.apache.spark.{SparkConf, SparkContext, SparkSessionUtils} import scala.collection.mutable import scala.concurrent.duration.Duration class MistScContext( val sc: SparkContext, val namespace: String, val streamingDuration: SDuration = SDuration(40 * 1000) ) { private val jars = mutable.Buffer.empty[String] def isK8S: Boolean = sc.getConf.get("spark.master").startsWith("k8s://") def addJar(artifact: SparkArtifact): Unit = synchronized { val path = if (isK8S) artifact.url else artifact.local.getAbsolutePath if (!jars.contains(path)) { sc.addJar(path) jars += path } } def getUIAddress(): Option[String] = SparkUtils.getSparkUiAddress(sc) //TODO: can we call that inside python directly using setupConfiguration? // python support def sparkConf: SparkConf = sc.getConf // python support def javaContext: JavaSparkContext = new JavaSparkContext(sc) // python support def sqlContext: SQLContext = new SQLContext(sc) // python support def hiveContext: HiveContext = new HiveContext(sc) def sparkSession(enableHive: Boolean): SparkSession = SparkSessionUtils.getOrCreate(sc, enableHive) def stop(): Unit = { sc.stop() } } object MistScContext { def apply(id: String, streamingDuration: Duration, sparkConf: SparkConf): MistScContext = { val upd = sparkConf.clone() .setAppName(id) .set("spark.streaming.stopSparkContextByDefault", "false") val duration = SDuration(streamingDuration.toMillis) val sc = new SparkContext(upd) new MistScContext(sc, id, duration) } def apply(id: String, streamingDuration: Duration): MistScContext = apply(id, streamingDuration, new SparkConf()) }
Example 12
Source File: InfinispanJavaRDD.scala From infinispan-spark with Apache License 2.0 | 5 votes |
package org.infinispan.spark.rdd import org.apache.spark.api.java.{JavaPairRDD, JavaSparkContext} import org.infinispan.query.dsl.Query import org.infinispan.spark._ import org.infinispan.spark.config.ConnectorConfiguration import scala.annotation.varargs import scala.reflect.ClassTag object InfinispanJavaRDD { def createInfinispanRDD[K, V](jsc: JavaSparkContext, config: ConnectorConfiguration): InfinispanJavaRDD[K, V] = { createInfinispanRDD(jsc.sc, config, new PerServerSplitter) } def createInfinispanRDD[K, V](jsc: JavaSparkContext, config: ConnectorConfiguration, splitter: Splitter): InfinispanJavaRDD[K, V] = { val infinispanRDD = new InfinispanRDD[K, V](jsc.sc, config, splitter) implicit val keyClassTag = ClassTag.AnyRef.asInstanceOf[ClassTag[K]] implicit val valueClassTag = ClassTag.AnyRef.asInstanceOf[ClassTag[V]] new InfinispanJavaRDD[K, V](infinispanRDD) } def write[K, V](pairRDD: JavaPairRDD[K, V], config: ConnectorConfiguration) = pairRDD.rdd.writeToInfinispan(config) } class InfinispanJavaRDD[K, V](rdd: InfinispanRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) with CacheManagementAware { def filterByQuery[R](q: Query): JavaPairRDD[K, R] = { val filteredRDD = rdd.filterByQuery[R](q) implicit val converted = ClassTag.AnyRef.asInstanceOf[ClassTag[R]] JavaPairRDD.fromRDD[K, R](filteredRDD) } def filterByQuery[R](q: String): JavaPairRDD[K, R] = { val filteredRDD = rdd.filterByQuery[R](q) implicit val converted = ClassTag.AnyRef.asInstanceOf[ClassTag[R]] JavaPairRDD.fromRDD[K, R](filteredRDD) } @varargs def filterByCustom[R](filterFactory: String, params: AnyRef*): JavaPairRDD[K, R] = { val filteredRDD = rdd.filterByCustom[R](filterFactory, params: _*) implicit val converted = ClassTag.AnyRef.asInstanceOf[ClassTag[R]] JavaPairRDD.fromRDD[K, R](filteredRDD) } override def count() = rdd.count() override def cacheAdmin(): CacheAdmin = rdd.cacheAdmin() }
Example 13
Source File: JavaSpark.scala From infinispan-spark with Apache License 2.0 | 5 votes |
package org.infinispan.spark.test import org.apache.spark.SparkConf import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql.SparkSession import org.infinispan.spark.serializer._ import org.scalatest.{BeforeAndAfterAll, Suite} trait JavaSpark extends BeforeAndAfterAll { this: Suite with RemoteTest => private lazy val config: SparkConf = new SparkConf().setMaster("local[4]") .setAppName(this.getClass.getName) .set("spark.serializer", classOf[JBossMarshallingSerializer].getName) .set("spark.driver.host", "127.0.0.1") protected var sparkSession: SparkSession = _ protected var jsc: JavaSparkContext = _ override protected def beforeAll(): Unit = { sparkSession = SparkSession.builder().config(config).getOrCreate() jsc = new JavaSparkContext(sparkSession.sparkContext) super.beforeAll() } override protected def afterAll(): Unit = { jsc.stop() sparkSession.stop() sparkSession.stop() super.afterAll() } }
Example 14
Source File: JavaSparkStream.scala From infinispan-spark with Apache License 2.0 | 5 votes |
package org.infinispan.spark.test import org.apache.spark.SparkConf import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.api.java.JavaStreamingContext import org.infinispan.spark.serializer.JBossMarshallingSerializer import org.scalatest.{BeforeAndAfterEach, Suite} trait JavaSparkStream extends BeforeAndAfterEach { this: Suite with RemoteTest => private lazy val config: SparkConf = new SparkConf().setMaster("local[4]") .setAppName(this.getClass.getName) .set("spark.serializer", classOf[JBossMarshallingSerializer].getName) .set("spark.driver.host","127.0.0.1") protected var jssc: JavaStreamingContext = _ protected var jsc: JavaSparkContext = _ override protected def beforeEach(): Unit = { jsc = new JavaSparkContext(config) jssc = new JavaStreamingContext(jsc, Seconds(1)) getRemoteCache.clear() super.beforeEach() } override protected def afterEach(): Unit = { jssc.stop(stopSparkContext = true) jsc.stop() super.afterEach() } }
Example 15
Source File: CustomCodeEntryPoint.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.workflowexecutor.customcode import java.util.concurrent.TimeoutException import java.util.concurrent.atomic.AtomicReference import scala.annotation.tailrec import scala.concurrent.duration._ import scala.concurrent.{Await, Promise} import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql.DataFrame import org.apache.spark.{SparkConf, SparkContext} import io.deepsense.commons.utils.Logging import io.deepsense.deeplang._ import io.deepsense.sparkutils.SparkSQLSession class CustomCodeEntryPoint( val sparkContext: SparkContext, val sparkSQLSession: SparkSQLSession, val dataFrameStorage: DataFrameStorage, val operationExecutionDispatcher: OperationExecutionDispatcher) extends Logging { import io.deepsense.workflowexecutor.customcode.CustomCodeEntryPoint._ def getSparkContext: JavaSparkContext = sparkContext def getSparkSQLSession: SparkSQLSession = sparkSQLSession def getNewSparkSQLSession: SparkSQLSession = sparkSQLSession.newSession() def getSparkConf: SparkConf = sparkContext.getConf private val codeExecutor: AtomicReference[Promise[CustomCodeExecutor]] = new AtomicReference(Promise()) private val pythonPort: AtomicReference[Promise[Int]] = new AtomicReference(Promise()) def getCodeExecutor(timeout: Duration): CustomCodeExecutor = getFromPromise(codeExecutor.get, timeout) def getPythonPort(timeout: Duration): Int = getFromPromise(pythonPort.get, timeout) def registerCodeExecutor(newCodeExecutor: CustomCodeExecutor): Unit = replacePromise(codeExecutor, newCodeExecutor) def registerCallbackServerPort(newPort: Int): Unit = replacePromise(pythonPort, newPort) def retrieveInputDataFrame(workflowId: String, nodeId: String, portNumber: Int): DataFrame = dataFrameStorage.getInputDataFrame(workflowId, nodeId, portNumber).get def retrieveOutputDataFrame(workflowId: String, nodeId: String, portNumber: Int): DataFrame = dataFrameStorage.getOutputDataFrame(workflowId, nodeId, portNumber).get def registerOutputDataFrame( workflowId: String, nodeId: String, portNumber: Int, dataFrame: DataFrame): Unit = dataFrameStorage.setOutputDataFrame(workflowId, nodeId, portNumber, dataFrame) def executionCompleted(workflowId: String, nodeId: String): Unit = operationExecutionDispatcher.executionEnded(workflowId, nodeId, Right(())) def executionFailed(workflowId: String, nodeId: String, error: String): Unit = operationExecutionDispatcher.executionEnded(workflowId, nodeId, Left(error)) } object CustomCodeEntryPoint { private case class PromiseReplacedException() extends Exception @tailrec private def getFromPromise[T](promise: => Promise[T], timeout: Duration): T = { try { Await.result(promise.future, timeout) } catch { case e: TimeoutException => throw e case e: PromiseReplacedException => getFromPromise(promise, timeout) } } private def replacePromise[T](promise: AtomicReference[Promise[T]], newValue: T): Unit = { val oldPromise = promise.getAndSet(Promise.successful(newValue)) try { oldPromise.failure(new PromiseReplacedException) } catch { // The oldPromise will have been completed always, except for the first time. // The illegal state is expected, but we have to complete the oldPromise, // since someone might be waiting on it. case e: IllegalStateException => () } } case class CustomCodeEntryPointConfig( pyExecutorSetupTimeout: Duration = 5.seconds) }