org.apache.spark.api.java.JavaRDD Scala Examples
The following examples show how to use org.apache.spark.api.java.JavaRDD.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: Word2VecModelWrapper.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import java.util.{List => JList, Map => JMap} import scala.collection.JavaConverters._ import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.feature.Word2VecModel import org.apache.spark.mllib.linalg.{Vector, Vectors} def findSynonyms(vector: Vector, num: Int): JList[Object] = { prepareResult(model.findSynonyms(vector, num)) } private def prepareResult(result: Array[(String, Double)]) = { val similarity = Vectors.dense(result.map(_._2)) val words = result.map(_._1) List(words, similarity).map(_.asInstanceOf[Object]).asJava } def getVectors: JMap[String, JList[Float]] = { model.getVectors.map { case (k, v) => (k, v.toList.asJava) }.asJava } def save(sc: SparkContext, path: String): Unit = model.save(sc, path) }
Example 2
Source File: MatrixFactorizationModelWrapper.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating} import org.apache.spark.rdd.RDD private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel) extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) { def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] = predict(SerDe.asTupleRDD(userAndProducts.rdd)) def getUserFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(userFeatures.map { case (user, feature) => (user, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def getProductFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(productFeatures.map { case (product, feature) => (product, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = { SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]]) } def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = { SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]]) } }
Example 3
Source File: KernelDensity.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD import org.apache.spark.rdd.RDD def normPdf( mean: Double, standardDeviation: Double, logStandardDeviationPlusHalfLog2Pi: Double, x: Double): Double = { val x0 = x - mean val x1 = x0 / standardDeviation val logDensity = -0.5 * x1 * x1 - logStandardDeviationPlusHalfLog2Pi math.exp(logDensity) } }
Example 4
Source File: RRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.r import java.util.{Map => JMap} import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext} import org.apache.spark.api.python.PythonRDD import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD private abstract class BaseRRDD[T: ClassTag, U: ClassTag]( parent: RDD[T], numPartitions: Int, func: Array[Byte], deserializer: String, serializer: String, packageNames: Array[Byte], broadcastVars: Array[Broadcast[Object]]) extends RDD[U](parent) with Logging { override def getPartitions: Array[Partition] = parent.partitions override def compute(partition: Partition, context: TaskContext): Iterator[U] = { val runner = new RRunner[U]( func, deserializer, serializer, packageNames, broadcastVars, numPartitions) // The parent may be also an RRDD, so we should launch it first. val parentIterator = firstParent[T].iterator(partition, context) runner.compute(parentIterator, partition.index) } } def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int): JavaRDD[Array[Byte]] = { PythonRDD.readRDDFromFile(jsc, fileName, parallelism) } }
Example 5
Source File: MatrixFactorizationModelWrapper.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating} import org.apache.spark.rdd.RDD private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel) extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) { def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] = predict(SerDe.asTupleRDD(userAndProducts.rdd)) def getUserFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(userFeatures.map { case (user, feature) => (user, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def getProductFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(productFeatures.map { case (product, feature) => (product, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } }
Example 6
Source File: KernelDensity.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.annotation.Experimental import org.apache.spark.api.java.JavaRDD import org.apache.spark.rdd.RDD def normPdf( mean: Double, standardDeviation: Double, logStandardDeviationPlusHalfLog2Pi: Double, x: Double): Double = { val x0 = x - mean val x1 = x0 / standardDeviation val logDensity = -0.5 * x1 * x1 - logStandardDeviationPlusHalfLog2Pi math.exp(logDensity) } }
Example 7
Source File: Word2VecModelWrapper.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import java.util.{ArrayList => JArrayList, List => JList, Map => JMap} import scala.collection.JavaConverters._ import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.feature.Word2VecModel import org.apache.spark.mllib.linalg.{Vector, Vectors} def transform(rdd: JavaRDD[String]): JavaRDD[Vector] = { rdd.rdd.map(model.transform) } def findSynonyms(word: String, num: Int): JList[Object] = { val vec = transform(word) findSynonyms(vec, num) } def findSynonyms(vector: Vector, num: Int): JList[Object] = { val result = model.findSynonyms(vector, num) val similarity = Vectors.dense(result.map(_._2)) val words = result.map(_._1) List(words, similarity).map(_.asInstanceOf[Object]).asJava } def getVectors: JMap[String, JList[Float]] = { model.getVectors.map({case (k, v) => (k, v.toList.asJava)}).asJava } def save(sc: SparkContext, path: String): Unit = model.save(sc, path) }
Example 8
Source File: MatrixFactorizationModelWrapper.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating} import org.apache.spark.rdd.RDD private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel) extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) { def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] = predict(SerDe.asTupleRDD(userAndProducts.rdd)) def getUserFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(userFeatures.map { case (user, feature) => (user, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def getProductFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(productFeatures.map { case (product, feature) => (product, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } }
Example 9
Source File: KernelDensity.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.rdd.RDD def normPdf( mean: Double, standardDeviation: Double, logStandardDeviationPlusHalfLog2Pi: Double, x: Double): Double = { val x0 = x - mean val x1 = x0 / standardDeviation val logDensity = -0.5 * x1 * x1 - logStandardDeviationPlusHalfLog2Pi math.exp(logDensity) } }
Example 10
Source File: Word2VecModelWrapper.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import java.util.{List => JList, Map => JMap} import scala.collection.JavaConverters._ import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.feature.Word2VecModel import org.apache.spark.mllib.linalg.{Vector, Vectors} def findSynonyms(vector: Vector, num: Int): JList[Object] = { prepareResult(model.findSynonyms(vector, num)) } private def prepareResult(result: Array[(String, Double)]) = { val similarity = Vectors.dense(result.map(_._2)) val words = result.map(_._1) List(words, similarity).map(_.asInstanceOf[Object]).asJava } def getVectors: JMap[String, JList[Float]] = { model.getVectors.map { case (k, v) => (k, v.toList.asJava) }.asJava } def save(sc: SparkContext, path: String): Unit = model.save(sc, path) }
Example 11
Source File: MatrixFactorizationModelWrapper.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating} import org.apache.spark.rdd.RDD private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel) extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) { def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] = predict(SerDe.asTupleRDD(userAndProducts.rdd)) def getUserFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(userFeatures.map { case (user, feature) => (user, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def getProductFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(productFeatures.map { case (product, feature) => (product, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = { SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]]) } def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = { SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]]) } }
Example 12
Source File: KernelDensity.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD import org.apache.spark.rdd.RDD def normPdf( mean: Double, standardDeviation: Double, logStandardDeviationPlusHalfLog2Pi: Double, x: Double): Double = { val x0 = x - mean val x1 = x0 / standardDeviation val logDensity = -0.5 * x1 * x1 - logStandardDeviationPlusHalfLog2Pi math.exp(logDensity) } }
Example 13
Source File: PythonSQLUtils.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.api.python import org.apache.spark.api.java.JavaRDD import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.catalyst.analysis.FunctionRegistry import org.apache.spark.sql.catalyst.expressions.ExpressionInfo import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.execution.arrow.ArrowConverters import org.apache.spark.sql.types.DataType private[sql] object PythonSQLUtils { def parseDataType(typeText: String): DataType = CatalystSqlParser.parseDataType(typeText) // This is needed when generating SQL documentation for built-in functions. def listBuiltinFunctionInfos(): Array[ExpressionInfo] = { FunctionRegistry.functionSet.flatMap(f => FunctionRegistry.builtin.lookupFunction(f)).toArray } def arrowPayloadToDataFrame( payloadRDD: JavaRDD[Array[Byte]], schemaString: String, sqlContext: SQLContext): DataFrame = { ArrowConverters.toDataFrame(payloadRDD, schemaString, sqlContext) } }
Example 14
Source File: Word2VecModelWrapper.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import java.util.{ArrayList => JArrayList, List => JList, Map => JMap} import scala.collection.JavaConverters._ import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.feature.Word2VecModel import org.apache.spark.mllib.linalg.{Vector, Vectors} def transform(rdd: JavaRDD[String]): JavaRDD[Vector] = { rdd.rdd.map(model.transform) } def findSynonyms(word: String, num: Int): JList[Object] = { val vec = transform(word) findSynonyms(vec, num) } def findSynonyms(vector: Vector, num: Int): JList[Object] = { val result = model.findSynonyms(vector, num) val similarity = Vectors.dense(result.map(_._2)) val words = result.map(_._1) List(words, similarity).map(_.asInstanceOf[Object]).asJava } def getVectors: JMap[String, JList[Float]] = { model.getVectors.map({case (k, v) => (k, v.toList.asJava)}).asJava } def save(sc: SparkContext, path: String): Unit = model.save(sc, path) }
Example 15
Source File: MatrixFactorizationModelWrapper.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating} import org.apache.spark.rdd.RDD private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel) extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) { def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] = predict(SerDe.asTupleRDD(userAndProducts.rdd)) def getUserFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(userFeatures.map { case (user, feature) => (user, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def getProductFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(productFeatures.map { case (product, feature) => (product, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = { SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]]) } def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = { SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]]) } }
Example 16
Source File: KernelDensity.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD import org.apache.spark.rdd.RDD def normPdf( mean: Double, standardDeviation: Double, logStandardDeviationPlusHalfLog2Pi: Double, x: Double): Double = { val x0 = x - mean val x1 = x0 / standardDeviation val logDensity = -0.5 * x1 * x1 - logStandardDeviationPlusHalfLog2Pi math.exp(logDensity) } }
Example 17
Source File: Neo4jJavaIntegration.scala From neo4j-spark-connector with Apache License 2.0 | 5 votes |
package org.neo4j.spark import java.util import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaRDD import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.neo4j.spark.dataframe.Neo4jDataFrame import org.neo4j.spark.rdd.{Neo4jRowRDD, Neo4jTupleRDD} import scala.collection.JavaConverters._ object Neo4jJavaIntegration { def rowRDD(sc: SparkContext, query: String, parameters: java.util.Map[String, AnyRef]) = new Neo4jRowRDD(sc, query, if (parameters == null) Seq.empty else parameters.asScala.toSeq).toJavaRDD() def tupleRDD(sc: SparkContext, query: String, parameters: java.util.Map[String, AnyRef]): JavaRDD[util.Map[String, AnyRef]] = { val params = if (parameters == null) Seq.empty else parameters.asScala.toSeq Neo4jTupleRDD(sc, query, params) .map((t) => new util.LinkedHashMap[String, AnyRef](t.toMap.asJava).asInstanceOf[util.Map[String, AnyRef]]) .toJavaRDD() } def dataFrame(sqlContext: SQLContext, query: String, parameters: java.util.Map[String, AnyRef], schemaInfo: util.Map[String, String]) = { Neo4jDataFrame(sqlContext, query, parameters.asScala.toSeq, schemaInfo.asScala.toSeq: _*) } }
Example 18
Source File: HashingTF.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import java.lang.{Iterable => JavaIterable} import scala.collection.JavaConverters._ import scala.collection.mutable import org.apache.spark.SparkException import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD import org.apache.spark.unsafe.hash.Murmur3_x86_32._ import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.Utils private[spark] def murmur3Hash(term: Any): Int = { term match { case null => seed case b: Boolean => hashInt(if (b) 1 else 0, seed) case b: Byte => hashInt(b, seed) case s: Short => hashInt(s, seed) case i: Int => hashInt(i, seed) case l: Long => hashLong(l, seed) case f: Float => hashInt(java.lang.Float.floatToIntBits(f), seed) case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed) case s: String => val utf8 = UTF8String.fromString(s) hashUnsafeBytes(utf8.getBaseObject, utf8.getBaseOffset, utf8.numBytes(), seed) case _ => throw new SparkException("HashingTF with murmur3 algorithm does not " + s"support type ${term.getClass.getCanonicalName} of input data.") } } }
Example 19
Source File: RRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.api.r import java.util.{Map => JMap} import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext} import org.apache.spark.api.python.PythonRDD import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD private abstract class BaseRRDD[T: ClassTag, U: ClassTag]( parent: RDD[T], numPartitions: Int, func: Array[Byte], deserializer: String, serializer: String, packageNames: Array[Byte], broadcastVars: Array[Broadcast[Object]]) extends RDD[U](parent) with Logging { override def getPartitions: Array[Partition] = parent.partitions override def compute(partition: Partition, context: TaskContext): Iterator[U] = { val runner = new RRunner[U]( func, deserializer, serializer, packageNames, broadcastVars, numPartitions) // The parent may be also an RRDD, so we should launch it first. val parentIterator = firstParent[T].iterator(partition, context) runner.compute(parentIterator, partition.index) } } def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int): JavaRDD[Array[Byte]] = { PythonRDD.readRDDFromFile(jsc, fileName, parallelism) } }
Example 20
Source File: KernelDensity.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD import org.apache.spark.rdd.RDD def normPdf( mean: Double, standardDeviation: Double, logStandardDeviationPlusHalfLog2Pi: Double, x: Double): Double = { val x0 = x - mean val x1 = x0 / standardDeviation val logDensity = -0.5 * x1 * x1 - logStandardDeviationPlusHalfLog2Pi math.exp(logDensity) } }
Example 21
Source File: MatrixFactorizationModelWrapper.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating} import org.apache.spark.rdd.RDD private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel) extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) { def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] = predict(SerDe.asTupleRDD(userAndProducts.rdd)) def getUserFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(userFeatures.map { case (user, feature) => (user, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def getProductFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(productFeatures.map { case (product, feature) => (product, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = { SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]]) } def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = { SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]]) } }
Example 22
Source File: Word2VecModelWrapper.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import java.util.{List => JList, Map => JMap} import scala.collection.JavaConverters._ import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.feature.Word2VecModel import org.apache.spark.mllib.linalg.{Vector, Vectors} def findSynonyms(vector: Vector, num: Int): JList[Object] = { prepareResult(model.findSynonyms(vector, num)) } private def prepareResult(result: Array[(String, Double)]) = { val similarity = Vectors.dense(result.map(_._2)) val words = result.map(_._1) List(words, similarity).map(_.asInstanceOf[Object]).asJava } def getVectors: JMap[String, JList[Float]] = { model.getVectors.map { case (k, v) => (k, v.toList.asJava) }.asJava } def save(sc: SparkContext, path: String): Unit = model.save(sc, path) }
Example 23
Source File: PythonSQLUtils.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.api.python import java.io.InputStream import java.nio.channels.Channels import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.python.PythonRDDServer import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.catalyst.analysis.FunctionRegistry import org.apache.spark.sql.catalyst.expressions.ExpressionInfo import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.execution.arrow.ArrowConverters import org.apache.spark.sql.types.DataType private[sql] object PythonSQLUtils { def parseDataType(typeText: String): DataType = CatalystSqlParser.parseDataType(typeText) // This is needed when generating SQL documentation for built-in functions. def listBuiltinFunctionInfos(): Array[ExpressionInfo] = { FunctionRegistry.functionSet.flatMap(f => FunctionRegistry.builtin.lookupFunction(f)).toArray } private[sql] class ArrowRDDServer(sqlContext: SQLContext) extends PythonRDDServer { override protected def streamToRDD(input: InputStream): RDD[Array[Byte]] = { // Create array to consume iterator so that we can safely close the inputStream val batches = ArrowConverters.getBatchesFromStream(Channels.newChannel(input)).toArray // Parallelize the record batches to create an RDD JavaRDD.fromRDD(sqlContext.sparkContext.parallelize(batches, batches.length)) } }
Example 24
Source File: SparkInflux.scala From reactiveinflux-spark with Apache License 2.0 | 5 votes |
package com.pygmalios.reactiveinflux.spark.jawa import com.pygmalios.reactiveinflux.ReactiveInfluxDbName import com.pygmalios.reactiveinflux.jawa.{Conversions, PointNoTime} import com.pygmalios.reactiveinflux.spark._ import org.apache.spark.api.java.JavaRDD import org.apache.spark.streaming.api.java.JavaDStream import scala.concurrent.duration._ class SparkInflux(val dbName: String, val awaitAtMostMillis: Long) { private implicit val reactiveInfluxDbName = ReactiveInfluxDbName(dbName) private implicit val awaitAtMost = awaitAtMostMillis.millis def saveToInflux[T <: PointNoTime](javaRdd: JavaRDD[T]): Unit = { javaRdd.rdd.map(Conversions.toScala).saveToInflux() } def saveToInflux[T <: PointNoTime](javaDStream: JavaDStream[T]): Unit = { javaDStream.dstream.map(Conversions.toScala).saveToInflux() } }
Example 25
Source File: BisectingKMeansModel.scala From bisecting-kmeans with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.bisectingkmeans import breeze.linalg.{Vector => BV, norm => breezeNorm} import org.apache.spark.Logging import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD def toJavaLinkageMatrix: java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = { val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]() this.node.toLinkageMatrix.foreach {x => val row = new java.util.ArrayList[java.lang.Double]() row.add(x._1.toDouble) row.add(x._2.toDouble) row.add(x._3.toDouble) row.add(x._4.toDouble) javaList.add(row) } javaList } }
Example 26
Source File: RiakPythonHelper.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.util.python import com.basho.riak.spark._ import com.basho.riak.spark.rdd.RiakRDD import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.api.java.JavaRDD import com.basho.riak.spark.writer.WriteConf import org.apache.spark.rdd.RDD import java.util.ArrayList import scala.collection.JavaConversions._ class RiakPythonHelper { implicit val pickling = new PicklingUtils() def riakBucket(jsc: JavaSparkContext, bucketName: String, bucketType: String): RiakRDD[(String, Any)] = { jsc.sc.riakBucket(bucketName, bucketType) } def saveToRiak(jrdd: JavaRDD[Array[Byte]], bucketName: String, bucketType: String) = { jrdd.rdd.unpickle().saveToRiak(bucketName, bucketType, WriteConf()) } def query2iKeys[K](jsc: JavaSparkContext, bucketName: String, bucketType: String, index: String, keys: ArrayList[K]) = jsc.sc.riakBucket(bucketName, bucketType).query2iKeys(index, keys: _*) def queryBucketKeys(jsc: JavaSparkContext, bucketName: String, bucketType: String, keys: ArrayList[String]) = jsc.sc.riakBucket(bucketName, bucketType).queryBucketKeys(keys: _*) def partitionBy2iRanges[K](jsc: JavaSparkContext, bucketName: String, bucketType: String, index: String, ranges: ArrayList[ArrayList[K]]) = { val r = ranges.map(x => (x(0), x(1))) jsc.sc.riakBucket(bucketName, bucketType).partitionBy2iRanges(index, r: _*) } def partitionBy2iKeys[K](jsc: JavaSparkContext, bucketName: String, bucketType: String, index: String, keys: ArrayList[K]) = jsc.sc.riakBucket(bucketName, bucketType).partitionBy2iKeys(index, keys: _*) def pickleRows(rdd: RDD[_]): RDD[Array[Byte]] = rdd.pickle() def javaRDD(rdd: RDD[_]) = JavaRDD.fromRDD(rdd) }
Example 27
Source File: Subtract.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.common import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.ImageUtil import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import org.apache.spark.api.java.JavaRDD import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Row, SparkSession} class Subtract extends ConfigurableStop{ override val authorEmail: String = "[email protected]" override val description: String = "Delete the existing data in the right table from the left table" override val inportList: List[String] =List(Port.LeftPort,Port.RightPort) override val outportList: List[String] = List(Port.DefaultPort) override def setProperties(map: Map[String, Any]): Unit = { } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/common/Subtract.png") } override def getGroup(): List[String] = { List(StopGroup.CommonGroup) } override def initialize(ctx: ProcessContext): Unit = { } override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val spark = pec.get[SparkSession]() val leftDF = in.read(Port.LeftPort) val rightDF = in.read(Port.RightPort) val outDF = leftDF.except(rightDF) out.write(outDF) } }
Example 28
Source File: RRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.r import java.util.{Map => JMap} import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext} import org.apache.spark.api.python.PythonRDD import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD private abstract class BaseRRDD[T: ClassTag, U: ClassTag]( parent: RDD[T], numPartitions: Int, func: Array[Byte], deserializer: String, serializer: String, packageNames: Array[Byte], broadcastVars: Array[Broadcast[Object]]) extends RDD[U](parent) with Logging { override def getPartitions: Array[Partition] = parent.partitions override def compute(partition: Partition, context: TaskContext): Iterator[U] = { val runner = new RRunner[U]( func, deserializer, serializer, packageNames, broadcastVars, numPartitions) // The parent may be also an RRDD, so we should launch it first. val parentIterator = firstParent[T].iterator(partition, context) runner.compute(parentIterator, partition.index) } } def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int): JavaRDD[Array[Byte]] = { PythonRDD.readRDDFromFile(jsc, fileName, parallelism) } }
Example 29
Source File: KernelDensity.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD import org.apache.spark.rdd.RDD def normPdf( mean: Double, standardDeviation: Double, logStandardDeviationPlusHalfLog2Pi: Double, x: Double): Double = { val x0 = x - mean val x1 = x0 / standardDeviation val logDensity = -0.5 * x1 * x1 - logStandardDeviationPlusHalfLog2Pi math.exp(logDensity) } }
Example 30
Source File: AssociationRules.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.fpm import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.java.JavaSparkContext.fakeClassTag import org.apache.spark.internal.Logging import org.apache.spark.mllib.fpm.AssociationRules.Rule import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset import org.apache.spark.rdd.RDD @Since("1.5.0") def javaConsequent: java.util.List[Item] = { consequent.toList.asJava } override def toString: String = { s"${antecedent.mkString("{", ",", "}")} => " + s"${consequent.mkString("{", ",", "}")}: ${confidence}" } } }
Example 31
Source File: MatrixFactorizationModelWrapper.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating} import org.apache.spark.rdd.RDD private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel) extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) { def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] = predict(SerDe.asTupleRDD(userAndProducts.rdd)) def getUserFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(userFeatures.map { case (user, feature) => (user, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def getProductFeatures: RDD[Array[Any]] = { SerDe.fromTuple2RDD(productFeatures.map { case (product, feature) => (product, Vectors.dense(feature)) }.asInstanceOf[RDD[(Any, Any)]]) } def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = { SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]]) } def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = { SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]]) } }
Example 32
Source File: Word2VecModelWrapper.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.api.python import java.util.{List => JList, Map => JMap} import scala.collection.JavaConverters._ import org.apache.spark.SparkContext import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.feature.Word2VecModel import org.apache.spark.mllib.linalg.{Vector, Vectors} def findSynonyms(vector: Vector, num: Int): JList[Object] = { prepareResult(model.findSynonyms(vector, num)) } private def prepareResult(result: Array[(String, Double)]) = { val similarity = Vectors.dense(result.map(_._2)) val words = result.map(_._1) List(words, similarity).map(_.asInstanceOf[Object]).asJava } def getVectors: JMap[String, JList[Float]] = { model.getVectors.map { case (k, v) => (k, v.toList.asJava) }.asJava } def save(sc: SparkContext, path: String): Unit = model.save(sc, path) }
Example 33
Source File: KMeansModel.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.clustering import scala.collection.JavaConverters._ import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.pmml.PMMLExportable import org.apache.spark.mllib.util.{Loader, Saveable} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SparkSession} @Since("0.8.0") def computeCost(data: RDD[Vector]): Double = { val centersWithNorm = clusterCentersWithNorm val bcCentersWithNorm = data.context.broadcast(centersWithNorm) data.map(p => KMeans.pointCost(bcCentersWithNorm.value, new VectorWithNorm(p))).sum() } private def clusterCentersWithNorm: Iterable[VectorWithNorm] = clusterCenters.map(new VectorWithNorm(_)) @Since("1.4.0") override def save(sc: SparkContext, path: String): Unit = { KMeansModel.SaveLoadV1_0.save(sc, this, path) } override protected def formatVersion: String = "1.0" } @Since("1.4.0") object KMeansModel extends Loader[KMeansModel] { @Since("1.4.0") override def load(sc: SparkContext, path: String): KMeansModel = { KMeansModel.SaveLoadV1_0.load(sc, path) } private case class Cluster(id: Int, point: Vector) private object Cluster { def apply(r: Row): Cluster = { Cluster(r.getInt(0), r.getAs[Vector](1)) } } private[clustering] object SaveLoadV1_0 { private val thisFormatVersion = "1.0" private[clustering] val thisClassName = "org.apache.spark.mllib.clustering.KMeansModel" def save(sc: SparkContext, model: KMeansModel, path: String): Unit = { val spark = SparkSession.builder().sparkContext(sc).getOrCreate() val metadata = compact(render( ("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~ ("k" -> model.k))) sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path)) val dataRDD = sc.parallelize(model.clusterCenters.zipWithIndex).map { case (point, id) => Cluster(id, point) } spark.createDataFrame(dataRDD).write.parquet(Loader.dataPath(path)) } def load(sc: SparkContext, path: String): KMeansModel = { implicit val formats = DefaultFormats val spark = SparkSession.builder().sparkContext(sc).getOrCreate() val (className, formatVersion, metadata) = Loader.loadMetadata(sc, path) assert(className == thisClassName) assert(formatVersion == thisFormatVersion) val k = (metadata \ "k").extract[Int] val centroids = spark.read.parquet(Loader.dataPath(path)) Loader.checkSchema[Cluster](centroids.schema) val localCentroids = centroids.rdd.map(Cluster.apply).collect() assert(k == localCentroids.length) new KMeansModel(localCentroids.sortBy(_.id).map(_.point)) } } }