breeze.linalg.SparseVector Scala Examples
The following examples show how to use breeze.linalg.SparseVector.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: Kinship.scala From seqspark with Apache License 2.0 | 5 votes |
package org.dizhang.seqspark.stat import org.apache.spark.rdd.RDD import org.dizhang.seqspark.ds._ import breeze.linalg.{DenseVector, SparseVector, Vector} import org.apache.spark.SparkContext import scala.collection.mutable.ArrayBuffer def removeNums(size: Int, nums: IndexedSeq[Int]): IndexedSeq[Int] = { var j: Int = 0 var i: Int = 0 val res = ArrayBuffer[Int]() while (i < size) { if (j >= nums.length) { res.+=(i) } else if (i == nums(j)) { j += 1 } else { res.+=(i) } i += 1 } res.toIndexedSeq } }
Example 2
Source File: LibSvmTest.scala From scio with Apache License 2.0 | 5 votes |
package com.spotify.scio.extra.libsvm import breeze.linalg.SparseVector import com.spotify.scio.testing.PipelineSpec class LibSvmTest extends PipelineSpec { val expected = List( (0.0, SparseVector[Double](34)((0, 1), (8, 1), (18, 1), (20, 1), (23, 1), (33, 1))), (1.0, SparseVector[Double](34)((2, 1), (8, 1), (18, 1), (20, 1), (29, 1), (33, 1))), (0.0, SparseVector[Double](34)((0, 1), (8, 1), (19, 1), (20, 1), (23, 1), (33, 1))) ) val data = List( "0 1:1 9:1 19:1 21:1 24:1 34:1", "1 3:1 9:1 19:1 21:1 30:1 34:1", "0 1:1 9:1 20:1 21:1 24:1 34:1" ) "libSVMCollection" should "parse libsvm files" in { runWithContext { sc => val res = libSVMCollection(sc.parallelize(data)) res should containInAnyOrder(expected) } } it should "parse libsvm files with length" in { runWithContext { sc => val res = libSVMCollection(sc.parallelize(data), 34) res should containInAnyOrder(expected) } } }
Example 3
Source File: BreezeSpec.scala From scio with Apache License 2.0 | 5 votes |
package com.spotify.scio.extra import breeze.linalg.{DenseMatrix, DenseVector, SparseVector} import breeze.stats.distributions.Rand import com.spotify.scio.extra.Breeze._ import com.twitter.algebird.Semigroup import org.scalacheck._ trait BreezeSpec[M[_], T] extends PropertySpec { val dimension = 10 val rows = 20 val cols = 10 val fRand = Rand.uniform.map(_.toFloat) val m: Gen[M[T]] def ms: Gen[List[M[T]]] = Gen.listOf[M[T]](m) def plus(x: M[T], y: M[T])(implicit sg: Semigroup[M[T]]): M[T] = sg.plus(x, y) def sumOption(xs: Iterable[M[T]])(implicit sg: Semigroup[M[T]]): Option[M[T]] = sg.sumOption(xs) } class FloatDenseVectorSpec extends BreezeSpec[DenseVector, Float] { val m = Gen.const(dimension).map(DenseVector.rand[Float](_, fRand)) property("plus") { forAll(m, m)((x, y) => plus(x, y) == x + y) } property("sumOption") { forAll(ms)(xs => sumOption(xs) == xs.reduceLeftOption(_ + _)) } } class DoubleDenseVectorSpec extends BreezeSpec[DenseVector, Double] { val m = Gen.const(dimension).map(DenseVector.rand[Double](_)) property("plus") { forAll(m, m)((x, y) => plus(x, y) == x + y) } property("sumOption") { forAll(ms)(xs => sumOption(xs) == xs.reduceLeftOption(_ + _)) } } class FloatDenseMatrixSpec extends BreezeSpec[DenseMatrix, Float] { val m = Gen.const((rows, cols)).map { case (r, c) => DenseMatrix.rand[Float](r, c, fRand) } property("plus") { forAll(m, m)((x, y) => plus(x, y) == x + y) } property("sumOption") { forAll(ms)(xs => sumOption(xs) == xs.reduceLeftOption(_ + _)) } } class DoubleDenseMatrixSpec extends BreezeSpec[DenseMatrix, Double] { val m = Gen.const((rows, cols)).map { case (r, c) => DenseMatrix.rand[Double](r, c) } property("plus") { forAll(m, m)((x, y) => plus(x, y) == x + y) } property("sumOption") { forAll(ms)(xs => sumOption(xs) == xs.reduceLeftOption(_ + _)) } } class FloatSparseVectorSpec extends BreezeSpec[SparseVector, Float] { val m = Gen .const(dimension) .map(d => SparseVector(DenseVector.rand[Float](d, fRand).data)) property("plus") { forAll(m, m)((x, y) => plus(x, y) == x + y) } property("sumOption") { forAll(ms)(xs => sumOption(xs) == xs.reduceLeftOption(_ + _)) } } class DoubleSparseVectorSpec extends BreezeSpec[SparseVector, Double] { val m = Gen .const(dimension) .map(d => SparseVector(DenseVector.rand[Double](d).data)) property("plus") { forAll(m, m)((x, y) => plus(x, y) == x + y) } property("sumOption") { forAll(ms)(xs => sumOption(xs) == xs.reduceLeftOption(_ + _)) } }
Example 4
Source File: PassiveAggressiveBinaryModelEvaluation.scala From flink-parameter-server with Apache License 2.0 | 5 votes |
package hu.sztaki.ilab.ps.test.utils import breeze.linalg.{DenseVector, SparseVector} import hu.sztaki.ilab.ps.passive.aggressive.algorithm.PassiveAggressiveBinaryAlgorithm import org.slf4j.LoggerFactory class PassiveAggressiveBinaryModelEvaluation object PassiveAggressiveBinaryModelEvaluation { private val log = LoggerFactory.getLogger(classOf[PassiveAggressiveBinaryModelEvaluation]) def accuracy(model: DenseVector[Double], testLines: Traversable[(SparseVector[Double], Option[Boolean])], featureCount: Int, pac: PassiveAggressiveBinaryAlgorithm): Double = { var tt = 0 var ff = 0 var tf = 0 var ft = 0 var cnt = 0 testLines.foreach { case (vector, label) => label match { case Some(lab) => val real = lab val predicted = pac.predict(vector, model) (real, predicted) match { case (true, true) => tt +=1 case (false, false) => ff +=1 case (true, false) => tf +=1 case (false, true) => ft +=1 } cnt += 1 case _ => throw new IllegalStateException("Labels shold not be missing.") } } val percent = ((tt + ff).toDouble / cnt) * 100 percent } }
Example 5
Source File: PassiveAggressiveMultiModelEvaluation.scala From flink-parameter-server with Apache License 2.0 | 5 votes |
package hu.sztaki.ilab.ps.test.utils import breeze.linalg.{DenseMatrix, SparseVector} import hu.sztaki.ilab.ps.passive.aggressive.algorithm.PassiveAggressiveMulticlassAlgorithm import org.slf4j.LoggerFactory class PassiveAggressiveMultiModelEvaluation object PassiveAggressiveMultiModelEvaluation { private val log = LoggerFactory.getLogger(classOf[PassiveAggressiveMultiModelEvaluation]) def accuracy(model: DenseMatrix[Double], testLines: Traversable[(SparseVector[Double], Option[Int])], featureCount: Int, pac: PassiveAggressiveMulticlassAlgorithm): Double = { var hit = 0 var cnt = 0 testLines.foreach{case(vector, label) => label match { case Some(l) => if (pac.predict(vector, model) == l) hit += 1 cnt += 1 case _ => throw new IllegalStateException("Labels should not be missing.") }} val percent = (hit.toDouble / cnt) * 100 percent } }
Example 6
Source File: driver.scala From proxcocoa with Apache License 2.0 | 5 votes |
package l1distopt import breeze.linalg.SparseVector import l1distopt.utils._ import l1distopt.solvers._ import org.apache.spark.rdd.RDD import org.apache.spark.{SparkContext, SparkConf} object driver { def main(args: Array[String]) { val options = args.map { arg => arg.dropWhile(_ == '-').split('=') match { case Array(opt, v) => (opt -> v) case Array(opt) => (opt -> "true") case _ => throw new IllegalArgumentException("Invalid argument: " + arg) } }.toMap // read in inputs val master = options.getOrElse("master", "local[4]") val trainFile = options.getOrElse("trainFile", "") val numFeatures = options.getOrElse("numFeatures", "0").toInt val numSplits = options.getOrElse("numSplits", "1").toInt val testFile = options.getOrElse("testFile", "") // algorithm-specific inputs val eta = options.getOrElse("eta", "1.0").toDouble // elastic net parameter: 1.0 = lasso, 0.0 = ridge regression val lambda = options.getOrElse("lambda", "0.01").toDouble // regularization parameter val numRounds = options.getOrElse("numRounds", "200").toInt // number of outer iterations, called T in the paper val localIterFrac = options.getOrElse("localIterFrac", "1.0").toDouble; // fraction of local points to be processed per round, H = localIterFrac * n val debugIter = options.getOrElse("debugIter", "10").toInt // set to -1 to turn off debugging output val seed = options.getOrElse("seed", "0").toInt // set seed for debug purposes // print out inputs println("master: " + master); println("trainFile: " + trainFile); println("numFeatures: " + numFeatures); println("numSplits: " + numSplits); println("testfile: " + testFile); println("eta " + eta); println("lambda: " + lambda); println("numRounds: " + numRounds); println("localIterFrac:" + localIterFrac); println("debugIter " + debugIter); println("seed " + seed); // start spark context val conf = new SparkConf().setMaster(master) .setAppName("Sparse-CoCoA") .setJars(SparkContext.jarOfObject(this).toSeq) val sc = new SparkContext(conf) val finalAlphaCoCoA = ProxCoCoAp.runProxCoCoAp(data, labels, params, debug) sc.stop() } }
Example 7
Source File: NearestNeighbors.scala From SparkSMOTE with MIT License | 5 votes |
package utils import org.apache.spark.SparkContext import breeze.linalg._ import breeze.linalg.{DenseVector,Vector,SparseVector} import com.github.fommil.netlib.BLAS import scala.util.Random import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import scala.collection.mutable.ArrayBuffer object NearestNeighbors { def runNearestNeighbors(data: RDD[Array[(LabeledPoint,Int,Int)]], kNN: Int, sampleData: Array[(LabeledPoint,Int,Int)]): Array[(String,Array[((Int,Int),Double)])] = { val globalNearestNeighborsByIndex = data.mapPartitionsWithIndex(localNearestNeighbors(_,_,kNN,sampleData)).groupByKey().map(x => (x._1,x._2.toArray.sortBy(r => r._2).take(kNN))).collect() globalNearestNeighborsByIndex } private def localNearestNeighbors(partitionIndex: Long, iter: Iterator[Array[(LabeledPoint,Int,Int)]], kNN: Int, sampleData: Array[(LabeledPoint,Int,Int)]): Iterator[(String,((Int,Int),Double))] = { var result = List[(String,((Int,Int),Double))]() val dataArr = iter.next val nLocal = dataArr.size - 1 val sampleDataSize = sampleData.size - 1 val kLocalNeighbors = Array.fill[distanceIndex](sampleDataSize+1)(null) for { i1 <- 0 to sampleDataSize } kLocalNeighbors(i1) = distanceIndex(sampleData(i1)._3.toInt, sampleData(i1)._2.toInt, DenseVector.zeros[Double](kNN) + Int.MaxValue.toDouble, DenseVector.zeros[Int](kNN)) for (i <- 0 to nLocal) { val currentPoint = dataArr(i) val features = currentPoint._1.features val rowId = currentPoint._3.toInt for (j <- 0 to sampleDataSize) { val samplePartitionId = sampleData(j)._2 val sampleRowId = sampleData(j)._3 val sampleFeatures = sampleData(j)._1.features if (!((rowId == sampleRowId) & (samplePartitionId == partitionIndex))) { val distance = Math.sqrt(sum((sampleFeatures - features) :* (sampleFeatures - features))) if (distance < max(kLocalNeighbors(j).distanceVector)) { val indexToReplace = argmax(kLocalNeighbors(j).distanceVector) kLocalNeighbors(j).distanceVector(indexToReplace) = distance kLocalNeighbors(j).neighborRowId(indexToReplace) = rowId } } } } for (m <- 0 to sampleDataSize){ for (l <-0 to kNN-1) { val key = kLocalNeighbors(m).partitionId.toString+","+kLocalNeighbors(m).sampleRowId.toString val tup = (partitionIndex.toInt,kLocalNeighbors(m).neighborRowId(l)) result.::=(key,(tup,kLocalNeighbors(m).distanceVector(l))) } } result.iterator } }
Example 8
Source File: loadData.scala From SparkSMOTE with MIT License | 5 votes |
package utils import org.apache.spark.SparkContext import breeze.linalg._ import breeze.linalg.{DenseVector,Vector,SparseVector} import org.apache.spark.rdd.RDD import org.apache.spark.broadcast.Broadcast object loadData { def readDelimitedData(sc: SparkContext, path: String, numFeatures: Int, delimiter: String, numPartitions: Int): RDD[(LabeledPoint,Int,Int)] = { val data = sc.textFile(path).filter{x => x.split(delimiter)(0).toDouble == 1.0}.repartition(numPartitions).mapPartitions{x => Iterator(x.toArray)} val formatData = data.mapPartitionsWithIndex{(partitionId,iter) => var result = List[(LabeledPoint,Int,Int)]() val dataArray = iter.next val dataArraySize = dataArray.size - 1 var rowCount = dataArraySize for (i <- 0 to dataArraySize) { val parts = dataArray(i).split(delimiter) result.::=((LabeledPoint(parts(0).toDouble,DenseVector(parts.slice(1,numFeatures+1)).map(_.toDouble)),partitionId.toInt,rowCount)) rowCount = rowCount - 1 } result.iterator } formatData } }
Example 9
Source File: SMOTE.scala From SparkSMOTE with MIT License | 5 votes |
package SMOTE import org.apache.spark.SparkContext import breeze.linalg._ import breeze.linalg.{DenseVector,Vector,SparseVector} import com.github.fommil.netlib.BLAS import scala.util.Random import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import scala.collection.mutable.ArrayBuffer import utils._ object SMOTE { def runSMOTE(sc: SparkContext, inPath: String, outPath: String, numFeatures: Int, oversamplingPctg: Double, kNN: Int, delimiter: String, numPartitions: Int): Unit = { val rand = new Random() val data = loadData.readDelimitedData(sc, inPath, numFeatures, delimiter, numPartitions) val dataArray = data.mapPartitions(x => Iterator(x.toArray)).cache() val numObs = dataArray.map(x => x.size).reduce(_+_) println("Number of Filtered Observations "+numObs.toString) val roundPctg = oversamplingPctg val sampleData = dataArray.flatMap(x => x).sample(withReplacement = false, fraction = roundPctg, seed = 1L).collect().sortBy(r => (r._2,r._3)) //without Replacement println("Sample Data Count "+sampleData.size.toString) val globalNearestNeighbors = NearestNeighbors.runNearestNeighbors(dataArray, kNN, sampleData) var randomNearestNeighbor = globalNearestNeighbors.map(x => (x._1.split(",")(0).toInt,x._1.split(",")(1).toInt,x._2(rand.nextInt(kNN)))).sortBy(r => (r._1,r._2)) var sampleDataNearestNeighbors = randomNearestNeighbor.zip(sampleData).map(x => (x._1._3._1._1, x._1._2, x._1._3._1._2, x._2._1)) val syntheticData = dataArray.mapPartitionsWithIndex(createSyntheticData(_,_,sampleDataNearestNeighbors,delimiter)).persist() println("Synthetic Data Count "+syntheticData.count.toString) val newData = syntheticData.union(sc.textFile(inPath)) println("New Line Count "+newData.count.toString) newData.saveAsTextFile(outPath) } private def createSyntheticData(partitionIndex: Long, iter: Iterator[Array[(LabeledPoint,Int,Int)]], sampleDataNN: Array[(Int,Int,Int,LabeledPoint)], delimiter: String): Iterator[String] = { var result = List[String]() val dataArr = iter.next val nLocal = dataArr.size - 1 val sampleDataNNSize = sampleDataNN.size - 1 val rand = new Random() for (j <- 0 to sampleDataNNSize){ val partitionId = sampleDataNN(j)._1 val neighborId = sampleDataNN(j)._3 val sampleFeatures = sampleDataNN(j)._4.features if (partitionId == partitionIndex.toInt){ val currentPoint = dataArr(neighborId) val features = currentPoint._1.features sampleFeatures += (sampleFeatures - features) * rand.nextDouble result.::=("1.0"+delimiter+sampleFeatures.toArray.mkString(delimiter)) } } result.iterator } }
Example 10
Source File: TensorLDAModelTest.scala From spectrallda-tensorspark with Apache License 2.0 | 5 votes |
package edu.uci.eecs.spectralLDA.algorithm import breeze.linalg.{DenseMatrix, DenseVector, SparseVector, norm} import breeze.numerics.abs import org.scalatest._ import org.apache.spark.SparkContext import edu.uci.eecs.spectralLDA.testharness.Context class TensorLDAModelTest extends FlatSpec with Matchers { private val sc: SparkContext = Context.getSparkContext "Multinomial log-likelihood" should "be correct" in { val p = DenseVector[Double](0.2, 0.5, 0.3) val x1 = DenseVector[Double](20, 50, 30) val x2 = DenseVector[Double](40, 40, 20) abs(TensorLDAModel.multinomialLogLikelihood(p, x1) - (-4.697546)) should be <= 1e-6 abs(TensorLDAModel.multinomialLogLikelihood(p, x2) - (-15.42038)) should be <= 1e-6 } }
Example 11
Source File: GibbsSample.scala From glintlda with MIT License | 5 votes |
package glintlda import breeze.linalg.{DenseVector, SparseVector, sum} import glintlda.util.FastRNG def apply(sv: SparseVector[Int], random: FastRNG, topics: Int): GibbsSample = { val totalTokens = sum(sv) val sample = new GibbsSample(new Array[Int](totalTokens), new Array[Int](totalTokens)) var i = 0 var current = 0 while (i < sv.activeSize) { val index = sv.indexAt(i) var value = sv.valueAt(i) while (value > 0) { sample.features(current) = index sample.topics(current) = random.nextPositiveInt() % topics current += 1 value -= 1 } i += 1 } sample } }
Example 12
Source File: ScoreTest.scala From seqspark with Apache License 2.0 | 5 votes |
package org.dizhang.seqspark.stat import breeze.linalg.{*, CSCMatrix, DenseMatrix, DenseVector, SparseVector} import org.dizhang.seqspark.stat.HypoTest.NullModel.{Fitted => SNM} import org.dizhang.seqspark.util.General._ object ScoreTest { def apply(nm: SNM, x: CSCMatrix[Double]): ScoreTest = { Sparse(nm, x) } def apply(nm: SNM, x: DenseMatrix[Double]): ScoreTest = { Dense(nm, x) } def apply(nm: SNM, x: DenseVector[Double]): ScoreTest = { Dense(nm, DenseVector.horzcat(x)) } def apply(nm: SNM, x: SparseVector[Double]): ScoreTest = { Sparse(nm, SparseVector.horzcat(x)) } def apply(nm: SNM, x1: DenseMatrix[Double], x2: CSCMatrix[Double]): ScoreTest = { Mixed(nm, x1, x2) } case class Sparse(nm: SNM, x: CSCMatrix[Double]) extends ScoreTest { val score = (nm.residuals.toDenseMatrix * x).toDenseVector / nm.a lazy val variance = { val c = nm.xs val IccInv = nm.invInfo * nm.a val Igg = (colMultiply(x, nm.b).t * x).toDense val Icg = (c(::, *) *:* nm.b).t * x val Igc = Icg.t (Igg - Igc * IccInv * Icg) / nm.a } } case class Dense(nm: SNM, x: DenseMatrix[Double]) extends ScoreTest { val score = x.t * nm.residuals / nm.a lazy val variance = { val c = nm.xs val IccInv = nm.invInfo * nm.a val Igg = (x(::, *) *:* nm.b).t * x val Icg = (c(::, *) *:* nm.b).t * x val Igc = Icg.t (Igg - Igc * IccInv * Icg)/nm.a } } case class Mixed(nm: SNM, x1: DenseMatrix[Double], x2: CSCMatrix[Double]) extends ScoreTest { private val dense = Dense(nm, x1) private val sparse = Sparse(nm, x2) val score = DenseVector.vertcat(dense.score, sparse.score) lazy val variance = { val v1 = dense.variance val v4 = sparse.variance val v2 = { val c = nm.xs val IccInv = nm.invInfo * nm.a val Igg = (x1(::, *) *:* nm.b).t * x2 val Icg = (c(::, *) *:* nm.b).t * x2 val Igc = x1.t * (c(::, *) *:* nm.b).t (Igg - Igc * IccInv * Icg) / nm.a } val v3 = v2.t val v12 = DenseMatrix.horzcat(v1, v2) val v34 = DenseMatrix.horzcat(v3, v4) DenseMatrix.vertcat(v12, v34) } } case class Mock(score: DenseVector[Double], variance: DenseMatrix[Double]) extends ScoreTest } @SerialVersionUID(7778780001L) sealed trait ScoreTest extends HypoTest { def score: DenseVector[Double] def variance: DenseMatrix[Double] }
Example 13
Source File: NewsgroupsPipeline.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.pipelines.text import breeze.linalg.SparseVector import keystoneml.evaluation.MulticlassClassifierEvaluator import keystoneml.loaders.NewsgroupsDataLoader import keystoneml.nodes.learning.NaiveBayesEstimator import keystoneml.nodes.nlp._ import keystoneml.nodes.stats.TermFrequency import keystoneml.nodes.util.{CommonSparseFeatures, MaxClassifier} import org.apache.spark.{SparkConf, SparkContext} import keystoneml.pipelines.Logging import scopt.OptionParser import keystoneml.workflow.Pipeline object NewsgroupsPipeline extends Logging { val appName = "NewsgroupsPipeline" def run(sc: SparkContext, conf: NewsgroupsConfig): Pipeline[String, Int] = { val trainData = NewsgroupsDataLoader(sc, conf.trainLocation) val numClasses = NewsgroupsDataLoader.classes.length // Build the classifier estimator logInfo("Training classifier") val predictor = Trim andThen LowerCase() andThen Tokenizer() andThen NGramsFeaturizer(1 to conf.nGrams) andThen TermFrequency(x => 1) andThen (CommonSparseFeatures[Seq[String]](conf.commonFeatures), trainData.data) andThen (NaiveBayesEstimator[SparseVector[Double]](numClasses), trainData.data, trainData.labels) andThen MaxClassifier // Evaluate the classifier logInfo("Evaluating classifier") val testData = NewsgroupsDataLoader(sc, conf.testLocation) val testLabels = testData.labels val testResults = predictor(testData.data) val eval = new MulticlassClassifierEvaluator(numClasses).evaluate(testResults, testLabels) logInfo("\n" + eval.summary(NewsgroupsDataLoader.classes)) predictor } case class NewsgroupsConfig( trainLocation: String = "", testLocation: String = "", nGrams: Int = 2, commonFeatures: Int = 100000) def parse(args: Array[String]): NewsgroupsConfig = new OptionParser[NewsgroupsConfig](appName) { head(appName, "0.1") opt[String]("trainLocation") required() action { (x,c) => c.copy(trainLocation=x) } opt[String]("testLocation") required() action { (x,c) => c.copy(testLocation=x) } opt[Int]("nGrams") action { (x,c) => c.copy(nGrams=x) } opt[Int]("commonFeatures") action { (x,c) => c.copy(commonFeatures=x) } }.parse(args, NewsgroupsConfig()).get def main(args: Array[String]) = { val conf = new SparkConf().setAppName(appName) conf.setIfMissing("spark.master", "local[2]") // This is a fallback if things aren't set via spark submit. val sc = new SparkContext(conf) val appConfig = parse(args) run(sc, appConfig) sc.stop() } }
Example 14
Source File: AmazonReviewsPipeline.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.pipelines.text import breeze.linalg.SparseVector import keystoneml.evaluation.BinaryClassifierEvaluator import keystoneml.loaders.{AmazonReviewsDataLoader, LabeledData} import keystoneml.nodes.learning.LogisticRegressionEstimator import keystoneml.nodes.nlp._ import keystoneml.nodes.stats.TermFrequency import keystoneml.nodes.util.CommonSparseFeatures import org.apache.spark.sql.SparkSession import org.apache.spark.{SparkConf, SparkContext} import keystoneml.pipelines.Logging import scopt.OptionParser import keystoneml.workflow.Pipeline object AmazonReviewsPipeline extends Logging { val appName = "AmazonReviewsPipeline" def run(spark: SparkSession, conf: AmazonReviewsConfig): Pipeline[String, Double] = { val amazonTrainData = AmazonReviewsDataLoader(spark, conf.trainLocation, conf.threshold).labeledData val trainData = LabeledData(amazonTrainData.repartition(conf.numParts).cache()) val training = trainData.data val labels = trainData.labels // Build the classifier estimator val predictor = Trim andThen LowerCase() andThen Tokenizer() andThen NGramsFeaturizer(1 to conf.nGrams) andThen TermFrequency(x => 1) andThen (CommonSparseFeatures[Seq[String]](conf.commonFeatures), training) andThen (LogisticRegressionEstimator[SparseVector[Double]](numClasses = 2, numIters = conf.numIters), training, labels) // Evaluate the classifier val amazonTestData = AmazonReviewsDataLoader(spark, conf.testLocation, conf.threshold).labeledData val testData = LabeledData(amazonTestData.repartition(conf.numParts).cache()) val testLabels = testData.labels val testResults = predictor(testData.data) val eval = BinaryClassifierEvaluator.evaluate(testResults.get.map(_ > 0), testLabels.map(_ > 0)) logInfo("\n" + eval.summary()) predictor } case class AmazonReviewsConfig( trainLocation: String = "", testLocation: String = "", threshold: Double = 3.5, nGrams: Int = 2, commonFeatures: Int = 100000, numIters: Int = 20, numParts: Int = 512) def parse(args: Array[String]): AmazonReviewsConfig = new OptionParser[AmazonReviewsConfig](appName) { head(appName, "0.1") opt[String]("trainLocation") required() action { (x,c) => c.copy(trainLocation=x) } opt[String]("testLocation") required() action { (x,c) => c.copy(testLocation=x) } opt[Double]("threshold") action { (x,c) => c.copy(threshold=x)} opt[Int]("nGrams") action { (x,c) => c.copy(nGrams=x) } opt[Int]("commonFeatures") action { (x,c) => c.copy(commonFeatures=x) } opt[Int]("numIters") action { (x,c) => c.copy(numParts=x) } opt[Int]("numParts") action { (x,c) => c.copy(numParts=x) } }.parse(args, AmazonReviewsConfig()).get def main(args: Array[String]) = { val conf = new SparkConf().setAppName(appName) conf.setIfMissing("spark.master", "local[2]") // This is a fallback if things aren't set via spark submit. val spark = SparkSession.builder.config(conf).getOrCreate() val appConfig = parse(args) run(spark, appConfig) spark.stop() } }
Example 15
Source File: AllSparseFeatures.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.util import breeze.linalg.SparseVector import org.apache.spark.rdd.RDD import keystoneml.workflow.Estimator import scala.reflect.ClassTag case class AllSparseFeatures[T: ClassTag]() extends Estimator[Seq[(T, Double)], SparseVector[Double]] { override def fit(data: RDD[Seq[(T, Double)]]): SparseFeatureVectorizer[T] = { val featureOccurrences = data.flatMap(_.map(_._1)) // zip with unique ids and take the smallest unique id for a given feature to get // a deterministic ordering val featuresWithUniqueId = featureOccurrences.zipWithUniqueId().reduceByKey { (x, y) => Math.min(x, y) } val featureSpace = featuresWithUniqueId.sortBy(_._2).map(_._1) .collect().zipWithIndex.toMap new SparseFeatureVectorizer(featureSpace) } }
Example 16
Source File: SparseFeatureVectorizer.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.util import breeze.linalg.SparseVector import keystoneml.workflow.Transformer class SparseFeatureVectorizer[T](featureSpace: Map[T, Int]) extends Transformer[Seq[(T, Double)], SparseVector[Double]] { private def transformVector(in: Seq[(T, Double)], featureSpaceMap: Map[T, Int]): SparseVector[Double] = { val features = in.map(f => (featureSpaceMap.get(f._1), f._2)) .filter(_._1.isDefined) .map(f => (f._1.get, f._2.toDouble)) SparseVector(featureSpaceMap.size)(features:_*) } override def apply(in: Seq[(T, Double)]): SparseVector[Double] = { transformVector(in, featureSpace) } }
Example 17
Source File: NGramsHashingTF.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.nlp import java.lang.Integer.{rotateLeft => rotl} import breeze.linalg.SparseVector import keystoneml.workflow.Transformer import scala.collection.mutable private final def avalanche(hash: Int): Int = { var h = hash h ^= h >>> 16 h *= 0x85ebca6b h ^= h >>> 13 h *= 0xc2b2ae35 h ^= h >>> 16 h } def nonNegativeMod(x: Int, mod: Int): Int = { val rawMod = x % mod rawMod + (if (rawMod < 0) mod else 0) } def apply(line: Seq[String]): SparseVector[Double] = { val hashes = new Array[Integer](line.length) var i = 0 while (i < line.length) { hashes(i) = line(i).## i += 1 } var j = 0 val termFrequencies = mutable.HashMap.empty[Int, Double] i = 0 while (i + minOrder <= line.length) { var order = minOrder var h = seqSeed j = i while (j < i + minOrder) { h = mix(h, hashes(j)) j += 1 } val feature = nonNegativeMod(finalizeHash(h, order), numFeatures) termFrequencies.put(feature, termFrequencies.getOrElse(feature, 0.0) + 1.0) order = minOrder + 1 while (order <= maxOrder && i + order <= line.length) { h = mix(h, hashes(i + order - 1)) val feature = nonNegativeMod(finalizeHash(h, order), numFeatures) termFrequencies.put(feature, termFrequencies.getOrElse(feature, 0.0) + 1.0) order += 1 } i += 1 } SparseVector(numFeatures)(termFrequencies.toSeq:_*) } }
Example 18
Source File: HashingTF.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.nlp import breeze.linalg.SparseVector import keystoneml.workflow.Transformer case class HashingTF[T <: Seq[Any]](numFeatures: Int) extends Transformer[T, SparseVector[Double]] { def nonNegativeMod(x: Int, mod: Int): Int = { val rawMod = x % mod rawMod + (if (rawMod < 0) mod else 0) } def apply(document: T): SparseVector[Double] = { val termFrequencies = scala.collection.mutable.HashMap.empty[Int, Double] document.foreach { term => val i = nonNegativeMod(term.##, numFeatures) termFrequencies.put(i, termFrequencies.getOrElse(i, 0.0) + 1.0) } SparseVector(numFeatures)(termFrequencies.toSeq:_*) } }
Example 19
Source File: MLlibUtils.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.utils import breeze.linalg.{SparseVector, DenseMatrix, DenseVector} def breezeVectorToMLlib(breezeVector: breeze.linalg.Vector[Double]): org.apache.spark.mllib.linalg.Vector = { breezeVector match { case v: DenseVector[Double] => if (v.offset == 0 && v.stride == 1 && v.length == v.data.length) { new org.apache.spark.mllib.linalg.DenseVector(v.data) } else { new org.apache.spark.mllib.linalg.DenseVector(v.toArray) // Can't use underlying array directly, so make a new one } case v: SparseVector[Double] => if (v.index.length == v.used) { new org.apache.spark.mllib.linalg.SparseVector(v.length, v.index, v.data) } else { new org.apache.spark.mllib.linalg.SparseVector(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used)) } case v: breeze.linalg.Vector[_] => sys.error("Unsupported Breeze vector type: " + v.getClass.getName) } } }
Example 20
Source File: SparseArray.scala From scalismo-faces with Apache License 2.0 | 5 votes |
package scalismo.faces.numerics import java.util import breeze.linalg.SparseVector private[numerics] class SparseArray(var index: Array[Int], var data: Array[Double], var nnz: Int, val length: Int) { require(nnz <= length, "too many non zeros") require(data.length == index.length, "data and index have different length") require(data.length >= nnz, "data array is too short") def activeSize = nnz def update(i: Int, v: Double): Unit = { val offset = findOffset(i) if (offset >= 0) data(offset) = v else { val insert = ~offset nnz += 1 if (nnz > index.length) reallocate() // insert // move right part System.arraycopy(index, insert, index, insert + 1, nnz - insert - 1) System.arraycopy(data, insert, data, insert + 1, nnz - insert - 1) // insert data index(insert) = i data(insert) = v } } private def reallocate() = { val newLength = math.max(nnz + 1, index.length * 2) val _index = new Array[Int](newLength) val _data = new Array[Double](newLength) System.arraycopy(index, 0, _index, 0, index.length) System.arraycopy(data, 0, _data, 0, data.length) index = _index data = _data } def apply(i: Int): Double = { val offset = findOffset(i) if (offset >= 0) data(offset) else 0.0 } def findOffset(i: Int): Int = util.Arrays.binarySearch(index, 0, nnz, i) def toDense: Array[Double] = { val arr = new Array[Double](length) var i = 0 while (i < nnz) { val ind = index(i) arr(ind) = data(i) i += 1 } arr } } object SparseArray { def apply(vector: SparseVector[Double]): SparseArray = { val nnz = vector.activeSize val index = new Array[Int](nnz) val data = new Array[Double](nnz) System.arraycopy(vector.index, 0, index, 0, nnz) System.arraycopy(vector.data, 0, data, 0, nnz) new SparseArray(index, data, nnz, vector.length) } }