org.apache.spark.ml.feature.CountVectorizerModel Scala Examples
The following examples show how to use org.apache.spark.ml.feature.CountVectorizerModel.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: CountVectorizerExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel} // $example off$ import org.apache.spark.sql.SparkSession object CountVectorizerExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("CountVectorizerExample") .getOrCreate() // $example on$ val df = spark.createDataFrame(Seq( (0, Array("a", "b", "c")), (1, Array("a", "b", "b", "c", "a")) )).toDF("id", "words") // fit a CountVectorizerModel from the corpus val cvModel: CountVectorizerModel = new CountVectorizer() .setInputCol("words") .setOutputCol("features") .setVocabSize(3) .setMinDF(2) .fit(df) // alternatively, define CountVectorizerModel with a-priori vocabulary val cvm = new CountVectorizerModel(Array("a", "b", "c")) .setInputCol("words") .setOutputCol("features") cvModel.transform(df).show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 2
Source File: LocalCountVectorizerModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.preprocessors import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import org.apache.spark.ml.feature.CountVectorizerModel import org.apache.spark.ml.linalg.Vectors import scala.collection.mutable class LocalCountVectorizerModel(override val sparkTransformer: CountVectorizerModel) extends LocalTransformer[CountVectorizerModel] { override def transform(localData: LocalData): LocalData = { import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._ val dict = sparkTransformer.vocabulary.zipWithIndex.toMap val minTf = sparkTransformer.getMinTF localData.column(sparkTransformer.getInputCol) match { case Some(column) => val newCol = column.data.map(_.asInstanceOf[List[String]]).map { arr => val termCounts = mutable.HashMap.empty[Int, Double] var tokenCount = 0L arr.foreach { token => dict.get(token) foreach { index => val storedValue = termCounts.getOrElseUpdate(index, 0.0) termCounts.update(index, storedValue + 1.0) } tokenCount += 1 } val eTF = if (minTf >= 1.0) minTf else tokenCount * minTf val eCounts = if (sparkTransformer.getBinary) { termCounts filter (_._2 >= eTF) map (_._1 -> 1.0) toSeq } else { termCounts filter (_._2 >= eTF) toSeq } Vectors.sparse(dict.size, eCounts.toList).toList } localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newCol)) case None => localData } } } object LocalCountVectorizerModel extends SimpleModelLoader[CountVectorizerModel] with TypedTransformerConverter[CountVectorizerModel] { override def build(metadata: Metadata, data: LocalData): CountVectorizerModel = { val vocabulary = data.column("vocabulary").get.data.head.asInstanceOf[Seq[String]].toArray val inst = new CountVectorizerModel(metadata.uid, vocabulary) inst .setInputCol(metadata.paramMap("inputCol").toString) .setOutputCol(metadata.paramMap("outputCol").toString) .set(inst.binary, metadata.paramMap("binary").asInstanceOf[Boolean]) .set(inst.minDF, metadata.paramMap("minDF").toString.toDouble) .set(inst.minTF, metadata.paramMap("minTF").toString.toDouble) .set(inst.vocabSize, metadata.paramMap("vocabSize").asInstanceOf[Number].intValue()) } override implicit def toLocal( transformer: CountVectorizerModel ) = new LocalCountVectorizerModel(transformer) }
Example 3
Source File: CountVectorizerOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.feature.CountVectorizerModel import org.apache.spark.ml.param.Param class CountVectorizerOp extends SimpleSparkOp[CountVectorizerModel] { override val Model: OpModel[SparkBundleContext, CountVectorizerModel] = new OpModel[SparkBundleContext, CountVectorizerModel] { override val klazz: Class[CountVectorizerModel] = classOf[CountVectorizerModel] override def opName: String = Bundle.BuiltinOps.feature.count_vectorizer override def store(model: Model, obj: CountVectorizerModel) (implicit context: BundleContext[SparkBundleContext]): Model = { model.withValue("vocabulary", Value.stringList(obj.vocabulary)). withValue("binary", Value.boolean(obj.getBinary)). withValue("min_tf", Value.double(obj.getMinTF)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): CountVectorizerModel = { new CountVectorizerModel(uid = "", vocabulary = model.value("vocabulary").getStringList.toArray). setBinary(model.value("binary").getBoolean). setMinTF(model.value("min_tf").getDouble) } } override def sparkLoad(uid: String, shape: NodeShape, model: CountVectorizerModel): CountVectorizerModel = { new CountVectorizerModel(uid = uid, vocabulary = model.vocabulary) .setBinary(model.getBinary) .setMinTF(model.getMinTF) } override def sparkInputs(obj: CountVectorizerModel): Seq[ParamSpec] = { Seq("input" -> obj.inputCol) } override def sparkOutputs(obj: CountVectorizerModel): Seq[SimpleParamSpec] = { Seq("output" -> obj.outputCol) } }
Example 4
Source File: CountVectorizerExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel} // $example off$ import org.apache.spark.sql.SparkSession object CountVectorizerExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("CountVectorizerExample") .getOrCreate() // $example on$ val df = spark.createDataFrame(Seq( (0, Array("a", "b", "c")), (1, Array("a", "b", "b", "c", "a")) )).toDF("id", "words") // fit a CountVectorizerModel from the corpus val cvModel: CountVectorizerModel = new CountVectorizer() .setInputCol("words") .setOutputCol("features") .setVocabSize(3) .setMinDF(2) .fit(df) // alternatively, define CountVectorizerModel with a-priori vocabulary val cvm = new CountVectorizerModel(Array("a", "b", "c")) .setInputCol("words") .setOutputCol("features") cvModel.transform(df).show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 5
Source File: CountVectorizerExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel} // $example off$ import org.apache.spark.sql.SparkSession object CountVectorizerExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("CountVectorizerExample") .getOrCreate() // $example on$ val df = spark.createDataFrame(Seq( (0, Array("a", "b", "c")), (1, Array("a", "b", "b", "c", "a")) )).toDF("id", "words") // fit a CountVectorizerModel from the corpus val cvModel: CountVectorizerModel = new CountVectorizer() .setInputCol("words") .setOutputCol("features") .setVocabSize(3) .setMinDF(2) .fit(df) // alternatively, define CountVectorizerModel with a-priori vocabulary val cvm = new CountVectorizerModel(Array("a", "b", "c")) .setInputCol("words") .setOutputCol("features") cvModel.transform(df).show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 6
Source File: OpCountVectorizer.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op.UID import com.salesforce.op.features.types._ import com.salesforce.op.stages.sparkwrappers.generic.SwUnaryModel import com.salesforce.op.stages.sparkwrappers.specific.OpEstimatorWrapper import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel} import org.apache.spark.sql.Dataset def setVocabSize(value: Int): this.type = { getSparkMlStage().get.setVocabSize(value) this } override def fit(dataset: Dataset[_]): SwUnaryModel[TextList, OPVector, CountVectorizerModel] = { val model = super.fit(dataset) val vocab = model.getSparkMlStage().map(_.vocabulary).getOrElse(Array.empty[String]) val tf = getTransientFeatures() val metadataCols = for { f <- tf word <- vocab } yield OpVectorColumnMetadata( parentFeatureName = Seq(f.name), parentFeatureType = Seq(f.typeName), grouping = None, // TODO do we want to test each word for label pred? indicatorValue = Option(word) ) model.setMetadata( OpVectorMetadata(getOutputFeatureName, metadataCols, Transmogrifier.inputFeaturesToHistory(tf, stageName)).toMetadata ) model } }
Example 7
Source File: CountVectorizerExample.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel} // $example off$ import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.types.StringType import org.apache.spark.sql.{SQLContext, DataFrame} //transform()方法将DataFrame转化为另外一个DataFrame的算法 cvm.transform(df).select("features").show() // $example off$ sc.stop() } } // scalastyle:on println
Example 8
Source File: CountVectorizerExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel} // $example off$ import org.apache.spark.sql.SparkSession object CountVectorizerExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("CountVectorizerExample") .getOrCreate() // $example on$ val df = spark.createDataFrame(Seq( (0, Array("a", "b", "c")), (1, Array("a", "b", "b", "c", "a")) )).toDF("id", "words") // fit a CountVectorizerModel from the corpus val cvModel: CountVectorizerModel = new CountVectorizer() .setInputCol("words") .setOutputCol("features") .setVocabSize(3) .setMinDF(2) .fit(df) // alternatively, define CountVectorizerModel with a-priori vocabulary val cvm = new CountVectorizerModel(Array("a", "b", "c")) .setInputCol("words") .setOutputCol("features") cvModel.transform(df).show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 9
Source File: CountVectorizerExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel} // $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object CountVectorizerExample { def main(args: Array[String]) { val conf = new SparkConf().setAppName("CounterVectorizerExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // $example on$ val df = sqlContext.createDataFrame(Seq( (0, Array("a", "b", "c")), (1, Array("a", "b", "b", "c", "a")) )).toDF("id", "words") // fit a CountVectorizerModel from the corpus val cvModel: CountVectorizerModel = new CountVectorizer() .setInputCol("words") .setOutputCol("features") .setVocabSize(3) .setMinDF(2) .fit(df) // alternatively, define CountVectorizerModel with a-priori vocabulary val cvm = new CountVectorizerModel(Array("a", "b", "c")) .setInputCol("words") .setOutputCol("features") cvModel.transform(df).select("features").show() // $example off$ } } // scalastyle:on println
Example 10
Source File: CountVectorizerDemo.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter11.SparkMachineLearning import org.apache.spark.sql.SparkSession import org.apache.spark.ml.feature.{ CountVectorizer, CountVectorizerModel } object CountVectorizerDemo { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName(s"OneVsRestExample") .getOrCreate() val df = spark.createDataFrame( Seq((0, Array("Jason", "David")), (1, Array("David", "Martin")), (2, Array("Martin", "Jason")), (3, Array("Jason", "Daiel")), (4, Array("Daiel", "Martin")), (5, Array("Moahmed", "Jason")), (6, Array("David", "David")), (7, Array("Jason", "Martin")))).toDF("id", "name") df.show(false) // fit a CountVectorizerModel from the corpus val cvModel: CountVectorizerModel = new CountVectorizer() .setInputCol("name") .setOutputCol("features") .setVocabSize(3) .setMinDF(2) .fit(df) val feature = cvModel.transform(df) feature.show(false) spark.stop() } }