Python pyspark.ml.feature.Tokenizer() Examples
The following are 4
code examples of pyspark.ml.feature.Tokenizer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.ml.feature
, or try the search function
.
Example #1
Source File: ops_names.py From onnxmltools with MIT License | 6 votes |
def build_sparkml_operator_name_map(): res = {k: "pyspark.ml.feature." + k.__name__ for k in [ Binarizer, BucketedRandomProjectionLSHModel, Bucketizer, ChiSqSelectorModel, CountVectorizerModel, DCT, ElementwiseProduct, HashingTF, IDFModel, ImputerModel, IndexToString, MaxAbsScalerModel, MinHashLSHModel, MinMaxScalerModel, NGram, Normalizer, OneHotEncoderModel, PCAModel, PolynomialExpansion, QuantileDiscretizer, RegexTokenizer, StandardScalerModel, StopWordsRemover, StringIndexerModel, Tokenizer, VectorAssembler, VectorIndexerModel, VectorSlicer, Word2VecModel ]} res.update({k: "pyspark.ml.classification." + k.__name__ for k in [ LinearSVCModel, LogisticRegressionModel, DecisionTreeClassificationModel, GBTClassificationModel, RandomForestClassificationModel, NaiveBayesModel, MultilayerPerceptronClassificationModel, OneVsRestModel ]}) res.update({k: "pyspark.ml.regression." + k.__name__ for k in [ AFTSurvivalRegressionModel, DecisionTreeRegressionModel, GBTRegressionModel, GBTRegressionModel, GeneralizedLinearRegressionModel, IsotonicRegressionModel, LinearRegressionModel, RandomForestRegressionModel ]}) return res
Example #2
Source File: test_tokenizer.py From onnxmltools with MIT License | 6 votes |
def test_tokenizer(self): data = self.spark.createDataFrame([("a b c",)], ["text"]) model = Tokenizer(inputCol='text', outputCol='words') predicted = model.transform(data) model_onnx = convert_sparkml(model, 'Sparkml Tokenizer', [ ('text', StringTensorType([1, 1])) ]) self.assertTrue(model_onnx is not None) # run the model expected = predicted.toPandas().words.apply(pandas.Series).values data_np = data.toPandas().text.values.reshape([1, 1]) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlTokenizer") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #3
Source File: spark_ml_pipline.py From Hanhan-Spark-Python with MIT License | 5 votes |
def main(): # Read training data as a DataFrame sqlCt = SQLContext(sc) trainDF = sqlCt.read.parquet(training_input) testDF = sqlCt.read.parquet(testing_input) tokenizer = Tokenizer(inputCol="text", outputCol="words") evaluator = BinaryClassificationEvaluator() # no parameter tuning hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000) lr_notuning = LogisticRegression(maxIter=20, regParam=0.1) pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning]) model_notuning = pipeline_notuning.fit(trainDF) prediction_notuning = model_notuning.transform(testDF) notuning_output = evaluator.evaluate(prediction_notuning) # for cross validation hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=20) paramGrid = ParamGridBuilder()\ .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\ .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\ .build() pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2) cvModel = cv.fit(trainDF) # Make predictions on test documents. cvModel uses the best model found. best_prediction = cvModel.transform(testDF) best_output = evaluator.evaluate(best_prediction) s = str(notuning_output) + '\n' + str(best_output) output_data = sc.parallelize([s]) output_data.saveAsTextFile(output)
Example #4
Source File: test_search_2.py From spark-sklearn with Apache License 2.0 | 5 votes |
def test_cv_lasso_with_mllib_featurization(self): data = [('hi there', 0.0), ('what is up', 1.0), ('huh', 1.0), ('now is the time', 5.0), ('for what', 0.0), ('the spark was there', 5.0), ('and so', 3.0), ('were many socks', 0.0), ('really', 1.0), ('too cool', 2.0)] data = self.sql.createDataFrame(data, ["review", "rating"]) # Feature extraction using MLlib tokenizer = Tokenizer(inputCol="review", outputCol="words") hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20000) pipeline = Pipeline(stages=[tokenizer, hashingTF]) data = pipeline.fit(data).transform(data) df = self.converter.toPandas(data.select(data.features.alias("review"), "rating")) pipeline = SKL_Pipeline([ ('lasso', SKL_Lasso()) ]) parameters = { 'lasso__alpha': (0.001, 0.005, 0.01) } grid_search = GridSearchCV(self.sc, pipeline, parameters) skl_gs = grid_search.fit(df.review.values, df.rating.values) assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])