Python pyspark.ml.classification.RandomForestClassifier() Examples
The following are 3
code examples of pyspark.ml.classification.RandomForestClassifier().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.ml.classification
, or try the search function
.
Example #1
Source File: random_forest.py From gnomad_methods with MIT License | 6 votes |
def get_features_importance( rf_pipeline: pyspark.ml.PipelineModel, rf_index: int = -2, assembler_index: int = -3 ) -> Dict[str, float]: """ Extract the features importance from a Pipeline model containing a RandomForestClassifier stage. :param rf_pipeline: Input pipeline :param rf_index: index of the RandomForestClassifier stage :param assembler_index: index of the VectorAssembler stage :return: feature importance for each feature in the RF model """ feature_names = [ x[: -len("_indexed")] if x.endswith("_indexed") else x for x in rf_pipeline.stages[assembler_index].getInputCols() ] return dict(zip(feature_names, rf_pipeline.stages[rf_index].featureImportances))
Example #2
Source File: test_random_forest_classifier.py From onnxmltools with MIT License | 5 votes |
def test_random_forrest_classification(self): this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt") original_data = self.spark.read.format("libsvm").load(input_path) # # truncate the features # feature_count = 5 self.spark.udf.register("truncateFeatures", lambda x: SparseVector(feature_count, range(0,feature_count), x.toArray()[125:130]), VectorUDT()) data = original_data.selectExpr("cast(label as string) as label", "truncateFeatures(features) as features") label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel") feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=10, handleInvalid='keep') rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10) pipeline = Pipeline(stages=[label_indexer, feature_indexer, rf]) model = pipeline.fit(data) model_onnx = convert_sparkml(model, 'Sparkml RandomForest Classifier', [ ('label', StringTensorType([1, 1])), ('features', FloatTensorType([1, feature_count])) ], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = { 'label': data.toPandas().label.values, 'features': data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) } expected = [ predicted.toPandas().indexedLabel.values.astype(numpy.int64), predicted.toPandas().prediction.values.astype(numpy.float32), predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlRandomForestClassifier") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['indexedLabel', 'prediction', 'probability'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
Example #3
Source File: datasetClassifier.py From mmtf-pyspark with Apache License 2.0 | 4 votes |
def main(argv): # Name of prediction column label = argv[1] start = time.time() spark = SparkSession.builder \ .master("local[*]") \ .appName("datasetClassifier") \ .getOrCreate() data = spark.read.parquet(argv[0]).cache() vector = data.first()["features"] featureCount = len(vector) print(f"Feature count : {featureCount}") classCount = int(data.select(label).distinct().count()) print(f"Class count : {classCount}") print(f"Dataset size (unbalanced) : {data.count()}") data.groupby(label).count().show(classCount) data = datasetBalancer.downsample(data, label, 1) print(f"Dataset size (balanced) : {data.count()}") data.groupby(label).count().show(classCount) testFraction = 0.3 seed = 123 # DecisionTree dtc = DecisionTreeClassifier() mcc = SparkMultiClassClassifier(dtc, label, testFraction, seed) matrics = mcc.fit(data) for k,v in matrics.items(): print(f"{k}\t{v}") # RandomForest rfc = RandomForestClassifier() mcc = SparkMultiClassClassifier(rfc, label, testFraction, seed) matrics = mcc.fit(data) for k,v in matrics.items(): print(f"{k}\t{v}") # LogisticRegression lr = LogisticRegression() mcc = SparkMultiClassClassifier(lr, label, testFraction, seed) matrics = mcc.fit(data) for k,v in matrics.items(): print(f"{k}\t{v}") # MultilayerPerceptronClassifier layers = [featureCount, 10, classCount] mpc = MultilayerPerceptronClassifier().setLayers(layers) \ .setBlockSize(128) \ .setSeed(1234) \ .setMaxIter(200) mcc = SparkMultiClassClassifier(mpc, label, testFraction, seed) matrics = mcc.fit(data) for k,v in matrics.items(): print(f"{k}\t{v}") end = time.time() print("Time: %f sec." %(end-start))