Python pyspark.sql.functions.rand() Examples
The following are 6
code examples of pyspark.sql.functions.rand().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql.functions
, or try the search function
.
Example #1
Source File: taar_ensemble.py From telemetry-airflow with Mozilla Public License 2.0 | 6 votes |
def cross_validation_split(dataset, k_folds): """ Splits dataframe into k_folds, returning array of dataframes """ dataset_split = [] h = 1.0 / k_folds df = dataset.select("*", rand().alias("rand")) for i in range(k_folds): validateLB = i * h validateUB = (i + 1) * h condition = (df["rand"] >= validateLB) & (df["rand"] < validateUB) fold = df.filter(condition).cache() dataset_split.append(fold) return dataset_split
Example #2
Source File: taar_ensemble.py From python_mozetl with MIT License | 6 votes |
def cross_validation_split(dataset, k_folds): """ Splits dataframe into k_folds, returning array of dataframes """ dataset_split = [] h = 1.0 / k_folds df = dataset.select("*", rand().alias("rand")) for i in range(k_folds): validateLB = i * h validateUB = (i + 1) * h condition = (df["rand"] >= validateLB) & (df["rand"] < validateUB) fold = df.filter(condition).cache() dataset_split.append(fold) return dataset_split
Example #3
Source File: tests.py From LearningApacheSpark with MIT License | 5 votes |
def _transform(self, dataset): return dataset.withColumn("prediction", dataset.feature + (rand(0) * self.getInducedError()))
Example #4
Source File: tuning.py From LearningApacheSpark with MIT License | 5 votes |
def _fit(self, dataset): est = self.getOrDefault(self.estimator) epm = self.getOrDefault(self.estimatorParamMaps) numModels = len(epm) eva = self.getOrDefault(self.evaluator) nFolds = self.getOrDefault(self.numFolds) seed = self.getOrDefault(self.seed) h = 1.0 / nFolds randCol = self.uid + "_rand" df = dataset.select("*", rand(seed).alias(randCol)) metrics = [0.0] * numModels pool = ThreadPool(processes=min(self.getParallelism(), numModels)) subModels = None collectSubModelsParam = self.getCollectSubModels() if collectSubModelsParam: subModels = [[None for j in range(numModels)] for i in range(nFolds)] for i in range(nFolds): validateLB = i * h validateUB = (i + 1) * h condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB) validation = df.filter(condition).cache() train = df.filter(~condition).cache() tasks = _parallelFitTasks(est, train, eva, validation, epm, collectSubModelsParam) for j, metric, subModel in pool.imap_unordered(lambda f: f(), tasks): metrics[j] += (metric / nFolds) if collectSubModelsParam: subModels[i][j] = subModel validation.unpersist() train.unpersist() if eva.isLargerBetter(): bestIndex = np.argmax(metrics) else: bestIndex = np.argmin(metrics) bestModel = est.fit(dataset, epm[bestIndex]) return self._copyValues(CrossValidatorModel(bestModel, metrics, subModels))
Example #5
Source File: tuning.py From LearningApacheSpark with MIT License | 5 votes |
def _fit(self, dataset): est = self.getOrDefault(self.estimator) epm = self.getOrDefault(self.estimatorParamMaps) numModels = len(epm) eva = self.getOrDefault(self.evaluator) tRatio = self.getOrDefault(self.trainRatio) seed = self.getOrDefault(self.seed) randCol = self.uid + "_rand" df = dataset.select("*", rand(seed).alias(randCol)) condition = (df[randCol] >= tRatio) validation = df.filter(condition).cache() train = df.filter(~condition).cache() subModels = None collectSubModelsParam = self.getCollectSubModels() if collectSubModelsParam: subModels = [None for i in range(numModels)] tasks = _parallelFitTasks(est, train, eva, validation, epm, collectSubModelsParam) pool = ThreadPool(processes=min(self.getParallelism(), numModels)) metrics = [None] * numModels for j, metric, subModel in pool.imap_unordered(lambda f: f(), tasks): metrics[j] = metric if collectSubModelsParam: subModels[j] = subModel train.unpersist() validation.unpersist() if eva.isLargerBetter(): bestIndex = np.argmax(metrics) else: bestIndex = np.argmin(metrics) bestModel = est.fit(dataset, epm[bestIndex]) return self._copyValues(TrainValidationSplitModel(bestModel, metrics, subModels))
Example #6
Source File: utils.py From dist-keras with GNU General Public License v3.0 | 5 votes |
def shuffle(dataset): """Shuffles the rows in the specified Spark Dataframe. # Arguments dataset: dataframe. A Spark Dataframe. """ dataset = dataset.orderBy(rand()) dataset.cache() return dataset