Python pyspark.sql.DataFrame() Examples
The following are 30
code examples of pyspark.sql.DataFrame().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql
, or try the search function
.
Example #1
Source File: helpers.py From search-MjoLniR with MIT License | 7 votes |
def require_output_table( self, partition_spec_spec, metadata_fn=None, mode='overwrite', ): @self._post_process_transform.append def post(df: DataFrame, kwargs: Dict): mt.write_partition( df, kwargs['output_table'], kwargs['output_path'], self._resolve_partition_spec(kwargs, partition_spec_spec), mode=mode) if metadata_fn is not None: spark = df.sql_ctx.sparkSession metadata = metadata_fn(spark.read.parquet(kwargs['output_path'])) write_metadata(kwargs['output_path'], metadata) self.add_argument('--output-table', required=True) self.add_argument('--output-path', required=True)
Example #2
Source File: helpers.py From SMV with Apache License 2.0 | 6 votes |
def smvExpandStruct(self, *cols): """Expand structure type column to a group of columns Args: cols (\*string): column names to expand Example: input DF: [id: string, address: struct<state:string, zip:string, street:string>] >>> df.smvExpandStruct("address") output DF: [id: string, state: string, zip: string, street: string] Returns: (DataFrame): DF with expanded columns """ jdf = self._jPythonHelper.smvExpandStruct(self._jdf, smv_copy_array(self._sc, *cols)) return DataFrame(jdf, self._sql_ctx)
Example #3
Source File: norm_query_clustering.py From search-MjoLniR with MIT License | 6 votes |
def filter_min_sessions_per_norm_query(min_sessions: int) -> mt.Transformer: def transform(df: DataFrame) -> DataFrame: w = Window.partitionBy('wikiid', 'norm_query') return ( df.withColumn( 'has_min_sessions', at_least_n_distinct('session_id', min_sessions).over(w)) .where(F.col('has_min_sessions')) .drop('has_min_sessions')) return transform
Example #4
Source File: feature_vectors.py From search-MjoLniR with MIT License | 6 votes |
def resample_clicks_to_query_page( df_cluster: DataFrame, random_seed: Optional[int], samples_per_wiki: int ) -> mt.Transformer: # Resamples the click log by proxy of resampling clusters, such # that a complete cluster is either included or excluded from the # resulting dataset. # TODO: Evaluate alternative resampling, such as perhaps only dropping from # clusters where all clicks were to the top result (implying an "easy" search). mt.check_schema(df_cluster, mt.QueryClustering) return mt.seq_transform([ # Grab only the parts of the query log we need to make the resulting sampled QueryPage lambda df: df.select('query', 'wikiid', 'session_id', 'hit_page_ids'), mt.join_cluster_by_query(df_cluster), # [1] is because sample returns a tuple of (page_counts, df) mt.temp_rename_col('cluster_id', 'norm_query_id', lambda df: mjolnir.sampling.sample( df, random_seed, samples_per_wiki)[1]), lambda df: df.withColumn( 'page_id', F.explode('hit_page_ids')).drop('hit_page_ids') ])
Example #5
Source File: feature_engineering.py From search-MjoLniR with MIT License | 6 votes |
def explode_features(df, features=None): """Convert feature vector into individual columns Parameters ---------- df : pyspark.sql.DataFrame features : list of str or None Returns ------- pyspark.sql.DataFrame """ if features is None: features = df.schema['features'].metadata['features'] def extract_feature(features, idx): return float(features[idx]) extract_feature_udf = F.udf(extract_feature, pyspark.sql.types.FloatType()) cols = [extract_feature_udf('features', F.lit(idx)).alias(name) for idx, name in enumerate(features)] return df.select('*', *cols)
Example #6
Source File: feature_vectors.py From search-MjoLniR with MIT License | 6 votes |
def transform( query_clicks: HivePartition, query_clustering: HivePartition, samples_per_wiki: int, random_seed: Optional[int], wikis: List[str], brokers: str, topic_request: str, topic_response: str, feature_set: str, **kwargs ) -> DataFrame: transformer = mt.seq_transform([ mt.restrict_wikis(wikis), resample_clicks_to_query_page( query_clustering.df, random_seed, samples_per_wiki), feature_vectors.transformer( brokers, topic_request, topic_response, feature_set) ]) return transformer(query_clicks.df)
Example #7
Source File: feature_engineering.py From search-MjoLniR with MIT License | 6 votes |
def append_features(df, *cols): """Append features from columns to the features vector. Parameters ---------- df : pyspark.sql.DataFrame cols : list of str Returns ------- pyspark.sql.DataFrame """ def add_features(feat, *other): raw = feat.toArray() return Vectors.dense(np.append(raw, list(map(float, other)))) add_features_udf = F.udf(add_features, VectorUDT()) new_feat_list = df.schema['features'].metadata['features'] + cols return df.withColumn('features', mjolnir.spark.add_meta( df._sc, add_features_udf('features', *cols), {'features': new_feat_list}))
Example #8
Source File: feature_vectors.py From search-MjoLniR with MIT License | 6 votes |
def collect_features( kafka_config: ClientConfig, feature_set: str ) -> mt.Transformer: def transform(df: DataFrame) -> DataFrame: df_features, fnames_accu = mjolnir.features.collect( df, model='featureset:' + feature_set, brokers=kafka_config, indices=mt.ContentIndices()) # Collect the accumulator to get feature names df_features.cache().count() # Future transformations have to be extra careful to not lose this metadata return _add_meta(df_features, 'features', { 'feature_set': feature_set, 'features': _check_features(fnames_accu), 'collected_at': datetime.datetime.now().isoformat() }) return transform
Example #9
Source File: feature_selection.py From search-MjoLniR with MIT License | 6 votes |
def select_features( wiki: str, num_features: int, metadata: Dict ) -> mt.Transformer: def transform(df: DataFrame) -> DataFrame: # Compute the "best" features, per some metric sc = df.sql_ctx.sparkSession.sparkContext features = metadata['input_feature_meta']['features'] selected = mjolnir.feature_engineering.select_features( sc, df, features, num_features, algo='mrmr') metadata['wiki_features'][wiki] = selected # Rebuild the `features` col with only the selected features keep_cols = metadata['default_cols'] + selected df_selected = df.select(*keep_cols) assembler = VectorAssembler( inputCols=selected, outputCol='features') return assembler.transform(df_selected).drop(*selected) return transform
Example #10
Source File: feature_selection.py From search-MjoLniR with MIT License | 6 votes |
def transformer( df_label: DataFrame, temp_dir: str, wikis: List[str], num_features: int ) -> mt.Transformer: mt.check_schema(df_label, mt.LabeledQueryPage) # Hack to transfer metadata between transformations. This is populated in # time since `select_features` does direct computation of the features. metadata = cast(Dict, {'wiki_features': {}}) return mt.seq_transform([ mt.restrict_wikis(wikis), mt.join_labels(df_label), explode_features(metadata), mt.cache_to_disk(temp_dir, partition_by='wikiid'), mt.for_each_item('wikiid', wikis, lambda wiki: select_features( wiki, num_features, metadata)), attach_feature_metadata(metadata), # While we used the labels for selecting features, they are not part of the feature vectors. # Allow them to be joined with any other label set for export to training. lambda df: df.drop('cluster_id', 'label'), lambda df: df.repartition(200, 'wikiid', 'query'), ])
Example #11
Source File: make_folds.py From search-MjoLniR with MIT License | 6 votes |
def convert_svmrank_to_xgboost(df: DataFrame) -> DataFrame: def convert_one(row: Row) -> Row: # For now place the .xgb right next to the svmrank files. Naming/path # options could be added if needed later. out_path = row.path + '.xgb' _convert_xgboost_remote(row.path, out_path) return Row(**dict( row.asDict(), vec_format='xgboost', path=out_path)) # Each row represents potentially gigabytes, convince spark # to create a partition per row. rdd_xgb = mt.partition_per_row(df.rdd).map(convert_one) df_xgb = df.sql_ctx.createDataFrame(rdd_xgb, df.schema) # type: ignore # Return both the xgb and svmrank datasets since # we aren't purging the related files. df is safe to reuse since # svmrank conversion returns a new dataframe with no lineage. return df.union(df_xgb)
Example #12
Source File: sys_exec.py From cadCAD with MIT License | 6 votes |
def to_spark_df(rdd: RDD, spark: SparkSession, init_condition: dict = None): # Typefull if init_condition is not None: return to_spark(rdd, init_condition) # Typeless else: spark.conf.set("spark.sql.execution.arrow.enabled", "true") spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "true") warnings.simplefilter(action='ignore', category=UserWarning) pdf_from_rdd: DataFrame = to_pandas(rdd) result = spark.createDataFrame(pdf_from_rdd) del pdf_from_rdd return result
Example #13
Source File: clustering.py From LearningApacheSpark with MIT License | 6 votes |
def assignClusters(self, dataset): """ Run the PIC algorithm and returns a cluster assignment for each input vertex. :param dataset: A dataset with columns src, dst, weight representing the affinity matrix, which is the matrix A in the PIC paper. Suppose the src column value is i, the dst column value is j, the weight column value is similarity s,,ij,, which must be nonnegative. This is a symmetric matrix and hence s,,ij,, = s,,ji,,. For any (i, j) with nonzero similarity, there should be either (i, j, s,,ij,,) or (j, i, s,,ji,,) in the input. Rows with i = j are ignored, because we assume s,,ij,, = 0.0. :return: A dataset that contains columns of vertex id and the corresponding cluster for the id. The schema of it will be: - id: Long - cluster: Int .. versionadded:: 2.4.0 """ self._transfer_params_to_java() jdf = self._java_obj.assignClusters(dataset._jdf) return DataFrame(jdf, dataset.sql_ctx)
Example #14
Source File: common.py From LearningApacheSpark with MIT License | 6 votes |
def _py2java(sc, obj): """ Convert Python object into Java """ if isinstance(obj, RDD): obj = _to_java_object_rdd(obj) elif isinstance(obj, DataFrame): obj = obj._jdf elif isinstance(obj, SparkContext): obj = obj._jsc elif isinstance(obj, list): obj = [_py2java(sc, x) for x in obj] elif isinstance(obj, JavaObject): pass elif isinstance(obj, (int, long, float, bool, bytes, unicode)): pass else: data = bytearray(PickleSerializer().dumps(obj)) obj = sc._jvm.org.apache.spark.ml.python.MLSerDe.loads(data) return obj
Example #15
Source File: recommendation.py From LearningApacheSpark with MIT License | 6 votes |
def _prepare(cls, ratings): if isinstance(ratings, RDD): pass elif isinstance(ratings, DataFrame): ratings = ratings.rdd else: raise TypeError("Ratings should be represented by either an RDD or a DataFrame, " "but got %s." % type(ratings)) first = ratings.first() if isinstance(first, Rating): pass elif isinstance(first, (tuple, list)): ratings = ratings.map(lambda x: Rating(*x)) else: raise TypeError("Expect a Rating or a tuple/list, but got %s." % type(first)) return ratings
Example #16
Source File: common.py From LearningApacheSpark with MIT License | 6 votes |
def _py2java(sc, obj): """ Convert Python object into Java """ if isinstance(obj, RDD): obj = _to_java_object_rdd(obj) elif isinstance(obj, DataFrame): obj = obj._jdf elif isinstance(obj, SparkContext): obj = obj._jsc elif isinstance(obj, list): obj = [_py2java(sc, x) for x in obj] elif isinstance(obj, JavaObject): pass elif isinstance(obj, (int, long, float, bool, bytes, unicode)): pass else: data = bytearray(PickleSerializer().dumps(obj)) obj = sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(data) return obj
Example #17
Source File: tests.py From LearningApacheSpark with MIT License | 6 votes |
def test_gaussian_mixture_summary(self): data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),), (Vectors.sparse(1, [], []),)] df = self.spark.createDataFrame(data, ["features"]) gmm = GaussianMixture(k=2) model = gmm.fit(df) self.assertTrue(model.hasSummary) s = model.summary self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.probabilityCol, "probability") self.assertTrue(isinstance(s.probability, DataFrame)) self.assertEqual(s.featuresCol, "features") self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.cluster, DataFrame)) self.assertEqual(len(s.clusterSizes), 2) self.assertEqual(s.k, 2) self.assertEqual(s.numIter, 3)
Example #18
Source File: helpers.py From SMV with Apache License 2.0 | 6 votes |
def smvTopNRecs(self, maxElems, *cols): """For each group, return the top N records according to a given ordering Example: >>> df.smvGroupBy("id").smvTopNRecs(3, col("amt").desc()) This will keep the 3 largest amt records for each id Args: maxElems (int): maximum number of records per group cols (\*str): columns defining the ordering Returns: (DataFrame): result of taking top records from groups """ return DataFrame(self.sgd.smvTopNRecs(maxElems, smv_copy_array(self.df._sc, *cols)), self.df.sql_ctx)
Example #19
Source File: feature_engineering.py From search-MjoLniR with MIT License | 6 votes |
def zero_features(df, *feature_names): """Zero out features in the feature vector. Parameters ---------- df : pyspark.sql.DataFrame feature_names : list of str Returns ------- pyspark.sql.DataFrame """ features = df.schema['features'].metadata['features'] idxs = [features.index(name) for name in feature_names] def zero_features(feat): raw = feat.toArray() for idx in idxs: raw[idx] = 0. return Vectors.dense(raw) zero_features_udf = F.udf(zero_features, VectorUDT()) return df.withColumn('features', mjolnir.spark.add_meta( df._sc, zero_features_udf('features'), {'features': features}))
Example #20
Source File: helpers.py From SMV with Apache License 2.0 | 6 votes |
def smvPivotSum(self, pivotCols, valueCols, baseOutput): """Perform SmvPivot, then sum the results. Please refer smvPivot's document for context and details of the SmvPivot operation. Args: pivotCols (list(list(str))): list of lists of column names to pivot valueCols (list(string)): names of value columns to sum baseOutput (list(str)): expected names pivoted column Examples: For example, given a DataFrame df that represents the table +-----+-------+---------+-------+ | id | month | product | count | +=====+=======+=========+=======+ | 1 | 5/14 | A | 100 | +-----+-------+---------+-------+ | 1 | 6/14 | B | 200 | +-----+-------+---------+-------+ | 1 | 5/14 | B | 300 | +-----+-------+---------+-------+ we can use >>> df.smvGroupBy("id").smvPivotSum([["month", "product"]], ["count"], ["5_14_A", "5_14_B", "6_14_A", "6_14_B"]) to produce the following output +-----+--------------+--------------+--------------+--------------+ | id | count_5_14_A | count_5_14_B | count_6_14_A | count_6_14_B | +=====+==============+==============+==============+==============+ | 1 | 100 | 300 | NULL | 200 | +-----+--------------+--------------+--------------+--------------+ Returns: (DataFrame): result of pivot sum """ return DataFrame(self.sgd.smvPivotSum(smv_copy_array(self.df._sc, *pivotCols), smv_copy_array(self.df._sc, *valueCols), smv_copy_array(self.df._sc, *baseOutput)), self.df.sql_ctx)
Example #21
Source File: helpers.py From SMV with Apache License 2.0 | 6 votes |
def smvRePartition(self, numParts): """Repartition SmvGroupedData using specified partitioner on the keys. A HashPartitioner with the specified number of partitions will be used. This method is used in the cases that the key-space is very large. In the current Spark DF's groupBy method, the entire key-space is actually loaded into executor's memory, which is very dangerous when the key space is big. The regular DF's repartition function doesn't solve this issue since a random repartition will not guaranteed to reduce the key-space on each executor. In that case we need to use this function to linearly reduce the key-space. Example: >>> df.smvGroupBy("k1", "k2").smvRePartition(32).agg(sum("v") as "v") """ jgdadp = self.sgd.smvRePartition(numParts) df = DataFrame(jgdadp.toDF(), self.df.sql_ctx) return SmvGroupedData(df, self.keys, jgdadp)
Example #22
Source File: helpers.py From SMV with Apache License 2.0 | 6 votes |
def smvPercentRank(self, value_cols, ignoreNull=True): """Compute the percent rank of a sequence of columns within a group in a given DataFrame. Used Spark's `percentRank` window function. The precent rank is defined as `R/(N-1)`, where `R` is the base 0 rank, and `N` is the population size. Under this definition, min value (R=0) has percent rank `0.0`, and max value has percent rank `1.0`. For each column for which the percent rank is computed (e.g. "v"), an additional column is added to the output, `v_pctrnk` All other columns in the input are untouched and propagated to the output. Args: value_cols (list(str)): columns to calculate percentRank on ignoreNull (boolean): if true, null values's percent ranks will be nulls, otherwise, as Spark sort considers null smaller than any value, nulls percent ranks will be zeros. Default true. Example: >>> df.smvGroupBy('g, 'g2).smvPercentRank(["v1", "v2", "v3"]) """ return DataFrame(self.sgd.smvPercentRank(smv_copy_array(self.df._sc, *value_cols), ignoreNull), self.df.sql_ctx)
Example #23
Source File: clustering.py From LearningApacheSpark with MIT License | 5 votes |
def cluster(self): """ DataFrame of predicted cluster centers for each training data point. """ return self._call_java("cluster")
Example #24
Source File: clustering.py From LearningApacheSpark with MIT License | 5 votes |
def predictions(self): """ DataFrame produced by the model's `transform` method. """ return self._call_java("predictions")
Example #25
Source File: regression.py From LearningApacheSpark with MIT License | 5 votes |
def evaluate(self, dataset): """ Evaluates the model on a test dataset. :param dataset: Test dataset to evaluate model on, where dataset is an instance of :py:class:`pyspark.sql.DataFrame` """ if not isinstance(dataset, DataFrame): raise ValueError("dataset must be a DataFrame but got %s." % type(dataset)) java_lr_summary = self._call_java("evaluate", dataset) return LinearRegressionSummary(java_lr_summary)
Example #26
Source File: regression.py From LearningApacheSpark with MIT License | 5 votes |
def evaluateEachIteration(self, dataset, loss): """ Method to compute error or loss for every iteration of gradient boosting. :param dataset: Test dataset to evaluate model on, where dataset is an instance of :py:class:`pyspark.sql.DataFrame` :param loss: The loss function used to compute error. Supported options: squared, absolute """ return self._call_java("evaluateEachIteration", dataset, loss)
Example #27
Source File: regression.py From LearningApacheSpark with MIT License | 5 votes |
def evaluate(self, dataset): """ Evaluates the model on a test dataset. :param dataset: Test dataset to evaluate model on, where dataset is an instance of :py:class:`pyspark.sql.DataFrame` """ if not isinstance(dataset, DataFrame): raise ValueError("dataset must be a DataFrame but got %s." % type(dataset)) java_glr_summary = self._call_java("evaluate", dataset) return GeneralizedLinearRegressionSummary(java_glr_summary)
Example #28
Source File: regression.py From LearningApacheSpark with MIT License | 5 votes |
def numInstances(self): """ Number of instances in DataFrame predictions. """ return self._call_java("numInstances")
Example #29
Source File: util.py From LearningApacheSpark with MIT License | 5 votes |
def convertMatrixColumnsFromML(dataset, *cols): """ Converts matrix columns in an input DataFrame to the :py:class:`pyspark.mllib.linalg.Matrix` type from the new :py:class:`pyspark.ml.linalg.Matrix` type under the `spark.ml` package. :param dataset: input dataset :param cols: a list of matrix columns to be converted. Old matrix columns will be ignored. If unspecified, all new matrix columns will be converted except nested ones. :return: the input dataset with new matrix columns converted to the old matrix type >>> import pyspark >>> from pyspark.ml.linalg import Matrices >>> from pyspark.mllib.util import MLUtils >>> df = spark.createDataFrame( ... [(0, Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]), ... Matrices.dense(2, 2, range(4)))], ["id", "x", "y"]) >>> r1 = MLUtils.convertMatrixColumnsFromML(df).first() >>> isinstance(r1.x, pyspark.mllib.linalg.SparseMatrix) True >>> isinstance(r1.y, pyspark.mllib.linalg.DenseMatrix) True >>> r2 = MLUtils.convertMatrixColumnsFromML(df, "x").first() >>> isinstance(r2.x, pyspark.mllib.linalg.SparseMatrix) True >>> isinstance(r2.y, pyspark.ml.linalg.DenseMatrix) True """ if not isinstance(dataset, DataFrame): raise TypeError("Input dataset must be a DataFrame but got {}.".format(type(dataset))) return callMLlibFunc("convertMatrixColumnsFromML", dataset, list(cols))
Example #30
Source File: helpers.py From search-MjoLniR with MIT License | 5 votes |
def _wiki_features(df: DataFrame, wiki: str) -> List[str]: meta = df.schema['features'].metadata if 'wiki_features' in meta: return meta['wiki_features'][wiki] else: return meta['features']