Python Examples of pyspark.sql.DataFrame

Source File: helpers.py From search-MjoLniR with MIT License

7 votes

def require_output_table(
        self, partition_spec_spec, metadata_fn=None,
        mode='overwrite',
    ):
        @self._post_process_transform.append
        def post(df: DataFrame, kwargs: Dict):
            mt.write_partition(
                df, kwargs['output_table'], kwargs['output_path'],
                self._resolve_partition_spec(kwargs, partition_spec_spec),
                mode=mode)
            if metadata_fn is not None:
                spark = df.sql_ctx.sparkSession
                metadata = metadata_fn(spark.read.parquet(kwargs['output_path']))
                write_metadata(kwargs['output_path'], metadata)

        self.add_argument('--output-table', required=True)
        self.add_argument('--output-path', required=True)

Source File: helpers.py From SMV with Apache License 2.0

6 votes

def smvExpandStruct(self, *cols):
        """Expand structure type column to a group of columns

            Args:
                cols (\*string): column names to expand

            Example:
                input DF:
                    [id: string, address: struct<state:string, zip:string, street:string>]

                >>> df.smvExpandStruct("address")

                output DF:
                    [id: string, state: string, zip: string, street: string]

            Returns:
                (DataFrame): DF with expanded columns
        """
        jdf = self._jPythonHelper.smvExpandStruct(self._jdf, smv_copy_array(self._sc, *cols))
        return DataFrame(jdf, self._sql_ctx)

Source File: norm_query_clustering.py From search-MjoLniR with MIT License

6 votes

def filter_min_sessions_per_norm_query(min_sessions: int) -> mt.Transformer:
    def transform(df: DataFrame) -> DataFrame:
        w = Window.partitionBy('wikiid', 'norm_query')
        return (
            df.withColumn(
                'has_min_sessions',
                at_least_n_distinct('session_id', min_sessions).over(w))
            .where(F.col('has_min_sessions'))
            .drop('has_min_sessions'))
    return transform

Source File: feature_vectors.py From search-MjoLniR with MIT License

6 votes

def resample_clicks_to_query_page(
    df_cluster: DataFrame,
    random_seed: Optional[int],
    samples_per_wiki: int
) -> mt.Transformer:
    # Resamples the click log by proxy of resampling clusters, such
    # that a complete cluster is either included or excluded from the
    # resulting dataset.
    # TODO: Evaluate alternative resampling, such as perhaps only dropping from
    # clusters where all clicks were to the top result (implying an "easy" search).

    mt.check_schema(df_cluster, mt.QueryClustering)
    return mt.seq_transform([
        # Grab only the parts of the query log we need to make the resulting sampled QueryPage
        lambda df: df.select('query', 'wikiid', 'session_id', 'hit_page_ids'),
        mt.join_cluster_by_query(df_cluster),
        # [1] is because sample returns a tuple of (page_counts, df)
        mt.temp_rename_col('cluster_id', 'norm_query_id', lambda df: mjolnir.sampling.sample(
            df, random_seed, samples_per_wiki)[1]),
        lambda df: df.withColumn(
            'page_id', F.explode('hit_page_ids')).drop('hit_page_ids')
    ])

Source File: feature_engineering.py From search-MjoLniR with MIT License

6 votes

def explode_features(df, features=None):
    """Convert feature vector into individual columns

    Parameters
    ----------
    df : pyspark.sql.DataFrame
    features : list of str or None

    Returns
    -------
    pyspark.sql.DataFrame
    """
    if features is None:
        features = df.schema['features'].metadata['features']

    def extract_feature(features, idx):
        return float(features[idx])
    extract_feature_udf = F.udf(extract_feature, pyspark.sql.types.FloatType())
    cols = [extract_feature_udf('features', F.lit(idx)).alias(name) for idx, name in enumerate(features)]
    return df.select('*', *cols)

Source File: feature_vectors.py From search-MjoLniR with MIT License

6 votes

def transform(
    query_clicks: HivePartition,
    query_clustering: HivePartition,
    samples_per_wiki: int,
    random_seed: Optional[int],
    wikis: List[str],
    brokers: str,
    topic_request: str,
    topic_response: str,
    feature_set: str,
    **kwargs
) -> DataFrame:
    transformer = mt.seq_transform([
        mt.restrict_wikis(wikis),
        resample_clicks_to_query_page(
            query_clustering.df, random_seed, samples_per_wiki),
        feature_vectors.transformer(
            brokers, topic_request, topic_response, feature_set)
    ])
    return transformer(query_clicks.df)

Source File: feature_engineering.py From search-MjoLniR with MIT License

6 votes

def append_features(df, *cols):
    """Append features from columns to the features vector.

    Parameters
    ----------
    df : pyspark.sql.DataFrame
    cols : list of str

    Returns
    -------
    pyspark.sql.DataFrame
    """
    def add_features(feat, *other):
        raw = feat.toArray()
        return Vectors.dense(np.append(raw, list(map(float, other))))
    add_features_udf = F.udf(add_features, VectorUDT())
    new_feat_list = df.schema['features'].metadata['features'] + cols
    return df.withColumn('features', mjolnir.spark.add_meta(
        df._sc, add_features_udf('features', *cols), {'features': new_feat_list}))

Source File: feature_vectors.py From search-MjoLniR with MIT License

6 votes

def collect_features(
    kafka_config: ClientConfig, feature_set: str
) -> mt.Transformer:
    def transform(df: DataFrame) -> DataFrame:
        df_features, fnames_accu = mjolnir.features.collect(
            df,
            model='featureset:' + feature_set,
            brokers=kafka_config,
            indices=mt.ContentIndices())
        # Collect the accumulator to get feature names
        df_features.cache().count()
        # Future transformations have to be extra careful to not lose this metadata
        return _add_meta(df_features, 'features', {
            'feature_set': feature_set,
            'features': _check_features(fnames_accu),
            'collected_at': datetime.datetime.now().isoformat()
        })
    return transform

Source File: feature_selection.py From search-MjoLniR with MIT License

6 votes

def select_features(
    wiki: str,
    num_features: int,
    metadata: Dict
) -> mt.Transformer:
    def transform(df: DataFrame) -> DataFrame:
        # Compute the "best" features, per some metric
        sc = df.sql_ctx.sparkSession.sparkContext
        features = metadata['input_feature_meta']['features']
        selected = mjolnir.feature_engineering.select_features(
            sc, df, features, num_features, algo='mrmr')
        metadata['wiki_features'][wiki] = selected

        # Rebuild the `features` col with only the selected features
        keep_cols = metadata['default_cols'] + selected
        df_selected = df.select(*keep_cols)
        assembler = VectorAssembler(
            inputCols=selected, outputCol='features')
        return assembler.transform(df_selected).drop(*selected)
    return transform

Source File: feature_selection.py From search-MjoLniR with MIT License

6 votes

def transformer(
    df_label: DataFrame,
    temp_dir: str,
    wikis: List[str],
    num_features: int
) -> mt.Transformer:
    mt.check_schema(df_label, mt.LabeledQueryPage)

    # Hack to transfer metadata between transformations. This is populated in
    # time since `select_features` does direct computation of the features.
    metadata = cast(Dict, {'wiki_features': {}})

    return mt.seq_transform([
        mt.restrict_wikis(wikis),
        mt.join_labels(df_label),
        explode_features(metadata),
        mt.cache_to_disk(temp_dir, partition_by='wikiid'),
        mt.for_each_item('wikiid', wikis, lambda wiki: select_features(
            wiki, num_features, metadata)),
        attach_feature_metadata(metadata),
        # While we used the labels for selecting features, they are not part of the feature vectors.
        # Allow them to be joined with any other label set for export to training.
        lambda df: df.drop('cluster_id', 'label'),
        lambda df: df.repartition(200, 'wikiid', 'query'),
    ])

Source File: make_folds.py From search-MjoLniR with MIT License

6 votes

def convert_svmrank_to_xgboost(df: DataFrame) -> DataFrame:
    def convert_one(row: Row) -> Row:
        # For now place the .xgb right next to the svmrank files. Naming/path
        # options could be added if needed later.
        out_path = row.path + '.xgb'
        _convert_xgboost_remote(row.path, out_path)
        return Row(**dict(
            row.asDict(),
            vec_format='xgboost',
            path=out_path))

    # Each row represents potentially gigabytes, convince spark
    # to create a partition per row.
    rdd_xgb = mt.partition_per_row(df.rdd).map(convert_one)
    df_xgb = df.sql_ctx.createDataFrame(rdd_xgb, df.schema)  # type: ignore
    # Return both the xgb and svmrank datasets since
    # we aren't purging the related files. df is safe to reuse since
    # svmrank conversion returns a new dataframe with no lineage.
    return df.union(df_xgb)

Source File: sys_exec.py From cadCAD with MIT License

6 votes

def to_spark_df(rdd: RDD, spark: SparkSession, init_condition: dict = None):
    # Typefull
    if init_condition is not None:
        return to_spark(rdd, init_condition)
    # Typeless
    else:
        spark.conf.set("spark.sql.execution.arrow.enabled", "true")
        spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "true")
        warnings.simplefilter(action='ignore', category=UserWarning)
        pdf_from_rdd: DataFrame = to_pandas(rdd)
        result = spark.createDataFrame(pdf_from_rdd)
        del pdf_from_rdd
        return result

Source File: clustering.py From LearningApacheSpark with MIT License

6 votes

def assignClusters(self, dataset):
        """
        Run the PIC algorithm and returns a cluster assignment for each input vertex.

        :param dataset:
          A dataset with columns src, dst, weight representing the affinity matrix,
          which is the matrix A in the PIC paper. Suppose the src column value is i,
          the dst column value is j, the weight column value is similarity s,,ij,,
          which must be nonnegative. This is a symmetric matrix and hence
          s,,ij,, = s,,ji,,. For any (i, j) with nonzero similarity, there should be
          either (i, j, s,,ij,,) or (j, i, s,,ji,,) in the input. Rows with i = j are
          ignored, because we assume s,,ij,, = 0.0.

        :return:
          A dataset that contains columns of vertex id and the corresponding cluster for
          the id. The schema of it will be:
          - id: Long
          - cluster: Int

        .. versionadded:: 2.4.0
        """
        self._transfer_params_to_java()
        jdf = self._java_obj.assignClusters(dataset._jdf)
        return DataFrame(jdf, dataset.sql_ctx)

Source File: common.py From LearningApacheSpark with MIT License

6 votes

def _py2java(sc, obj):
    """ Convert Python object into Java """
    if isinstance(obj, RDD):
        obj = _to_java_object_rdd(obj)
    elif isinstance(obj, DataFrame):
        obj = obj._jdf
    elif isinstance(obj, SparkContext):
        obj = obj._jsc
    elif isinstance(obj, list):
        obj = [_py2java(sc, x) for x in obj]
    elif isinstance(obj, JavaObject):
        pass
    elif isinstance(obj, (int, long, float, bool, bytes, unicode)):
        pass
    else:
        data = bytearray(PickleSerializer().dumps(obj))
        obj = sc._jvm.org.apache.spark.ml.python.MLSerDe.loads(data)
    return obj

Source File: recommendation.py From LearningApacheSpark with MIT License

6 votes

def _prepare(cls, ratings):
        if isinstance(ratings, RDD):
            pass
        elif isinstance(ratings, DataFrame):
            ratings = ratings.rdd
        else:
            raise TypeError("Ratings should be represented by either an RDD or a DataFrame, "
                            "but got %s." % type(ratings))
        first = ratings.first()
        if isinstance(first, Rating):
            pass
        elif isinstance(first, (tuple, list)):
            ratings = ratings.map(lambda x: Rating(*x))
        else:
            raise TypeError("Expect a Rating or a tuple/list, but got %s." % type(first))
        return ratings

Source File: common.py From LearningApacheSpark with MIT License

6 votes

def _py2java(sc, obj):
    """ Convert Python object into Java """
    if isinstance(obj, RDD):
        obj = _to_java_object_rdd(obj)
    elif isinstance(obj, DataFrame):
        obj = obj._jdf
    elif isinstance(obj, SparkContext):
        obj = obj._jsc
    elif isinstance(obj, list):
        obj = [_py2java(sc, x) for x in obj]
    elif isinstance(obj, JavaObject):
        pass
    elif isinstance(obj, (int, long, float, bool, bytes, unicode)):
        pass
    else:
        data = bytearray(PickleSerializer().dumps(obj))
        obj = sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(data)
    return obj

Source File: tests.py From LearningApacheSpark with MIT License

6 votes

def test_gaussian_mixture_summary(self):
        data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
                (Vectors.sparse(1, [], []),)]
        df = self.spark.createDataFrame(data, ["features"])
        gmm = GaussianMixture(k=2)
        model = gmm.fit(df)
        self.assertTrue(model.hasSummary)
        s = model.summary
        self.assertTrue(isinstance(s.predictions, DataFrame))
        self.assertEqual(s.probabilityCol, "probability")
        self.assertTrue(isinstance(s.probability, DataFrame))
        self.assertEqual(s.featuresCol, "features")
        self.assertEqual(s.predictionCol, "prediction")
        self.assertTrue(isinstance(s.cluster, DataFrame))
        self.assertEqual(len(s.clusterSizes), 2)
        self.assertEqual(s.k, 2)
        self.assertEqual(s.numIter, 3)

Source File: helpers.py From SMV with Apache License 2.0

6 votes

def smvTopNRecs(self, maxElems, *cols):
        """For each group, return the top N records according to a given ordering

            Example:

                >>> df.smvGroupBy("id").smvTopNRecs(3, col("amt").desc())

                This will keep the 3 largest amt records for each id

            Args:
                maxElems (int): maximum number of records per group
                cols (\*str): columns defining the ordering

            Returns:
                (DataFrame): result of taking top records from groups

        """
        return DataFrame(self.sgd.smvTopNRecs(maxElems, smv_copy_array(self.df._sc, *cols)), self.df.sql_ctx)

Source File: feature_engineering.py From search-MjoLniR with MIT License

6 votes

def zero_features(df, *feature_names):
    """Zero out features in the feature vector.

    Parameters
    ----------
    df : pyspark.sql.DataFrame
    feature_names : list of str

    Returns
    -------
    pyspark.sql.DataFrame
    """
    features = df.schema['features'].metadata['features']
    idxs = [features.index(name) for name in feature_names]

    def zero_features(feat):
        raw = feat.toArray()
        for idx in idxs:
            raw[idx] = 0.
        return Vectors.dense(raw)
    zero_features_udf = F.udf(zero_features, VectorUDT())
    return df.withColumn('features', mjolnir.spark.add_meta(
        df._sc, zero_features_udf('features'), {'features': features}))

Source File: helpers.py From SMV with Apache License 2.0

6 votes

def smvPivotSum(self, pivotCols, valueCols, baseOutput):
        """Perform SmvPivot, then sum the results.
            Please refer smvPivot's document for context and details of the SmvPivot operation.

            Args:
                pivotCols (list(list(str))): list of lists of column names to pivot
                valueCols (list(string)): names of value columns to sum
                baseOutput (list(str)): expected names pivoted column

            Examples:
                For example, given a DataFrame df that represents the table

                +-----+-------+---------+-------+
                | id  | month | product | count |
                +=====+=======+=========+=======+
                | 1   | 5/14  |   A     |   100 |
                +-----+-------+---------+-------+
                | 1   | 6/14  |   B     |   200 |
                +-----+-------+---------+-------+
                | 1   | 5/14  |   B     |   300 |
                +-----+-------+---------+-------+

                we can use

                >>> df.smvGroupBy("id").smvPivotSum([["month", "product"]], ["count"], ["5_14_A", "5_14_B", "6_14_A", "6_14_B"])

                to produce the following output

                +-----+--------------+--------------+--------------+--------------+
                | id  | count_5_14_A | count_5_14_B | count_6_14_A | count_6_14_B |
                +=====+==============+==============+==============+==============+
                | 1   | 100          | 300          | NULL         | 200          |
                +-----+--------------+--------------+--------------+--------------+

            Returns:
                (DataFrame): result of pivot sum
        """
        return DataFrame(self.sgd.smvPivotSum(smv_copy_array(self.df._sc, *pivotCols), smv_copy_array(self.df._sc, *valueCols), smv_copy_array(self.df._sc, *baseOutput)), self.df.sql_ctx)

Source File: helpers.py From SMV with Apache License 2.0

6 votes

def smvRePartition(self, numParts):
        """Repartition SmvGroupedData using specified partitioner on the keys. A
            HashPartitioner with the specified number of partitions will be used.

            This method is used in the cases that the key-space is very large. In the
            current Spark DF's groupBy method, the entire key-space is actually loaded
            into executor's memory, which is very dangerous when the key space is big.
            The regular DF's repartition function doesn't solve this issue since a random
            repartition will not guaranteed to reduce the key-space on each executor.
            In that case we need to use this function to linearly reduce the key-space.

            Example:

            >>> df.smvGroupBy("k1", "k2").smvRePartition(32).agg(sum("v") as "v")
        """
        jgdadp = self.sgd.smvRePartition(numParts)
        df = DataFrame(jgdadp.toDF(), self.df.sql_ctx)
        return SmvGroupedData(df, self.keys, jgdadp)

Source File: helpers.py From SMV with Apache License 2.0

6 votes

def smvPercentRank(self, value_cols, ignoreNull=True):
        """Compute the percent rank of a sequence of columns within a group in a given DataFrame.

            Used Spark's `percentRank` window function. The precent rank is defined as
            `R/(N-1)`, where `R` is the base 0 rank, and `N` is the population size. Under
            this definition, min value (R=0) has percent rank `0.0`, and max value has percent
            rank `1.0`.

            For each column for which the percent rank is computed (e.g. "v"), an additional column is
            added to the output, `v_pctrnk`

            All other columns in the input are untouched and propagated to the output.

            Args:
                value_cols (list(str)): columns to calculate percentRank on
                ignoreNull (boolean): if true, null values's percent ranks will be nulls, otherwise,
                    as Spark sort considers null smaller than any value, nulls percent ranks will be
                    zeros. Default true.

            Example:
                >>> df.smvGroupBy('g, 'g2).smvPercentRank(["v1", "v2", "v3"])
        """
        return DataFrame(self.sgd.smvPercentRank(smv_copy_array(self.df._sc, *value_cols), ignoreNull), self.df.sql_ctx)

Source File: clustering.py From LearningApacheSpark with MIT License

5 votes

def cluster(self):
        """
        DataFrame of predicted cluster centers for each training data point.
        """
        return self._call_java("cluster")

Source File: clustering.py From LearningApacheSpark with MIT License

5 votes

def predictions(self):
        """
        DataFrame produced by the model's `transform` method.
        """
        return self._call_java("predictions")

Source File: regression.py From LearningApacheSpark with MIT License

5 votes

def evaluate(self, dataset):
        """
        Evaluates the model on a test dataset.

        :param dataset:
          Test dataset to evaluate model on, where dataset is an
          instance of :py:class:`pyspark.sql.DataFrame`
        """
        if not isinstance(dataset, DataFrame):
            raise ValueError("dataset must be a DataFrame but got %s." % type(dataset))
        java_lr_summary = self._call_java("evaluate", dataset)
        return LinearRegressionSummary(java_lr_summary)

Source File: regression.py From LearningApacheSpark with MIT License

5 votes

def evaluateEachIteration(self, dataset, loss):
        """
        Method to compute error or loss for every iteration of gradient boosting.

        :param dataset:
            Test dataset to evaluate model on, where dataset is an
            instance of :py:class:`pyspark.sql.DataFrame`
        :param loss:
            The loss function used to compute error.
            Supported options: squared, absolute
        """
        return self._call_java("evaluateEachIteration", dataset, loss)

Source File: regression.py From LearningApacheSpark with MIT License

5 votes

def evaluate(self, dataset):
        """
        Evaluates the model on a test dataset.

        :param dataset:
          Test dataset to evaluate model on, where dataset is an
          instance of :py:class:`pyspark.sql.DataFrame`
        """
        if not isinstance(dataset, DataFrame):
            raise ValueError("dataset must be a DataFrame but got %s." % type(dataset))
        java_glr_summary = self._call_java("evaluate", dataset)
        return GeneralizedLinearRegressionSummary(java_glr_summary)

Source File: regression.py From LearningApacheSpark with MIT License

5 votes

def numInstances(self):
        """
        Number of instances in DataFrame predictions.
        """
        return self._call_java("numInstances")

Source File: util.py From LearningApacheSpark with MIT License

5 votes

def convertMatrixColumnsFromML(dataset, *cols):
        """
        Converts matrix columns in an input DataFrame to the
        :py:class:`pyspark.mllib.linalg.Matrix` type from the new
        :py:class:`pyspark.ml.linalg.Matrix` type under the `spark.ml`
        package.

        :param dataset:
          input dataset
        :param cols:
          a list of matrix columns to be converted.
          Old matrix columns will be ignored. If unspecified, all new
          matrix columns will be converted except nested ones.
        :return:
          the input dataset with new matrix columns converted to the
          old matrix type

        >>> import pyspark
        >>> from pyspark.ml.linalg import Matrices
        >>> from pyspark.mllib.util import MLUtils
        >>> df = spark.createDataFrame(
        ...     [(0, Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]),
        ...     Matrices.dense(2, 2, range(4)))], ["id", "x", "y"])
        >>> r1 = MLUtils.convertMatrixColumnsFromML(df).first()
        >>> isinstance(r1.x, pyspark.mllib.linalg.SparseMatrix)
        True
        >>> isinstance(r1.y, pyspark.mllib.linalg.DenseMatrix)
        True
        >>> r2 = MLUtils.convertMatrixColumnsFromML(df, "x").first()
        >>> isinstance(r2.x, pyspark.mllib.linalg.SparseMatrix)
        True
        >>> isinstance(r2.y, pyspark.ml.linalg.DenseMatrix)
        True
        """
        if not isinstance(dataset, DataFrame):
            raise TypeError("Input dataset must be a DataFrame but got {}.".format(type(dataset)))
        return callMLlibFunc("convertMatrixColumnsFromML", dataset, list(cols))

Source File: helpers.py From search-MjoLniR with MIT License

5 votes

def _wiki_features(df: DataFrame, wiki: str) -> List[str]:
    meta = df.schema['features'].metadata
    if 'wiki_features' in meta:
        return meta['wiki_features'][wiki]
    else:
        return meta['features']

Python pyspark.sql.DataFrame() Examples