Python Examples of pyspark.sql.types.Row

Source File: test_keras_estimators.py From spark-deep-learning with Apache License 2.0

7 votes

def _create_train_image_uris_and_labels(self, repeat_factor=1, cardinality=100, dense=True):
        image_uris = getSampleImagePaths() * repeat_factor
        # Create image categorical labels (integer IDs)
        local_rows = []
        for uri in image_uris:
            label = np.random.randint(low=0, high=cardinality, size=1)[0]
            if dense:
                label_inds = np.zeros(cardinality)
                label_inds[label] = 1.0
                label_inds = label_inds.ravel()
                assert label_inds.shape[0] == cardinality, label_inds.shape
                one_hot_vec = spla.Vectors.dense(label_inds.tolist())
            else:   # sparse
                one_hot_vec = spla.Vectors.sparse(cardinality, {label: 1})
            _row_struct = {self.input_col: uri, self.one_hot_col: one_hot_vec,
                           self.one_hot_label_col: float(label)}
            row = sptyp.Row(**_row_struct)
            local_rows.append(row)

        image_uri_df = self.session.createDataFrame(local_rows)
        return image_uri_df

Source File: context.py From LearningApacheSpark with MIT License

6 votes

def range(self, start, end=None, step=1, numPartitions=None):
        """
        Create a :class:`DataFrame` with single :class:`pyspark.sql.types.LongType` column named
        ``id``, containing elements in a range from ``start`` to ``end`` (exclusive) with
        step value ``step``.

        :param start: the start value
        :param end: the end value (exclusive)
        :param step: the incremental step (default: 1)
        :param numPartitions: the number of partitions of the DataFrame
        :return: :class:`DataFrame`

        >>> sqlContext.range(1, 7, 2).collect()
        [Row(id=1), Row(id=3), Row(id=5)]

        If only one argument is specified, it will be used as the end value.

        >>> sqlContext.range(3).collect()
        [Row(id=0), Row(id=1), Row(id=2)]
        """
        return self.sparkSession.range(start, end, step, numPartitions)

Source File: context.py From LearningApacheSpark with MIT License

6 votes

def tables(self, dbName=None):
        """Returns a :class:`DataFrame` containing names of tables in the given database.

        If ``dbName`` is not specified, the current database will be used.

        The returned DataFrame has two columns: ``tableName`` and ``isTemporary``
        (a column with :class:`BooleanType` indicating if a table is a temporary one or not).

        :param dbName: string, name of the database to use.
        :return: :class:`DataFrame`

        >>> sqlContext.registerDataFrameAsTable(df, "table1")
        >>> df2 = sqlContext.tables()
        >>> df2.filter("tableName = 'table1'").first()
        Row(database=u'', tableName=u'table1', isTemporary=True)
        """
        if dbName is None:
            return DataFrame(self._ssql_ctx.tables(), self)
        else:
            return DataFrame(self._ssql_ctx.tables(dbName), self)

Source File: image.py From LearningApacheSpark with MIT License

5 votes

def toNDArray(self, image):
        """
        Converts an image to an array with metadata.

        :param `Row` image: A row that contains the image to be converted. It should
            have the attributes specified in `ImageSchema.imageSchema`.
        :return: a `numpy.ndarray` that is an image.

        .. versionadded:: 2.3.0
        """

        if not isinstance(image, Row):
            raise TypeError(
                "image argument should be pyspark.sql.types.Row; however, "
                "it got [%s]." % type(image))

        if any(not hasattr(image, f) for f in self.imageFields):
            raise ValueError(
                "image argument should have attributes specified in "
                "ImageSchema.imageSchema [%s]." % ", ".join(self.imageFields))

        height = image.height
        width = image.width
        nChannels = image.nChannels
        return np.ndarray(
            shape=(height, width, nChannels),
            dtype=np.uint8,
            buffer=image.data,
            strides=(width * nChannels, nChannels, 1))

Source File: image.py From LearningApacheSpark with MIT License

5 votes

def toImage(self, array, origin=""):
        """
        Converts an array with metadata to a two-dimensional image.

        :param `numpy.ndarray` array: The array to convert to image.
        :param str origin: Path to the image, optional.
        :return: a :class:`Row` that is a two dimensional image.

        .. versionadded:: 2.3.0
        """

        if not isinstance(array, np.ndarray):
            raise TypeError(
                "array argument should be numpy.ndarray; however, it got [%s]." % type(array))

        if array.ndim != 3:
            raise ValueError("Invalid array shape")

        height, width, nChannels = array.shape
        ocvTypes = ImageSchema.ocvTypes
        if nChannels == 1:
            mode = ocvTypes["CV_8UC1"]
        elif nChannels == 3:
            mode = ocvTypes["CV_8UC3"]
        elif nChannels == 4:
            mode = ocvTypes["CV_8UC4"]
        else:
            raise ValueError("Invalid number of channels")

        # Running `bytearray(numpy.array([1]))` fails in specific Python versions
        # with a specific Numpy version, for example in Python 3.6.0 and NumPy 1.13.3.
        # Here, it avoids it by converting it to bytes.
        data = bytearray(array.astype(dtype=np.uint8).ravel().tobytes())

        # Creating new Row with _create_row(), because Row(name = value, ... )
        # orders fields by name, which conflicts with expected schema order
        # when the new DataFrame is created by UDF
        return _create_row(self.imageFields,
                           [origin, height, width, nChannels, mode, data])

Source File: context.py From LearningApacheSpark with MIT License

5 votes

def __init__(self, sparkContext, sparkSession=None, jsqlContext=None):
        """Creates a new SQLContext.

        >>> from datetime import datetime
        >>> sqlContext = SQLContext(sc)
        >>> allTypes = sc.parallelize([Row(i=1, s="string", d=1.0, l=1,
        ...     b=True, list=[1, 2, 3], dict={"s": 0}, row=Row(a=1),
        ...     time=datetime(2014, 8, 1, 14, 1, 5))])
        >>> df = allTypes.toDF()
        >>> df.createOrReplaceTempView("allTypes")
        >>> sqlContext.sql('select i+1, d+1, not b, list[1], dict["s"], time, row.a '
        ...            'from allTypes where b and i > 0').collect()
        [Row((i + CAST(1 AS BIGINT))=2, (d + CAST(1 AS DOUBLE))=2.0, (NOT b)=False, list[1]=2, \
            dict[s]=0, time=datetime.datetime(2014, 8, 1, 14, 1, 5), a=1)]
        >>> df.rdd.map(lambda x: (x.i, x.s, x.d, x.l, x.b, x.time, x.row.a, x.list)).collect()
        [(1, u'string', 1.0, 1, True, datetime.datetime(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])]
        """
        self._sc = sparkContext
        self._jsc = self._sc._jsc
        self._jvm = self._sc._jvm
        if sparkSession is None:
            sparkSession = SparkSession.builder.getOrCreate()
        if jsqlContext is None:
            jsqlContext = sparkSession._jwrapped
        self.sparkSession = sparkSession
        self._jsqlContext = jsqlContext
        _monkey_patch_RDD(self.sparkSession)
        install_exception_handler()
        if SQLContext._instantiatedContext is None:
            SQLContext._instantiatedContext = self

Source File: context.py From LearningApacheSpark with MIT License

5 votes

def _inferSchema(self, rdd, samplingRatio=None):
        """
        Infer schema from an RDD of Row or tuple.

        :param rdd: an RDD of Row or tuple
        :param samplingRatio: sampling ratio, or no sampling (default)
        :return: :class:`pyspark.sql.types.StructType`
        """
        return self.sparkSession._inferSchema(rdd, samplingRatio)

Source File: context.py From LearningApacheSpark with MIT License

5 votes

def _test():
    import os
    import doctest
    import tempfile
    from pyspark.context import SparkContext
    from pyspark.sql import Row, SQLContext
    import pyspark.sql.context

    os.chdir(os.environ["SPARK_HOME"])

    globs = pyspark.sql.context.__dict__.copy()
    sc = SparkContext('local[4]', 'PythonTest')
    globs['tempfile'] = tempfile
    globs['os'] = os
    globs['sc'] = sc
    globs['sqlContext'] = SQLContext(sc)
    globs['rdd'] = rdd = sc.parallelize(
        [Row(field1=1, field2="row1"),
         Row(field1=2, field2="row2"),
         Row(field1=3, field2="row3")]
    )
    globs['df'] = rdd.toDF()
    jsonStrings = [
        '{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
        '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},'
        '"field6":[{"field7": "row2"}]}',
        '{"field1" : null, "field2": "row3", '
        '"field3":{"field4":33, "field5": []}}'
    ]
    globs['jsonStrings'] = jsonStrings
    globs['json'] = sc.parallelize(jsonStrings)
    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.context, globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
    globs['sc'].stop()
    if failure_count:
        sys.exit(-1)

Source File: score_images_spark.py From models with Apache License 2.0

5 votes

def read_images(spark, filenames):
    filenames_rdd = spark.sparkContext.parallelize(filenames)
    schema = StructType(
        [StructField("filename", StringType(), True), StructField("image", StringType(), True)])
    return filenames_rdd.map(lambda x: Row(filename=x,
                                           image=read_image_bytes_base64(x))).toDF(schema=schema)

Source File: graph.py From apollo with GNU General Public License v3.0

5 votes

def evaluate_communities(args):
    log = logging.getLogger("evalcc")
    model = CommunitiesModel().load(args.input)
    configure(args)
    spark = create_spark("evalcc-%s" % uuid4(), **filter_kwargs(args.__dict__, create_spark))
    log.info("Preparing the communities' RDD")
    items = []
    for i, c in progress_bar(enumerate(model.communities), log,
                             expected_size=len(model.communities)):
        for m in c:
            if m < len(model.id_to_element):
                items.append(Row(sha1=model.id_to_element[m], community=i))
    log.info("Running")
    items_in_spark = spark.sparkContext.parallelize(items).toDF()
    bags = spark \
        .read \
        .format("org.apache.spark.sql.cassandra") \
        .options(table=args.tables["bags"], keyspace=args.keyspace) \
        .load()
    log.info("Loaded the bags, calculating the vocabulary")
    vocabulary = bags.drop("sha1", "value").distinct().rdd.map(lambda x: x.item).collect()
    vocabulary = {v: i for i, v in enumerate(vocabulary)}
    log.info("Vocabulary size: %d", len(vocabulary))
    element_to_id = {e: i for i, e in enumerate(model.id_to_element)}
    metrics = items_in_spark.join(bags, "sha1").rdd \
        .map(lambda r: (r.community, (element_to_id[r.sha1], vocabulary[r.item], r.value))) \
        .groupByKey() \
        .map(CommunityEvaluator(args.threshold, len(vocabulary))) \
        .reduce(lambda v1, v2: [v1[i] + v2[i] for i in range(4)])
    log.info("Total misses: %d", metrics[0])
    log.info("Average normalized misses: %f", metrics[1] / len(model.communities))
    log.info("Total loss: %f", metrics[2])
    log.info("Average normalized loss: %f", numpy.sqrt(metrics[3] / len(model.communities)))

Source File: hasher.py From apollo with GNU General Public License v3.0

5 votes

def __call__(self, record):
        key, wmh = record
        for hti in range(self.htnum):
            yield Row(sha1=key, hashtable=hti,
                      value=bytearray(wmh[hti * self.band_size:(hti + 1) * self.band_size].data))

Source File: bags.py From apollo with GNU General Public License v3.0

5 votes

def __call__(self, head):
        rows = head.map(lambda row: Row(sha1=row.document,
                                        item=row.token,
                                        value=float(row.value)))
        if self.explained:
            self._log.info("toDebugString():\n%s", rows.toDebugString().decode())
        rows.toDF() \
            .write \
            .format("org.apache.spark.sql.cassandra") \
            .mode("append") \
            .options(table=self.table, keyspace=self.keyspace) \
            .save()
        return head

Source File: bags.py From apollo with GNU General Public License v3.0

5 votes

def __call__(self, head):
        rows = head.map(lambda x: Row(
            sha1=x.blob_id, repo=x.repository_id, commit=x.commit_hash, path=x.path))
        if self.explained:
            self._log.info("toDebugString():\n%s", rows.toDebugString().decode())
        rows.toDF() \
            .write \
            .format("org.apache.spark.sql.cassandra") \
            .mode("append") \
            .options(table=self.table, keyspace=self.keyspace) \
            .save()

Source File: score_images_spark.py From mlflow with Apache License 2.0

5 votes

def read_images(spark, filenames):
    filenames_rdd = spark.sparkContext.parallelize(filenames)
    schema = StructType(
        [StructField("filename", StringType(), True), StructField("image", StringType(), True)])
    return filenames_rdd.map(lambda x: Row(filename=x,
                                           image=read_image_bytes_base64(x))).toDF(schema=schema)

Python pyspark.sql.types.Row() Examples