Python pyspark.sql.types.Row() Examples
The following are 14
code examples of pyspark.sql.types.Row().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql.types
, or try the search function
.
Example #1
Source File: test_keras_estimators.py From spark-deep-learning with Apache License 2.0 | 7 votes |
def _create_train_image_uris_and_labels(self, repeat_factor=1, cardinality=100, dense=True): image_uris = getSampleImagePaths() * repeat_factor # Create image categorical labels (integer IDs) local_rows = [] for uri in image_uris: label = np.random.randint(low=0, high=cardinality, size=1)[0] if dense: label_inds = np.zeros(cardinality) label_inds[label] = 1.0 label_inds = label_inds.ravel() assert label_inds.shape[0] == cardinality, label_inds.shape one_hot_vec = spla.Vectors.dense(label_inds.tolist()) else: # sparse one_hot_vec = spla.Vectors.sparse(cardinality, {label: 1}) _row_struct = {self.input_col: uri, self.one_hot_col: one_hot_vec, self.one_hot_label_col: float(label)} row = sptyp.Row(**_row_struct) local_rows.append(row) image_uri_df = self.session.createDataFrame(local_rows) return image_uri_df
Example #2
Source File: context.py From LearningApacheSpark with MIT License | 6 votes |
def range(self, start, end=None, step=1, numPartitions=None): """ Create a :class:`DataFrame` with single :class:`pyspark.sql.types.LongType` column named ``id``, containing elements in a range from ``start`` to ``end`` (exclusive) with step value ``step``. :param start: the start value :param end: the end value (exclusive) :param step: the incremental step (default: 1) :param numPartitions: the number of partitions of the DataFrame :return: :class:`DataFrame` >>> sqlContext.range(1, 7, 2).collect() [Row(id=1), Row(id=3), Row(id=5)] If only one argument is specified, it will be used as the end value. >>> sqlContext.range(3).collect() [Row(id=0), Row(id=1), Row(id=2)] """ return self.sparkSession.range(start, end, step, numPartitions)
Example #3
Source File: context.py From LearningApacheSpark with MIT License | 6 votes |
def tables(self, dbName=None): """Returns a :class:`DataFrame` containing names of tables in the given database. If ``dbName`` is not specified, the current database will be used. The returned DataFrame has two columns: ``tableName`` and ``isTemporary`` (a column with :class:`BooleanType` indicating if a table is a temporary one or not). :param dbName: string, name of the database to use. :return: :class:`DataFrame` >>> sqlContext.registerDataFrameAsTable(df, "table1") >>> df2 = sqlContext.tables() >>> df2.filter("tableName = 'table1'").first() Row(database=u'', tableName=u'table1', isTemporary=True) """ if dbName is None: return DataFrame(self._ssql_ctx.tables(), self) else: return DataFrame(self._ssql_ctx.tables(dbName), self)
Example #4
Source File: image.py From LearningApacheSpark with MIT License | 5 votes |
def toNDArray(self, image): """ Converts an image to an array with metadata. :param `Row` image: A row that contains the image to be converted. It should have the attributes specified in `ImageSchema.imageSchema`. :return: a `numpy.ndarray` that is an image. .. versionadded:: 2.3.0 """ if not isinstance(image, Row): raise TypeError( "image argument should be pyspark.sql.types.Row; however, " "it got [%s]." % type(image)) if any(not hasattr(image, f) for f in self.imageFields): raise ValueError( "image argument should have attributes specified in " "ImageSchema.imageSchema [%s]." % ", ".join(self.imageFields)) height = image.height width = image.width nChannels = image.nChannels return np.ndarray( shape=(height, width, nChannels), dtype=np.uint8, buffer=image.data, strides=(width * nChannels, nChannels, 1))
Example #5
Source File: image.py From LearningApacheSpark with MIT License | 5 votes |
def toImage(self, array, origin=""): """ Converts an array with metadata to a two-dimensional image. :param `numpy.ndarray` array: The array to convert to image. :param str origin: Path to the image, optional. :return: a :class:`Row` that is a two dimensional image. .. versionadded:: 2.3.0 """ if not isinstance(array, np.ndarray): raise TypeError( "array argument should be numpy.ndarray; however, it got [%s]." % type(array)) if array.ndim != 3: raise ValueError("Invalid array shape") height, width, nChannels = array.shape ocvTypes = ImageSchema.ocvTypes if nChannels == 1: mode = ocvTypes["CV_8UC1"] elif nChannels == 3: mode = ocvTypes["CV_8UC3"] elif nChannels == 4: mode = ocvTypes["CV_8UC4"] else: raise ValueError("Invalid number of channels") # Running `bytearray(numpy.array([1]))` fails in specific Python versions # with a specific Numpy version, for example in Python 3.6.0 and NumPy 1.13.3. # Here, it avoids it by converting it to bytes. data = bytearray(array.astype(dtype=np.uint8).ravel().tobytes()) # Creating new Row with _create_row(), because Row(name = value, ... ) # orders fields by name, which conflicts with expected schema order # when the new DataFrame is created by UDF return _create_row(self.imageFields, [origin, height, width, nChannels, mode, data])
Example #6
Source File: context.py From LearningApacheSpark with MIT License | 5 votes |
def __init__(self, sparkContext, sparkSession=None, jsqlContext=None): """Creates a new SQLContext. >>> from datetime import datetime >>> sqlContext = SQLContext(sc) >>> allTypes = sc.parallelize([Row(i=1, s="string", d=1.0, l=1, ... b=True, list=[1, 2, 3], dict={"s": 0}, row=Row(a=1), ... time=datetime(2014, 8, 1, 14, 1, 5))]) >>> df = allTypes.toDF() >>> df.createOrReplaceTempView("allTypes") >>> sqlContext.sql('select i+1, d+1, not b, list[1], dict["s"], time, row.a ' ... 'from allTypes where b and i > 0').collect() [Row((i + CAST(1 AS BIGINT))=2, (d + CAST(1 AS DOUBLE))=2.0, (NOT b)=False, list[1]=2, \ dict[s]=0, time=datetime.datetime(2014, 8, 1, 14, 1, 5), a=1)] >>> df.rdd.map(lambda x: (x.i, x.s, x.d, x.l, x.b, x.time, x.row.a, x.list)).collect() [(1, u'string', 1.0, 1, True, datetime.datetime(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])] """ self._sc = sparkContext self._jsc = self._sc._jsc self._jvm = self._sc._jvm if sparkSession is None: sparkSession = SparkSession.builder.getOrCreate() if jsqlContext is None: jsqlContext = sparkSession._jwrapped self.sparkSession = sparkSession self._jsqlContext = jsqlContext _monkey_patch_RDD(self.sparkSession) install_exception_handler() if SQLContext._instantiatedContext is None: SQLContext._instantiatedContext = self
Example #7
Source File: context.py From LearningApacheSpark with MIT License | 5 votes |
def _inferSchema(self, rdd, samplingRatio=None): """ Infer schema from an RDD of Row or tuple. :param rdd: an RDD of Row or tuple :param samplingRatio: sampling ratio, or no sampling (default) :return: :class:`pyspark.sql.types.StructType` """ return self.sparkSession._inferSchema(rdd, samplingRatio)
Example #8
Source File: context.py From LearningApacheSpark with MIT License | 5 votes |
def _test(): import os import doctest import tempfile from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext import pyspark.sql.context os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.sql.context.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['tempfile'] = tempfile globs['os'] = os globs['sc'] = sc globs['sqlContext'] = SQLContext(sc) globs['rdd'] = rdd = sc.parallelize( [Row(field1=1, field2="row1"), Row(field1=2, field2="row2"), Row(field1=3, field2="row3")] ) globs['df'] = rdd.toDF() jsonStrings = [ '{"field1": 1, "field2": "row1", "field3":{"field4":11}}', '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},' '"field6":[{"field7": "row2"}]}', '{"field1" : null, "field2": "row3", ' '"field3":{"field4":33, "field5": []}}' ] globs['jsonStrings'] = jsonStrings globs['json'] = sc.parallelize(jsonStrings) (failure_count, test_count) = doctest.testmod( pyspark.sql.context, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) globs['sc'].stop() if failure_count: sys.exit(-1)
Example #9
Source File: score_images_spark.py From models with Apache License 2.0 | 5 votes |
def read_images(spark, filenames): filenames_rdd = spark.sparkContext.parallelize(filenames) schema = StructType( [StructField("filename", StringType(), True), StructField("image", StringType(), True)]) return filenames_rdd.map(lambda x: Row(filename=x, image=read_image_bytes_base64(x))).toDF(schema=schema)
Example #10
Source File: graph.py From apollo with GNU General Public License v3.0 | 5 votes |
def evaluate_communities(args): log = logging.getLogger("evalcc") model = CommunitiesModel().load(args.input) configure(args) spark = create_spark("evalcc-%s" % uuid4(), **filter_kwargs(args.__dict__, create_spark)) log.info("Preparing the communities' RDD") items = [] for i, c in progress_bar(enumerate(model.communities), log, expected_size=len(model.communities)): for m in c: if m < len(model.id_to_element): items.append(Row(sha1=model.id_to_element[m], community=i)) log.info("Running") items_in_spark = spark.sparkContext.parallelize(items).toDF() bags = spark \ .read \ .format("org.apache.spark.sql.cassandra") \ .options(table=args.tables["bags"], keyspace=args.keyspace) \ .load() log.info("Loaded the bags, calculating the vocabulary") vocabulary = bags.drop("sha1", "value").distinct().rdd.map(lambda x: x.item).collect() vocabulary = {v: i for i, v in enumerate(vocabulary)} log.info("Vocabulary size: %d", len(vocabulary)) element_to_id = {e: i for i, e in enumerate(model.id_to_element)} metrics = items_in_spark.join(bags, "sha1").rdd \ .map(lambda r: (r.community, (element_to_id[r.sha1], vocabulary[r.item], r.value))) \ .groupByKey() \ .map(CommunityEvaluator(args.threshold, len(vocabulary))) \ .reduce(lambda v1, v2: [v1[i] + v2[i] for i in range(4)]) log.info("Total misses: %d", metrics[0]) log.info("Average normalized misses: %f", metrics[1] / len(model.communities)) log.info("Total loss: %f", metrics[2]) log.info("Average normalized loss: %f", numpy.sqrt(metrics[3] / len(model.communities)))
Example #11
Source File: hasher.py From apollo with GNU General Public License v3.0 | 5 votes |
def __call__(self, record): key, wmh = record for hti in range(self.htnum): yield Row(sha1=key, hashtable=hti, value=bytearray(wmh[hti * self.band_size:(hti + 1) * self.band_size].data))
Example #12
Source File: bags.py From apollo with GNU General Public License v3.0 | 5 votes |
def __call__(self, head): rows = head.map(lambda row: Row(sha1=row.document, item=row.token, value=float(row.value))) if self.explained: self._log.info("toDebugString():\n%s", rows.toDebugString().decode()) rows.toDF() \ .write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=self.table, keyspace=self.keyspace) \ .save() return head
Example #13
Source File: bags.py From apollo with GNU General Public License v3.0 | 5 votes |
def __call__(self, head): rows = head.map(lambda x: Row( sha1=x.blob_id, repo=x.repository_id, commit=x.commit_hash, path=x.path)) if self.explained: self._log.info("toDebugString():\n%s", rows.toDebugString().decode()) rows.toDF() \ .write \ .format("org.apache.spark.sql.cassandra") \ .mode("append") \ .options(table=self.table, keyspace=self.keyspace) \ .save()
Example #14
Source File: score_images_spark.py From mlflow with Apache License 2.0 | 5 votes |
def read_images(spark, filenames): filenames_rdd = spark.sparkContext.parallelize(filenames) schema = StructType( [StructField("filename", StringType(), True), StructField("image", StringType(), True)]) return filenames_rdd.map(lambda x: Row(filename=x, image=read_image_bytes_base64(x))).toDF(schema=schema)