Python pyspark.sql.types.StructField() Examples
The following are 30
code examples of pyspark.sql.types.StructField().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql.types
, or try the search function
.
Example #1
Source File: imageIO.py From spark-deep-learning with Apache License 2.0 | 8 votes |
def filesToDF(sc, path, numPartitions=None): """ Read files from a directory to a DataFrame. :param sc: SparkContext. :param path: str, path to files. :param numPartition: int, number or partitions to use for reading files. :return: DataFrame, with columns: (filePath: str, fileData: BinaryType) """ numPartitions = numPartitions or sc.defaultParallelism schema = StructType([StructField("filePath", StringType(), False), StructField("fileData", BinaryType(), False)]) rdd = sc.binaryFiles( path, minPartitions=numPartitions).repartition(numPartitions) rdd = rdd.map(lambda x: (x[0], bytearray(x[1]))) return rdd.toDF(schema)
Example #2
Source File: transform.py From search-MjoLniR with MIT License | 6 votes |
def _merge_schemas(*schemas: T.StructType): """Merge one or more spark schemas into a new schema""" fields = cast(Dict[str, T.StructField], {}) errors = [] for schema in schemas: for field in schema: if field.name not in fields: fields[field.name] = field elif field != fields[field.name]: errors.append('Incompatible fields: {} != {}'.format(field, fields[field.name])) if errors: raise Exception('\n'.join(errors)) return T.StructType(list(fields.values())) # Primary input schema from which most everything else is derived
Example #3
Source File: df_naive.py From example_dataproc_twitter with MIT License | 6 votes |
def register_udfs(self, sess, sc): """Register UDFs to be used in SQL queries. :type sess: `pyspark.sql.SparkSession` :param sess: Session used in Spark for SQL queries. :type sc: `pyspark.SparkContext` :param sc: Spark Context to run Spark jobs. """ sess.udf.register("SQUARED", self.squared, returnType=( stypes.ArrayType(stypes.StructType( fields=[stypes.StructField('sku0', stypes.StringType()), stypes.StructField('norm', stypes.FloatType())])))) sess.udf.register('INTERSECTIONS',self.process_intersections, returnType=stypes.ArrayType(stypes.StructType(fields=[ stypes.StructField('sku0', stypes.StringType()), stypes.StructField('sku1', stypes.StringType()), stypes.StructField('cor', stypes.FloatType())])))
Example #4
Source File: unischema.py From petastorm with Apache License 2.0 | 6 votes |
def as_spark_schema(self): """Returns an object derived from the unischema as spark schema. Example: >>> spark.createDataFrame(dataset_rows, >>> SomeSchema.as_spark_schema()) """ # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path # (currently works only with make_batch_reader) import pyspark.sql.types as sql_types schema_entries = [] for field in self._fields.values(): spark_type = _field_spark_dtype(field) schema_entries.append(sql_types.StructField(field.name, spark_type, field.nullable)) return sql_types.StructType(schema_entries)
Example #5
Source File: df_naive.py From example_dataproc_twitter with MIT License | 6 votes |
def register_udfs(self, sess, sc): """Register UDFs to be used in SQL queries. :type sess: `pyspark.sql.SparkSession` :param sess: Session used in Spark for SQL queries. :type sc: `pyspark.SparkContext` :param sc: Spark Context to run Spark jobs. """ sess.udf.register("SQUARED", self.squared, returnType=( stypes.ArrayType(stypes.StructType( fields=[stypes.StructField('sku0', stypes.StringType()), stypes.StructField('norm', stypes.FloatType())])))) sess.udf.register('INTERSECTIONS',self.process_intersections, returnType=stypes.ArrayType(stypes.StructType(fields=[ stypes.StructField('sku0', stypes.StringType()), stypes.StructField('sku1', stypes.StringType()), stypes.StructField('cor', stypes.FloatType())])))
Example #6
Source File: test_base.py From example_dataproc_twitter with MIT License | 5 votes |
def test_load_neighbor_schema(self): klass = self.get_target_klass()() result = klass.load_neighbor_schema() expected = stypes.StructType(fields=[ stypes.StructField("item", stypes.StringType()), stypes.StructField("similarity_items", stypes.ArrayType( stypes.StructType(fields=[ stypes.StructField("item", stypes.StringType()), stypes.StructField("similarity", stypes.FloatType())])))]) self.assertEqual(expected, result)
Example #7
Source File: typehints.py From koalas with Apache License 2.0 | 5 votes |
def __init__(self, tpe, names=None): if names is None: # Default names `c0, c1, ... cn`. self.tpe = types.StructType( [types.StructField("c%s" % i, tpe[i]) for i in range(len(tpe))] ) # type: types.StructType else: self.tpe = types.StructType( [types.StructField(n, t) for n, t in zip(names, tpe)] ) # type: types.StructType
Example #8
Source File: criteo.py From azure-python-labs with MIT License | 5 votes |
def get_spark_schema(header=DEFAULT_HEADER): ## create schema schema = StructType() ## do label + ints n_ints = 14 for i in range(n_ints): schema.add(StructField(header[i], IntegerType())) ## do categoricals for i in range(26): schema.add(StructField(header[i + n_ints], StringType())) return schema
Example #9
Source File: criteo.py From azure-python-labs with MIT License | 5 votes |
def get_spark_schema(header=DEFAULT_HEADER): ## create schema schema = StructType() ## do label + ints n_ints = 14 for i in range(n_ints): schema.add(StructField(header[i], IntegerType())) ## do categoricals for i in range(26): schema.add(StructField(header[i + n_ints], StringType())) return schema
Example #10
Source File: base.py From example_dataproc_twitter with MIT License | 5 votes |
def load_users_schema(): """Loads schema with data type [user, [(sku, score), (sku, score)]] :rtype: `pyspark.sql.type.StructType` :returns: schema speficiation for user -> (sku, score) data. """ return stypes.StructType(fields=[ stypes.StructField("user", stypes.StringType()), stypes.StructField('interactions', stypes.ArrayType( stypes.StructType(fields=[stypes.StructField('item', stypes.StringType()), stypes.StructField('score', stypes.FloatType())])))])
Example #11
Source File: base.py From example_dataproc_twitter with MIT License | 5 votes |
def load_neighbor_schema(self): """Loads neighborhood schema for similarity matrix :rtype: `pyspark.sql.types.StructField` :returns: schema of type ["key", [("key", "value")]] """ return stypes.StructType(fields=[ stypes.StructField("item", stypes.StringType()), stypes.StructField("similarity_items", stypes.ArrayType( stypes.StructType(fields=[ stypes.StructField("item", stypes.StringType()), stypes.StructField("similarity", stypes.FloatType())])))])
Example #12
Source File: base.py From example_dataproc_twitter with MIT License | 5 votes |
def load_users_schema(): """Loads schema with data type [user, [(sku, score), (sku, score)]] :rtype: `pyspark.sql.type.StructType` :returns: schema speficiation for user -> (sku, score) data. """ return stypes.StructType(fields=[ stypes.StructField("user", stypes.StringType()), stypes.StructField('interactions', stypes.ArrayType( stypes.StructType(fields=[stypes.StructField('item', stypes.StringType()), stypes.StructField('score', stypes.FloatType())])))])
Example #13
Source File: test_base.py From example_dataproc_twitter with MIT License | 5 votes |
def test_load_users_schema(self): klass = self.get_target_klass()() expected = stypes.StructType(fields=[ stypes.StructField("user", stypes.StringType()), stypes.StructField('interactions', stypes.ArrayType( stypes.StructType(fields=[stypes.StructField('item', stypes.StringType()), stypes.StructField('score', stypes.FloatType())])))]) result = klass.load_users_schema() self.assertEqual(result, expected)
Example #14
Source File: test_upload.py From listenbrainz-server with GNU General Public License v2.0 | 5 votes |
def test_process_json_listens(self, mock_save, mock_read): fakeschema = StructType([StructField('xxxxx', StringType(), nullable=True)]) ListenbrainzDataUploader().process_json_listens('/2020/1.json', '/fakedir', 'fakehdfspath', fakeschema) mock_read.assert_called_once_with('fakehdfspath', schema=fakeschema) mock_save.assert_called_once_with(mock_read.return_value, '/fakedir/2020/1.parquet')
Example #15
Source File: reader.py From HoloClean-Legacy-deprecated with Apache License 2.0 | 5 votes |
def read(self, file_path, spark_session, indexcol=0, schema=None): """ Creates a dataframe from the csv file :param indexcol: if 1, create a tuple id column as auto increment :param schema: optional schema of file if known :param spark_session: The spark_session we created in Holoclean object :param file_path: The path to the file :return: dataframe """ if schema is None: df = spark_session.read.csv(file_path, header=True) else: df = spark_session.read.csv(file_path, header=True, schema=schema) if indexcol == 0: return df index_name = GlobalVariables.index_name new_cols = df.schema.names + [index_name] list_schema = [] for index_attribute in range(len(df.schema.names)): list_schema.append(StructField("_" + str(index_attribute), df.schema[ index_attribute].dataType, True)) list_schema.append( StructField("_" + str(len(new_cols)), LongType(), True)) schema = StructType(list_schema) ix_df = df.rdd.zipWithIndex().map( lambda (row, ix): row + (ix + 1,)).toDF(schema) tmp_cols = ix_df.schema.names new_df = reduce(lambda data, idx: data.withColumnRenamed(tmp_cols[idx], new_cols[idx]), xrange(len(tmp_cols)), ix_df) new_df = self.checking_string_size(new_df) return new_df
Example #16
Source File: sampler.py From python_mozetl with MIT License | 5 votes |
def transform(landfill, n_documents=1000): meta_schema = StructType( [StructField(k, StringType(), True) for k in META_WHITELIST] ) schema = StructType( [ StructField("namespace", StringType(), False), StructField("doc_type", StringType(), False), StructField("doc_version", StringType(), True), StructField("doc_id", StringType(), True), StructField("meta", meta_schema, False), StructField("content", StringType(), False), ] ) documents = ( landfill.map(_process) .filter(lambda x: x[0] and x[1] and x[-2] and x[-1]) .toDF(schema) ) window_spec = Window.partitionBy("namespace", "doc_type", "doc_version").orderBy( "doc_id" ) df = ( documents.fillna("0", "doc_version") .withColumn("row_id", row_number().over(window_spec)) .where(col("row_id") <= n_documents) .drop("row_id") ) return df
Example #17
Source File: transform.py From python_mozetl with MIT License | 5 votes |
def toStructType(self): return StructType( [StructField(col.name, col.struct_type, True) for col in self.columns] )
Example #18
Source File: sample_scaffolds.py From reinvent-scaffold-decorator with MIT License | 5 votes |
def _initialize_results(self, scaffolds): data = [ps.Row(smiles=scaffold, scaffold=scaffold, decorations={}, count=1) for scaffold in scaffolds] data_schema = pst.StructType([ pst.StructField("smiles", pst.StringType()), pst.StructField("scaffold", pst.StringType()), pst.StructField("decorations", pst.MapType(pst.IntegerType(), pst.StringType())), pst.StructField("count", pst.IntegerType()) ]) return SPARK.createDataFrame(data, schema=data_schema)
Example #19
Source File: schema.py From mlflow with Apache License 2.0 | 5 votes |
def as_spark_schema(self): """Convert to Spark schema. If this schema is a single unnamed column, it is converted directly the corresponding spark data type, otherwise it's returned as a struct (missing column names are filled with an integer sequence). """ if len(self.columns) == 1 and self.columns[0].name is None: return self.columns[0].type.to_spark() from pyspark.sql.types import StructType, StructField return StructType([StructField(name=col.name or str(i), dataType=col.type.to_spark()) for i, col in enumerate(self.columns)])
Example #20
Source File: test_schema.py From mlflow with Apache License 2.0 | 5 votes |
def test_spark_schema_inference(pandas_df_with_all_types): import pyspark from pyspark.sql.types import _parse_datatype_string, StructField, StructType schema = _infer_schema(pandas_df_with_all_types) assert schema == Schema([ColSpec(x, x) for x in pandas_df_with_all_types.columns]) spark_session = pyspark.sql.SparkSession(pyspark.SparkContext.getOrCreate()) spark_schema = StructType( [StructField(t.name, _parse_datatype_string(t.name), True) for t in schema.column_types()]) sparkdf = spark_session.createDataFrame(pandas_df_with_all_types, schema=spark_schema) schema = _infer_schema(sparkdf) assert schema == Schema([ColSpec(x, x) for x in pandas_df_with_all_types.columns])
Example #21
Source File: test_schema.py From mlflow with Apache License 2.0 | 5 votes |
def test_spark_type_mapping(pandas_df_with_all_types): import pyspark from pyspark.sql.types import BooleanType, IntegerType, LongType, FloatType, DoubleType, \ StringType, BinaryType from pyspark.sql.types import StructField, StructType assert isinstance(DataType.boolean.to_spark(), BooleanType) assert isinstance(DataType.integer.to_spark(), IntegerType) assert isinstance(DataType.long.to_spark(), LongType) assert isinstance(DataType.float.to_spark(), FloatType) assert isinstance(DataType.double.to_spark(), DoubleType) assert isinstance(DataType.string.to_spark(), StringType) assert isinstance(DataType.binary.to_spark(), BinaryType) schema = _infer_schema(pandas_df_with_all_types) expected_spark_schema = StructType( [StructField(t.name, t.to_spark(), True) for t in schema.column_types()]) actual_spark_schema = schema.as_spark_schema() assert expected_spark_schema.jsonValue() == actual_spark_schema.jsonValue() spark_session = pyspark.sql.SparkSession(pyspark.SparkContext.getOrCreate()) sparkdf = spark_session.createDataFrame(pandas_df_with_all_types, schema=actual_spark_schema) schema2 = _infer_schema(sparkdf) assert schema == schema2 # test unnamed columns schema = Schema([ColSpec(col.type) for col in schema.columns]) expected_spark_schema = StructType( [StructField(str(i), t.to_spark(), True) for i, t in enumerate(schema.column_types())]) actual_spark_schema = schema.as_spark_schema() assert expected_spark_schema.jsonValue() == actual_spark_schema.jsonValue() # test single unnamed column is mapped to just a single spark type schema = Schema([ColSpec(DataType.integer)]) spark_type = schema.as_spark_schema() assert isinstance(spark_type, IntegerType)
Example #22
Source File: dfutil.py From TensorFlowOnSpark with Apache License 2.0 | 5 votes |
def infer_schema(example, binary_features=[]): """Given a tf.train.Example, infer the Spark DataFrame schema (StructFields). Note: TensorFlow represents both strings and binary types as tf.train.BytesList, and we need to disambiguate these types for Spark DataFrames DTypes (StringType and BinaryType), so we require a "hint" from the caller in the ``binary_features`` argument. Args: :example: a tf.train.Example :binary_features: a list of tf.train.Example features which are expected to be binary/bytearrays. Returns: A DataFrame StructType schema """ def _infer_sql_type(k, v): # special handling for binary features if k in binary_features: return BinaryType() if v.int64_list.value: result = v.int64_list.value sql_type = LongType() elif v.float_list.value: result = v.float_list.value sql_type = DoubleType() else: result = v.bytes_list.value sql_type = StringType() if len(result) > 1: # represent multi-item tensors as Spark SQL ArrayType() of base types return ArrayType(sql_type) else: # represent everything else as base types (and empty tensors as StringType()) return sql_type return StructType([StructField(k, _infer_sql_type(k, v), True) for k, v in sorted(example.features.feature.items())])
Example #23
Source File: smvschema.py From SMV with Apache License 2.0 | 5 votes |
def _scala_to_python_field_type(self, scala_field_type): """create a python FieldType from the scala field type""" col_name = str(scala_field_type.name()) col_type_name = str(scala_field_type.dataType()) # map string "IntegerType" to actual class IntegerType col_type_class = getattr(sql_types, col_type_name) return sql_types.StructField(col_name, col_type_class())
Example #24
Source File: es_hits.py From search-MjoLniR with MIT License | 5 votes |
def transform(df, url_list=None, brokers=None, **kwargs): if brokers and url_list: raise ValueError('cannot specify brokers and url_list') if brokers: rdd = transform_from_kafka(df, brokers, **kwargs) else: rdd = transform_from_elasticsearch(df, url_list, **kwargs) return df.sql_ctx.createDataFrame(rdd, T.StructType([ df.schema['wikiid'], df.schema['query'], df.schema['norm_query'], T.StructField('hit_page_ids', T.ArrayType(T.IntegerType()), nullable=False), ]))
Example #25
Source File: test_dataset.py From python_moztelemetry with Mozilla Public License 2.0 | 5 votes |
def test_dataframe_with_schema(dataset, spark): schema = StructType([StructField("foo", IntegerType(), True)]) df = dataset.dataframe(spark, decode=decode, schema=schema, table_name='bar') assert type(df) == DataFrame assert df.columns == ['foo'] assert df.orderBy(["foo"]).collect() == [Row(foo=1), Row(foo=2)]
Example #26
Source File: test_dataset.py From python_moztelemetry with Mozilla Public License 2.0 | 5 votes |
def test_dataframe_bad_schema(dataset, spark): spark.catalog.dropTempView('bar') schema = StructType([StructField("name", StringType(), True)]) df = dataset.dataframe(spark, decode=decode, schema=schema, table_name='bar') assert type(df) == DataFrame assert df.collect() == [Row(name=None), Row(name=None)]
Example #27
Source File: datatypes.py From ibis with Apache License 2.0 | 5 votes |
def ibis_struct_dtype_to_spark_dtype(ibis_dtype_obj): fields = [ pt.StructField(n, spark_dtype(t), t.nullable) for n, t in zip(ibis_dtype_obj.names, ibis_dtype_obj.types) ] return pt.StructType(fields)
Example #28
Source File: ml_model.py From elephas with MIT License | 5 votes |
def _transform(self, df): """Private transform method of a Transformer. This serves as batch-prediction method for our purposes. """ output_col = self.getOutputCol() label_col = self.getLabelCol() new_schema = copy.deepcopy(df.schema) new_schema.add(StructField(output_col, StringType(), True)) rdd = df.rdd.coalesce(1) features = np.asarray( rdd.map(lambda x: from_vector(x.features)).collect()) # Note that we collect, since executing this on the rdd would require model serialization once again model = model_from_yaml(self.get_keras_model_config()) model.set_weights(self.weights.value) predictions = rdd.ctx.parallelize( model.predict_classes(features)).coalesce(1) predictions = predictions.map(lambda x: tuple(str(x))) results_rdd = rdd.zip(predictions).map(lambda x: x[0] + x[1]) results_df = df.sql_ctx.createDataFrame(results_rdd, new_schema) results_df = results_df.withColumn( output_col, results_df[output_col].cast(DoubleType())) results_df = results_df.withColumn( label_col, results_df[label_col].cast(DoubleType())) return results_df
Example #29
Source File: base.py From LearningApacheSpark with MIT License | 5 votes |
def transformSchema(self, schema): inputType = schema[self.getInputCol()].dataType self.validateInputType(inputType) if self.getOutputCol() in schema.names: raise ValueError("Output column %s already exists." % self.getOutputCol()) outputFields = copy.copy(schema.fields) outputFields.append(StructField(self.getOutputCol(), self.outputDataType(), nullable=False)) return StructType(outputFields)
Example #30
Source File: test_upload.py From listenbrainz-server with GNU General Public License v2.0 | 5 votes |
def test_process_json(self, mock_save, mock_read): fakeschema = StructType([StructField('xxxxx', StringType(), nullable=True)]) ListenbrainzDataUploader().process_json('fakename', '/fakedestpath', '/fakehdfspath', fakeschema) mock_read.assert_called_once_with('/fakehdfspath', schema=fakeschema) mock_save.assert_called_once_with(mock_read.return_value, '/fakedestpath')