Python pyspark.sql.types.StructType() Examples
The following are 30
code examples of pyspark.sql.types.StructType().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql.types
, or try the search function
.
Example #1
Source File: temp_range_sql.py From Hanhan-Spark-Python with MIT License | 12 votes |
def main(): temp_schema = StructType([ StructField('StationID', StringType(), False), StructField('DateTime', StringType(), False), StructField('Observation', StringType(), False), StructField('DataValue', DoubleType(), False), StructField('MFlag', StringType(), True), StructField('QFlag', StringType(), True), StructField('SFlag', StringType(), True), StructField('OBSTime', StringType(), True), ]) df = sqlContext.read.format('com.databricks.spark.csv').options(header='false').load(inputs1, schema=temp_schema) df = df.filter(df.QFlag == '') dfrange = get_range(df) result = dfrange.rdd.map(lambda r: str(r.DateTime)+' '+str(r.StationID)+' '+str(r.MaxRange)) outdata = result.sortBy(lambda r: r[0]).coalesce(1) outdata.saveAsTextFile(output)
Example #2
Source File: imageIO.py From spark-deep-learning with Apache License 2.0 | 8 votes |
def filesToDF(sc, path, numPartitions=None): """ Read files from a directory to a DataFrame. :param sc: SparkContext. :param path: str, path to files. :param numPartition: int, number or partitions to use for reading files. :return: DataFrame, with columns: (filePath: str, fileData: BinaryType) """ numPartitions = numPartitions or sc.defaultParallelism schema = StructType([StructField("filePath", StringType(), False), StructField("fileData", BinaryType(), False)]) rdd = sc.binaryFiles( path, minPartitions=numPartitions).repartition(numPartitions) rdd = rdd.map(lambda x: (x[0], bytearray(x[1]))) return rdd.toDF(schema)
Example #3
Source File: transform.py From search-MjoLniR with MIT License | 6 votes |
def _simplify_data_type(data_type: T.DataType) -> Tuple: """Simplify datatype into a tuple of equality information we care about Most notably this ignores nullability concerns due to hive not being able to represent not null in it's schemas. """ try: # Normalize UDT into it's sql form. Allows comparison of schemas # from hive and spark. sql_type = data_type.sqlType() # type: ignore except AttributeError: sql_type = data_type if isinstance(sql_type, T.StructType): return ('StructType', [(field.name, _simplify_data_type(field.dataType)) for field in sql_type]) elif isinstance(sql_type, T.ArrayType): return ('ArrayType', _simplify_data_type(sql_type.elementType)) else: return (type(sql_type).__name__,)
Example #4
Source File: transform.py From search-MjoLniR with MIT License | 6 votes |
def _verify_schema_compatability(expect: T.StructType, have: T.StructType) -> List[str]: """Verify all expected fields and types are present Allows additional columns in the `have` schema. Additionally allows relaxing nullability """ errors = [] for expect_field in expect: try: have_field = have[expect_field.name] except KeyError: errors.append('Field {} missing. Have: {}'.format(expect_field.name, ','.join(have.names))) continue expect_type = _simplify_data_type(expect_field.dataType) have_type = _simplify_data_type(have_field.dataType) if expect_type != have_type: errors.append('Field {} has incompatible data types: expect {} != have {}'.format( expect_field.name, expect_type, have_type)) return errors
Example #5
Source File: utils.py From mlflow with Apache License 2.0 | 6 votes |
def format_to_file_path(spark_session): rows = [ Row(8, 32, "bat"), Row(64, 40, "mouse"), Row(-27, 55, "horse") ] schema = StructType([ StructField("number2", IntegerType()), StructField("number1", IntegerType()), StructField("word", StringType()) ]) rdd = spark_session.sparkContext.parallelize(rows) df = spark_session.createDataFrame(rdd, schema) res = {} tempdir = tempfile.mkdtemp() for data_format in ["csv", "parquet", "json"]: res[data_format] = os.path.join(tempdir, "test-data-%s" % data_format) for data_format, file_path in res.items(): df.write.option("header", "true").format(data_format).save(file_path) yield res shutil.rmtree(tempdir)
Example #6
Source File: reddit_average_sql.py From Hanhan-Spark-Python with MIT License | 6 votes |
def main(): schema = StructType([ StructField('subreddit', StringType(), False), StructField('score', IntegerType(), False), ]) inputs = sqlContext.read.json(inputs1, schema=schema) # Uncomment this then shcema is not added # inputs = sqlContext.read.json(inputs1) # Uncomment these when there are 2 inputs dir # comments_input1 = sqlContext.read.json(inputs1, schema=schema) # comments_input2 = sqlContext.read.json(inputs2, schema=schema) # inputs = comments_input1.unionAll(comments_input2) df = get_avg(inputs) df.write.save(output, format='json', mode='overwrite')
Example #7
Source File: transform.py From search-MjoLniR with MIT License | 6 votes |
def _merge_schemas(*schemas: T.StructType): """Merge one or more spark schemas into a new schema""" fields = cast(Dict[str, T.StructField], {}) errors = [] for schema in schemas: for field in schema: if field.name not in fields: fields[field.name] = field elif field != fields[field.name]: errors.append('Incompatible fields: {} != {}'.format(field, fields[field.name])) if errors: raise Exception('\n'.join(errors)) return T.StructType(list(fields.values())) # Primary input schema from which most everything else is derived
Example #8
Source File: unischema.py From petastorm with Apache License 2.0 | 6 votes |
def as_spark_schema(self): """Returns an object derived from the unischema as spark schema. Example: >>> spark.createDataFrame(dataset_rows, >>> SomeSchema.as_spark_schema()) """ # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path # (currently works only with make_batch_reader) import pyspark.sql.types as sql_types schema_entries = [] for field in self._fields.values(): spark_type = _field_spark_dtype(field) schema_entries.append(sql_types.StructField(field.name, spark_type, field.nullable)) return sql_types.StructType(schema_entries)
Example #9
Source File: accuracy.py From HoloClean-Legacy-deprecated with Apache License 2.0 | 6 votes |
def read_groundtruth(self): """ Create a dataframe from the ground truth csv file Takes as argument the full path name of the csv file and the spark_session """ filereader = Reader(self.spark_session) groundtruth_schema = StructType([ StructField("tid", IntegerType(), False), StructField("attr_name", StringType(), False), StructField("attr_val", StringType(), False)]) self.ground_truth_flat = filereader.read(self.path_to_grand_truth, 0, groundtruth_schema).\ drop(GlobalVariables.index_name) self.dataengine.add_db_table( 'Groundtruth', self.ground_truth_flat, self.dataset)
Example #10
Source File: udf.py From ibis with Apache License 2.0 | 6 votes |
def validate_func_and_types(self, func): if isinstance(self.spark_output_type, (pt.MapType, pt.StructType)): raise com.IbisTypeError( 'Spark does not support MapType or StructType output for \ Pandas UDFs' ) if not self.input_type: raise com.UnsupportedArgumentError( 'Spark does not support 0-arg pandas UDFs. Instead, create \ a 1-arg pandas UDF and ignore the arg in your function' ) super().validate_func_and_types(func)
Example #11
Source File: criteo.py From azure-python-labs with MIT License | 5 votes |
def get_spark_schema(header=DEFAULT_HEADER): ## create schema schema = StructType() ## do label + ints n_ints = 14 for i in range(n_ints): schema.add(StructField(header[i], IntegerType())) ## do categoricals for i in range(26): schema.add(StructField(header[i + n_ints], StringType())) return schema
Example #12
Source File: base.py From example_dataproc_twitter with MIT License | 5 votes |
def load_neighbor_schema(self): """Loads neighborhood schema for similarity matrix :rtype: `pyspark.sql.types.StructField` :returns: schema of type ["key", [("key", "value")]] """ return stypes.StructType(fields=[ stypes.StructField("item", stypes.StringType()), stypes.StructField("similarity_items", stypes.ArrayType( stypes.StructType(fields=[ stypes.StructField("item", stypes.StringType()), stypes.StructField("similarity", stypes.FloatType())])))])
Example #13
Source File: base.py From example_dataproc_twitter with MIT License | 5 votes |
def load_users_schema(): """Loads schema with data type [user, [(sku, score), (sku, score)]] :rtype: `pyspark.sql.type.StructType` :returns: schema speficiation for user -> (sku, score) data. """ return stypes.StructType(fields=[ stypes.StructField("user", stypes.StringType()), stypes.StructField('interactions', stypes.ArrayType( stypes.StructType(fields=[stypes.StructField('item', stypes.StringType()), stypes.StructField('score', stypes.FloatType())])))])
Example #14
Source File: base.py From example_dataproc_twitter with MIT License | 5 votes |
def load_neighbor_schema(self): """Loads neighborhood schema for similarity matrix :rtype: `pyspark.sql.types.StructField` :returns: schema of type ["key", [("key", "value")]] """ return stypes.StructType(fields=[ stypes.StructField("item", stypes.StringType()), stypes.StructField("similarity_items", stypes.ArrayType( stypes.StructType(fields=[ stypes.StructField("item", stypes.StringType()), stypes.StructField("similarity", stypes.FloatType())])))])
Example #15
Source File: base.py From example_dataproc_twitter with MIT License | 5 votes |
def load_users_schema(): """Loads schema with data type [user, [(sku, score), (sku, score)]] :rtype: `pyspark.sql.type.StructType` :returns: schema speficiation for user -> (sku, score) data. """ return stypes.StructType(fields=[ stypes.StructField("user", stypes.StringType()), stypes.StructField('interactions', stypes.ArrayType( stypes.StructType(fields=[stypes.StructField('item', stypes.StringType()), stypes.StructField('score', stypes.FloatType())])))])
Example #16
Source File: criteo.py From azure-python-labs with MIT License | 5 votes |
def get_spark_schema(header=DEFAULT_HEADER): ## create schema schema = StructType() ## do label + ints n_ints = 14 for i in range(n_ints): schema.add(StructField(header[i], IntegerType())) ## do categoricals for i in range(26): schema.add(StructField(header[i + n_ints], StringType())) return schema
Example #17
Source File: __init__.py From listenbrainz-server with GNU General Public License v2.0 | 5 votes |
def upload_test_playcounts(cls): schema = StructType( [ StructField("user_id", IntegerType()), StructField("recording_id", IntegerType()), StructField("count", IntegerType()) ] ) test_playcounts = [] for i in range(1, PLAYCOUNTS_COUNT // 2 + 1): test_playcounts.append([1, 1, 1]) for i in range(PLAYCOUNTS_COUNT // 2 + 1, PLAYCOUNTS_COUNT + 1): test_playcounts.append([2, 2, 1]) test_playcounts_df = listenbrainz_spark.session.createDataFrame(test_playcounts, schema=schema) utils.save_parquet(test_playcounts_df, TEST_PLAYCOUNTS_PATH)
Example #18
Source File: typehints.py From koalas with Apache License 2.0 | 5 votes |
def __init__(self, tpe, names=None): if names is None: # Default names `c0, c1, ... cn`. self.tpe = types.StructType( [types.StructField("c%s" % i, tpe[i]) for i in range(len(tpe))] ) # type: types.StructType else: self.tpe = types.StructType( [types.StructField(n, t) for n, t in zip(names, tpe)] ) # type: types.StructType
Example #19
Source File: test_upload.py From listenbrainz-server with GNU General Public License v2.0 | 5 votes |
def test_process_json_listens(self, mock_save, mock_read): fakeschema = StructType([StructField('xxxxx', StringType(), nullable=True)]) ListenbrainzDataUploader().process_json_listens('/2020/1.json', '/fakedir', 'fakehdfspath', fakeschema) mock_read.assert_called_once_with('fakehdfspath', schema=fakeschema) mock_save.assert_called_once_with(mock_read.return_value, '/fakedir/2020/1.parquet')
Example #20
Source File: test_upload.py From listenbrainz-server with GNU General Public License v2.0 | 5 votes |
def test_process_json(self, mock_save, mock_read): fakeschema = StructType([StructField('xxxxx', StringType(), nullable=True)]) ListenbrainzDataUploader().process_json('fakename', '/fakedestpath', '/fakehdfspath', fakeschema) mock_read.assert_called_once_with('/fakehdfspath', schema=fakeschema) mock_save.assert_called_once_with(mock_read.return_value, '/fakedestpath')
Example #21
Source File: test_pyspark.py From dagster with Apache License 2.0 | 5 votes |
def make_df_solid(context): schema = StructType([StructField('name', StringType()), StructField('age', IntegerType())]) rows = [Row(name='John', age=19), Row(name='Jennifer', age=29), Row(name='Henry', age=50)] return context.resources.pyspark.spark_session.createDataFrame(rows, schema)
Example #22
Source File: test_base.py From example_dataproc_twitter with MIT License | 5 votes |
def test_load_users_schema(self): klass = self.get_target_klass()() expected = stypes.StructType(fields=[ stypes.StructField("user", stypes.StringType()), stypes.StructField('interactions', stypes.ArrayType( stypes.StructType(fields=[stypes.StructField('item', stypes.StringType()), stypes.StructField('score', stypes.FloatType())])))]) result = klass.load_users_schema() self.assertEqual(result, expected)
Example #23
Source File: reader.py From HoloClean-Legacy-deprecated with Apache License 2.0 | 5 votes |
def read(self, file_path, spark_session, indexcol=0, schema=None): """ Creates a dataframe from the csv file :param indexcol: if 1, create a tuple id column as auto increment :param schema: optional schema of file if known :param spark_session: The spark_session we created in Holoclean object :param file_path: The path to the file :return: dataframe """ if schema is None: df = spark_session.read.csv(file_path, header=True) else: df = spark_session.read.csv(file_path, header=True, schema=schema) if indexcol == 0: return df index_name = GlobalVariables.index_name new_cols = df.schema.names + [index_name] list_schema = [] for index_attribute in range(len(df.schema.names)): list_schema.append(StructField("_" + str(index_attribute), df.schema[ index_attribute].dataType, True)) list_schema.append( StructField("_" + str(len(new_cols)), LongType(), True)) schema = StructType(list_schema) ix_df = df.rdd.zipWithIndex().map( lambda (row, ix): row + (ix + 1,)).toDF(schema) tmp_cols = ix_df.schema.names new_df = reduce(lambda data, idx: data.withColumnRenamed(tmp_cols[idx], new_cols[idx]), xrange(len(tmp_cols)), ix_df) new_df = self.checking_string_size(new_df) return new_df
Example #24
Source File: sampler.py From python_mozetl with MIT License | 5 votes |
def transform(landfill, n_documents=1000): meta_schema = StructType( [StructField(k, StringType(), True) for k in META_WHITELIST] ) schema = StructType( [ StructField("namespace", StringType(), False), StructField("doc_type", StringType(), False), StructField("doc_version", StringType(), True), StructField("doc_id", StringType(), True), StructField("meta", meta_schema, False), StructField("content", StringType(), False), ] ) documents = ( landfill.map(_process) .filter(lambda x: x[0] and x[1] and x[-2] and x[-1]) .toDF(schema) ) window_spec = Window.partitionBy("namespace", "doc_type", "doc_version").orderBy( "doc_id" ) df = ( documents.fillna("0", "doc_version") .withColumn("row_id", row_number().over(window_spec)) .where(col("row_id") <= n_documents) .drop("row_id") ) return df
Example #25
Source File: transform.py From python_mozetl with MIT License | 5 votes |
def toStructType(self): return StructType( [StructField(col.name, col.struct_type, True) for col in self.columns] )
Example #26
Source File: sample_scaffolds.py From reinvent-scaffold-decorator with MIT License | 5 votes |
def _initialize_results(self, scaffolds): data = [ps.Row(smiles=scaffold, scaffold=scaffold, decorations={}, count=1) for scaffold in scaffolds] data_schema = pst.StructType([ pst.StructField("smiles", pst.StringType()), pst.StructField("scaffold", pst.StringType()), pst.StructField("decorations", pst.MapType(pst.IntegerType(), pst.StringType())), pst.StructField("count", pst.IntegerType()) ]) return SPARK.createDataFrame(data, schema=data_schema)
Example #27
Source File: schema.py From mlflow with Apache License 2.0 | 5 votes |
def as_spark_schema(self): """Convert to Spark schema. If this schema is a single unnamed column, it is converted directly the corresponding spark data type, otherwise it's returned as a struct (missing column names are filled with an integer sequence). """ if len(self.columns) == 1 and self.columns[0].name is None: return self.columns[0].type.to_spark() from pyspark.sql.types import StructType, StructField return StructType([StructField(name=col.name or str(i), dataType=col.type.to_spark()) for i, col in enumerate(self.columns)])
Example #28
Source File: test_schema.py From mlflow with Apache License 2.0 | 5 votes |
def test_spark_schema_inference(pandas_df_with_all_types): import pyspark from pyspark.sql.types import _parse_datatype_string, StructField, StructType schema = _infer_schema(pandas_df_with_all_types) assert schema == Schema([ColSpec(x, x) for x in pandas_df_with_all_types.columns]) spark_session = pyspark.sql.SparkSession(pyspark.SparkContext.getOrCreate()) spark_schema = StructType( [StructField(t.name, _parse_datatype_string(t.name), True) for t in schema.column_types()]) sparkdf = spark_session.createDataFrame(pandas_df_with_all_types, schema=spark_schema) schema = _infer_schema(sparkdf) assert schema == Schema([ColSpec(x, x) for x in pandas_df_with_all_types.columns])
Example #29
Source File: test_schema.py From mlflow with Apache License 2.0 | 5 votes |
def test_spark_type_mapping(pandas_df_with_all_types): import pyspark from pyspark.sql.types import BooleanType, IntegerType, LongType, FloatType, DoubleType, \ StringType, BinaryType from pyspark.sql.types import StructField, StructType assert isinstance(DataType.boolean.to_spark(), BooleanType) assert isinstance(DataType.integer.to_spark(), IntegerType) assert isinstance(DataType.long.to_spark(), LongType) assert isinstance(DataType.float.to_spark(), FloatType) assert isinstance(DataType.double.to_spark(), DoubleType) assert isinstance(DataType.string.to_spark(), StringType) assert isinstance(DataType.binary.to_spark(), BinaryType) schema = _infer_schema(pandas_df_with_all_types) expected_spark_schema = StructType( [StructField(t.name, t.to_spark(), True) for t in schema.column_types()]) actual_spark_schema = schema.as_spark_schema() assert expected_spark_schema.jsonValue() == actual_spark_schema.jsonValue() spark_session = pyspark.sql.SparkSession(pyspark.SparkContext.getOrCreate()) sparkdf = spark_session.createDataFrame(pandas_df_with_all_types, schema=actual_spark_schema) schema2 = _infer_schema(sparkdf) assert schema == schema2 # test unnamed columns schema = Schema([ColSpec(col.type) for col in schema.columns]) expected_spark_schema = StructType( [StructField(str(i), t.to_spark(), True) for i, t in enumerate(schema.column_types())]) actual_spark_schema = schema.as_spark_schema() assert expected_spark_schema.jsonValue() == actual_spark_schema.jsonValue() # test single unnamed column is mapped to just a single spark type schema = Schema([ColSpec(DataType.integer)]) spark_type = schema.as_spark_schema() assert isinstance(spark_type, IntegerType)
Example #30
Source File: hostlinks_to_graph.py From cc-pyspark with MIT License | 5 votes |
def vertices_assign_ids(self, sc, sqlc, edges): source = edges.select(edges.s.alias('name')) target = edges.select(edges.t.alias('name')) ids = source.union(target) \ .distinct() if self.args.validate_host_names: is_valid = sqlf.udf(HostLinksToGraph.reverse_host_is_valid, BooleanType()) ids = ids.filter(is_valid(ids.name)) if self.args.vertex_partitions == 1: ids = ids \ .coalesce(1) \ .sort('name') \ .withColumn('id', sqlf.monotonically_increasing_id()) else: id_rdd = ids.select(ids.name).rdd \ .map(lambda row: tuple(row)[0]) \ .sortBy(lambda x: x, True, self.args.vertex_partitions) \ .zipWithIndex() id_schema = StructType([ StructField("name", StringType(), True), StructField("id", LongType(), True) ]) ids = sqlc.createDataFrame(id_rdd, schema=id_schema) if self.args.save_as_text is not None: ids = ids.persist() ids.select(sqlf.concat_ws('\t', ids.id, ids.name)) \ .write \ .text(os.path.join(self.args.save_as_text, "vertices"), compression="gzip") ids.write \ .format(self.args.output_format) \ .option("compression", self.args.output_compression) \ .saveAsTable(self.args.output + '_vertices') return ids