Python pyspark.sql.types.DataType() Examples
The following are 11
code examples of pyspark.sql.types.DataType().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql.types
, or try the search function
.
Example #1
Source File: transform.py From search-MjoLniR with MIT License | 6 votes |
def _simplify_data_type(data_type: T.DataType) -> Tuple: """Simplify datatype into a tuple of equality information we care about Most notably this ignores nullability concerns due to hive not being able to represent not null in it's schemas. """ try: # Normalize UDT into it's sql form. Allows comparison of schemas # from hive and spark. sql_type = data_type.sqlType() # type: ignore except AttributeError: sql_type = data_type if isinstance(sql_type, T.StructType): return ('StructType', [(field.name, _simplify_data_type(field.dataType)) for field in sql_type]) elif isinstance(sql_type, T.ArrayType): return ('ArrayType', _simplify_data_type(sql_type.elementType)) else: return (type(sql_type).__name__,)
Example #2
Source File: typehints.py From koalas with Apache License 2.0 | 6 votes |
def infer_pd_series_spark_type(s: pd.Series) -> types.DataType: """Infer Spark DataType from pandas Series dtype. :param s: :class:`pandas.Series` to be inferred :return: the inferred Spark data type """ dt = s.dtype if dt == np.dtype("object"): if len(s) == 0 or s.isnull().all(): raise ValueError("can not infer schema from empty or null dataset") elif hasattr(s[0], "__UDT__"): return s[0].__UDT__ else: return from_arrow_type(pa.Array.from_pandas(s).type) elif is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt): return types.TimestampType() else: return from_arrow_type(pa.from_numpy_dtype(dt))
Example #3
Source File: datatypes.py From ibis with Apache License 2.0 | 5 votes |
def default(value, **kwargs) -> pt.DataType: raise com.IbisTypeError('Value {!r} is not a valid datatype'.format(value))
Example #4
Source File: datatypes.py From ibis with Apache License 2.0 | 5 votes |
def from_spark_dtype(value: pt.DataType) -> pt.DataType: return value
Example #5
Source File: helpers.py From SMV with Apache License 2.0 | 5 votes |
def smvArrayFlatten(self, elemType): """smvArrayFlatten helper applies flatten operation on an Array of Array column. Example: >>> df.select(col('arrayOfArrayOfStr').smvArrayFlatten(StringType())) Args: elemType (DataType or DataFram): array element's data type, in object form or the DataFrame to infer the element data type """ if(isinstance(elemType, DataType)): elemTypeJson = elemType.json() elif(isinstance(elemType, DataFrame)): elemTypeJson = elemType.select(self.col)\ .schema.fields[0].dataType.elementType.elementType.json() else: raise SmvRuntimeError("smvArrayFlatten does not support type: {}".format(type(elemType))) jc = self._jColumnHelper.smvArrayFlatten(elemTypeJson) return Column(jc) # Initialize DataFrame and Column with helper methods. Called by SmvApp.
Example #6
Source File: generic.py From koalas with Apache License 2.0 | 5 votes |
def _count_expr(col: spark.Column, spark_type: DataType) -> spark.Column: # Special handle floating point types because Spark's count treats nan as a valid value, # whereas pandas count doesn't include nan. if isinstance(spark_type, (FloatType, DoubleType)): return F.count(F.nanvl(col, F.lit(None))) else: return F.count(col)
Example #7
Source File: typehints.py From koalas with Apache License 2.0 | 5 votes |
def as_spark_type(tpe) -> types.DataType: """ Given a python type, returns the equivalent spark type. Accepts: - the built-in types in python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - python3's typing system """ if tpe in (str, "str", "string"): return types.StringType() elif tpe in (bytes,): return types.BinaryType() elif tpe in (np.int8, "int8", "byte"): return types.ByteType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() elif tpe in (int, "int", np.int, np.int32): return types.IntegerType() elif tpe in (np.int64, "int64", "long", "bigint"): return types.LongType() elif tpe in (float, "float", np.float): return types.FloatType() elif tpe in (np.float64, "float64", "double"): return types.DoubleType() elif tpe in (datetime.datetime, np.datetime64): return types.TimestampType() elif tpe in (datetime.date,): return types.DateType() elif tpe in (bool, "boolean", "bool", np.bool): return types.BooleanType() elif tpe in (np.ndarray,): # TODO: support other child types return types.ArrayType(types.StringType()) else: raise TypeError("Type %s was not understood." % tpe)
Example #8
Source File: typehints.py From koalas with Apache License 2.0 | 5 votes |
def spark_type_to_pandas_dtype(spark_type): """ Return the given Spark DataType to pandas dtype. """ if isinstance(spark_type, (types.DateType, types.UserDefinedType)): return np.dtype("object") elif isinstance(spark_type, types.TimestampType): return np.dtype("datetime64[ns]") else: return np.dtype(to_arrow_type(spark_type).to_pandas_dtype())
Example #9
Source File: mlflow.py From koalas with Apache License 2.0 | 5 votes |
def _return_type(self) -> DataType: hint = self._return_type_hint # The logic is simple for now, because it corresponds to the default # case: continuous predictions # TODO: do something smarter, for example when there is a sklearn.Classifier (it should # return an integer or a categorical) # We can do the same for pytorch/tensorflow/keras models by looking at the output types. # However, this is probably better done in mlflow than here. if hint == "infer" or not hint: hint = np.float64 return as_spark_type(hint)
Example #10
Source File: functions.py From LearningApacheSpark with MIT License | 4 votes |
def from_json(col, schema, options={}): """ Parses a column containing a JSON string into a :class:`MapType` with :class:`StringType` as keys type, :class:`StructType` or :class:`ArrayType` with the specified schema. Returns `null`, in the case of an unparseable string. :param col: string column in json format :param schema: a StructType or ArrayType of StructType to use when parsing the json column. :param options: options to control parsing. accepts the same options as the json datasource .. note:: Since Spark 2.3, the DDL-formatted string or a JSON format string is also supported for ``schema``. >>> from pyspark.sql.types import * >>> data = [(1, '''{"a": 1}''')] >>> schema = StructType([StructField("a", IntegerType())]) >>> df = spark.createDataFrame(data, ("key", "value")) >>> df.select(from_json(df.value, schema).alias("json")).collect() [Row(json=Row(a=1))] >>> df.select(from_json(df.value, "a INT").alias("json")).collect() [Row(json=Row(a=1))] >>> df.select(from_json(df.value, "MAP<STRING,INT>").alias("json")).collect() [Row(json={u'a': 1})] >>> data = [(1, '''[{"a": 1}]''')] >>> schema = ArrayType(StructType([StructField("a", IntegerType())])) >>> df = spark.createDataFrame(data, ("key", "value")) >>> df.select(from_json(df.value, schema).alias("json")).collect() [Row(json=[Row(a=1)])] >>> schema = schema_of_json(lit('''{"a": 0}''')) >>> df.select(from_json(df.value, schema).alias("json")).collect() [Row(json=Row(a=1))] >>> data = [(1, '''[1, 2, 3]''')] >>> schema = ArrayType(IntegerType()) >>> df = spark.createDataFrame(data, ("key", "value")) >>> df.select(from_json(df.value, schema).alias("json")).collect() [Row(json=[1, 2, 3])] """ sc = SparkContext._active_spark_context if isinstance(schema, DataType): schema = schema.json() elif isinstance(schema, Column): schema = _to_java_column(schema) jc = sc._jvm.functions.from_json(_to_java_column(col), schema, options) return Column(jc)
Example #11
Source File: functions.py From LearningApacheSpark with MIT License | 4 votes |
def udf(f=None, returnType=StringType()): """Creates a user defined function (UDF). .. note:: The user-defined functions are considered deterministic by default. Due to optimization, duplicate invocations may be eliminated or the function may even be invoked more times than it is present in the query. If your function is not deterministic, call `asNondeterministic` on the user defined function. E.g.: >>> from pyspark.sql.types import IntegerType >>> import random >>> random_udf = udf(lambda: int(random.random() * 100), IntegerType()).asNondeterministic() .. note:: The user-defined functions do not support conditional expressions or short circuiting in boolean expressions and it ends up with being executed all internally. If the functions can fail on special rows, the workaround is to incorporate the condition into the functions. .. note:: The user-defined functions do not take keyword arguments on the calling side. :param f: python function if used as a standalone function :param returnType: the return type of the user-defined function. The value can be either a :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string. >>> from pyspark.sql.types import IntegerType >>> slen = udf(lambda s: len(s), IntegerType()) >>> @udf ... def to_upper(s): ... if s is not None: ... return s.upper() ... >>> @udf(returnType=IntegerType()) ... def add_one(x): ... if x is not None: ... return x + 1 ... >>> df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age")) >>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")).show() +----------+--------------+------------+ |slen(name)|to_upper(name)|add_one(age)| +----------+--------------+------------+ | 8| JOHN DOE| 22| +----------+--------------+------------+ """ # decorator @udf, @udf(), @udf(dataType()) if f is None or isinstance(f, (str, DataType)): # If DataType has been passed as a positional argument # for decorator use it as a returnType return_type = f or returnType return functools.partial(_create_udf, returnType=return_type, evalType=PythonEvalType.SQL_BATCHED_UDF) else: return _create_udf(f=f, returnType=returnType, evalType=PythonEvalType.SQL_BATCHED_UDF)