Python pyspark.sql.types.FloatType() Examples
The following are 17
code examples of pyspark.sql.types.FloatType().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql.types
, or try the search function
.
Example #1
Source File: spark_dataset_converter.py From petastorm with Apache License 2.0 | 6 votes |
def _convert_precision(df, dtype): if dtype is None: return df if dtype != "float32" and dtype != "float64": raise ValueError("dtype {} is not supported. \ Use 'float32' or float64".format(dtype)) source_type, target_type = (DoubleType, FloatType) \ if dtype == "float32" else (FloatType, DoubleType) logger.warning("Converting floating-point columns to %s", dtype) for field in df.schema: col_name = field.name if isinstance(field.dataType, source_type): df = df.withColumn(col_name, df[col_name].cast(target_type())) elif isinstance(field.dataType, ArrayType) and \ isinstance(field.dataType.elementType, source_type): df = df.withColumn(col_name, df[col_name].cast(ArrayType(target_type()))) return df
Example #2
Source File: df_naive.py From example_dataproc_twitter with MIT License | 6 votes |
def register_udfs(self, sess, sc): """Register UDFs to be used in SQL queries. :type sess: `pyspark.sql.SparkSession` :param sess: Session used in Spark for SQL queries. :type sc: `pyspark.SparkContext` :param sc: Spark Context to run Spark jobs. """ sess.udf.register("SQUARED", self.squared, returnType=( stypes.ArrayType(stypes.StructType( fields=[stypes.StructField('sku0', stypes.StringType()), stypes.StructField('norm', stypes.FloatType())])))) sess.udf.register('INTERSECTIONS',self.process_intersections, returnType=stypes.ArrayType(stypes.StructType(fields=[ stypes.StructField('sku0', stypes.StringType()), stypes.StructField('sku1', stypes.StringType()), stypes.StructField('cor', stypes.FloatType())])))
Example #3
Source File: named_image.py From spark-deep-learning with Apache License 2.0 | 6 votes |
def _decodeOutputAsPredictions(self, df): # If we start having different weights than imagenet, we'll need to # move this logic to individual model building in NamedImageTransformer. # Also, we could put the computation directly in the main computation # graph or use a scala UDF for potentially better performance. topK = self.getOrDefault(self.topK) def decode(predictions): pred_arr = np.expand_dims(np.array(predictions), axis=0) decoded = decode_predictions(pred_arr, top=topK)[0] # convert numpy dtypes to python native types return [(t[0], t[1], t[2].item()) for t in decoded] decodedSchema = ArrayType( StructType([ StructField("class", StringType(), False), StructField("description", StringType(), False), StructField("probability", FloatType(), False) ])) decodeUDF = udf(decode, decodedSchema) interim_output = self._getIntermediateOutputCol() return df \ .withColumn(self.getOutputCol(), decodeUDF(df[interim_output])) \ .drop(interim_output)
Example #4
Source File: df_naive.py From example_dataproc_twitter with MIT License | 6 votes |
def register_udfs(self, sess, sc): """Register UDFs to be used in SQL queries. :type sess: `pyspark.sql.SparkSession` :param sess: Session used in Spark for SQL queries. :type sc: `pyspark.SparkContext` :param sc: Spark Context to run Spark jobs. """ sess.udf.register("SQUARED", self.squared, returnType=( stypes.ArrayType(stypes.StructType( fields=[stypes.StructField('sku0', stypes.StringType()), stypes.StructField('norm', stypes.FloatType())])))) sess.udf.register('INTERSECTIONS',self.process_intersections, returnType=stypes.ArrayType(stypes.StructType(fields=[ stypes.StructField('sku0', stypes.StringType()), stypes.StructField('sku1', stypes.StringType()), stypes.StructField('cor', stypes.FloatType())])))
Example #5
Source File: typehints.py From koalas with Apache License 2.0 | 5 votes |
def as_spark_type(tpe) -> types.DataType: """ Given a python type, returns the equivalent spark type. Accepts: - the built-in types in python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - python3's typing system """ if tpe in (str, "str", "string"): return types.StringType() elif tpe in (bytes,): return types.BinaryType() elif tpe in (np.int8, "int8", "byte"): return types.ByteType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() elif tpe in (int, "int", np.int, np.int32): return types.IntegerType() elif tpe in (np.int64, "int64", "long", "bigint"): return types.LongType() elif tpe in (float, "float", np.float): return types.FloatType() elif tpe in (np.float64, "float64", "double"): return types.DoubleType() elif tpe in (datetime.datetime, np.datetime64): return types.TimestampType() elif tpe in (datetime.date,): return types.DateType() elif tpe in (bool, "boolean", "bool", np.bool): return types.BooleanType() elif tpe in (np.ndarray,): # TODO: support other child types return types.ArrayType(types.StringType()) else: raise TypeError("Type %s was not understood." % tpe)
Example #6
Source File: test_spark.py From mlflow with Apache License 2.0 | 5 votes |
def test_spark_udf(spark, model_path): mlflow.pyfunc.save_model( path=model_path, loader_module=__name__, code_path=[os.path.dirname(tests.__file__)], ) reloaded_pyfunc_model = mlflow.pyfunc.load_pyfunc(model_path) pandas_df = pd.DataFrame(data=np.ones((10, 10)), columns=[str(i) for i in range(10)]) spark_df = spark.createDataFrame(pandas_df) # Test all supported return types type_map = {"float": (FloatType(), np.number), "int": (IntegerType(), np.int32), "double": (DoubleType(), np.number), "long": (LongType(), np.int), "string": (StringType(), None)} for tname, tdef in type_map.items(): spark_type, np_type = tdef prediction_df = reloaded_pyfunc_model.predict(pandas_df) for is_array in [True, False]: t = ArrayType(spark_type) if is_array else spark_type if tname == "string": expected = prediction_df.applymap(str) else: expected = prediction_df.select_dtypes(np_type) if tname == "float": expected = expected.astype(np.float32) expected = [list(row[1]) if is_array else row[1][0] for row in expected.iterrows()] pyfunc_udf = spark_udf(spark, model_path, result_type=t) new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns)) actual = list(new_df.select("prediction").toPandas()['prediction']) assert expected == actual if not is_array: pyfunc_udf = spark_udf(spark, model_path, result_type=tname) new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns)) actual = list(new_df.select("prediction").toPandas()['prediction']) assert expected == actual
Example #7
Source File: test_base.py From example_dataproc_twitter with MIT License | 5 votes |
def test_load_neighbor_schema(self): klass = self.get_target_klass()() result = klass.load_neighbor_schema() expected = stypes.StructType(fields=[ stypes.StructField("item", stypes.StringType()), stypes.StructField("similarity_items", stypes.ArrayType( stypes.StructType(fields=[ stypes.StructField("item", stypes.StringType()), stypes.StructField("similarity", stypes.FloatType())])))]) self.assertEqual(expected, result)
Example #8
Source File: test_base.py From example_dataproc_twitter with MIT License | 5 votes |
def test_load_users_schema(self): klass = self.get_target_klass()() expected = stypes.StructType(fields=[ stypes.StructField("user", stypes.StringType()), stypes.StructField('interactions', stypes.ArrayType( stypes.StructType(fields=[stypes.StructField('item', stypes.StringType()), stypes.StructField('score', stypes.FloatType())])))]) result = klass.load_users_schema() self.assertEqual(result, expected)
Example #9
Source File: base.py From example_dataproc_twitter with MIT License | 5 votes |
def load_users_schema(): """Loads schema with data type [user, [(sku, score), (sku, score)]] :rtype: `pyspark.sql.type.StructType` :returns: schema speficiation for user -> (sku, score) data. """ return stypes.StructType(fields=[ stypes.StructField("user", stypes.StringType()), stypes.StructField('interactions', stypes.ArrayType( stypes.StructType(fields=[stypes.StructField('item', stypes.StringType()), stypes.StructField('score', stypes.FloatType())])))])
Example #10
Source File: base.py From example_dataproc_twitter with MIT License | 5 votes |
def load_neighbor_schema(self): """Loads neighborhood schema for similarity matrix :rtype: `pyspark.sql.types.StructField` :returns: schema of type ["key", [("key", "value")]] """ return stypes.StructType(fields=[ stypes.StructField("item", stypes.StringType()), stypes.StructField("similarity_items", stypes.ArrayType( stypes.StructType(fields=[ stypes.StructField("item", stypes.StringType()), stypes.StructField("similarity", stypes.FloatType())])))])
Example #11
Source File: base.py From example_dataproc_twitter with MIT License | 5 votes |
def load_users_schema(): """Loads schema with data type [user, [(sku, score), (sku, score)]] :rtype: `pyspark.sql.type.StructType` :returns: schema speficiation for user -> (sku, score) data. """ return stypes.StructType(fields=[ stypes.StructField("user", stypes.StringType()), stypes.StructField('interactions', stypes.ArrayType( stypes.StructType(fields=[stypes.StructField('item', stypes.StringType()), stypes.StructField('score', stypes.FloatType())])))])
Example #12
Source File: base.py From koalas with Apache License 2.0 | 5 votes |
def isnull(self): """ Detect existing (non-missing) values. Return a boolean same-sized object indicating if the values are NA. NA values, such as None or numpy.NaN, gets mapped to True values. Everything else gets mapped to False values. Characters such as empty strings '' or numpy.inf are not considered NA values (unless you set pandas.options.mode.use_inf_as_na = True). Returns ------- Series : Mask of bool values for each element in Series that indicates whether an element is not an NA value. Examples -------- >>> ser = ks.Series([5, 6, np.NaN]) >>> ser.isna() # doctest: +NORMALIZE_WHITESPACE 0 False 1 False 2 True Name: 0, dtype: bool >>> ser.rename("a").to_frame().set_index("a").index.isna() Index([False, False, True], dtype='object', name='a') """ from databricks.koalas.indexes import MultiIndex if isinstance(self, MultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") if isinstance(self.spark.data_type, (FloatType, DoubleType)): return self._with_new_scol( self.spark.column.isNull() | F.isnan(self.spark.column) ).rename(self.name) else: return self._with_new_scol(self.spark.column.isNull()).rename(self.name)
Example #13
Source File: generic.py From koalas with Apache License 2.0 | 5 votes |
def _count_expr(col: spark.Column, spark_type: DataType) -> spark.Column: # Special handle floating point types because Spark's count treats nan as a valid value, # whereas pandas count doesn't include nan. if isinstance(spark_type, (FloatType, DoubleType)): return F.count(F.nanvl(col, F.lit(None))) else: return F.count(col)
Example #14
Source File: testSmvSchema.py From SMV with Apache License 2.0 | 5 votes |
def test_fromFile(self): f = os.path.join(SmvSchemaTest.resourceTestDir(), "data/a.schema") s = SmvSchema.fromFile(f) fields = s.spark_schema.fields assert(len(fields) == 2) assert(fields[0] == st.StructField('a', st.StringType())) assert(fields[1] == st.StructField('b', st.FloatType()))
Example #15
Source File: codecs.py From petastorm with Apache License 2.0 | 5 votes |
def encode(self, unischema_field, value): # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path # (currently works only with make_batch_reader). We should move all pyspark related code into a separate module import pyspark.sql.types as sql_types # We treat ndarrays with shape=() as scalars unsized_numpy_array = isinstance(value, np.ndarray) and value.shape == () # Validate the input to be a scalar (or an unsized numpy array) if not unsized_numpy_array and hasattr(value, '__len__') and (not isinstance(value, str)): raise TypeError('Expected a scalar as a value for field \'{}\'. ' 'Got a non-numpy type\'{}\''.format(unischema_field.name, type(value))) if unischema_field.shape: raise ValueError('The shape field of unischema_field \'%s\' must be an empty tuple (i.e. \'()\' ' 'to indicate a scalar. However, the actual shape is %s', unischema_field.name, unischema_field.shape) if isinstance(self._spark_type, (sql_types.ByteType, sql_types.ShortType, sql_types.IntegerType, sql_types.LongType)): return int(value) if isinstance(self._spark_type, (sql_types.FloatType, sql_types.DoubleType)): return float(value) if isinstance(self._spark_type, sql_types.BooleanType): return bool(value) if isinstance(self._spark_type, sql_types.StringType): if not isinstance(value, str): raise ValueError( 'Expected a string value for field {}. Got type {}'.format(unischema_field.name, type(value))) return str(value) return value
Example #16
Source File: unischema.py From petastorm with Apache License 2.0 | 5 votes |
def _numpy_to_spark_mapping(): """Returns a mapping from numpy to pyspark.sql type. Caches the mapping dictionary inorder to avoid instantiation of multiple objects in each call.""" # Refer to the attribute of the function we use to cache the map using a name in the variable instead of a 'dot' # notation to avoid copy/paste/typo mistakes cache_attr_name = 'cached_numpy_to_pyspark_types_map' if not hasattr(_numpy_to_spark_mapping, cache_attr_name): import pyspark.sql.types as T setattr(_numpy_to_spark_mapping, cache_attr_name, { np.int8: T.ByteType(), np.uint8: T.ShortType(), np.int16: T.ShortType(), np.uint16: T.IntegerType(), np.int32: T.IntegerType(), np.int64: T.LongType(), np.float32: T.FloatType(), np.float64: T.DoubleType(), np.string_: T.StringType(), np.str_: T.StringType(), np.unicode_: T.StringType(), np.bool_: T.BooleanType(), }) return getattr(_numpy_to_spark_mapping, cache_attr_name) # TODO: Changing fields in this class or the UnischemaField will break reading due to the schema being pickled next to # the dataset on disk
Example #17
Source File: schema_utils.py From eva with Apache License 2.0 | 5 votes |
def get_petastorm_column(df_column): column_type = df_column.type column_name = df_column.name column_is_nullable = df_column.is_nullable column_array_dimensions = df_column.array_dimensions # Reference: # https://github.com/uber/petastorm/blob/master/petastorm/ # tests/test_common.py petastorm_column = None if column_type == ColumnType.INTEGER: petastorm_column = UnischemaField(column_name, np.int32, (), ScalarCodec(IntegerType()), column_is_nullable) elif column_type == ColumnType.FLOAT: petastorm_column = UnischemaField(column_name, np.float64, (), ScalarCodec(FloatType()), column_is_nullable) elif column_type == ColumnType.TEXT: petastorm_column = UnischemaField(column_name, np.string_, (), ScalarCodec(StringType()), column_is_nullable) elif column_type == ColumnType.NDARRAY: petastorm_column = UnischemaField(column_name, np.uint8, column_array_dimensions, NdarrayCodec(), column_is_nullable) else: LoggingManager().log("Invalid column type: " + str(column_type), LoggingLevel.ERROR) return petastorm_column