Python pyspark.sql.types.DoubleType() Examples
The following are 22
code examples of pyspark.sql.types.DoubleType().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql.types
, or try the search function
.
Example #1
Source File: temp_range_sql.py From Hanhan-Spark-Python with MIT License | 12 votes |
def main(): temp_schema = StructType([ StructField('StationID', StringType(), False), StructField('DateTime', StringType(), False), StructField('Observation', StringType(), False), StructField('DataValue', DoubleType(), False), StructField('MFlag', StringType(), True), StructField('QFlag', StringType(), True), StructField('SFlag', StringType(), True), StructField('OBSTime', StringType(), True), ]) df = sqlContext.read.format('com.databricks.spark.csv').options(header='false').load(inputs1, schema=temp_schema) df = df.filter(df.QFlag == '') dfrange = get_range(df) result = dfrange.rdd.map(lambda r: str(r.DateTime)+' '+str(r.StationID)+' '+str(r.MaxRange)) outdata = result.sortBy(lambda r: r[0]).coalesce(1) outdata.saveAsTextFile(output)
Example #2
Source File: spark_dataset_converter.py From petastorm with Apache License 2.0 | 6 votes |
def _convert_precision(df, dtype): if dtype is None: return df if dtype != "float32" and dtype != "float64": raise ValueError("dtype {} is not supported. \ Use 'float32' or float64".format(dtype)) source_type, target_type = (DoubleType, FloatType) \ if dtype == "float32" else (FloatType, DoubleType) logger.warning("Converting floating-point columns to %s", dtype) for field in df.schema: col_name = field.name if isinstance(field.dataType, source_type): df = df.withColumn(col_name, df[col_name].cast(target_type())) elif isinstance(field.dataType, ArrayType) and \ isinstance(field.dataType.elementType, source_type): df = df.withColumn(col_name, df[col_name].cast(ArrayType(target_type()))) return df
Example #3
Source File: test_decode_row.py From petastorm with Apache License 2.0 | 6 votes |
def test_decode_numpy_scalar_with_explicit_scalar_codec(): """Decoding a row that has a field with the codec set explicitly""" MatrixSchema = Unischema('TestSchema', [UnischemaField('scalar', np.float64, (), ScalarCodec(DoubleType()), False)]) row = {'scalar': 42.0} decoded_value = decode_row(row, MatrixSchema)['scalar'] assert decoded_value == 42 assert isinstance(decoded_value, np.float64)
Example #4
Source File: torch_distributed.py From sparktorch with MIT License | 5 votes |
def _transform(self, dataset): inp = self.getOrDefault(self.inputCol) out = self.getOrDefault(self.predictionCol) mod_str = self.getOrDefault(self.modStr) use_vector_out = self.getOrDefault(self.useVectorOut) model = dill.loads(codecs.decode(mod_str.encode(), "base64")) model_broadcast = dataset._sc.broadcast(model) def predict_vec(data): features = data.toArray().reshape((1, len(data))) x_data = torch.from_numpy(features).float() model = model_broadcast.value model.eval() return Vectors.dense(model(x_data).detach().numpy().flatten()) def predict_float(data): features = data.toArray().reshape((1, len(data))) x_data = torch.from_numpy(features).float() model = model_broadcast.value model.eval() raw_prediction = model(x_data).detach().numpy().flatten() if len(raw_prediction) > 1: return float(np.argmax(raw_prediction)) return float(raw_prediction[0]) if use_vector_out: udfGenerateCode = F.udf(predict_vec, VectorUDT()) else: udfGenerateCode = F.udf(predict_float, DoubleType()) return dataset.withColumn(out, udfGenerateCode(inp))
Example #5
Source File: dfutil.py From TensorFlowOnSpark with Apache License 2.0 | 5 votes |
def infer_schema(example, binary_features=[]): """Given a tf.train.Example, infer the Spark DataFrame schema (StructFields). Note: TensorFlow represents both strings and binary types as tf.train.BytesList, and we need to disambiguate these types for Spark DataFrames DTypes (StringType and BinaryType), so we require a "hint" from the caller in the ``binary_features`` argument. Args: :example: a tf.train.Example :binary_features: a list of tf.train.Example features which are expected to be binary/bytearrays. Returns: A DataFrame StructType schema """ def _infer_sql_type(k, v): # special handling for binary features if k in binary_features: return BinaryType() if v.int64_list.value: result = v.int64_list.value sql_type = LongType() elif v.float_list.value: result = v.float_list.value sql_type = DoubleType() else: result = v.bytes_list.value sql_type = StringType() if len(result) > 1: # represent multi-item tensors as Spark SQL ArrayType() of base types return ArrayType(sql_type) else: # represent everything else as base types (and empty tensors as StringType()) return sql_type return StructType([StructField(k, _infer_sql_type(k, v), True) for k, v in sorted(example.features.feature.items())])
Example #6
Source File: test_spark.py From mlflow with Apache License 2.0 | 5 votes |
def test_spark_udf(spark, model_path): mlflow.pyfunc.save_model( path=model_path, loader_module=__name__, code_path=[os.path.dirname(tests.__file__)], ) reloaded_pyfunc_model = mlflow.pyfunc.load_pyfunc(model_path) pandas_df = pd.DataFrame(data=np.ones((10, 10)), columns=[str(i) for i in range(10)]) spark_df = spark.createDataFrame(pandas_df) # Test all supported return types type_map = {"float": (FloatType(), np.number), "int": (IntegerType(), np.int32), "double": (DoubleType(), np.number), "long": (LongType(), np.int), "string": (StringType(), None)} for tname, tdef in type_map.items(): spark_type, np_type = tdef prediction_df = reloaded_pyfunc_model.predict(pandas_df) for is_array in [True, False]: t = ArrayType(spark_type) if is_array else spark_type if tname == "string": expected = prediction_df.applymap(str) else: expected = prediction_df.select_dtypes(np_type) if tname == "float": expected = expected.astype(np.float32) expected = [list(row[1]) if is_array else row[1][0] for row in expected.iterrows()] pyfunc_udf = spark_udf(spark, model_path, result_type=t) new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns)) actual = list(new_df.select("prediction").toPandas()['prediction']) assert expected == actual if not is_array: pyfunc_udf = spark_udf(spark, model_path, result_type=tname) new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns)) actual = list(new_df.select("prediction").toPandas()['prediction']) assert expected == actual
Example #7
Source File: typehints.py From koalas with Apache License 2.0 | 5 votes |
def as_spark_type(tpe) -> types.DataType: """ Given a python type, returns the equivalent spark type. Accepts: - the built-in types in python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - python3's typing system """ if tpe in (str, "str", "string"): return types.StringType() elif tpe in (bytes,): return types.BinaryType() elif tpe in (np.int8, "int8", "byte"): return types.ByteType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() elif tpe in (int, "int", np.int, np.int32): return types.IntegerType() elif tpe in (np.int64, "int64", "long", "bigint"): return types.LongType() elif tpe in (float, "float", np.float): return types.FloatType() elif tpe in (np.float64, "float64", "double"): return types.DoubleType() elif tpe in (datetime.datetime, np.datetime64): return types.TimestampType() elif tpe in (datetime.date,): return types.DateType() elif tpe in (bool, "boolean", "bool", np.bool): return types.BooleanType() elif tpe in (np.ndarray,): # TODO: support other child types return types.ArrayType(types.StringType()) else: raise TypeError("Type %s was not understood." % tpe)
Example #8
Source File: base.py From koalas with Apache License 2.0 | 5 votes |
def isnull(self): """ Detect existing (non-missing) values. Return a boolean same-sized object indicating if the values are NA. NA values, such as None or numpy.NaN, gets mapped to True values. Everything else gets mapped to False values. Characters such as empty strings '' or numpy.inf are not considered NA values (unless you set pandas.options.mode.use_inf_as_na = True). Returns ------- Series : Mask of bool values for each element in Series that indicates whether an element is not an NA value. Examples -------- >>> ser = ks.Series([5, 6, np.NaN]) >>> ser.isna() # doctest: +NORMALIZE_WHITESPACE 0 False 1 False 2 True Name: 0, dtype: bool >>> ser.rename("a").to_frame().set_index("a").index.isna() Index([False, False, True], dtype='object', name='a') """ from databricks.koalas.indexes import MultiIndex if isinstance(self, MultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") if isinstance(self.spark.data_type, (FloatType, DoubleType)): return self._with_new_scol( self.spark.column.isNull() | F.isnan(self.spark.column) ).rename(self.name) else: return self._with_new_scol(self.spark.column.isNull()).rename(self.name)
Example #9
Source File: generic.py From koalas with Apache License 2.0 | 5 votes |
def _count_expr(col: spark.Column, spark_type: DataType) -> spark.Column: # Special handle floating point types because Spark's count treats nan as a valid value, # whereas pandas count doesn't include nan. if isinstance(spark_type, (FloatType, DoubleType)): return F.count(F.nanvl(col, F.lit(None))) else: return F.count(col)
Example #10
Source File: analyze_run.py From pipelines with Apache License 2.0 | 5 votes |
def load_schema(schema_file): type_map = { 'KEY': StringType(), 'NUMBER': DoubleType(), 'CATEGORY': StringType(), 'TEXT': StringType(), 'IMAGE_URL': StringType() } schema_json = json.loads(file_io.read_file_to_string(schema_file)) fields = [StructField(x['name'], type_map[x['type']]) for x in schema_json] return schema_json, StructType(fields)
Example #11
Source File: transform_run.py From pipelines with Apache License 2.0 | 5 votes |
def load_schema(analysis_path): type_map = { 'KEY': StringType(), 'NUMBER': DoubleType(), 'CATEGORY': StringType(), 'TEXT': StringType(), 'IMAGE_URL': StringType() } schema_file = os.path.join(analysis_path, 'schema.json') schema_json = json.loads(file_io.read_file_to_string(schema_file)) fields = [StructField(x['name'], type_map[x['type']]) for x in schema_json] return schema_json, StructType(fields)
Example #12
Source File: testSmvSchema.py From SMV with Apache License 2.0 | 5 votes |
def test_fromString(self): s = SmvSchema.fromString("a:string; b:double") fields = s.spark_schema.fields assert(len(fields) == 2) assert(fields[0] == st.StructField('a', st.StringType())) assert(fields[1] == st.StructField('b', st.DoubleType()))
Example #13
Source File: evaluation.py From LearningApacheSpark with MIT License | 5 votes |
def __init__(self, predictionAndLabels): sc = predictionAndLabels.ctx sql_ctx = SQLContext.getOrCreate(sc) df = sql_ctx.createDataFrame(predictionAndLabels, schema=StructType([ StructField("prediction", DoubleType(), nullable=False), StructField("label", DoubleType(), nullable=False)])) java_class = sc._jvm.org.apache.spark.mllib.evaluation.MulticlassMetrics java_model = java_class(df._jdf) super(MulticlassMetrics, self).__init__(java_model)
Example #14
Source File: evaluation.py From LearningApacheSpark with MIT License | 5 votes |
def __init__(self, scoreAndLabels): sc = scoreAndLabels.ctx sql_ctx = SQLContext.getOrCreate(sc) df = sql_ctx.createDataFrame(scoreAndLabels, schema=StructType([ StructField("score", DoubleType(), nullable=False), StructField("label", DoubleType(), nullable=False)])) java_class = sc._jvm.org.apache.spark.mllib.evaluation.BinaryClassificationMetrics java_model = java_class(df._jdf) super(BinaryClassificationMetrics, self).__init__(java_model)
Example #15
Source File: tests.py From LearningApacheSpark with MIT License | 5 votes |
def test_unary_transformer_validate_input_type(self): shiftVal = 3 transformer = MockUnaryTransformer(shiftVal=shiftVal)\ .setInputCol("input").setOutputCol("output") # should not raise any errors transformer.validateInputType(DoubleType()) with self.assertRaises(TypeError): # passing the wrong input type should raise an error transformer.validateInputType(IntegerType())
Example #16
Source File: tests.py From LearningApacheSpark with MIT License | 5 votes |
def validateInputType(self, inputType): if inputType != DoubleType(): raise TypeError("Bad input type: {}. ".format(inputType) + "Requires Double.")
Example #17
Source File: tests.py From LearningApacheSpark with MIT License | 5 votes |
def outputDataType(self): return DoubleType()
Example #18
Source File: codecs.py From petastorm with Apache License 2.0 | 5 votes |
def encode(self, unischema_field, value): # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path # (currently works only with make_batch_reader). We should move all pyspark related code into a separate module import pyspark.sql.types as sql_types # We treat ndarrays with shape=() as scalars unsized_numpy_array = isinstance(value, np.ndarray) and value.shape == () # Validate the input to be a scalar (or an unsized numpy array) if not unsized_numpy_array and hasattr(value, '__len__') and (not isinstance(value, str)): raise TypeError('Expected a scalar as a value for field \'{}\'. ' 'Got a non-numpy type\'{}\''.format(unischema_field.name, type(value))) if unischema_field.shape: raise ValueError('The shape field of unischema_field \'%s\' must be an empty tuple (i.e. \'()\' ' 'to indicate a scalar. However, the actual shape is %s', unischema_field.name, unischema_field.shape) if isinstance(self._spark_type, (sql_types.ByteType, sql_types.ShortType, sql_types.IntegerType, sql_types.LongType)): return int(value) if isinstance(self._spark_type, (sql_types.FloatType, sql_types.DoubleType)): return float(value) if isinstance(self._spark_type, sql_types.BooleanType): return bool(value) if isinstance(self._spark_type, sql_types.StringType): if not isinstance(value, str): raise ValueError( 'Expected a string value for field {}. Got type {}'.format(unischema_field.name, type(value))) return str(value) return value
Example #19
Source File: unischema.py From petastorm with Apache License 2.0 | 5 votes |
def _numpy_to_spark_mapping(): """Returns a mapping from numpy to pyspark.sql type. Caches the mapping dictionary inorder to avoid instantiation of multiple objects in each call.""" # Refer to the attribute of the function we use to cache the map using a name in the variable instead of a 'dot' # notation to avoid copy/paste/typo mistakes cache_attr_name = 'cached_numpy_to_pyspark_types_map' if not hasattr(_numpy_to_spark_mapping, cache_attr_name): import pyspark.sql.types as T setattr(_numpy_to_spark_mapping, cache_attr_name, { np.int8: T.ByteType(), np.uint8: T.ShortType(), np.int16: T.ShortType(), np.uint16: T.IntegerType(), np.int32: T.IntegerType(), np.int64: T.LongType(), np.float32: T.FloatType(), np.float64: T.DoubleType(), np.string_: T.StringType(), np.str_: T.StringType(), np.unicode_: T.StringType(), np.bool_: T.BooleanType(), }) return getattr(_numpy_to_spark_mapping, cache_attr_name) # TODO: Changing fields in this class or the UnischemaField will break reading due to the schema being pickled next to # the dataset on disk
Example #20
Source File: ml_model.py From elephas with MIT License | 5 votes |
def _transform(self, df): """Private transform method of a Transformer. This serves as batch-prediction method for our purposes. """ output_col = self.getOutputCol() label_col = self.getLabelCol() new_schema = copy.deepcopy(df.schema) new_schema.add(StructField(output_col, StringType(), True)) rdd = df.rdd.coalesce(1) features = np.asarray( rdd.map(lambda x: from_vector(x.features)).collect()) # Note that we collect, since executing this on the rdd would require model serialization once again model = model_from_yaml(self.get_keras_model_config()) model.set_weights(self.weights.value) predictions = rdd.ctx.parallelize( model.predict_classes(features)).coalesce(1) predictions = predictions.map(lambda x: tuple(str(x))) results_rdd = rdd.zip(predictions).map(lambda x: x[0] + x[1]) results_df = df.sql_ctx.createDataFrame(results_rdd, new_schema) results_df = results_df.withColumn( output_col, results_df[output_col].cast(DoubleType())) results_df = results_df.withColumn( label_col, results_df[label_col].cast(DoubleType())) return results_df
Example #21
Source File: tf_tensor.py From spark-deep-learning with Apache License 2.0 | 5 votes |
def _transform(self, dataset): if any([field.dataType == DoubleType() for field in dataset.schema]): logger.warning("Detected DoubleType columns in dataframe passed to transform(). In " "Deep Learning Pipelines 1.0 and above, DoubleType columns can only be " "fed to input tensors of type tf.float64. To feed dataframe data to " "tensors of other types (e.g. tf.float32, tf.int32, tf.int64), use the " "corresponding Spark SQL data types (FloatType, IntegerType, LongType).") graph_def = self._optimize_for_inference() input_mapping = self.getInputMapping() output_mapping = self.getOutputMapping() graph = tf.Graph() with tf.Session(graph=graph): analyzed_df = tfs.analyze(dataset) out_tnsr_op_names = [tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping] # Load graph tf.import_graph_def(graph_def=graph_def, name='', return_elements=out_tnsr_op_names) # Feed dict maps from placeholder name to DF column name feed_dict = {tfx.op_name(tnsr_name): col_name for col_name, tnsr_name in input_mapping} fetches = [tfx.get_tensor(tnsr_name, graph) for tnsr_name in out_tnsr_op_names] out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict) # We still have to rename output columns for tnsr_name, new_colname in output_mapping: old_colname = tfx.op_name(tnsr_name, graph) if old_colname != new_colname: out_df = out_df.withColumnRenamed(old_colname, new_colname) return out_df
Example #22
Source File: classification.py From LearningApacheSpark with MIT License | 4 votes |
def _transform(self, dataset): # determine the input columns: these need to be passed through origCols = dataset.columns # add an accumulator column to store predictions of all the models accColName = "mbc$acc" + str(uuid.uuid4()) initUDF = udf(lambda _: [], ArrayType(DoubleType())) newDataset = dataset.withColumn(accColName, initUDF(dataset[origCols[0]])) # persist if underlying dataset is not persistent. handlePersistence = dataset.storageLevel == StorageLevel(False, False, False, False) if handlePersistence: newDataset.persist(StorageLevel.MEMORY_AND_DISK) # update the accumulator column with the result of prediction of models aggregatedDataset = newDataset for index, model in enumerate(self.models): rawPredictionCol = model._call_java("getRawPredictionCol") columns = origCols + [rawPredictionCol, accColName] # add temporary column to store intermediate scores and update tmpColName = "mbc$tmp" + str(uuid.uuid4()) updateUDF = udf( lambda predictions, prediction: predictions + [prediction.tolist()[1]], ArrayType(DoubleType())) transformedDataset = model.transform(aggregatedDataset).select(*columns) updatedDataset = transformedDataset.withColumn( tmpColName, updateUDF(transformedDataset[accColName], transformedDataset[rawPredictionCol])) newColumns = origCols + [tmpColName] # switch out the intermediate column with the accumulator column aggregatedDataset = updatedDataset\ .select(*newColumns).withColumnRenamed(tmpColName, accColName) if handlePersistence: newDataset.unpersist() # output the index of the classifier with highest confidence as prediction labelUDF = udf( lambda predictions: float(max(enumerate(predictions), key=operator.itemgetter(1))[0]), DoubleType()) # output label and label metadata as prediction return aggregatedDataset.withColumn( self.getPredictionCol(), labelUDF(aggregatedDataset[accColName])).drop(accColName)