Python Examples of pyspark.sql.types.DoubleType

Source File: temp_range_sql.py From Hanhan-Spark-Python with MIT License

12 votes

def main():
    temp_schema = StructType([
    StructField('StationID', StringType(), False),
    StructField('DateTime', StringType(), False),
    StructField('Observation', StringType(), False),
    StructField('DataValue', DoubleType(), False),
    StructField('MFlag', StringType(), True),
    StructField('QFlag', StringType(), True),
    StructField('SFlag', StringType(), True),
    StructField('OBSTime', StringType(), True),
    ])

    df = sqlContext.read.format('com.databricks.spark.csv').options(header='false').load(inputs1, schema=temp_schema)
    df = df.filter(df.QFlag == '')

    dfrange = get_range(df)
    result = dfrange.rdd.map(lambda r: str(r.DateTime)+' '+str(r.StationID)+' '+str(r.MaxRange))
    outdata = result.sortBy(lambda r: r[0]).coalesce(1)
    outdata.saveAsTextFile(output)

Source File: spark_dataset_converter.py From petastorm with Apache License 2.0

6 votes

def _convert_precision(df, dtype):
    if dtype is None:
        return df

    if dtype != "float32" and dtype != "float64":
        raise ValueError("dtype {} is not supported. \
            Use 'float32' or float64".format(dtype))

    source_type, target_type = (DoubleType, FloatType) \
        if dtype == "float32" else (FloatType, DoubleType)

    logger.warning("Converting floating-point columns to %s", dtype)

    for field in df.schema:
        col_name = field.name
        if isinstance(field.dataType, source_type):
            df = df.withColumn(col_name, df[col_name].cast(target_type()))
        elif isinstance(field.dataType, ArrayType) and \
                isinstance(field.dataType.elementType, source_type):
            df = df.withColumn(col_name, df[col_name].cast(ArrayType(target_type())))
    return df

Source File: test_decode_row.py From petastorm with Apache License 2.0

6 votes

def test_decode_numpy_scalar_with_explicit_scalar_codec():
    """Decoding a row that has a field with the codec set explicitly"""

    MatrixSchema = Unischema('TestSchema', [UnischemaField('scalar', np.float64, (), ScalarCodec(DoubleType()), False)])
    row = {'scalar': 42.0}
    decoded_value = decode_row(row, MatrixSchema)['scalar']
    assert decoded_value == 42
    assert isinstance(decoded_value, np.float64)

Source File: torch_distributed.py From sparktorch with MIT License

5 votes

def _transform(self, dataset):
        inp = self.getOrDefault(self.inputCol)
        out = self.getOrDefault(self.predictionCol)
        mod_str = self.getOrDefault(self.modStr)
        use_vector_out = self.getOrDefault(self.useVectorOut)

        model = dill.loads(codecs.decode(mod_str.encode(), "base64"))
        model_broadcast = dataset._sc.broadcast(model)

        def predict_vec(data):
            features = data.toArray().reshape((1, len(data)))
            x_data = torch.from_numpy(features).float()
            model = model_broadcast.value
            model.eval()
            return Vectors.dense(model(x_data).detach().numpy().flatten())

        def predict_float(data):
            features = data.toArray().reshape((1, len(data)))
            x_data = torch.from_numpy(features).float()
            model = model_broadcast.value
            model.eval()
            raw_prediction = model(x_data).detach().numpy().flatten()
            if len(raw_prediction) > 1:
                return float(np.argmax(raw_prediction))
            return float(raw_prediction[0])

        if use_vector_out:
            udfGenerateCode = F.udf(predict_vec, VectorUDT())
        else:
            udfGenerateCode = F.udf(predict_float, DoubleType())

        return dataset.withColumn(out, udfGenerateCode(inp))

Source File: dfutil.py From TensorFlowOnSpark with Apache License 2.0

5 votes

def infer_schema(example, binary_features=[]):
  """Given a tf.train.Example, infer the Spark DataFrame schema (StructFields).

  Note: TensorFlow represents both strings and binary types as tf.train.BytesList, and we need to
  disambiguate these types for Spark DataFrames DTypes (StringType and BinaryType), so we require a "hint"
  from the caller in the ``binary_features`` argument.

  Args:
    :example: a tf.train.Example
    :binary_features: a list of tf.train.Example features which are expected to be binary/bytearrays.

  Returns:
    A DataFrame StructType schema
  """
  def _infer_sql_type(k, v):
    # special handling for binary features
    if k in binary_features:
      return BinaryType()

    if v.int64_list.value:
      result = v.int64_list.value
      sql_type = LongType()
    elif v.float_list.value:
      result = v.float_list.value
      sql_type = DoubleType()
    else:
      result = v.bytes_list.value
      sql_type = StringType()

    if len(result) > 1:             # represent multi-item tensors as Spark SQL ArrayType() of base types
      return ArrayType(sql_type)
    else:                           # represent everything else as base types (and empty tensors as StringType())
      return sql_type

  return StructType([StructField(k, _infer_sql_type(k, v), True) for k, v in sorted(example.features.feature.items())])

Source File: test_spark.py From mlflow with Apache License 2.0

5 votes

def test_spark_udf(spark, model_path):
    mlflow.pyfunc.save_model(
        path=model_path,
        loader_module=__name__,
        code_path=[os.path.dirname(tests.__file__)],
    )
    reloaded_pyfunc_model = mlflow.pyfunc.load_pyfunc(model_path)

    pandas_df = pd.DataFrame(data=np.ones((10, 10)), columns=[str(i) for i in range(10)])
    spark_df = spark.createDataFrame(pandas_df)

    # Test all supported return types
    type_map = {"float": (FloatType(), np.number),
                "int": (IntegerType(), np.int32),
                "double": (DoubleType(), np.number),
                "long": (LongType(), np.int),
                "string": (StringType(), None)}

    for tname, tdef in type_map.items():
        spark_type, np_type = tdef
        prediction_df = reloaded_pyfunc_model.predict(pandas_df)
        for is_array in [True, False]:
            t = ArrayType(spark_type) if is_array else spark_type
            if tname == "string":
                expected = prediction_df.applymap(str)
            else:
                expected = prediction_df.select_dtypes(np_type)
                if tname == "float":
                    expected = expected.astype(np.float32)

            expected = [list(row[1]) if is_array else row[1][0] for row in expected.iterrows()]
            pyfunc_udf = spark_udf(spark, model_path, result_type=t)
            new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns))
            actual = list(new_df.select("prediction").toPandas()['prediction'])
            assert expected == actual
            if not is_array:
                pyfunc_udf = spark_udf(spark, model_path, result_type=tname)
                new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns))
                actual = list(new_df.select("prediction").toPandas()['prediction'])
                assert expected == actual

Source File: typehints.py From koalas with Apache License 2.0

5 votes

def as_spark_type(tpe) -> types.DataType:
    """
    Given a python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - python3's typing system
    """
    if tpe in (str, "str", "string"):
        return types.StringType()
    elif tpe in (bytes,):
        return types.BinaryType()
    elif tpe in (np.int8, "int8", "byte"):
        return types.ByteType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    elif tpe in (int, "int", np.int, np.int32):
        return types.IntegerType()
    elif tpe in (np.int64, "int64", "long", "bigint"):
        return types.LongType()
    elif tpe in (float, "float", np.float):
        return types.FloatType()
    elif tpe in (np.float64, "float64", "double"):
        return types.DoubleType()
    elif tpe in (datetime.datetime, np.datetime64):
        return types.TimestampType()
    elif tpe in (datetime.date,):
        return types.DateType()
    elif tpe in (bool, "boolean", "bool", np.bool):
        return types.BooleanType()
    elif tpe in (np.ndarray,):
        # TODO: support other child types
        return types.ArrayType(types.StringType())
    else:
        raise TypeError("Type %s was not understood." % tpe)

Source File: base.py From koalas with Apache License 2.0

5 votes

def isnull(self):
        """
        Detect existing (non-missing) values.

        Return a boolean same-sized object indicating if the values are NA.
        NA values, such as None or numpy.NaN, gets mapped to True values.
        Everything else gets mapped to False values. Characters such as empty strings '' or
        numpy.inf are not considered NA values
        (unless you set pandas.options.mode.use_inf_as_na = True).

        Returns
        -------
        Series : Mask of bool values for each element in Series
            that indicates whether an element is not an NA value.

        Examples
        --------
        >>> ser = ks.Series([5, 6, np.NaN])
        >>> ser.isna()  # doctest: +NORMALIZE_WHITESPACE
        0    False
        1    False
        2     True
        Name: 0, dtype: bool

        >>> ser.rename("a").to_frame().set_index("a").index.isna()
        Index([False, False, True], dtype='object', name='a')
        """
        from databricks.koalas.indexes import MultiIndex

        if isinstance(self, MultiIndex):
            raise NotImplementedError("isna is not defined for MultiIndex")
        if isinstance(self.spark.data_type, (FloatType, DoubleType)):
            return self._with_new_scol(
                self.spark.column.isNull() | F.isnan(self.spark.column)
            ).rename(self.name)
        else:
            return self._with_new_scol(self.spark.column.isNull()).rename(self.name)

Source File: generic.py From koalas with Apache License 2.0

5 votes

def _count_expr(col: spark.Column, spark_type: DataType) -> spark.Column:
        # Special handle floating point types because Spark's count treats nan as a valid value,
        # whereas pandas count doesn't include nan.
        if isinstance(spark_type, (FloatType, DoubleType)):
            return F.count(F.nanvl(col, F.lit(None)))
        else:
            return F.count(col)

Source File: analyze_run.py From pipelines with Apache License 2.0

5 votes

def load_schema(schema_file):
  type_map = {
    'KEY': StringType(),
    'NUMBER': DoubleType(),
    'CATEGORY': StringType(),
    'TEXT': StringType(),
    'IMAGE_URL': StringType()
  }
  
  schema_json = json.loads(file_io.read_file_to_string(schema_file))
  fields = [StructField(x['name'], type_map[x['type']]) for x in schema_json]
  return schema_json, StructType(fields)

Source File: transform_run.py From pipelines with Apache License 2.0

5 votes

def load_schema(analysis_path):
  type_map = {
    'KEY': StringType(),
    'NUMBER': DoubleType(),
    'CATEGORY': StringType(),
    'TEXT': StringType(),
    'IMAGE_URL': StringType()
  }
  schema_file = os.path.join(analysis_path, 'schema.json')
  schema_json = json.loads(file_io.read_file_to_string(schema_file))
  fields = [StructField(x['name'], type_map[x['type']]) for x in schema_json]
  return schema_json, StructType(fields)

Source File: testSmvSchema.py From SMV with Apache License 2.0

5 votes

def test_fromString(self):
        s = SmvSchema.fromString("a:string; b:double")
        fields = s.spark_schema.fields
        assert(len(fields) == 2)
        assert(fields[0] == st.StructField('a', st.StringType()))
        assert(fields[1] == st.StructField('b', st.DoubleType()))

Source File: evaluation.py From LearningApacheSpark with MIT License

5 votes

def __init__(self, predictionAndLabels):
        sc = predictionAndLabels.ctx
        sql_ctx = SQLContext.getOrCreate(sc)
        df = sql_ctx.createDataFrame(predictionAndLabels, schema=StructType([
            StructField("prediction", DoubleType(), nullable=False),
            StructField("label", DoubleType(), nullable=False)]))
        java_class = sc._jvm.org.apache.spark.mllib.evaluation.MulticlassMetrics
        java_model = java_class(df._jdf)
        super(MulticlassMetrics, self).__init__(java_model)

Source File: evaluation.py From LearningApacheSpark with MIT License

5 votes

def __init__(self, scoreAndLabels):
        sc = scoreAndLabels.ctx
        sql_ctx = SQLContext.getOrCreate(sc)
        df = sql_ctx.createDataFrame(scoreAndLabels, schema=StructType([
            StructField("score", DoubleType(), nullable=False),
            StructField("label", DoubleType(), nullable=False)]))
        java_class = sc._jvm.org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
        java_model = java_class(df._jdf)
        super(BinaryClassificationMetrics, self).__init__(java_model)

Source File: tests.py From LearningApacheSpark with MIT License

5 votes

def test_unary_transformer_validate_input_type(self):
        shiftVal = 3
        transformer = MockUnaryTransformer(shiftVal=shiftVal)\
            .setInputCol("input").setOutputCol("output")

        # should not raise any errors
        transformer.validateInputType(DoubleType())

        with self.assertRaises(TypeError):
            # passing the wrong input type should raise an error
            transformer.validateInputType(IntegerType())

Source File: tests.py From LearningApacheSpark with MIT License

5 votes

def validateInputType(self, inputType):
        if inputType != DoubleType():
            raise TypeError("Bad input type: {}. ".format(inputType) +
                            "Requires Double.")

Source File: tests.py From LearningApacheSpark with MIT License

5 votes

def outputDataType(self):
        return DoubleType()

Source File: codecs.py From petastorm with Apache License 2.0

5 votes

def encode(self, unischema_field, value):
        # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path
        # (currently works only with make_batch_reader). We should move all pyspark related code into a separate module
        import pyspark.sql.types as sql_types

        # We treat ndarrays with shape=() as scalars
        unsized_numpy_array = isinstance(value, np.ndarray) and value.shape == ()
        # Validate the input to be a scalar (or an unsized numpy array)
        if not unsized_numpy_array and hasattr(value, '__len__') and (not isinstance(value, str)):
            raise TypeError('Expected a scalar as a value for field \'{}\'. '
                            'Got a non-numpy type\'{}\''.format(unischema_field.name, type(value)))

        if unischema_field.shape:
            raise ValueError('The shape field of unischema_field \'%s\' must be an empty tuple (i.e. \'()\' '
                             'to indicate a scalar. However, the actual shape is %s',
                             unischema_field.name, unischema_field.shape)
        if isinstance(self._spark_type, (sql_types.ByteType, sql_types.ShortType, sql_types.IntegerType,
                                         sql_types.LongType)):
            return int(value)
        if isinstance(self._spark_type, (sql_types.FloatType, sql_types.DoubleType)):
            return float(value)
        if isinstance(self._spark_type, sql_types.BooleanType):
            return bool(value)
        if isinstance(self._spark_type, sql_types.StringType):
            if not isinstance(value, str):
                raise ValueError(
                    'Expected a string value for field {}. Got type {}'.format(unischema_field.name, type(value)))
            return str(value)

        return value

Source File: unischema.py From petastorm with Apache License 2.0

5 votes

def _numpy_to_spark_mapping():
    """Returns a mapping from numpy to pyspark.sql type. Caches the mapping dictionary inorder to avoid instantiation
    of multiple objects in each call."""

    # Refer to the attribute of the function we use to cache the map using a name in the variable instead of a 'dot'
    # notation to avoid copy/paste/typo mistakes
    cache_attr_name = 'cached_numpy_to_pyspark_types_map'
    if not hasattr(_numpy_to_spark_mapping, cache_attr_name):
        import pyspark.sql.types as T

        setattr(_numpy_to_spark_mapping, cache_attr_name,
                {
                    np.int8: T.ByteType(),
                    np.uint8: T.ShortType(),
                    np.int16: T.ShortType(),
                    np.uint16: T.IntegerType(),
                    np.int32: T.IntegerType(),
                    np.int64: T.LongType(),
                    np.float32: T.FloatType(),
                    np.float64: T.DoubleType(),
                    np.string_: T.StringType(),
                    np.str_: T.StringType(),
                    np.unicode_: T.StringType(),
                    np.bool_: T.BooleanType(),
                })

    return getattr(_numpy_to_spark_mapping, cache_attr_name)


# TODO: Changing fields in this class or the UnischemaField will break reading due to the schema being pickled next to
# the dataset on disk

Source File: ml_model.py From elephas with MIT License

5 votes

def _transform(self, df):
        """Private transform method of a Transformer. This serves as batch-prediction method for our purposes.
        """
        output_col = self.getOutputCol()
        label_col = self.getLabelCol()
        new_schema = copy.deepcopy(df.schema)
        new_schema.add(StructField(output_col, StringType(), True))

        rdd = df.rdd.coalesce(1)
        features = np.asarray(
            rdd.map(lambda x: from_vector(x.features)).collect())
        # Note that we collect, since executing this on the rdd would require model serialization once again
        model = model_from_yaml(self.get_keras_model_config())
        model.set_weights(self.weights.value)
        predictions = rdd.ctx.parallelize(
            model.predict_classes(features)).coalesce(1)
        predictions = predictions.map(lambda x: tuple(str(x)))

        results_rdd = rdd.zip(predictions).map(lambda x: x[0] + x[1])
        results_df = df.sql_ctx.createDataFrame(results_rdd, new_schema)
        results_df = results_df.withColumn(
            output_col, results_df[output_col].cast(DoubleType()))
        results_df = results_df.withColumn(
            label_col, results_df[label_col].cast(DoubleType()))

        return results_df

Source File: tf_tensor.py From spark-deep-learning with Apache License 2.0

5 votes

def _transform(self, dataset):
        if any([field.dataType == DoubleType() for field in dataset.schema]):
            logger.warning("Detected DoubleType columns in dataframe passed to transform(). In "
                           "Deep Learning Pipelines 1.0 and above, DoubleType columns can only be "
                           "fed to input tensors of type tf.float64. To feed dataframe data to "
                           "tensors of other types (e.g. tf.float32, tf.int32, tf.int64), use the "
                           "corresponding Spark SQL data types (FloatType, IntegerType, LongType).")

        graph_def = self._optimize_for_inference()
        input_mapping = self.getInputMapping()
        output_mapping = self.getOutputMapping()

        graph = tf.Graph()
        with tf.Session(graph=graph):
            analyzed_df = tfs.analyze(dataset)
            out_tnsr_op_names = [tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping]
            # Load graph
            tf.import_graph_def(graph_def=graph_def, name='', return_elements=out_tnsr_op_names)
            # Feed dict maps from placeholder name to DF column name
            feed_dict = {tfx.op_name(tnsr_name): col_name for col_name, tnsr_name in input_mapping}
            fetches = [tfx.get_tensor(tnsr_name, graph) for tnsr_name in out_tnsr_op_names]
            out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict)
            # We still have to rename output columns
            for tnsr_name, new_colname in output_mapping:
                old_colname = tfx.op_name(tnsr_name, graph)
                if old_colname != new_colname:
                    out_df = out_df.withColumnRenamed(old_colname, new_colname)

        return out_df

Source File: classification.py From LearningApacheSpark with MIT License

4 votes

def _transform(self, dataset):
        # determine the input columns: these need to be passed through
        origCols = dataset.columns

        # add an accumulator column to store predictions of all the models
        accColName = "mbc$acc" + str(uuid.uuid4())
        initUDF = udf(lambda _: [], ArrayType(DoubleType()))
        newDataset = dataset.withColumn(accColName, initUDF(dataset[origCols[0]]))

        # persist if underlying dataset is not persistent.
        handlePersistence = dataset.storageLevel == StorageLevel(False, False, False, False)
        if handlePersistence:
            newDataset.persist(StorageLevel.MEMORY_AND_DISK)

        # update the accumulator column with the result of prediction of models
        aggregatedDataset = newDataset
        for index, model in enumerate(self.models):
            rawPredictionCol = model._call_java("getRawPredictionCol")
            columns = origCols + [rawPredictionCol, accColName]

            # add temporary column to store intermediate scores and update
            tmpColName = "mbc$tmp" + str(uuid.uuid4())
            updateUDF = udf(
                lambda predictions, prediction: predictions + [prediction.tolist()[1]],
                ArrayType(DoubleType()))
            transformedDataset = model.transform(aggregatedDataset).select(*columns)
            updatedDataset = transformedDataset.withColumn(
                tmpColName,
                updateUDF(transformedDataset[accColName], transformedDataset[rawPredictionCol]))
            newColumns = origCols + [tmpColName]

            # switch out the intermediate column with the accumulator column
            aggregatedDataset = updatedDataset\
                .select(*newColumns).withColumnRenamed(tmpColName, accColName)

        if handlePersistence:
            newDataset.unpersist()

        # output the index of the classifier with highest confidence as prediction
        labelUDF = udf(
            lambda predictions: float(max(enumerate(predictions), key=operator.itemgetter(1))[0]),
            DoubleType())

        # output label and label metadata as prediction
        return aggregatedDataset.withColumn(
            self.getPredictionCol(), labelUDF(aggregatedDataset[accColName])).drop(accColName)

Python pyspark.sql.types.DoubleType() Examples