Python pyspark.sql.types.StringType() Examples
The following are 30
code examples of pyspark.sql.types.StringType().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql.types
, or try the search function
.
Example #1
Source File: temp_range_sql.py From Hanhan-Spark-Python with MIT License | 12 votes |
def main(): temp_schema = StructType([ StructField('StationID', StringType(), False), StructField('DateTime', StringType(), False), StructField('Observation', StringType(), False), StructField('DataValue', DoubleType(), False), StructField('MFlag', StringType(), True), StructField('QFlag', StringType(), True), StructField('SFlag', StringType(), True), StructField('OBSTime', StringType(), True), ]) df = sqlContext.read.format('com.databricks.spark.csv').options(header='false').load(inputs1, schema=temp_schema) df = df.filter(df.QFlag == '') dfrange = get_range(df) result = dfrange.rdd.map(lambda r: str(r.DateTime)+' '+str(r.StationID)+' '+str(r.MaxRange)) outdata = result.sortBy(lambda r: r[0]).coalesce(1) outdata.saveAsTextFile(output)
Example #2
Source File: imageIO.py From spark-deep-learning with Apache License 2.0 | 8 votes |
def filesToDF(sc, path, numPartitions=None): """ Read files from a directory to a DataFrame. :param sc: SparkContext. :param path: str, path to files. :param numPartition: int, number or partitions to use for reading files. :return: DataFrame, with columns: (filePath: str, fileData: BinaryType) """ numPartitions = numPartitions or sc.defaultParallelism schema = StructType([StructField("filePath", StringType(), False), StructField("fileData", BinaryType(), False)]) rdd = sc.binaryFiles( path, minPartitions=numPartitions).repartition(numPartitions) rdd = rdd.map(lambda x: (x[0], bytearray(x[1]))) return rdd.toDF(schema)
Example #3
Source File: sample_scaffolds.py From reinvent-scaffold-decorator with MIT License | 6 votes |
def _join_results_multi(self, scaffolds_df, sampled_df): def _join_scaffold(scaff, dec): mol = usc.join(scaff, dec) if mol: return usc.to_smiles(mol) def _format_attachment_point(smi, num): smi = usc.add_first_attachment_point_number(smi, num) return usc.to_smiles(uc.to_mol(smi)) # canonicalize join_scaffold_udf = psf.udf(_join_scaffold, pst.StringType()) format_attachment_point_udf = psf.udf(_format_attachment_point, pst.StringType()) return sampled_df.join(scaffolds_df, on="id")\ .withColumn("decoration", format_attachment_point_udf("decoration_smi", psf.col("attachment_points")[0]))\ .select( join_scaffold_udf("smiles", "decoration").alias("smiles"), psf.map_concat( psf.create_map(psf.col("attachment_points")[0], SampleScaffolds.cleanup_decoration_udf("decoration")), "decorations", ).alias("decorations"), "scaffold")
Example #4
Source File: functions.py From LearningApacheSpark with MIT License | 6 votes |
def locate(substr, str, pos=1): """ Locate the position of the first occurrence of substr in a string column, after position pos. .. note:: The position is not zero based, but 1 based index. Returns 0 if substr could not be found in str. :param substr: a string :param str: a Column of :class:`pyspark.sql.types.StringType` :param pos: start position (zero based) >>> df = spark.createDataFrame([('abcd',)], ['s',]) >>> df.select(locate('b', df.s, 1).alias('s')).collect() [Row(s=2)] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.locate(substr, _to_java_column(str), pos))
Example #5
Source File: df_naive.py From example_dataproc_twitter with MIT License | 6 votes |
def register_udfs(self, sess, sc): """Register UDFs to be used in SQL queries. :type sess: `pyspark.sql.SparkSession` :param sess: Session used in Spark for SQL queries. :type sc: `pyspark.SparkContext` :param sc: Spark Context to run Spark jobs. """ sess.udf.register("SQUARED", self.squared, returnType=( stypes.ArrayType(stypes.StructType( fields=[stypes.StructField('sku0', stypes.StringType()), stypes.StructField('norm', stypes.FloatType())])))) sess.udf.register('INTERSECTIONS',self.process_intersections, returnType=stypes.ArrayType(stypes.StructType(fields=[ stypes.StructField('sku0', stypes.StringType()), stypes.StructField('sku1', stypes.StringType()), stypes.StructField('cor', stypes.FloatType())])))
Example #6
Source File: functions.py From LearningApacheSpark with MIT License | 6 votes |
def to_timestamp(col, format=None): """Converts a :class:`Column` of :class:`pyspark.sql.types.StringType` or :class:`pyspark.sql.types.TimestampType` into :class:`pyspark.sql.types.DateType` using the optionally specified format. Specify formats according to `SimpleDateFormats <http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html>`_. By default, it follows casting rules to :class:`pyspark.sql.types.TimestampType` if the format is omitted (equivalent to ``col.cast("timestamp")``). >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) >>> df.select(to_timestamp(df.t).alias('dt')).collect() [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))] >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) >>> df.select(to_timestamp(df.t, 'yyyy-MM-dd HH:mm:ss').alias('dt')).collect() [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))] """ sc = SparkContext._active_spark_context if format is None: jc = sc._jvm.functions.to_timestamp(_to_java_column(col)) else: jc = sc._jvm.functions.to_timestamp(_to_java_column(col), format) return Column(jc)
Example #7
Source File: accuracy.py From HoloClean-Legacy-deprecated with Apache License 2.0 | 6 votes |
def read_groundtruth(self): """ Create a dataframe from the ground truth csv file Takes as argument the full path name of the csv file and the spark_session """ filereader = Reader(self.spark_session) groundtruth_schema = StructType([ StructField("tid", IntegerType(), False), StructField("attr_name", StringType(), False), StructField("attr_val", StringType(), False)]) self.ground_truth_flat = filereader.read(self.path_to_grand_truth, 0, groundtruth_schema).\ drop(GlobalVariables.index_name) self.dataengine.add_db_table( 'Groundtruth', self.ground_truth_flat, self.dataset)
Example #8
Source File: testColumnHelper.py From SMV with Apache License 2.0 | 6 votes |
def test_smvArrayFlatten(self): df = self.createDF('a:String;b:String;c:String', ',,;1,2,;2,3,4') df1 = df.select(F.array( F.array(F.lit(None), F.col('a')), F.array(F.col('a'), F.col('b'), F.col('c')) ).alias('aa')) res1 = df1.select(F.col('aa').smvArrayFlatten(StringType()).alias('a'))\ .select(SF.smvArrayCat('|', F.col('a')).alias('k')) exp = self.createDF("k: String", """||||; |1|1|2|; |2|2|3|4""") res2 = df1.select(F.col('aa').smvArrayFlatten(df1).alias('a'))\ .select(SF.smvArrayCat('|', F.col('a')).alias('k')) self.should_be_same(res1, exp) self.should_be_same(res2, exp)
Example #9
Source File: functions.py From LearningApacheSpark with MIT License | 6 votes |
def to_date(col, format=None): """Converts a :class:`Column` of :class:`pyspark.sql.types.StringType` or :class:`pyspark.sql.types.TimestampType` into :class:`pyspark.sql.types.DateType` using the optionally specified format. Specify formats according to `SimpleDateFormats <http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html>`_. By default, it follows casting rules to :class:`pyspark.sql.types.DateType` if the format is omitted (equivalent to ``col.cast("date")``). >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) >>> df.select(to_date(df.t).alias('date')).collect() [Row(date=datetime.date(1997, 2, 28))] >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t']) >>> df.select(to_date(df.t, 'yyyy-MM-dd HH:mm:ss').alias('date')).collect() [Row(date=datetime.date(1997, 2, 28))] """ sc = SparkContext._active_spark_context if format is None: jc = sc._jvm.functions.to_date(_to_java_column(col)) else: jc = sc._jvm.functions.to_date(_to_java_column(col), format) return Column(jc)
Example #10
Source File: df_naive.py From example_dataproc_twitter with MIT License | 6 votes |
def register_udfs(self, sess, sc): """Register UDFs to be used in SQL queries. :type sess: `pyspark.sql.SparkSession` :param sess: Session used in Spark for SQL queries. :type sc: `pyspark.SparkContext` :param sc: Spark Context to run Spark jobs. """ sess.udf.register("SQUARED", self.squared, returnType=( stypes.ArrayType(stypes.StructType( fields=[stypes.StructField('sku0', stypes.StringType()), stypes.StructField('norm', stypes.FloatType())])))) sess.udf.register('INTERSECTIONS',self.process_intersections, returnType=stypes.ArrayType(stypes.StructType(fields=[ stypes.StructField('sku0', stypes.StringType()), stypes.StructField('sku1', stypes.StringType()), stypes.StructField('cor', stypes.FloatType())])))
Example #11
Source File: test_unischema.py From petastorm with Apache License 2.0 | 6 votes |
def test_as_spark_schema(): """Try using 'as_spark_schema' function""" TestSchema = Unischema('TestSchema', [ UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False), UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False), UnischemaField('string_field_implicit', np.string_, ()), ]) spark_schema = TestSchema.as_spark_schema() assert spark_schema.fields[0].name == 'int_field' assert spark_schema.fields[1].name == 'string_field' assert spark_schema.fields[1].dataType == StringType() assert spark_schema.fields[2].name == 'string_field_implicit' assert spark_schema.fields[2].dataType == StringType() assert TestSchema.fields['int_field'].name == 'int_field' assert TestSchema.fields['string_field'].name == 'string_field'
Example #12
Source File: sample_scaffolds.py From reinvent-scaffold-decorator with MIT License | 6 votes |
def _join_results_single(self, scaffolds_df, sampled_df): def _join_scaffold(scaff, decs): mol = usc.join_joined_attachments(scaff, decs) if mol: return usc.to_smiles(mol) join_scaffold_udf = psf.udf(_join_scaffold, pst.StringType()) def _create_decorations_map(decorations_smi, attachment_points): decorations = decorations_smi.split(usc.ATTACHMENT_SEPARATOR_TOKEN) return {idx: _cleanup_decoration(dec) for dec, idx in zip(decorations, attachment_points)} create_decorations_map_udf = psf.udf(_create_decorations_map, pst.MapType(pst.IntegerType(), pst.StringType())) return sampled_df.join(scaffolds_df, on="id")\ .select( join_scaffold_udf("randomized_scaffold", "decoration_smi").alias("smiles"), create_decorations_map_udf("decoration_smi", "attachment_points").alias("decorations"), "scaffold")
Example #13
Source File: utils.py From mlflow with Apache License 2.0 | 6 votes |
def format_to_file_path(spark_session): rows = [ Row(8, 32, "bat"), Row(64, 40, "mouse"), Row(-27, 55, "horse") ] schema = StructType([ StructField("number2", IntegerType()), StructField("number1", IntegerType()), StructField("word", StringType()) ]) rdd = spark_session.sparkContext.parallelize(rows) df = spark_session.createDataFrame(rdd, schema) res = {} tempdir = tempfile.mkdtemp() for data_format in ["csv", "parquet", "json"]: res[data_format] = os.path.join(tempdir, "test-data-%s" % data_format) for data_format, file_path in res.items(): df.write.option("header", "true").format(data_format).save(file_path) yield res shutil.rmtree(tempdir)
Example #14
Source File: reddit_average_sql.py From Hanhan-Spark-Python with MIT License | 6 votes |
def main(): schema = StructType([ StructField('subreddit', StringType(), False), StructField('score', IntegerType(), False), ]) inputs = sqlContext.read.json(inputs1, schema=schema) # Uncomment this then shcema is not added # inputs = sqlContext.read.json(inputs1) # Uncomment these when there are 2 inputs dir # comments_input1 = sqlContext.read.json(inputs1, schema=schema) # comments_input2 = sqlContext.read.json(inputs2, schema=schema) # inputs = comments_input1.unionAll(comments_input2) df = get_avg(inputs) df.write.save(output, format='json', mode='overwrite')
Example #15
Source File: named_image.py From spark-deep-learning with Apache License 2.0 | 6 votes |
def _decodeOutputAsPredictions(self, df): # If we start having different weights than imagenet, we'll need to # move this logic to individual model building in NamedImageTransformer. # Also, we could put the computation directly in the main computation # graph or use a scala UDF for potentially better performance. topK = self.getOrDefault(self.topK) def decode(predictions): pred_arr = np.expand_dims(np.array(predictions), axis=0) decoded = decode_predictions(pred_arr, top=topK)[0] # convert numpy dtypes to python native types return [(t[0], t[1], t[2].item()) for t in decoded] decodedSchema = ArrayType( StructType([ StructField("class", StringType(), False), StructField("description", StringType(), False), StructField("probability", FloatType(), False) ])) decodeUDF = udf(decode, decodedSchema) interim_output = self._getIntermediateOutputCol() return df \ .withColumn(self.getOutputCol(), decodeUDF(df[interim_output])) \ .drop(interim_output)
Example #16
Source File: sampler.py From python_mozetl with MIT License | 5 votes |
def transform(landfill, n_documents=1000): meta_schema = StructType( [StructField(k, StringType(), True) for k in META_WHITELIST] ) schema = StructType( [ StructField("namespace", StringType(), False), StructField("doc_type", StringType(), False), StructField("doc_version", StringType(), True), StructField("doc_id", StringType(), True), StructField("meta", meta_schema, False), StructField("content", StringType(), False), ] ) documents = ( landfill.map(_process) .filter(lambda x: x[0] and x[1] and x[-2] and x[-1]) .toDF(schema) ) window_spec = Window.partitionBy("namespace", "doc_type", "doc_version").orderBy( "doc_id" ) df = ( documents.fillna("0", "doc_version") .withColumn("row_id", row_number().over(window_spec)) .where(col("row_id") <= n_documents) .drop("row_id") ) return df
Example #17
Source File: sample_scaffolds.py From reinvent-scaffold-decorator with MIT License | 5 votes |
def _initialize_results(self, scaffolds): data = [ps.Row(smiles=scaffold, scaffold=scaffold, decorations={}, count=1) for scaffold in scaffolds] data_schema = pst.StructType([ pst.StructField("smiles", pst.StringType()), pst.StructField("scaffold", pst.StringType()), pst.StructField("decorations", pst.MapType(pst.IntegerType(), pst.StringType())), pst.StructField("count", pst.IntegerType()) ]) return SPARK.createDataFrame(data, schema=data_schema)
Example #18
Source File: test_spark.py From mlflow with Apache License 2.0 | 5 votes |
def test_spark_udf(spark, model_path): mlflow.pyfunc.save_model( path=model_path, loader_module=__name__, code_path=[os.path.dirname(tests.__file__)], ) reloaded_pyfunc_model = mlflow.pyfunc.load_pyfunc(model_path) pandas_df = pd.DataFrame(data=np.ones((10, 10)), columns=[str(i) for i in range(10)]) spark_df = spark.createDataFrame(pandas_df) # Test all supported return types type_map = {"float": (FloatType(), np.number), "int": (IntegerType(), np.int32), "double": (DoubleType(), np.number), "long": (LongType(), np.int), "string": (StringType(), None)} for tname, tdef in type_map.items(): spark_type, np_type = tdef prediction_df = reloaded_pyfunc_model.predict(pandas_df) for is_array in [True, False]: t = ArrayType(spark_type) if is_array else spark_type if tname == "string": expected = prediction_df.applymap(str) else: expected = prediction_df.select_dtypes(np_type) if tname == "float": expected = expected.astype(np.float32) expected = [list(row[1]) if is_array else row[1][0] for row in expected.iterrows()] pyfunc_udf = spark_udf(spark, model_path, result_type=t) new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns)) actual = list(new_df.select("prediction").toPandas()['prediction']) assert expected == actual if not is_array: pyfunc_udf = spark_udf(spark, model_path, result_type=tname) new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns)) actual = list(new_df.select("prediction").toPandas()['prediction']) assert expected == actual
Example #19
Source File: test_spark.py From mlflow with Apache License 2.0 | 5 votes |
def test_spark_udf_autofills_column_names_with_schema(spark): class TestModel(PythonModel): def predict(self, context, model_input): return [model_input.columns] * len(model_input) signature = ModelSignature( inputs=Schema([ ColSpec("long", "a"), ColSpec("long", "b"), ColSpec("long", "c"), ]), outputs=Schema([ColSpec("integer")]) ) with mlflow.start_run() as run: mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=signature) udf = mlflow.pyfunc.spark_udf(spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType())) data = spark.createDataFrame(pd.DataFrame( columns=["a", "b", "c", "d"], data={ "a": [1], "b": [2], "c": [3], "d": [4] } )) with pytest.raises(Py4JJavaError): res = data.withColumn("res1", udf("a", "b")).select("res1").toPandas() res = data.withColumn("res2", udf("a", "b", "c")).select("res2").toPandas() assert res["res2"][0] == ["a", "b", "c"] res = data.withColumn("res4", udf("a", "b", "c", "d")).select("res4").toPandas() assert res["res4"][0] == ["a", "b", "c"]
Example #20
Source File: analyze_run.py From pipelines with Apache License 2.0 | 5 votes |
def load_schema(schema_file): type_map = { 'KEY': StringType(), 'NUMBER': DoubleType(), 'CATEGORY': StringType(), 'TEXT': StringType(), 'IMAGE_URL': StringType() } schema_json = json.loads(file_io.read_file_to_string(schema_file)) fields = [StructField(x['name'], type_map[x['type']]) for x in schema_json] return schema_json, StructType(fields)
Example #21
Source File: criteo.py From azure-python-labs with MIT License | 5 votes |
def get_spark_schema(header=DEFAULT_HEADER): ## create schema schema = StructType() ## do label + ints n_ints = 14 for i in range(n_ints): schema.add(StructField(header[i], IntegerType())) ## do categoricals for i in range(26): schema.add(StructField(header[i + n_ints], StringType())) return schema
Example #22
Source File: criteo.py From azure-python-labs with MIT License | 5 votes |
def get_spark_schema(header=DEFAULT_HEADER): ## create schema schema = StructType() ## do label + ints n_ints = 14 for i in range(n_ints): schema.add(StructField(header[i], IntegerType())) ## do categoricals for i in range(26): schema.add(StructField(header[i + n_ints], StringType())) return schema
Example #23
Source File: strings.py From koalas with Apache License 2.0 | 5 votes |
def __init__(self, series: "ks.Series"): if not isinstance(series.spark.data_type, (StringType, BinaryType, ArrayType)): raise ValueError("Cannot call StringMethods on type {}".format(series.spark.data_type)) self._data = series self.name = self._data.name # Methods
Example #24
Source File: typehints.py From koalas with Apache License 2.0 | 5 votes |
def as_spark_type(tpe) -> types.DataType: """ Given a python type, returns the equivalent spark type. Accepts: - the built-in types in python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - python3's typing system """ if tpe in (str, "str", "string"): return types.StringType() elif tpe in (bytes,): return types.BinaryType() elif tpe in (np.int8, "int8", "byte"): return types.ByteType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() elif tpe in (int, "int", np.int, np.int32): return types.IntegerType() elif tpe in (np.int64, "int64", "long", "bigint"): return types.LongType() elif tpe in (float, "float", np.float): return types.FloatType() elif tpe in (np.float64, "float64", "double"): return types.DoubleType() elif tpe in (datetime.datetime, np.datetime64): return types.TimestampType() elif tpe in (datetime.date,): return types.DateType() elif tpe in (bool, "boolean", "bool", np.bool): return types.BooleanType() elif tpe in (np.ndarray,): # TODO: support other child types return types.ArrayType(types.StringType()) else: raise TypeError("Type %s was not understood." % tpe)
Example #25
Source File: indexes.py From koalas with Apache License 2.0 | 5 votes |
def _comparator_for_monotonic_decreasing(data_type): if isinstance(data_type, StringType): return compare_disallow_null elif isinstance(data_type, BooleanType): return compare_allow_null elif isinstance(data_type, NumericType): return compare_null_last else: return compare_null_first
Example #26
Source File: base.py From koalas with Apache License 2.0 | 5 votes |
def __radd__(self, other): # Handle 'literal' + df['col'] if isinstance(self.spark.data_type, StringType) and isinstance(other, str): return self._with_new_scol(F.concat(F.lit(other), self.spark.column)) else: return column_op(Column.__radd__)(self, other)
Example #27
Source File: base.py From koalas with Apache License 2.0 | 5 votes |
def __add__(self, other): if isinstance(self.spark.data_type, StringType): # Concatenate string columns if isinstance(other, IndexOpsMixin) and isinstance(other.spark.data_type, StringType): return column_op(F.concat)(self, other) # Handle df['col'] + 'literal' elif isinstance(other, str): return column_op(F.concat)(self, F.lit(other)) else: raise TypeError("string addition can only be applied to string series or literals.") else: return column_op(Column.__add__)(self, other)
Example #28
Source File: test_upload.py From listenbrainz-server with GNU General Public License v2.0 | 5 votes |
def test_process_json_listens(self, mock_save, mock_read): fakeschema = StructType([StructField('xxxxx', StringType(), nullable=True)]) ListenbrainzDataUploader().process_json_listens('/2020/1.json', '/fakedir', 'fakehdfspath', fakeschema) mock_read.assert_called_once_with('fakehdfspath', schema=fakeschema) mock_save.assert_called_once_with(mock_read.return_value, '/fakedir/2020/1.parquet')
Example #29
Source File: test_upload.py From listenbrainz-server with GNU General Public License v2.0 | 5 votes |
def test_process_json(self, mock_save, mock_read): fakeschema = StructType([StructField('xxxxx', StringType(), nullable=True)]) ListenbrainzDataUploader().process_json('fakename', '/fakedestpath', '/fakehdfspath', fakeschema) mock_read.assert_called_once_with('/fakehdfspath', schema=fakeschema) mock_save.assert_called_once_with(mock_read.return_value, '/fakedestpath')
Example #30
Source File: image_utils.py From spark-deep-learning with Apache License 2.0 | 5 votes |
def getSampleImagePathsDF(sqlContext, colName): files = getSampleImagePaths() return sqlContext.createDataFrame(files, StringType()).toDF(colName) # Methods for making comparisons between outputs of using different frameworks. # For ImageNet.