Python Examples of pyspark.sql.types.StructField

Source File: imageIO.py From spark-deep-learning with Apache License 2.0

8 votes

def filesToDF(sc, path, numPartitions=None):
    """
    Read files from a directory to a DataFrame.

    :param sc: SparkContext.
    :param path: str, path to files.
    :param numPartition: int, number or partitions to use for reading files.
    :return: DataFrame, with columns: (filePath: str, fileData: BinaryType)
    """
    numPartitions = numPartitions or sc.defaultParallelism
    schema = StructType([StructField("filePath", StringType(), False),
                         StructField("fileData", BinaryType(), False)])
    rdd = sc.binaryFiles(
        path, minPartitions=numPartitions).repartition(numPartitions)
    rdd = rdd.map(lambda x: (x[0], bytearray(x[1])))
    return rdd.toDF(schema)

Source File: transform.py From search-MjoLniR with MIT License

6 votes

def _merge_schemas(*schemas: T.StructType):
    """Merge one or more spark schemas into a new schema"""
    fields = cast(Dict[str, T.StructField], {})
    errors = []
    for schema in schemas:
        for field in schema:
            if field.name not in fields:
                fields[field.name] = field
            elif field != fields[field.name]:
                errors.append('Incompatible fields: {} != {}'.format(field, fields[field.name]))
    if errors:
        raise Exception('\n'.join(errors))
    return T.StructType(list(fields.values()))


# Primary input schema from which most everything else is derived

Source File: df_naive.py From example_dataproc_twitter with MIT License

6 votes

def register_udfs(self, sess, sc):
        """Register UDFs to be used in SQL queries.

        :type sess: `pyspark.sql.SparkSession`
        :param sess: Session used in Spark for SQL queries.

        :type sc: `pyspark.SparkContext`
        :param sc: Spark Context to run Spark jobs.
        """ 
        sess.udf.register("SQUARED", self.squared, returnType=(
            stypes.ArrayType(stypes.StructType(
            fields=[stypes.StructField('sku0', stypes.StringType()),
            stypes.StructField('norm', stypes.FloatType())]))))

        sess.udf.register('INTERSECTIONS',self.process_intersections,
            returnType=stypes.ArrayType(stypes.StructType(fields=[
            stypes.StructField('sku0', stypes.StringType()),
            stypes.StructField('sku1', stypes.StringType()),
            stypes.StructField('cor', stypes.FloatType())])))

Source File: unischema.py From petastorm with Apache License 2.0

6 votes

def as_spark_schema(self):
        """Returns an object derived from the unischema as spark schema.

        Example:

        >>> spark.createDataFrame(dataset_rows,
        >>>                       SomeSchema.as_spark_schema())
        """
        # Lazy loading pyspark to avoid creating pyspark dependency on data reading code path
        # (currently works only with make_batch_reader)
        import pyspark.sql.types as sql_types

        schema_entries = []
        for field in self._fields.values():
            spark_type = _field_spark_dtype(field)
            schema_entries.append(sql_types.StructField(field.name, spark_type, field.nullable))

        return sql_types.StructType(schema_entries)

Source File: df_naive.py From example_dataproc_twitter with MIT License

6 votes

def register_udfs(self, sess, sc):
        """Register UDFs to be used in SQL queries.

        :type sess: `pyspark.sql.SparkSession`
        :param sess: Session used in Spark for SQL queries.

        :type sc: `pyspark.SparkContext`
        :param sc: Spark Context to run Spark jobs.
        """ 
        sess.udf.register("SQUARED", self.squared, returnType=(
            stypes.ArrayType(stypes.StructType(
            fields=[stypes.StructField('sku0', stypes.StringType()),
            stypes.StructField('norm', stypes.FloatType())]))))

        sess.udf.register('INTERSECTIONS',self.process_intersections,
            returnType=stypes.ArrayType(stypes.StructType(fields=[
            stypes.StructField('sku0', stypes.StringType()),
            stypes.StructField('sku1', stypes.StringType()),
            stypes.StructField('cor', stypes.FloatType())])))

Source File: test_base.py From example_dataproc_twitter with MIT License

5 votes

def test_load_neighbor_schema(self):
        klass = self.get_target_klass()()
        result = klass.load_neighbor_schema()
        expected = stypes.StructType(fields=[
                stypes.StructField("item", stypes.StringType()),
                 stypes.StructField("similarity_items", stypes.ArrayType(
                  stypes.StructType(fields=[
                   stypes.StructField("item", stypes.StringType()),
                    stypes.StructField("similarity", stypes.FloatType())])))])
        self.assertEqual(expected, result)

Source File: typehints.py From koalas with Apache License 2.0

5 votes

def __init__(self, tpe, names=None):
        if names is None:
            # Default names `c0, c1, ... cn`.
            self.tpe = types.StructType(
                [types.StructField("c%s" % i, tpe[i]) for i in range(len(tpe))]
            )  # type: types.StructType
        else:
            self.tpe = types.StructType(
                [types.StructField(n, t) for n, t in zip(names, tpe)]
            )  # type: types.StructType

Source File: criteo.py From azure-python-labs with MIT License

5 votes

def get_spark_schema(header=DEFAULT_HEADER):
    ## create schema
    schema = StructType()
    ## do label + ints
    n_ints = 14
    for i in range(n_ints):
        schema.add(StructField(header[i], IntegerType()))
    ## do categoricals
    for i in range(26):
        schema.add(StructField(header[i + n_ints], StringType()))
    return schema

Source File: criteo.py From azure-python-labs with MIT License

5 votes

def get_spark_schema(header=DEFAULT_HEADER):
    ## create schema
    schema = StructType()
    ## do label + ints
    n_ints = 14
    for i in range(n_ints):
        schema.add(StructField(header[i], IntegerType()))
    ## do categoricals
    for i in range(26):
        schema.add(StructField(header[i + n_ints], StringType()))
    return schema

Source File: base.py From example_dataproc_twitter with MIT License

5 votes

def load_users_schema():
        """Loads schema with data type [user, [(sku, score), (sku, score)]]

        :rtype: `pyspark.sql.type.StructType`
        :returns: schema speficiation for user -> (sku, score) data.
        """
        return stypes.StructType(fields=[
        	stypes.StructField("user", stypes.StringType()),
        	 stypes.StructField('interactions', stypes.ArrayType(
        	  stypes.StructType(fields=[stypes.StructField('item', 
        	   stypes.StringType()), stypes.StructField('score', 
        	    stypes.FloatType())])))])

Source File: base.py From example_dataproc_twitter with MIT License

5 votes

def load_neighbor_schema(self):
        """Loads neighborhood schema for similarity matrix

        :rtype: `pyspark.sql.types.StructField`
        :returns: schema of type ["key", [("key", "value")]]
        """
        return stypes.StructType(fields=[
                stypes.StructField("item", stypes.StringType()),
                 stypes.StructField("similarity_items", stypes.ArrayType(
                  stypes.StructType(fields=[
                   stypes.StructField("item", stypes.StringType()),
                    stypes.StructField("similarity", stypes.FloatType())])))])

Source File: base.py From example_dataproc_twitter with MIT License

5 votes

def load_users_schema():
        """Loads schema with data type [user, [(sku, score), (sku, score)]]

        :rtype: `pyspark.sql.type.StructType`
        :returns: schema speficiation for user -> (sku, score) data.
        """
        return stypes.StructType(fields=[
        	stypes.StructField("user", stypes.StringType()),
        	 stypes.StructField('interactions', stypes.ArrayType(
        	  stypes.StructType(fields=[stypes.StructField('item', 
        	   stypes.StringType()), stypes.StructField('score', 
        	    stypes.FloatType())])))])

Source File: test_base.py From example_dataproc_twitter with MIT License

5 votes

def test_load_users_schema(self):
        klass = self.get_target_klass()()
        expected = stypes.StructType(fields=[
        	stypes.StructField("user", stypes.StringType()),
        	 stypes.StructField('interactions', stypes.ArrayType(
        	  stypes.StructType(fields=[stypes.StructField('item', 
        	   stypes.StringType()), stypes.StructField('score', 
        	    stypes.FloatType())])))])
        result = klass.load_users_schema()
        self.assertEqual(result, expected)

Source File: test_upload.py From listenbrainz-server with GNU General Public License v2.0

5 votes

def test_process_json_listens(self, mock_save, mock_read):
        fakeschema = StructType([StructField('xxxxx', StringType(), nullable=True)])
        ListenbrainzDataUploader().process_json_listens('/2020/1.json', '/fakedir', 'fakehdfspath', fakeschema)
        mock_read.assert_called_once_with('fakehdfspath', schema=fakeschema)
        mock_save.assert_called_once_with(mock_read.return_value, '/fakedir/2020/1.parquet')

Source File: reader.py From HoloClean-Legacy-deprecated with Apache License 2.0

5 votes

def read(self, file_path, spark_session, indexcol=0, schema=None):
        """
        Creates a dataframe from the csv file

        :param indexcol: if 1, create a tuple id column as auto increment
        :param schema: optional schema of file if known
        :param spark_session: The spark_session we created in Holoclean object
        :param file_path: The path to the file

        :return: dataframe
        """
        if schema is None:
            df = spark_session.read.csv(file_path, header=True)
        else:
            df = spark_session.read.csv(file_path, header=True, schema=schema)

        if indexcol == 0:
            return df

        index_name = GlobalVariables.index_name

        new_cols = df.schema.names + [index_name]
        list_schema = []
        for index_attribute in range(len(df.schema.names)):
            list_schema.append(StructField("_" + str(index_attribute),
                                           df.schema[
                                               index_attribute].dataType,
                                           True))
        list_schema.append(
            StructField("_" + str(len(new_cols)), LongType(), True))

        schema = StructType(list_schema)
        ix_df = df.rdd.zipWithIndex().map(
            lambda (row, ix): row + (ix + 1,)).toDF(schema)
        tmp_cols = ix_df.schema.names
        new_df = reduce(lambda data, idx: data.withColumnRenamed(tmp_cols[idx],
                        new_cols[idx]),
                        xrange(len(tmp_cols)), ix_df)
        new_df = self.checking_string_size(new_df)
        return new_df

Source File: sampler.py From python_mozetl with MIT License

5 votes

def transform(landfill, n_documents=1000):
    meta_schema = StructType(
        [StructField(k, StringType(), True) for k in META_WHITELIST]
    )

    schema = StructType(
        [
            StructField("namespace", StringType(), False),
            StructField("doc_type", StringType(), False),
            StructField("doc_version", StringType(), True),
            StructField("doc_id", StringType(), True),
            StructField("meta", meta_schema, False),
            StructField("content", StringType(), False),
        ]
    )

    documents = (
        landfill.map(_process)
        .filter(lambda x: x[0] and x[1] and x[-2] and x[-1])
        .toDF(schema)
    )

    window_spec = Window.partitionBy("namespace", "doc_type", "doc_version").orderBy(
        "doc_id"
    )

    df = (
        documents.fillna("0", "doc_version")
        .withColumn("row_id", row_number().over(window_spec))
        .where(col("row_id") <= n_documents)
        .drop("row_id")
    )

    return df

Source File: transform.py From python_mozetl with MIT License

5 votes

def toStructType(self):
        return StructType(
            [StructField(col.name, col.struct_type, True) for col in self.columns]
        )

Source File: sample_scaffolds.py From reinvent-scaffold-decorator with MIT License

5 votes

def _initialize_results(self, scaffolds):
        data = [ps.Row(smiles=scaffold, scaffold=scaffold,
                       decorations={}, count=1) for scaffold in scaffolds]
        data_schema = pst.StructType([
            pst.StructField("smiles", pst.StringType()),
            pst.StructField("scaffold", pst.StringType()),
            pst.StructField("decorations", pst.MapType(pst.IntegerType(), pst.StringType())),
            pst.StructField("count", pst.IntegerType())
        ])
        return SPARK.createDataFrame(data, schema=data_schema)

Source File: schema.py From mlflow with Apache License 2.0

5 votes

def as_spark_schema(self):
        """Convert to Spark schema. If this schema is a single unnamed column, it is converted
        directly the corresponding spark data type, otherwise it's returned as a struct (missing
        column names are filled with an integer sequence).
        """
        if len(self.columns) == 1 and self.columns[0].name is None:
            return self.columns[0].type.to_spark()
        from pyspark.sql.types import StructType, StructField
        return StructType([StructField(name=col.name or str(i),
                                       dataType=col.type.to_spark())
                           for i, col in enumerate(self.columns)])

Source File: test_schema.py From mlflow with Apache License 2.0

5 votes

def test_spark_schema_inference(pandas_df_with_all_types):
    import pyspark
    from pyspark.sql.types import _parse_datatype_string, StructField, StructType
    schema = _infer_schema(pandas_df_with_all_types)
    assert schema == Schema([ColSpec(x, x) for x in pandas_df_with_all_types.columns])
    spark_session = pyspark.sql.SparkSession(pyspark.SparkContext.getOrCreate())
    spark_schema = StructType(
        [StructField(t.name, _parse_datatype_string(t.name), True)
         for t in schema.column_types()])
    sparkdf = spark_session.createDataFrame(pandas_df_with_all_types, schema=spark_schema)
    schema = _infer_schema(sparkdf)
    assert schema == Schema([ColSpec(x, x) for x in pandas_df_with_all_types.columns])

Source File: test_schema.py From mlflow with Apache License 2.0

5 votes

def test_spark_type_mapping(pandas_df_with_all_types):
    import pyspark
    from pyspark.sql.types import BooleanType, IntegerType, LongType, FloatType, DoubleType, \
        StringType, BinaryType
    from pyspark.sql.types import StructField, StructType

    assert isinstance(DataType.boolean.to_spark(), BooleanType)
    assert isinstance(DataType.integer.to_spark(), IntegerType)
    assert isinstance(DataType.long.to_spark(), LongType)
    assert isinstance(DataType.float.to_spark(), FloatType)
    assert isinstance(DataType.double.to_spark(), DoubleType)
    assert isinstance(DataType.string.to_spark(), StringType)
    assert isinstance(DataType.binary.to_spark(), BinaryType)
    schema = _infer_schema(pandas_df_with_all_types)
    expected_spark_schema = StructType(
        [StructField(t.name, t.to_spark(), True)
         for t in schema.column_types()])
    actual_spark_schema = schema.as_spark_schema()
    assert expected_spark_schema.jsonValue() == actual_spark_schema.jsonValue()
    spark_session = pyspark.sql.SparkSession(pyspark.SparkContext.getOrCreate())
    sparkdf = spark_session.createDataFrame(pandas_df_with_all_types,
                                            schema=actual_spark_schema)
    schema2 = _infer_schema(sparkdf)
    assert schema == schema2

    # test unnamed columns
    schema = Schema([ColSpec(col.type) for col in schema.columns])
    expected_spark_schema = StructType(
        [StructField(str(i), t.to_spark(), True)
         for i, t in enumerate(schema.column_types())])
    actual_spark_schema = schema.as_spark_schema()
    assert expected_spark_schema.jsonValue() == actual_spark_schema.jsonValue()

    # test single unnamed column is mapped to just a single spark type
    schema = Schema([ColSpec(DataType.integer)])
    spark_type = schema.as_spark_schema()
    assert isinstance(spark_type, IntegerType)

Source File: dfutil.py From TensorFlowOnSpark with Apache License 2.0

5 votes

def infer_schema(example, binary_features=[]):
  """Given a tf.train.Example, infer the Spark DataFrame schema (StructFields).

  Note: TensorFlow represents both strings and binary types as tf.train.BytesList, and we need to
  disambiguate these types for Spark DataFrames DTypes (StringType and BinaryType), so we require a "hint"
  from the caller in the ``binary_features`` argument.

  Args:
    :example: a tf.train.Example
    :binary_features: a list of tf.train.Example features which are expected to be binary/bytearrays.

  Returns:
    A DataFrame StructType schema
  """
  def _infer_sql_type(k, v):
    # special handling for binary features
    if k in binary_features:
      return BinaryType()

    if v.int64_list.value:
      result = v.int64_list.value
      sql_type = LongType()
    elif v.float_list.value:
      result = v.float_list.value
      sql_type = DoubleType()
    else:
      result = v.bytes_list.value
      sql_type = StringType()

    if len(result) > 1:             # represent multi-item tensors as Spark SQL ArrayType() of base types
      return ArrayType(sql_type)
    else:                           # represent everything else as base types (and empty tensors as StringType())
      return sql_type

  return StructType([StructField(k, _infer_sql_type(k, v), True) for k, v in sorted(example.features.feature.items())])

Source File: smvschema.py From SMV with Apache License 2.0

5 votes

def _scala_to_python_field_type(self, scala_field_type):
        """create a python FieldType from the scala field type"""
        col_name = str(scala_field_type.name())
        col_type_name = str(scala_field_type.dataType())
        # map string "IntegerType" to actual class IntegerType
        col_type_class = getattr(sql_types, col_type_name)
        return sql_types.StructField(col_name, col_type_class())

Source File: es_hits.py From search-MjoLniR with MIT License

5 votes

def transform(df, url_list=None, brokers=None, **kwargs):
    if brokers and url_list:
        raise ValueError('cannot specify brokers and url_list')
    if brokers:
        rdd = transform_from_kafka(df, brokers, **kwargs)
    else:
        rdd = transform_from_elasticsearch(df, url_list, **kwargs)
    return df.sql_ctx.createDataFrame(rdd, T.StructType([
        df.schema['wikiid'],
        df.schema['query'],
        df.schema['norm_query'],
        T.StructField('hit_page_ids', T.ArrayType(T.IntegerType()), nullable=False),
    ]))

Source File: test_dataset.py From python_moztelemetry with Mozilla Public License 2.0

5 votes

def test_dataframe_with_schema(dataset, spark):
    schema = StructType([StructField("foo", IntegerType(), True)])
    df = dataset.dataframe(spark, decode=decode, schema=schema, table_name='bar')

    assert type(df) == DataFrame
    assert df.columns == ['foo']
    assert df.orderBy(["foo"]).collect() == [Row(foo=1), Row(foo=2)]

Source File: test_dataset.py From python_moztelemetry with Mozilla Public License 2.0

5 votes

def test_dataframe_bad_schema(dataset, spark):
    spark.catalog.dropTempView('bar')
    schema = StructType([StructField("name", StringType(), True)])
    df = dataset.dataframe(spark, decode=decode, schema=schema, table_name='bar')

    assert type(df) == DataFrame
    assert df.collect() == [Row(name=None), Row(name=None)]

Source File: datatypes.py From ibis with Apache License 2.0

5 votes

def ibis_struct_dtype_to_spark_dtype(ibis_dtype_obj):
    fields = [
        pt.StructField(n, spark_dtype(t), t.nullable)
        for n, t in zip(ibis_dtype_obj.names, ibis_dtype_obj.types)
    ]
    return pt.StructType(fields)

Source File: ml_model.py From elephas with MIT License

5 votes

def _transform(self, df):
        """Private transform method of a Transformer. This serves as batch-prediction method for our purposes.
        """
        output_col = self.getOutputCol()
        label_col = self.getLabelCol()
        new_schema = copy.deepcopy(df.schema)
        new_schema.add(StructField(output_col, StringType(), True))

        rdd = df.rdd.coalesce(1)
        features = np.asarray(
            rdd.map(lambda x: from_vector(x.features)).collect())
        # Note that we collect, since executing this on the rdd would require model serialization once again
        model = model_from_yaml(self.get_keras_model_config())
        model.set_weights(self.weights.value)
        predictions = rdd.ctx.parallelize(
            model.predict_classes(features)).coalesce(1)
        predictions = predictions.map(lambda x: tuple(str(x)))

        results_rdd = rdd.zip(predictions).map(lambda x: x[0] + x[1])
        results_df = df.sql_ctx.createDataFrame(results_rdd, new_schema)
        results_df = results_df.withColumn(
            output_col, results_df[output_col].cast(DoubleType()))
        results_df = results_df.withColumn(
            label_col, results_df[label_col].cast(DoubleType()))

        return results_df

Source File: base.py From LearningApacheSpark with MIT License

5 votes

def transformSchema(self, schema):
        inputType = schema[self.getInputCol()].dataType
        self.validateInputType(inputType)
        if self.getOutputCol() in schema.names:
            raise ValueError("Output column %s already exists." % self.getOutputCol())
        outputFields = copy.copy(schema.fields)
        outputFields.append(StructField(self.getOutputCol(),
                                        self.outputDataType(),
                                        nullable=False))
        return StructType(outputFields)

Source File: test_upload.py From listenbrainz-server with GNU General Public License v2.0

5 votes

def test_process_json(self, mock_save, mock_read):
        fakeschema = StructType([StructField('xxxxx', StringType(), nullable=True)])
        ListenbrainzDataUploader().process_json('fakename', '/fakedestpath', '/fakehdfspath', fakeschema)
        mock_read.assert_called_once_with('/fakehdfspath', schema=fakeschema)
        mock_save.assert_called_once_with(mock_read.return_value, '/fakedestpath')

Python pyspark.sql.types.StructField() Examples