Python pyspark.sql.types.TimestampType() Examples
The following are 11
code examples of pyspark.sql.types.TimestampType().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark.sql.types
, or try the search function
.
Example #1
Source File: base.py From koalas with Apache License 2.0 | 5 votes |
def __sub__(self, other): # Note that timestamp subtraction casts arguments to integer. This is to mimic pandas's # behaviors. pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction. if isinstance(other, IndexOpsMixin) and isinstance(self.spark.data_type, TimestampType): if not isinstance(other.spark.data_type, TimestampType): raise TypeError("datetime subtraction can only be applied to datetime series.") return self.astype("bigint") - other.astype("bigint") elif isinstance(other, IndexOpsMixin) and isinstance(self.spark.data_type, DateType): if not isinstance(other.spark.data_type, DateType): raise TypeError("date subtraction can only be applied to date series.") return column_op(F.datediff)(self, other) else: return column_op(Column.__sub__)(self, other)
Example #2
Source File: indexes.py From koalas with Apache License 2.0 | 5 votes |
def is_all_dates(self): """ Return if all data types of the index are datetime. remember that since Koalas does not support multiple data types in an index, so it returns True if any type of data is datetime. Examples -------- >>> from datetime import datetime >>> idx = ks.Index([datetime(2019, 1, 1, 0, 0, 0), datetime(2019, 2, 3, 0, 0, 0)]) >>> idx DatetimeIndex(['2019-01-01', '2019-02-03'], dtype='datetime64[ns]', freq=None) >>> idx.is_all_dates True >>> idx = ks.Index([datetime(2019, 1, 1, 0, 0, 0), None]) >>> idx DatetimeIndex(['2019-01-01', 'NaT'], dtype='datetime64[ns]', freq=None) >>> idx.is_all_dates True >>> idx = ks.Index([0, 1, 2]) >>> idx Int64Index([0, 1, 2], dtype='int64') >>> idx.is_all_dates False """ return isinstance(self.spark.data_type, TimestampType)
Example #3
Source File: typehints.py From koalas with Apache License 2.0 | 5 votes |
def as_spark_type(tpe) -> types.DataType: """ Given a python type, returns the equivalent spark type. Accepts: - the built-in types in python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - python3's typing system """ if tpe in (str, "str", "string"): return types.StringType() elif tpe in (bytes,): return types.BinaryType() elif tpe in (np.int8, "int8", "byte"): return types.ByteType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() elif tpe in (int, "int", np.int, np.int32): return types.IntegerType() elif tpe in (np.int64, "int64", "long", "bigint"): return types.LongType() elif tpe in (float, "float", np.float): return types.FloatType() elif tpe in (np.float64, "float64", "double"): return types.DoubleType() elif tpe in (datetime.datetime, np.datetime64): return types.TimestampType() elif tpe in (datetime.date,): return types.DateType() elif tpe in (bool, "boolean", "bool", np.bool): return types.BooleanType() elif tpe in (np.ndarray,): # TODO: support other child types return types.ArrayType(types.StringType()) else: raise TypeError("Type %s was not understood." % tpe)
Example #4
Source File: typehints.py From koalas with Apache License 2.0 | 5 votes |
def spark_type_to_pandas_dtype(spark_type): """ Return the given Spark DataType to pandas dtype. """ if isinstance(spark_type, (types.DateType, types.UserDefinedType)): return np.dtype("object") elif isinstance(spark_type, types.TimestampType): return np.dtype("datetime64[ns]") else: return np.dtype(to_arrow_type(spark_type).to_pandas_dtype())
Example #5
Source File: session.py From tidb-docker-compose with Apache License 2.0 | 4 votes |
def _convert_from_pandas(self, pdf, schema, timezone): """ Convert a pandas.DataFrame to list of records that can be used to make a DataFrame :return list of records """ if timezone is not None: from pyspark.sql.types import _check_series_convert_timestamps_tz_local copied = False if isinstance(schema, StructType): for field in schema: # TODO: handle nested timestamps, such as ArrayType(TimestampType())? if isinstance(field.dataType, TimestampType): s = _check_series_convert_timestamps_tz_local(pdf[field.name], timezone) if s is not pdf[field.name]: if not copied: # Copy once if the series is modified to prevent the original # Pandas DataFrame from being updated pdf = pdf.copy() copied = True pdf[field.name] = s else: for column, series in pdf.iteritems(): s = _check_series_convert_timestamps_tz_local(series, timezone) if s is not series: if not copied: # Copy once if the series is modified to prevent the original # Pandas DataFrame from being updated pdf = pdf.copy() copied = True pdf[column] = s # Convert pandas.DataFrame to list of numpy records np_records = pdf.to_records(index=False) # Check if any columns need to be fixed for Spark to infer properly if len(np_records) > 0: record_dtype = self._get_numpy_record_dtype(np_records[0]) if record_dtype is not None: return [r.astype(record_dtype).tolist() for r in np_records] # Convert list of numpy records to python lists return [r.tolist() for r in np_records]
Example #6
Source File: session.py From tidb-docker-compose with Apache License 2.0 | 4 votes |
def _create_from_pandas_with_arrow(self, pdf, schema, timezone): """ Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the data types will be used to coerce the data in Pandas to Arrow conversion. """ from pyspark.serializers import ArrowStreamSerializer, _create_batch from pyspark.sql.types import from_arrow_schema, to_arrow_type, TimestampType from pyspark.sql.utils import require_minimum_pandas_version, \ require_minimum_pyarrow_version require_minimum_pandas_version() require_minimum_pyarrow_version() from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype # Determine arrow types to coerce data when creating batches if isinstance(schema, StructType): arrow_types = [to_arrow_type(f.dataType) for f in schema.fields] elif isinstance(schema, DataType): raise ValueError("Single data type %s is not supported with Arrow" % str(schema)) else: # Any timestamps must be coerced to be compatible with Spark arrow_types = [to_arrow_type(TimestampType()) if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None for t in pdf.dtypes] # Slice the DataFrame to be batched step = -(-len(pdf) // self.sparkContext.defaultParallelism) # round int up pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step)) # Create Arrow record batches batches = [_create_batch([(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)], timezone) for pdf_slice in pdf_slices] # Create the Spark schema from the first Arrow batch (always at least 1 batch after slicing) if isinstance(schema, (list, tuple)): struct = from_arrow_schema(batches[0].schema) for i, name in enumerate(schema): struct.fields[i].name = name struct.names[i] = name schema = struct jsqlContext = self._wrapped._jsqlContext def reader_func(temp_filename): return self._jvm.PythonSQLUtils.readArrowStreamFromFile(jsqlContext, temp_filename) def create_RDD_server(): return self._jvm.ArrowRDDServer(jsqlContext) # Create Spark DataFrame from Arrow stream file, using one batch per partition jrdd = self._sc._serialize_to_jvm(batches, ArrowStreamSerializer(), reader_func, create_RDD_server) jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), jsqlContext) df = DataFrame(jdf, self._wrapped) df._schema = schema return df
Example #7
Source File: session.py From tidb-docker-compose with Apache License 2.0 | 4 votes |
def _convert_from_pandas(self, pdf, schema, timezone): """ Convert a pandas.DataFrame to list of records that can be used to make a DataFrame :return list of records """ if timezone is not None: from pyspark.sql.types import _check_series_convert_timestamps_tz_local copied = False if isinstance(schema, StructType): for field in schema: # TODO: handle nested timestamps, such as ArrayType(TimestampType())? if isinstance(field.dataType, TimestampType): s = _check_series_convert_timestamps_tz_local(pdf[field.name], timezone) if s is not pdf[field.name]: if not copied: # Copy once if the series is modified to prevent the original # Pandas DataFrame from being updated pdf = pdf.copy() copied = True pdf[field.name] = s else: for column, series in pdf.iteritems(): s = _check_series_convert_timestamps_tz_local(series, timezone) if s is not series: if not copied: # Copy once if the series is modified to prevent the original # Pandas DataFrame from being updated pdf = pdf.copy() copied = True pdf[column] = s # Convert pandas.DataFrame to list of numpy records np_records = pdf.to_records(index=False) # Check if any columns need to be fixed for Spark to infer properly if len(np_records) > 0: record_dtype = self._get_numpy_record_dtype(np_records[0]) if record_dtype is not None: return [r.astype(record_dtype).tolist() for r in np_records] # Convert list of numpy records to python lists return [r.tolist() for r in np_records]
Example #8
Source File: session.py From tidb-docker-compose with Apache License 2.0 | 4 votes |
def _create_from_pandas_with_arrow(self, pdf, schema, timezone): """ Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the data types will be used to coerce the data in Pandas to Arrow conversion. """ from pyspark.serializers import ArrowSerializer, _create_batch from pyspark.sql.types import from_arrow_schema, to_arrow_type, TimestampType from pyspark.sql.utils import require_minimum_pandas_version, \ require_minimum_pyarrow_version require_minimum_pandas_version() require_minimum_pyarrow_version() from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype # Determine arrow types to coerce data when creating batches if isinstance(schema, StructType): arrow_types = [to_arrow_type(f.dataType) for f in schema.fields] elif isinstance(schema, DataType): raise ValueError("Single data type %s is not supported with Arrow" % str(schema)) else: # Any timestamps must be coerced to be compatible with Spark arrow_types = [to_arrow_type(TimestampType()) if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None for t in pdf.dtypes] # Slice the DataFrame to be batched step = -(-len(pdf) // self.sparkContext.defaultParallelism) # round int up pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step)) # Create Arrow record batches batches = [_create_batch([(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)], timezone) for pdf_slice in pdf_slices] # Create the Spark schema from the first Arrow batch (always at least 1 batch after slicing) if isinstance(schema, (list, tuple)): struct = from_arrow_schema(batches[0].schema) for i, name in enumerate(schema): struct.fields[i].name = name struct.names[i] = name schema = struct # Create the Spark DataFrame directly from the Arrow data and schema jrdd = self._sc._serialize_to_jvm(batches, len(batches), ArrowSerializer()) jdf = self._jvm.PythonSQLUtils.arrowPayloadToDataFrame( jrdd, schema.json(), self._wrapped._jsqlContext) df = DataFrame(jdf, self._wrapped) df._schema = schema return df
Example #9
Source File: session.py From LearningApacheSpark with MIT License | 4 votes |
def _convert_from_pandas(self, pdf, schema, timezone): """ Convert a pandas.DataFrame to list of records that can be used to make a DataFrame :return list of records """ if timezone is not None: from pyspark.sql.types import _check_series_convert_timestamps_tz_local copied = False if isinstance(schema, StructType): for field in schema: # TODO: handle nested timestamps, such as ArrayType(TimestampType())? if isinstance(field.dataType, TimestampType): s = _check_series_convert_timestamps_tz_local(pdf[field.name], timezone) if s is not pdf[field.name]: if not copied: # Copy once if the series is modified to prevent the original # Pandas DataFrame from being updated pdf = pdf.copy() copied = True pdf[field.name] = s else: for column, series in pdf.iteritems(): s = _check_series_convert_timestamps_tz_local(series, timezone) if s is not series: if not copied: # Copy once if the series is modified to prevent the original # Pandas DataFrame from being updated pdf = pdf.copy() copied = True pdf[column] = s # Convert pandas.DataFrame to list of numpy records np_records = pdf.to_records(index=False) # Check if any columns need to be fixed for Spark to infer properly if len(np_records) > 0: record_dtype = self._get_numpy_record_dtype(np_records[0]) if record_dtype is not None: return [r.astype(record_dtype).tolist() for r in np_records] # Convert list of numpy records to python lists return [r.tolist() for r in np_records]
Example #10
Source File: session.py From LearningApacheSpark with MIT License | 4 votes |
def _create_from_pandas_with_arrow(self, pdf, schema, timezone): """ Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the data types will be used to coerce the data in Pandas to Arrow conversion. """ from pyspark.serializers import ArrowStreamSerializer, _create_batch from pyspark.sql.types import from_arrow_schema, to_arrow_type, TimestampType from pyspark.sql.utils import require_minimum_pandas_version, \ require_minimum_pyarrow_version require_minimum_pandas_version() require_minimum_pyarrow_version() from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype # Determine arrow types to coerce data when creating batches if isinstance(schema, StructType): arrow_types = [to_arrow_type(f.dataType) for f in schema.fields] elif isinstance(schema, DataType): raise ValueError("Single data type %s is not supported with Arrow" % str(schema)) else: # Any timestamps must be coerced to be compatible with Spark arrow_types = [to_arrow_type(TimestampType()) if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None for t in pdf.dtypes] # Slice the DataFrame to be batched step = -(-len(pdf) // self.sparkContext.defaultParallelism) # round int up pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step)) # Create Arrow record batches batches = [_create_batch([(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)], timezone) for pdf_slice in pdf_slices] # Create the Spark schema from the first Arrow batch (always at least 1 batch after slicing) if isinstance(schema, (list, tuple)): struct = from_arrow_schema(batches[0].schema) for i, name in enumerate(schema): struct.fields[i].name = name struct.names[i] = name schema = struct jsqlContext = self._wrapped._jsqlContext def reader_func(temp_filename): return self._jvm.PythonSQLUtils.readArrowStreamFromFile(jsqlContext, temp_filename) def create_RDD_server(): return self._jvm.ArrowRDDServer(jsqlContext) # Create Spark DataFrame from Arrow stream file, using one batch per partition jrdd = self._sc._serialize_to_jvm(batches, ArrowStreamSerializer(), reader_func, create_RDD_server) jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), jsqlContext) df = DataFrame(jdf, self._wrapped) df._schema = schema return df
Example #11
Source File: datetimes.py From koalas with Apache License 2.0 | 4 votes |
def __init__(self, series: "ks.Series"): if not isinstance(series.spark.data_type, (DateType, TimestampType)): raise ValueError( "Cannot call DatetimeMethods on type {}".format(series.spark.data_type) ) self._data = series # Properties