Python pyspark.__version__() Examples
The following are 27
code examples of pyspark.__version__().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyspark
, or try the search function
.
Example #1
Source File: __init__.py From koalas with Apache License 2.0 | 6 votes |
def assert_pyspark_version(): import logging pyspark_ver = None try: import pyspark except ImportError: raise ImportError( "Unable to import pyspark - consider doing a pip install with [spark] " "extra to install pyspark with pip" ) else: pyspark_ver = getattr(pyspark, "__version__") if pyspark_ver is None or pyspark_ver < "2.4": logging.warning( 'Found pyspark version "{}" installed. pyspark>=2.4.0 is recommended.'.format( pyspark_ver if pyspark_ver is not None else "<unknown version>" ) )
Example #2
Source File: spark.py From mlflow with Apache License 2.0 | 6 votes |
def get_default_conda_env(): """ :return: The default Conda environment for MLflow Models produced by calls to :func:`save_model()` and :func:`log_model()`. This Conda environment contains the current version of PySpark that is installed on the caller's system. ``dev`` versions of PySpark are replaced with stable versions in the resulting Conda environment (e.g., if you are running PySpark version ``2.4.5.dev0``, invoking this method produces a Conda environment with a dependency on PySpark version ``2.4.5``). """ import pyspark # Strip the suffix from `dev` versions of PySpark, which are not # available for installation from Anaconda or PyPI pyspark_version = re.sub(r"(\.?)dev.*", "", pyspark.__version__) return _mlflow_conda_env( additional_conda_deps=[ "pyspark={}".format(pyspark_version), ], additional_pip_deps=None, additional_conda_channels=None)
Example #3
Source File: backend.py From joblib-spark with Apache License 2.0 | 6 votes |
def apply_async(self, func, callback=None): # Note the `func` args is a batch here. (BatchedCalls type) # See joblib.parallel.Parallel._dispatch def run_on_worker_and_fetch_result(): # TODO: handle possible spark exception here. # pylint: disable=fixme rdd = self._spark.sparkContext.parallelize([0], 1) \ .map(lambda _: cloudpickle.dumps(func())) if VersionUtils.majorMinorVersion(pyspark.__version__)[0] < 3: ser_res = rdd.collect()[0] else: ser_res = rdd.collectWithJobGroup(self._job_group, "joblib spark jobs")[0] return cloudpickle.loads(ser_res) return self._get_pool().apply_async( SafeFunction(run_on_worker_and_fetch_result), callback=callback )
Example #4
Source File: test_dataframe.py From koalas with Apache License 2.0 | 6 votes |
def test_rfloordiv(self): pdf = pd.DataFrame( {"angles": [0, 3, 4], "degrees": [360, 180, 360]}, index=["circle", "triangle", "rectangle"], columns=["angles", "degrees"], ) kdf = ks.from_pandas(pdf) if LooseVersion(pd.__version__) < LooseVersion("1.0.0") and LooseVersion( pd.__version__ ) >= LooseVersion("0.24.0"): expected_result = pd.DataFrame( {"angles": [np.inf, 3.0, 2.0], "degrees": [0.0, 0.0, 0.0]}, index=["circle", "triangle", "rectangle"], columns=["angles", "degrees"], ) else: expected_result = pdf.rfloordiv(10) self.assert_eq(kdf.rfloordiv(10), expected_result)
Example #5
Source File: test_series.py From koalas with Apache License 2.0 | 6 votes |
def test_div_zero_and_nan(self): pser = pd.Series([100, None, -300, None, 500, -700, np.inf, -np.inf], name="Koalas") kser = ks.from_pandas(pser) self.assert_eq(repr(pser.div(0)), repr(kser.div(0))) self.assert_eq(repr(pser.truediv(0)), repr(kser.truediv(0))) self.assert_eq(repr(pser / 0), repr(kser / 0)) self.assert_eq(repr(pser.div(np.nan)), repr(kser.div(np.nan))) self.assert_eq(repr(pser.truediv(np.nan)), repr(kser.truediv(np.nan))) self.assert_eq(repr(pser / np.nan), repr(kser / np.nan)) # floordiv has different behavior in pandas > 1.0.0 when divide by 0 if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): self.assert_eq(repr(pser.floordiv(0)), repr(kser.floordiv(0))) self.assert_eq(repr(pser // 0), repr(kser // 0)) else: result = pd.Series( [np.inf, np.nan, -np.inf, np.nan, np.inf, -np.inf, np.inf, -np.inf], name="Koalas" ) self.assert_eq(repr(kser.floordiv(0)), repr(result)) self.assert_eq(repr(kser // 0), repr(result)) self.assert_eq(repr(pser.floordiv(np.nan)), repr(kser.floordiv(np.nan)))
Example #6
Source File: test_series.py From koalas with Apache License 2.0 | 6 votes |
def test_repeat(self): pser = pd.Series(["a", "b", "c"], name="0", index=np.random.rand(3)) kser = ks.from_pandas(pser) self.assert_eq(kser.repeat(3).sort_index(), pser.repeat(3).sort_index()) self.assert_eq(kser.repeat(0).sort_index(), pser.repeat(0).sort_index()) self.assertRaises(ValueError, lambda: kser.repeat(-1)) self.assertRaises(ValueError, lambda: kser.repeat("abc")) pdf = pd.DataFrame({"a": ["a", "b", "c"], "rep": [10, 20, 30]}, index=np.random.rand(3)) kdf = ks.from_pandas(pdf) if LooseVersion(pyspark.__version__) < LooseVersion("2.4"): self.assertRaises(ValueError, lambda: kdf.a.repeat(kdf.rep)) else: self.assert_eq(kdf.a.repeat(kdf.rep).sort_index(), pdf.a.repeat(pdf.rep).sort_index())
Example #7
Source File: test_series.py From koalas with Apache License 2.0 | 5 votes |
def test_udt(self): sparse_values = {0: 0.1, 1: 1.1} sparse_vector = SparseVector(len(sparse_values), sparse_values) pser = pd.Series([sparse_vector]) if LooseVersion(pyspark.__version__) < LooseVersion("2.4"): with self.sql_conf({"spark.sql.execution.arrow.enabled": False}): kser = ks.from_pandas(pser) self.assert_eq(kser, pser) else: kser = ks.from_pandas(pser) self.assert_eq(kser, pser)
Example #8
Source File: TFSparkNode.py From TensorFlowOnSpark with Apache License 2.0 | 5 votes |
def _has_spark_resource_api(): """Returns true if Spark 3+ resource API is available""" import pyspark return version.parse(pyspark.__version__).base_version >= version.parse("3.0.0").base_version
Example #9
Source File: spark.py From mlflow with Apache License 2.0 | 5 votes |
def _save_model_metadata(dst_dir, spark_model, mlflow_model, sample_input, conda_env, signature=None, input_example=None): """ Saves model metadata into the passed-in directory. The persisted metadata assumes that a model can be loaded from a relative path to the metadata file (currently hard-coded to "sparkml"). """ import pyspark if sample_input is not None: mleap.add_to_model(mlflow_model=mlflow_model, path=dst_dir, spark_model=spark_model, sample_input=sample_input) if signature is not None: mlflow_model.signature = signature if input_example is not None: _save_example(mlflow_model, input_example, dst_dir) conda_env_subpath = "conda.yaml" if conda_env is None: conda_env = get_default_conda_env() elif not isinstance(conda_env, dict): with open(conda_env, "r") as f: conda_env = yaml.safe_load(f) with open(os.path.join(dst_dir, conda_env_subpath), "w") as f: yaml.safe_dump(conda_env, stream=f, default_flow_style=False) mlflow_model.add_flavor(FLAVOR_NAME, pyspark_version=pyspark.__version__, model_data=_SPARK_MODEL_PATH_SUB) pyfunc.add_to_model(mlflow_model, loader_module="mlflow.spark", data=_SPARK_MODEL_PATH_SUB, env=conda_env_subpath) mlflow_model.save(os.path.join(dst_dir, "MLmodel"))
Example #10
Source File: backend.py From joblib-spark with Apache License 2.0 | 5 votes |
def _cancel_all_jobs(self): if VersionUtils.majorMinorVersion(pyspark.__version__)[0] < 3: # Note: There's bug existing in `sparkContext.cancelJobGroup`. # See https://issues.apache.org/jira/browse/SPARK-31549 warnings.warn("For spark version < 3, pyspark cancelling job API has bugs, " "so we could not terminate running spark jobs correctly. " "See https://issues.apache.org/jira/browse/SPARK-31549 for reference.") else: self._spark.sparkContext.cancelJobGroup(self._job_group)
Example #11
Source File: backend.py From joblib-spark with Apache License 2.0 | 5 votes |
def register(): """ Register joblib spark backend. """ try: import sklearn # pylint: disable=C0415 if LooseVersion(sklearn.__version__) < LooseVersion('0.21'): warnings.warn("Your sklearn version is < 0.21, but joblib-spark only support " "sklearn >=0.21 . You can upgrade sklearn to version >= 0.21 to " "make sklearn use spark backend.") except ImportError: pass register_parallel_backend('spark', SparkDistributedBackend)
Example #12
Source File: conftest.py From koalas with Apache License 2.0 | 5 votes |
def add_pa(doctest_namespace): if os.getenv("PYARROW_VERSION", None) is not None: assert pa.__version__ == os.getenv("PYARROW_VERSION") doctest_namespace["pa"] = pa
Example #13
Source File: conftest.py From koalas with Apache License 2.0 | 5 votes |
def add_pd(doctest_namespace): if os.getenv("PANDAS_VERSION", None) is not None: assert pd.__version__ == os.getenv("PANDAS_VERSION") doctest_namespace["pd"] = pd
Example #14
Source File: test_ops_on_diff_frames.py From koalas with Apache License 2.0 | 5 votes |
def test_series_repeat(self): pser1 = pd.Series(["a", "b", "c"], name="a") pser2 = pd.Series([10, 20, 30], name="rep") kser1 = ks.from_pandas(pser1) kser2 = ks.from_pandas(pser2) if LooseVersion(pyspark.__version__) < LooseVersion("2.4"): self.assertRaises(ValueError, lambda: kser1.repeat(kser2)) else: self.assert_eq(kser1.repeat(kser2).sort_index(), pser1.repeat(pser2).sort_index())
Example #15
Source File: test_series.py From koalas with Apache License 2.0 | 5 votes |
def test_rdivmod(self): pser = pd.Series([100, None, 300, None, 500], name="Koalas") kser = ks.from_pandas(pser) if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): self.assert_eq(repr(kser.rdivmod(-100)), repr(pser.rdivmod(-100))) self.assert_eq(repr(kser.rdivmod(100)), repr(pser.rdivmod(100))) elif LooseVersion(pd.__version__) < LooseVersion("1.0.0"): expected_result = repr((pser.rfloordiv(-100), pser.rmod(-100))) self.assert_eq(repr(kser.rdivmod(-100)), expected_result) expected_result = repr((pser.rfloordiv(100), pser.rmod(100))) self.assert_eq(repr(kser.rdivmod(100)), expected_result)
Example #16
Source File: test_series.py From koalas with Apache License 2.0 | 5 votes |
def test_divmod(self): pser = pd.Series([100, None, 300, None, 500], name="Koalas") kser = ks.from_pandas(pser) if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): self.assert_eq(repr(kser.divmod(-100)), repr(pser.divmod(-100))) self.assert_eq(repr(kser.divmod(100)), repr(pser.divmod(100))) elif LooseVersion(pd.__version__) < LooseVersion("1.0.0"): expected_result = repr((pser.floordiv(-100), pser.mod(-100))) self.assert_eq(repr(kser.divmod(-100)), expected_result) expected_result = repr((pser.floordiv(100), pser.mod(100))) self.assert_eq(repr(kser.divmod(100)), expected_result)
Example #17
Source File: client.py From ibis with Apache License 2.0 | 5 votes |
def version(self): return parse_version(ps.__version__)
Example #18
Source File: test_series.py From koalas with Apache License 2.0 | 5 votes |
def test_to_list(self): if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"): self.assertEqual(self.kser.to_list(), self.pser.to_list())
Example #19
Source File: test_series.py From koalas with Apache License 2.0 | 5 votes |
def test_value_counts(self): if LooseVersion(pyspark.__version__) < LooseVersion("2.4"): with self.sql_conf({"spark.sql.execution.arrow.enabled": False}): self._test_value_counts() self.assertRaises( RuntimeError, lambda: ks.MultiIndex.from_tuples([("x", "a"), ("x", "b")]).value_counts(), ) else: self._test_value_counts()
Example #20
Source File: test_indexes.py From koalas with Apache License 2.0 | 5 votes |
def test_multi_index_names(self): arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] idx = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) pdf = pd.DataFrame(np.random.randn(4, 5), idx) kdf = ks.from_pandas(pdf) self.assertEqual(kdf.index.names, pdf.index.names) pidx = pdf.index kidx = kdf.index pidx.names = ["renamed_number", "renamed_color"] kidx.names = ["renamed_number", "renamed_color"] self.assertEqual(kidx.names, pidx.names) pidx.names = ["renamed_number", None] kidx.names = ["renamed_number", None] self.assertEqual(kidx.names, pidx.names) if LooseVersion(pyspark.__version__) < LooseVersion("2.4"): # PySpark < 2.4 does not support struct type with arrow enabled. with self.sql_conf({"spark.sql.execution.arrow.enabled": False}): self.assert_eq(kidx, pidx) else: self.assert_eq(kidx, pidx) with self.assertRaises(PandasNotImplementedError): kidx.name with self.assertRaises(PandasNotImplementedError): kidx.name = "renamed"
Example #21
Source File: test_indexes.py From koalas with Apache License 2.0 | 5 votes |
def test_to_frame(self): pidx = self.pdf.index kidx = self.kdf.index self.assert_eq(repr(kidx.to_frame()), repr(pidx.to_frame())) self.assert_eq(repr(kidx.to_frame(index=False)), repr(pidx.to_frame(index=False))) pidx.name = "a" kidx.name = "a" self.assert_eq(repr(kidx.to_frame()), repr(pidx.to_frame())) self.assert_eq(repr(kidx.to_frame(index=False)), repr(pidx.to_frame(index=False))) if LooseVersion(pd.__version__) >= LooseVersion("0.24"): # The `name` argument is added in pandas 0.24. self.assert_eq(repr(kidx.to_frame(name="x")), repr(pidx.to_frame(name="x"))) self.assert_eq( repr(kidx.to_frame(index=False, name="x")), repr(pidx.to_frame(index=False, name="x")), ) pidx = self.pdf.set_index("b", append=True).index kidx = self.kdf.set_index("b", append=True).index self.assert_eq(repr(kidx.to_frame()), repr(pidx.to_frame())) self.assert_eq(repr(kidx.to_frame(index=False)), repr(pidx.to_frame(index=False))) if LooseVersion(pd.__version__) >= LooseVersion("0.24"): # The `name` argument is added in pandas 0.24. self.assert_eq( repr(kidx.to_frame(name=["x", "y"])), repr(pidx.to_frame(name=["x", "y"])) ) self.assert_eq( repr(kidx.to_frame(index=False, name=["x", "y"])), repr(pidx.to_frame(index=False, name=["x", "y"])), )
Example #22
Source File: test_dataframe.py From koalas with Apache License 2.0 | 5 votes |
def test_udt(self): sparse_values = {0: 0.1, 1: 1.1} sparse_vector = SparseVector(len(sparse_values), sparse_values) pdf = pd.DataFrame({"a": [sparse_vector], "b": [10]}) if LooseVersion(pyspark.__version__) < LooseVersion("2.4"): with self.sql_conf({"spark.sql.execution.arrow.enabled": False}): kdf = ks.from_pandas(pdf) self.assert_eq(kdf, pdf) else: kdf = ks.from_pandas(pdf) self.assert_eq(kdf, pdf)
Example #23
Source File: indexes.py From koalas with Apache License 2.0 | 5 votes |
def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): if ( LooseVersion(pyspark.__version__) < LooseVersion("2.4") and default_session().conf.get("spark.sql.execution.arrow.enabled") == "true" and isinstance(self, MultiIndex) ): raise RuntimeError( "if you're using pyspark < 2.4, set conf " "'spark.sql.execution.arrow.enabled' to 'false' " "for using this function with MultiIndex" ) return super(MultiIndex, self).value_counts( normalize=normalize, sort=sort, ascending=ascending, bins=bins, dropna=dropna )
Example #24
Source File: utils.py From koalas with Apache License 2.0 | 5 votes |
def default_session(conf=None): if conf is None: conf = dict() should_use_legacy_ipc = False if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15") and LooseVersion( pyspark.__version__ ) < LooseVersion("3.0"): conf["spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT"] = "1" conf["spark.yarn.appMasterEnv.ARROW_PRE_0_15_IPC_FORMAT"] = "1" conf["spark.mesos.driverEnv.ARROW_PRE_0_15_IPC_FORMAT"] = "1" conf["spark.kubernetes.driverEnv.ARROW_PRE_0_15_IPC_FORMAT"] = "1" should_use_legacy_ipc = True builder = spark.SparkSession.builder.appName("Koalas") for key, value in conf.items(): builder = builder.config(key, value) # Currently, Koalas is dependent on such join due to 'compute.ops_on_diff_frames' # configuration. This is needed with Spark 3.0+. builder.config("spark.sql.analyzer.failAmbiguousSelfJoin", False) session = builder.getOrCreate() if not should_use_legacy_ipc: is_legacy_ipc_set = any( v == "1" for v in [ session.conf.get("spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT", None), session.conf.get("spark.yarn.appMasterEnv.ARROW_PRE_0_15_IPC_FORMAT", None), session.conf.get("spark.mesos.driverEnv.ARROW_PRE_0_15_IPC_FORMAT", None), session.conf.get("spark.kubernetes.driverEnv.ARROW_PRE_0_15_IPC_FORMAT", None), ] ) if is_legacy_ipc_set: raise RuntimeError( "Please explicitly unset 'ARROW_PRE_0_15_IPC_FORMAT' environment variable in " "both driver and executor sides. Check your spark.executorEnv.*, " "spark.yarn.appMasterEnv.*, spark.mesos.driverEnv.* and " "spark.kubernetes.driverEnv.* configurations. It is required to set this " "environment variable only when you use pyarrow>=0.15 and pyspark<3.0." ) return session
Example #25
Source File: spark_dataset_converter.py From petastorm with Apache License 2.0 | 5 votes |
def __enter__(self): # import locally to avoid importing tensorflow globally. from petastorm.tf_utils import make_petastorm_dataset import tensorflow.compat.v1 as tf # pylint: disable=import-error _wait_file_available(self.parquet_file_url_list) self.reader = make_batch_reader(self.parquet_file_url_list, **self.petastorm_reader_kwargs) # unroll dataset dataset = make_petastorm_dataset(self.reader).flat_map( tf.data.Dataset.from_tensor_slices) # TODO: auto tune best batch size in default case. batch_size = self.batch_size or 32 dataset = dataset.batch(batch_size=batch_size) prefetch = self.prefetch if prefetch is None: if LooseVersion(tf.__version__) >= LooseVersion('1.14'): # We can make prefetch optimization prefetch = tf.data.experimental.AUTOTUNE else: prefetch = 1 dataset = dataset.prefetch(prefetch) return dataset
Example #26
Source File: utils.py From koalas with Apache License 2.0 | 4 votes |
def validate_arguments_and_invoke_function( pobj: Union[pd.DataFrame, pd.Series], koalas_func: Callable, pandas_func: Callable, input_args: Dict, ): """ Invokes a pandas function. This is created because different versions of pandas support different parameters, and as a result when we code against the latest version, our users might get a confusing "got an unexpected keyword argument" error if they are using an older version of pandas. This function validates all the arguments, removes the ones that are not supported if they are simply the default value (i.e. most likely the user didn't explicitly specify it). It throws a TypeError if the user explicitly specify an argument that is not supported by the pandas version available. For example usage, look at DataFrame.to_html(). :param pobj: the pandas DataFrame or Series to operate on :param koalas_func: Koalas function, used to get default parameter values :param pandas_func: pandas function, used to check whether pandas supports all the arguments :param input_args: arguments to pass to the pandas function, often created by using locals(). Make sure locals() call is at the top of the function so it captures only input parameters, rather than local variables. :return: whatever pandas_func returns """ import inspect # Makes a copy since whatever passed in is likely created by locals(), and we can't delete # 'self' key from that. args = input_args.copy() del args["self"] if "kwargs" in args: # explode kwargs kwargs = args["kwargs"] del args["kwargs"] args = {**args, **kwargs} koalas_params = inspect.signature(koalas_func).parameters pandas_params = inspect.signature(pandas_func).parameters for param in koalas_params.values(): if param.name not in pandas_params: if args[param.name] == param.default: del args[param.name] else: raise TypeError( ( "The pandas version [%s] available does not support parameter '%s' " + "for function '%s'." ) % (pd.__version__, param.name, pandas_func.__name__) ) args["self"] = pobj return pandas_func(**args)
Example #27
Source File: dataset_metadata.py From petastorm with Apache License 2.0 | 4 votes |
def _init_spark(spark, current_spark_config, row_group_size_mb=None, use_summary_metadata=False): """ Initializes spark and hdfs config with necessary options for petastorm datasets before running the spark job. """ # It's important to keep pyspark import local because when placed at the top level it somehow messes up with # namedtuple serialization code and we end up getting UnischemaFields objects depickled without overriden __eq__ # and __hash__ methods. import pyspark _PYSPARK_BEFORE_24 = version.parse(pyspark.__version__) < version.parse('2.4') hadoop_config = spark.sparkContext._jsc.hadoopConfiguration() # Store current values so we can restore them later current_spark_config['parquet.summary.metadata.level'] = \ hadoop_config.get('parquet.summary.metadata.level') current_spark_config['parquet.enable.summary-metadata'] = \ hadoop_config.get('parquet.enable.summary-metadata') current_spark_config['parquet.summary.metadata.propagate-errors'] = \ hadoop_config.get('parquet.summary.metadata.propagate-errors') current_spark_config['parquet.block.size.row.check.min'] = \ hadoop_config.get('parquet.block.size.row.check.min') current_spark_config['parquet.row-group.size.row.check.min'] = \ hadoop_config.get('parquet.row-group.size.row.check.min') current_spark_config['parquet.block.size'] = \ hadoop_config.get('parquet.block.size') if _PYSPARK_BEFORE_24: hadoop_config.setBoolean("parquet.enable.summary-metadata", use_summary_metadata) else: hadoop_config.set('parquet.summary.metadata.level', "ALL" if use_summary_metadata else "NONE") # Our atg fork includes https://github.com/apache/parquet-mr/pull/502 which creates this # option. This forces a job to fail if the summary metadata files cannot be created # instead of just having them fail to be created silently hadoop_config.setBoolean('parquet.summary.metadata.propagate-errors', True) # In our atg fork this config is called parquet.block.size.row.check.min however in newer # parquet versions it will be renamed to parquet.row-group.size.row.check.min # We use both for backwards compatibility hadoop_config.setInt('parquet.block.size.row.check.min', 3) hadoop_config.setInt('parquet.row-group.size.row.check.min', 3) if row_group_size_mb: hadoop_config.setInt('parquet.block.size', row_group_size_mb * 1024 * 1024)