Python dask.distributed() Examples
The following are 19
code examples of dask.distributed().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
dask
, or try the search function
.
Example #1
Source File: FEMSolver.py From florence with MIT License | 9 votes |
def LaunchDaskDistributedClient(self, scheduler_ip=None, scheduler_port=None): if self.parallel and self.parallel_model == "dask" and self.is_dask_scheduler_initialised is False: from multiprocessing.pool import ThreadPool try: import dask from dask.distributed import Client, LocalCluster except ImportError: raise ImportError("dask is not installed. Install it 'using pip install dask[complete]'") dask.config.set(pool=ThreadPool(self.no_of_cpu_cores)) # INITIALISE CLUSTER if scheduler_ip is None: cluster = LocalCluster(n_workers=self.no_of_cpu_cores, processes=False, threads_per_worker=None) client = Client(cluster) else: client = Client(scheduler_ip) self.dask_client = client self.is_dask_scheduler_initialised = True
Example #2
Source File: utils.py From verde with BSD 3-Clause "New" or "Revised" License | 7 votes |
def dispatch(function, delayed=False, client=None): """ Decide how to wrap a function for Dask depending on the options given. Parameters ---------- function : callable The function that will be called. delayed : bool If True, will wrap the function in :func:`dask.delayed`. client : None or dask.distributed Client If *delayed* is False and *client* is not None, will return a partial execution of the ``client.submit`` with the function as first argument. Returns ------- function : callable The function wrapped in Dask. """ if delayed: return dask.delayed(function) if client is not None: return functools.partial(client.submit, function) return function
Example #3
Source File: map_processing.py From PyXRF with BSD 3-Clause "New" or "Revised" License | 6 votes |
def dask_client_create(**kwargs): """ Create Dask client object. The function is trivial and introduced so that Dask client is created in uniform way throughout the program. Parameters ---------- kwargs: dict, optional kwargs will be passed to the Dask client constructor Returns ------- client: dask.distributed.Client Dask client object """ _kwargs = {"processes": True, "silence_logs": logging.ERROR} _kwargs.update(kwargs) client = Client(**_kwargs) dask.config.set(shuffle="disk") path_dask_data = os.path.expanduser("~/.dask") dask.config.set({"temporary_directory": path_dask_data}) return client
Example #4
Source File: automate.py From aospy with Apache License 2.0 | 5 votes |
def _exec_calcs(calcs, parallelize=False, client=None, **compute_kwargs): """Execute the given calculations. Parameters ---------- calcs : Sequence of ``aospy.Calc`` objects parallelize : bool, default False Whether to submit the calculations in parallel or not client : distributed.Client or None The distributed Client used if parallelize is set to True; if None a distributed LocalCluster is used. compute_kwargs : dict of keyword arguments passed to ``Calc.compute`` Returns ------- A list of the values returned by each Calc object that was executed. """ if parallelize: def func(calc): """Wrap _compute_or_skip_on_error to require only the calc argument""" if 'write_to_tar' in compute_kwargs: compute_kwargs['write_to_tar'] = False return _compute_or_skip_on_error(calc, compute_kwargs) if client is None: n_workers = _n_workers_for_local_cluster(calcs) with distributed.LocalCluster(n_workers=n_workers) as cluster: with distributed.Client(cluster) as client: result = _submit_calcs_on_client(calcs, client, func) else: result = _submit_calcs_on_client(calcs, client, func) if compute_kwargs['write_to_tar']: _serial_write_to_tar(calcs) return result else: return [_compute_or_skip_on_error(calc, compute_kwargs) for calc in calcs]
Example #5
Source File: test_dask_env.py From pangeo-stacks with BSD 3-Clause "New" or "Revised" License | 5 votes |
def client(): from dask.distributed import Client with Client(n_workers=4) as dask_client: yield dask_client
Example #6
Source File: test_dask_env.py From pangeo-stacks with BSD 3-Clause "New" or "Revised" License | 5 votes |
def client(): from dask.distributed import Client with Client(n_workers=4) as dask_client: yield dask_client
Example #7
Source File: test_async.py From dask-kubernetes with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_diagnostics_link_env_variable(pod_spec, ns): pytest.importorskip("bokeh") with dask.config.set({"distributed.dashboard.link": "foo-{USER}-{port}"}): async with KubeCluster(pod_spec, namespace=ns, asynchronous=True) as cluster: port = cluster.scheduler_info["services"]["dashboard"] assert ( "foo-" + getpass.getuser() + "-" + str(port) in cluster.dashboard_link )
Example #8
Source File: test_async.py From dask-kubernetes with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_logs(remote_cluster): cluster = remote_cluster cluster.scale(2) await cluster start = time() while len(cluster.scheduler_info["workers"]) < 2: await asyncio.sleep(0.1) assert time() < start + 20 logs = await cluster.logs() assert len(logs) == 3 for _, log in logs.items(): assert "distributed.scheduler" in log or "distributed.worker" in log
Example #9
Source File: core.py From dask-kubernetes with BSD 3-Clause "New" or "Revised" License | 5 votes |
def scale(self, n): # A shim to maintain backward compatibility # https://github.com/dask/distributed/issues/3054 maximum = dask.config.get("kubernetes.count.max") if maximum is not None and maximum < n: logger.info( "Tried to scale beyond maximum number of workers %d > %d", n, maximum ) n = maximum return super().scale(n)
Example #10
Source File: executor.py From dagster with Apache License 2.0 | 5 votes |
def build_dict(self, pipeline_name): '''Returns a dict we can use for kwargs passed to dask client instantiation. Intended to be used like: with dask.distributed.Client(**cfg.build_dict()) as client: << use client here >> ''' if self.cluster_type in ['yarn', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube']: dask_cfg = {'name': pipeline_name} else: dask_cfg = {} if self.cluster_configuration: for k, v in self.cluster_configuration.items(): dask_cfg[k] = v # if address is set, don't add LocalCluster args # context: https://github.com/dask/distributed/issues/3313 if (self.cluster_type == 'local') and ('address' not in dask_cfg): # We set threads_per_worker because Dagster is not thread-safe. Even though # environments=True by default, there is a clever piece of machinery # (dask.distributed.deploy.local.nprocesses_nthreads) that automagically makes execution # multithreaded by default when the number of available cores is greater than 4. # See: https://github.com/dagster-io/dagster/issues/2181 # We may want to try to figure out a way to enforce this on remote Dask clusters against # which users run Dagster workloads. dask_cfg['threads_per_worker'] = 1 return dask_cfg
Example #11
Source File: test_dask.py From filesystem_spec with BSD 3-Clause "New" or "Revised" License | 5 votes |
def cli(tmpdir): import dask.distributed client = dask.distributed.Client(n_workers=1) def setup(): m = fsspec.filesystem("memory") with m.open("afile", "wb") as f: f.write(b"data") client.run(setup) try: yield client finally: client.close()
Example #12
Source File: automate.py From aospy with Apache License 2.0 | 5 votes |
def _submit_calcs_on_client(calcs, client, func): """Submit calculations via dask.bag and a distributed client""" logging.info('Connected to client: {}'.format(client)) if LooseVersion(dask.__version__) < '0.18': dask_option_setter = dask.set_options else: dask_option_setter = dask.config.set with dask_option_setter(get=client.get): return db.from_sequence(calcs).map(func).compute()
Example #13
Source File: executor.py From dagster with Apache License 2.0 | 4 votes |
def dask_executor(init_context): '''Dask-based executor. If the Dask executor is used without providing executor-specific config, a local Dask cluster will be created (as when calling :py:class:`dask.distributed.Client() <dask:distributed.Client>` without specifying the scheduler address). The Dask executor optionally takes the following config: .. code-block:: none cluster: { local?: # The cluster type, one of the following ('local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube'). { address?: '127.0.0.1:8786', # The address of a Dask scheduler timeout?: 5, # Timeout duration for initial connection to the scheduler scheduler_file?: '/path/to/file' # Path to a file with scheduler information # Whether to connect directly to the workers, or ask the scheduler to serve as # intermediary direct_to_workers?: False, heartbeat_interval?: 1000, # Time in milliseconds between heartbeats to scheduler } } If you'd like to configure a dask executor in addition to the :py:class:`~dagster.default_executors`, you should add it to the ``executor_defs`` defined on a :py:class:`~dagster.ModeDefinition` as follows: .. code-block:: python from dagster import ModeDefinition, default_executors, pipeline from dagster_dask import dask_executor @pipeline(mode_defs=[ModeDefinition(executor_defs=default_executors + [dask_executor])]) def dask_enabled_pipeline(): pass ''' check_cross_process_constraints(init_context) ((cluster_type, cluster_configuration),) = init_context.executor_config['cluster'].items() return DaskExecutor(cluster_type, cluster_configuration)
Example #14
Source File: run_rj_neq.py From perses with MIT License | 4 votes |
def createSystemFromIUPAC(iupac_name): """ Create an openmm system out of an oemol Parameters ---------- iupac_name : str IUPAC name Returns ------- molecule : openeye.OEMol OEMol molecule system : openmm.System object OpenMM system positions : [n,3] np.array of floats Positions topology : openmm.app.Topology object Topology """ from perses.utils.data import get_data_filename from perses.utils.openeye import extractPositionsFromOEMol # Create OEMol molecule = iupac_to_oemol(iupac_name) # Generate a topology. from openmoltools.forcefield_generators import generateTopologyFromOEMol topology = generateTopologyFromOEMol(molecule) # Initialize a forcefield with GAFF. # TODO: Fix path for `gaff.xml` since it is not yet distributed with OpenMM from simtk.openmm.app import ForceField gaff_xml_filename = get_data_filename('data/gaff.xml') forcefield = ForceField(gaff_xml_filename) # Generate template and parameters. from openmoltools.forcefield_generators import generateResidueTemplate [template, ffxml] = generateResidueTemplate(molecule) # Register the template. forcefield.registerResidueTemplate(template) # Add the parameters. forcefield.loadFile(StringIO(ffxml)) # Create the system. system = forcefield.createSystem(topology, removeCMMotion=False) # Extract positions positions = extractPositionsFromOEMol(molecule) return (molecule, system, positions, topology)
Example #15
Source File: utils.py From HistomicsTK with Apache License 2.0 | 4 votes |
def create_dask_client(args): """Create and install a Dask distributed client using args from a Namespace, supporting the following attributes: - .scheduler: Address of the distributed scheduler, or the empty string to start one locally """ import dask scheduler = getattr(args, 'scheduler', None) num_workers = getattr(args, 'num_workers', 0) num_threads_per_worker = getattr(args, 'num_threads_per_worker', 0) if scheduler == 'multithreading': import dask.threaded from multiprocessing.pool import ThreadPool if num_threads_per_worker <= 0: num_workers = max(1, psutil.cpu_count(logical=False) + num_threads_per_worker) else: num_workers = num_threads_per_worker print('Starting dask thread pool with %d thread(s)' % num_workers) dask.config.set(pool=ThreadPool(num_workers)) dask.config.set(scheduler='threads') return if scheduler == 'multiprocessing': import dask.multiprocessing import multiprocessing dask.config.set(scheduler='processes') if num_workers <= 0: num_workers = max(1, psutil.cpu_count(logical=False) + num_workers) print('Starting dask multiprocessing pool with %d worker(s)' % num_workers) dask.config.set(pool=multiprocessing.Pool( num_workers, initializer=dask.multiprocessing.initialize_worker_process)) return import dask.distributed if not scheduler: if num_workers <= 0: num_workers = max(1, psutil.cpu_count(logical=False) + num_workers) num_threads_per_worker = (num_threads_per_worker if num_threads_per_worker >= 1 else None) print('Creating dask LocalCluster with %d worker(s), %r thread(s) per ' 'worker' % (num_workers, num_threads_per_worker)) scheduler = dask.distributed.LocalCluster( ip='0.0.0.0', # Allow reaching the diagnostics port externally scheduler_port=0, # Don't expose the scheduler port n_workers=num_workers, memory_limit=0, threads_per_worker=num_threads_per_worker, silence_logs=False ) return dask.distributed.Client(scheduler)
Example #16
Source File: core.py From dask-kubernetes with BSD 3-Clause "New" or "Revised" License | 4 votes |
def __init__( self, pod_template=None, name=None, namespace=None, n_workers=None, host=None, port=None, env=None, auth=ClusterAuth.DEFAULT, idle_timeout=None, deploy_mode=None, interface=None, protocol=None, dashboard_address=None, security=None, scheduler_service_wait_timeout=None, scheduler_pod_template=None, **kwargs ): self.pod_template = pod_template self.scheduler_pod_template = scheduler_pod_template self._generate_name = name self._namespace = namespace self._n_workers = n_workers self._idle_timeout = idle_timeout self._deploy_mode = deploy_mode self._protocol = protocol self._interface = interface self._dashboard_address = dashboard_address self._scheduler_service_wait_timeout = scheduler_service_wait_timeout self.security = security if self.security and not isinstance( self.security, distributed.security.Security ): raise RuntimeError( "Security object is not a valid distributed.security.Security object" ) self.host = host self.port = port self.env = env self.auth = auth self.kwargs = kwargs super().__init__(**self.kwargs)
Example #17
Source File: map_processing.py From PyXRF with BSD 3-Clause "New" or "Revised" License | 4 votes |
def _fit_xrf_block(data, data_sel_indices, matv, snip_param, use_snip): """ Spectrum fitting for a block of XRF dataset. The function is intended to be called using `map_blocks` function for parallel processing using Dask distributed package. Parameters ---------- data : ndarray block of an XRF dataset. Shape=(ny, nx, ne). data_sel_indices: tuple tuple `(n_start, n_end)` which defines the indices along axis 2 of `data` array that are used for fitting. Note that `ne` (in `data`) and `ne_model` (in `matv`) are not equal. But `n_end - n_start` MUST be equal to `ne_model`! Indexes `n_start .. n_end - 1` will be selected from each pixel. matv: ndarray Matrix of spectra of the selected elements (emission lines). Shape=(ne_model, n_lines) snip_param: dict Dictionary of parameters forwarded to 'snip' method for background removal. Keys: `e_offset`, `e_linear`, `e_quadratic` (parameters of the energy axis approximation), `b_width` (width of the window that defines resolution of the snip algorithm). use_snip: bool, optional enable/disable background removal using snip algorithm Returns ------- data_out: ndarray array with fitting results. Shape: `(ny, nx, ne_model + 4)`. For each pixel the output data contains: `ne_model` values that represent area under the emission line spectra; background area (only in the selected energy range), error (R-factor), total count in the selected energy range, total count of the full experimental spectrum. """ spec = data spec_sel = spec[:, :, data_sel_indices[0]: data_sel_indices[1]] if use_snip: bg_sel = np.apply_along_axis(snip_method_numba, 2, spec_sel, snip_param['e_offset'], snip_param['e_linear'], snip_param['e_quadratic'], width=snip_param['b_width']) y = spec_sel - bg_sel bg_sum = np.sum(bg_sel, axis=2) else: y = spec_sel bg_sum = np.zeros(shape=data.shape[0:2]) weights, rfactor, _ = fit_spectrum(y, matv, axis=2, method="nnls") total_cnt = np.sum(spec, axis=2) sel_cnt = np.sum(spec_sel, axis=2) # Stack depth-wise (along axis 2) data_out = np.dstack((weights, bg_sum, rfactor, sel_cnt, total_cnt)) return data_out
Example #18
Source File: map_processing.py From PyXRF with BSD 3-Clause "New" or "Revised" License | 4 votes |
def _chunk_numpy_array(data, chunk_size): """ Convert a numpy array into Dask array with chunks of given size. The function splits the array into chunks along axes 0 and 1. If the array has more than 2 dimensions, then the remaining dimensions are not chunked. Note, that `dask_array = da.array(data, chunks=...)` will set the chunk size, but not split the data into chunks, therefore the array can not be loaded block by block by workers controlled by a distributed scheduler. Parameters ---------- data: ndarray(float), 2 or more dimensions XRF map of the shape `(ny, nx, ne)`, where `ny` and `nx` represent the image size and `ne` is the number of points in spectra chunk_size: tuple(int, int) or list(int, int) Chunk size for axis 0 and 1: `(chunk_y, chunk_x`). The function will accept chunk size values that are larger then the respective `data` array dimensions. Returns ------- data_dask: dask.array Dask array with the given chunk size """ chunk_y, chunk_x = chunk_size ny, nx = data.shape[0:2] chunk_y, chunk_x = min(chunk_y, ny), min(chunk_x, nx) def _get_slice(n1, n2): data_slice = data[slice(n1 * chunk_y, min(n1 * chunk_y + chunk_y, ny)), slice(n2 * chunk_x, min(n2 * chunk_x + chunk_x, nx))] # Wrap the slice into a list wiht appropriate dimensions for _ in range(2, data.ndim): data_slice = [data_slice] return data_slice # Chunk the numpy array and assemble it as a dask array data_dask = da.block([ [ _get_slice(_1, _2) for _2 in range(int(math.ceil(nx / chunk_x))) ] for _1 in range(int(math.ceil(ny / chunk_y))) ]) return data_dask
Example #19
Source File: map_processing.py From PyXRF with BSD 3-Clause "New" or "Revised" License | 4 votes |
def wait_and_display_progress(fut, progress_bar=None): """ Wait for the future to complete and display the progress bar. This method may be used to drive any custom progress bar, which displays progress in percent from 0 to 100. Parameters ---------- fut: dask future future object for the batch of tasks submitted to the distributed client. progress_bar: callable or None callable function or callable object with methods `start()`, `__call__(float)` and `finish()`. The methods `start()` and `finish()` are optional. For example, this could be a reference to an instance of the object `TerminalProgressBar` Examples -------- .. code-block:: client = Client() data = da.random.random(size=(100, 100), chunks=(10, 10)) sm_fut = da.sum(data, axis=0).persist(scheduler=client) # Call the progress monitor wait_and_display_progress(sm_fut, TerminalProgressBar("Monitoring progress: ")) sm = sm_fut.compute(scheduler=client) client.close() """ # If there is no progress bar, then just return without waiting for the future if progress_bar is None: return if hasattr(progress_bar, "start"): progress_bar.start() progress_bar(1.0) while True: done, not_done = wait(fut, return_when='FIRST_COMPLETED') n_completed, n_pending = len(done), len(not_done) n_total = n_completed + n_pending percent_completed = n_completed / n_total * 100.0 if n_total > 0 else 100.0 # It is guaranteed that 'progress_bar' is called for 100% completion progress_bar(percent_completed) if not n_pending: break ttime.sleep(0.5) if hasattr(progress_bar, "finish"): progress_bar.finish()