Python dask.distributed() Examples

The following are 19 code examples of dask.distributed(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module dask , or try the search function .
Example #1
Source File: FEMSolver.py    From florence with MIT License 9 votes vote down vote up
def LaunchDaskDistributedClient(self, scheduler_ip=None, scheduler_port=None):

        if self.parallel and self.parallel_model == "dask" and self.is_dask_scheduler_initialised is False:

            from multiprocessing.pool import ThreadPool
            try:
                import dask
                from dask.distributed import Client, LocalCluster
            except ImportError:
                raise ImportError("dask is not installed. Install it 'using pip install dask[complete]'")

            dask.config.set(pool=ThreadPool(self.no_of_cpu_cores))
            # INITIALISE CLUSTER
            if scheduler_ip is None:
                cluster = LocalCluster(n_workers=self.no_of_cpu_cores, processes=False, threads_per_worker=None)
                client = Client(cluster)
            else:
                client = Client(scheduler_ip)

            self.dask_client = client

            self.is_dask_scheduler_initialised = True 
Example #2
Source File: utils.py    From verde with BSD 3-Clause "New" or "Revised" License 7 votes vote down vote up
def dispatch(function, delayed=False, client=None):
    """
    Decide how to wrap a function for Dask depending on the options given.

    Parameters
    ----------
    function : callable
        The function that will be called.
    delayed : bool
        If True, will wrap the function in :func:`dask.delayed`.
    client : None or dask.distributed Client
        If *delayed* is False and *client* is not None, will return a partial
        execution of the ``client.submit`` with the function as first argument.

    Returns
    -------
    function : callable
        The function wrapped in Dask.

    """
    if delayed:
        return dask.delayed(function)
    if client is not None:
        return functools.partial(client.submit, function)
    return function 
Example #3
Source File: map_processing.py    From PyXRF with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def dask_client_create(**kwargs):
    """
    Create Dask client object. The function is trivial and introduced so that
    Dask client is created in uniform way throughout the program.

    Parameters
    ----------
    kwargs: dict, optional
        kwargs will be passed to the Dask client constructor

    Returns
    -------
    client: dask.distributed.Client
        Dask client object
    """
    _kwargs = {"processes": True, "silence_logs": logging.ERROR}
    _kwargs.update(kwargs)
    client = Client(**_kwargs)
    dask.config.set(shuffle="disk")
    path_dask_data = os.path.expanduser("~/.dask")
    dask.config.set({"temporary_directory": path_dask_data})
    return client 
Example #4
Source File: automate.py    From aospy with Apache License 2.0 5 votes vote down vote up
def _exec_calcs(calcs, parallelize=False, client=None, **compute_kwargs):
    """Execute the given calculations.

    Parameters
    ----------
    calcs : Sequence of ``aospy.Calc`` objects
    parallelize : bool, default False
        Whether to submit the calculations in parallel or not
    client : distributed.Client or None
        The distributed Client used if parallelize is set to True; if None
        a distributed LocalCluster is used.
    compute_kwargs : dict of keyword arguments passed to ``Calc.compute``

    Returns
    -------
    A list of the values returned by each Calc object that was executed.
    """
    if parallelize:
        def func(calc):
            """Wrap _compute_or_skip_on_error to require only the calc
            argument"""
            if 'write_to_tar' in compute_kwargs:
                compute_kwargs['write_to_tar'] = False
            return _compute_or_skip_on_error(calc, compute_kwargs)

        if client is None:
            n_workers = _n_workers_for_local_cluster(calcs)
            with distributed.LocalCluster(n_workers=n_workers) as cluster:
                with distributed.Client(cluster) as client:
                    result = _submit_calcs_on_client(calcs, client, func)
        else:
            result = _submit_calcs_on_client(calcs, client, func)
        if compute_kwargs['write_to_tar']:
            _serial_write_to_tar(calcs)
        return result
    else:
        return [_compute_or_skip_on_error(calc, compute_kwargs)
                for calc in calcs] 
Example #5
Source File: test_dask_env.py    From pangeo-stacks with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def client():
    from dask.distributed import Client
    with Client(n_workers=4) as dask_client:
        yield dask_client 
Example #6
Source File: test_dask_env.py    From pangeo-stacks with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def client():
    from dask.distributed import Client
    with Client(n_workers=4) as dask_client:
        yield dask_client 
Example #7
Source File: test_async.py    From dask-kubernetes with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_diagnostics_link_env_variable(pod_spec, ns):
    pytest.importorskip("bokeh")
    with dask.config.set({"distributed.dashboard.link": "foo-{USER}-{port}"}):
        async with KubeCluster(pod_spec, namespace=ns, asynchronous=True) as cluster:
            port = cluster.scheduler_info["services"]["dashboard"]

            assert (
                "foo-" + getpass.getuser() + "-" + str(port) in cluster.dashboard_link
            ) 
Example #8
Source File: test_async.py    From dask-kubernetes with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_logs(remote_cluster):
    cluster = remote_cluster
    cluster.scale(2)
    await cluster

    start = time()
    while len(cluster.scheduler_info["workers"]) < 2:
        await asyncio.sleep(0.1)
        assert time() < start + 20

    logs = await cluster.logs()
    assert len(logs) == 3
    for _, log in logs.items():
        assert "distributed.scheduler" in log or "distributed.worker" in log 
Example #9
Source File: core.py    From dask-kubernetes with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def scale(self, n):
        # A shim to maintain backward compatibility
        # https://github.com/dask/distributed/issues/3054
        maximum = dask.config.get("kubernetes.count.max")
        if maximum is not None and maximum < n:
            logger.info(
                "Tried to scale beyond maximum number of workers %d > %d", n, maximum
            )
            n = maximum
        return super().scale(n) 
Example #10
Source File: executor.py    From dagster with Apache License 2.0 5 votes vote down vote up
def build_dict(self, pipeline_name):
        '''Returns a dict we can use for kwargs passed to dask client instantiation.

        Intended to be used like:

        with dask.distributed.Client(**cfg.build_dict()) as client:
            << use client here >>

        '''
        if self.cluster_type in ['yarn', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube']:
            dask_cfg = {'name': pipeline_name}
        else:
            dask_cfg = {}

        if self.cluster_configuration:
            for k, v in self.cluster_configuration.items():
                dask_cfg[k] = v

        # if address is set, don't add LocalCluster args
        # context: https://github.com/dask/distributed/issues/3313
        if (self.cluster_type == 'local') and ('address' not in dask_cfg):
            # We set threads_per_worker because Dagster is not thread-safe. Even though
            # environments=True by default, there is a clever piece of machinery
            # (dask.distributed.deploy.local.nprocesses_nthreads) that automagically makes execution
            # multithreaded by default when the number of available cores is greater than 4.
            # See: https://github.com/dagster-io/dagster/issues/2181
            # We may want to try to figure out a way to enforce this on remote Dask clusters against
            # which users run Dagster workloads.
            dask_cfg['threads_per_worker'] = 1

        return dask_cfg 
Example #11
Source File: test_dask.py    From filesystem_spec with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def cli(tmpdir):
    import dask.distributed

    client = dask.distributed.Client(n_workers=1)

    def setup():
        m = fsspec.filesystem("memory")
        with m.open("afile", "wb") as f:
            f.write(b"data")

    client.run(setup)
    try:
        yield client
    finally:
        client.close() 
Example #12
Source File: automate.py    From aospy with Apache License 2.0 5 votes vote down vote up
def _submit_calcs_on_client(calcs, client, func):
    """Submit calculations via dask.bag and a distributed client"""
    logging.info('Connected to client: {}'.format(client))
    if LooseVersion(dask.__version__) < '0.18':
        dask_option_setter = dask.set_options
    else:
        dask_option_setter = dask.config.set
    with dask_option_setter(get=client.get):
        return db.from_sequence(calcs).map(func).compute() 
Example #13
Source File: executor.py    From dagster with Apache License 2.0 4 votes vote down vote up
def dask_executor(init_context):
    '''Dask-based executor.

    If the Dask executor is used without providing executor-specific config, a local Dask cluster
    will be created (as when calling :py:class:`dask.distributed.Client() <dask:distributed.Client>`
    without specifying the scheduler address).

    The Dask executor optionally takes the following config:

    .. code-block:: none

        cluster:
            {
                local?:  # The cluster type, one of the following ('local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube').
                    {
                        address?: '127.0.0.1:8786',  # The address of a Dask scheduler
                        timeout?: 5,  # Timeout duration for initial connection to the scheduler
                        scheduler_file?: '/path/to/file'  # Path to a file with scheduler information
                        # Whether to connect directly to the workers, or ask the scheduler to serve as
                        # intermediary
                        direct_to_workers?: False,
                        heartbeat_interval?: 1000,  # Time in milliseconds between heartbeats to scheduler
                    }
            }

    If you'd like to configure a dask executor in addition to the
    :py:class:`~dagster.default_executors`, you should add it to the ``executor_defs`` defined on a
    :py:class:`~dagster.ModeDefinition` as follows:

    .. code-block:: python

        from dagster import ModeDefinition, default_executors, pipeline
        from dagster_dask import dask_executor

        @pipeline(mode_defs=[ModeDefinition(executor_defs=default_executors + [dask_executor])])
        def dask_enabled_pipeline():
            pass

    '''
    check_cross_process_constraints(init_context)
    ((cluster_type, cluster_configuration),) = init_context.executor_config['cluster'].items()
    return DaskExecutor(cluster_type, cluster_configuration) 
Example #14
Source File: run_rj_neq.py    From perses with MIT License 4 votes vote down vote up
def createSystemFromIUPAC(iupac_name):
    """
    Create an openmm system out of an oemol

    Parameters
    ----------
    iupac_name : str
        IUPAC name

    Returns
    -------
    molecule : openeye.OEMol
        OEMol molecule
    system : openmm.System object
        OpenMM system
    positions : [n,3] np.array of floats
        Positions
    topology : openmm.app.Topology object
        Topology
    """
    from perses.utils.data import get_data_filename
    from perses.utils.openeye import extractPositionsFromOEMol
    # Create OEMol
    molecule = iupac_to_oemol(iupac_name)

    # Generate a topology.
    from openmoltools.forcefield_generators import generateTopologyFromOEMol
    topology = generateTopologyFromOEMol(molecule)

    # Initialize a forcefield with GAFF.
    # TODO: Fix path for `gaff.xml` since it is not yet distributed with OpenMM
    from simtk.openmm.app import ForceField
    gaff_xml_filename = get_data_filename('data/gaff.xml')
    forcefield = ForceField(gaff_xml_filename)

    # Generate template and parameters.
    from openmoltools.forcefield_generators import generateResidueTemplate
    [template, ffxml] = generateResidueTemplate(molecule)

    # Register the template.
    forcefield.registerResidueTemplate(template)

    # Add the parameters.
    forcefield.loadFile(StringIO(ffxml))

    # Create the system.
    system = forcefield.createSystem(topology, removeCMMotion=False)

    # Extract positions
    positions = extractPositionsFromOEMol(molecule)

    return (molecule, system, positions, topology) 
Example #15
Source File: utils.py    From HistomicsTK with Apache License 2.0 4 votes vote down vote up
def create_dask_client(args):
    """Create and install a Dask distributed client using args from a
    Namespace, supporting the following attributes:

    - .scheduler: Address of the distributed scheduler, or the
      empty string to start one locally

    """
    import dask
    scheduler = getattr(args, 'scheduler', None)
    num_workers = getattr(args, 'num_workers', 0)
    num_threads_per_worker = getattr(args, 'num_threads_per_worker', 0)

    if scheduler == 'multithreading':
        import dask.threaded
        from multiprocessing.pool import ThreadPool

        if num_threads_per_worker <= 0:
            num_workers = max(1, psutil.cpu_count(logical=False) + num_threads_per_worker)
        else:
            num_workers = num_threads_per_worker
        print('Starting dask thread pool with %d thread(s)' % num_workers)
        dask.config.set(pool=ThreadPool(num_workers))
        dask.config.set(scheduler='threads')
        return

    if scheduler == 'multiprocessing':
        import dask.multiprocessing
        import multiprocessing

        dask.config.set(scheduler='processes')
        if num_workers <= 0:
            num_workers = max(1, psutil.cpu_count(logical=False) + num_workers)

        print('Starting dask multiprocessing pool with %d worker(s)' % num_workers)
        dask.config.set(pool=multiprocessing.Pool(
            num_workers, initializer=dask.multiprocessing.initialize_worker_process))
        return

    import dask.distributed
    if not scheduler:

        if num_workers <= 0:
            num_workers = max(1, psutil.cpu_count(logical=False) + num_workers)
        num_threads_per_worker = (num_threads_per_worker if num_threads_per_worker >= 1 else None)

        print('Creating dask LocalCluster with %d worker(s), %r thread(s) per '
              'worker' % (num_workers, num_threads_per_worker))
        scheduler = dask.distributed.LocalCluster(
            ip='0.0.0.0',  # Allow reaching the diagnostics port externally
            scheduler_port=0,  # Don't expose the scheduler port
            n_workers=num_workers,
            memory_limit=0,
            threads_per_worker=num_threads_per_worker,
            silence_logs=False
        )

    return dask.distributed.Client(scheduler) 
Example #16
Source File: core.py    From dask-kubernetes with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def __init__(
        self,
        pod_template=None,
        name=None,
        namespace=None,
        n_workers=None,
        host=None,
        port=None,
        env=None,
        auth=ClusterAuth.DEFAULT,
        idle_timeout=None,
        deploy_mode=None,
        interface=None,
        protocol=None,
        dashboard_address=None,
        security=None,
        scheduler_service_wait_timeout=None,
        scheduler_pod_template=None,
        **kwargs
    ):
        self.pod_template = pod_template
        self.scheduler_pod_template = scheduler_pod_template
        self._generate_name = name
        self._namespace = namespace
        self._n_workers = n_workers
        self._idle_timeout = idle_timeout
        self._deploy_mode = deploy_mode
        self._protocol = protocol
        self._interface = interface
        self._dashboard_address = dashboard_address
        self._scheduler_service_wait_timeout = scheduler_service_wait_timeout
        self.security = security
        if self.security and not isinstance(
            self.security, distributed.security.Security
        ):
            raise RuntimeError(
                "Security object is not a valid distributed.security.Security object"
            )
        self.host = host
        self.port = port
        self.env = env
        self.auth = auth
        self.kwargs = kwargs
        super().__init__(**self.kwargs) 
Example #17
Source File: map_processing.py    From PyXRF with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def _fit_xrf_block(data, data_sel_indices,
                   matv, snip_param, use_snip):
    """
    Spectrum fitting for a block of XRF dataset. The function is intended to be
    called using `map_blocks` function for parallel processing using Dask distributed
    package.

    Parameters
    ----------
    data : ndarray
        block of an XRF dataset. Shape=(ny, nx, ne).
    data_sel_indices: tuple
        tuple `(n_start, n_end)` which defines the indices along axis 2 of `data` array
        that are used for fitting. Note that `ne` (in `data`) and `ne_model` (in `matv`)
        are not equal. But `n_end - n_start` MUST be equal to `ne_model`! Indexes
        `n_start .. n_end - 1` will be selected from each pixel.
    matv: ndarray
        Matrix of spectra of the selected elements (emission lines). Shape=(ne_model, n_lines)
    snip_param: dict
        Dictionary of parameters forwarded to 'snip' method for background removal.
        Keys: `e_offset`, `e_linear`, `e_quadratic` (parameters of the energy axis approximation),
        `b_width` (width of the window that defines resolution of the snip algorithm).
    use_snip: bool, optional
        enable/disable background removal using snip algorithm

    Returns
    -------
    data_out: ndarray
        array with fitting results. Shape: `(ny, nx, ne_model + 4)`. For each pixel
        the output data contains: `ne_model` values that represent area under the emission
        line spectra; background area (only in the selected energy range), error (R-factor),
        total count in the selected energy range, total count of the full experimental spectrum.
    """
    spec = data
    spec_sel = spec[:, :, data_sel_indices[0]: data_sel_indices[1]]

    if use_snip:
        bg_sel = np.apply_along_axis(snip_method_numba, 2, spec_sel,
                                     snip_param['e_offset'],
                                     snip_param['e_linear'],
                                     snip_param['e_quadratic'],
                                     width=snip_param['b_width'])

        y = spec_sel - bg_sel
        bg_sum = np.sum(bg_sel, axis=2)

    else:
        y = spec_sel
        bg_sum = np.zeros(shape=data.shape[0:2])

    weights, rfactor, _ = fit_spectrum(y, matv, axis=2, method="nnls")

    total_cnt = np.sum(spec, axis=2)
    sel_cnt = np.sum(spec_sel, axis=2)

    # Stack depth-wise (along axis 2)
    data_out = np.dstack((weights, bg_sum, rfactor, sel_cnt, total_cnt))

    return data_out 
Example #18
Source File: map_processing.py    From PyXRF with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def _chunk_numpy_array(data, chunk_size):
    """
    Convert a numpy array into Dask array with chunks of given size. The function
    splits the array into chunks along axes 0 and 1. If the array has more than 2 dimensions,
    then the remaining dimensions are not chunked. Note, that
    `dask_array = da.array(data, chunks=...)` will set the chunk size, but not split the
    data into chunks, therefore the array can not be loaded block by block by workers
    controlled by a distributed scheduler.

    Parameters
    ----------
    data: ndarray(float), 2 or more dimensions
        XRF map of the shape `(ny, nx, ne)`, where `ny` and `nx` represent the image size
        and `ne` is the number of points in spectra
    chunk_size: tuple(int, int) or list(int, int)
         Chunk size for axis 0 and 1: `(chunk_y, chunk_x`). The function will accept
         chunk size values that are larger then the respective `data` array dimensions.

    Returns
    -------
    data_dask: dask.array
        Dask array with the given chunk size
    """

    chunk_y, chunk_x = chunk_size
    ny, nx = data.shape[0:2]
    chunk_y, chunk_x = min(chunk_y, ny), min(chunk_x, nx)

    def _get_slice(n1, n2):
        data_slice = data[slice(n1 * chunk_y, min(n1 * chunk_y + chunk_y, ny)),
                          slice(n2 * chunk_x, min(n2 * chunk_x + chunk_x, nx))]
        # Wrap the slice into a list wiht appropriate dimensions
        for _ in range(2, data.ndim):
            data_slice = [data_slice]
        return data_slice

    # Chunk the numpy array and assemble it as a dask array
    data_dask = da.block([
        [
            _get_slice(_1, _2)
            for _2 in range(int(math.ceil(nx / chunk_x)))
        ]
        for _1 in range(int(math.ceil(ny / chunk_y)))
    ])

    return data_dask 
Example #19
Source File: map_processing.py    From PyXRF with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def wait_and_display_progress(fut, progress_bar=None):
    """
    Wait for the future to complete and display the progress bar.
    This method may be used to drive any custom progress bar, which
    displays progress in percent from 0 to 100.

    Parameters
    ----------
    fut: dask future
        future object for the batch of tasks submitted to the distributed
        client.
    progress_bar: callable or None
        callable function or callable object with methods `start()`,
        `__call__(float)` and `finish()`. The methods `start()` and
        `finish()` are optional. For example, this could be a reference
        to an instance of the object `TerminalProgressBar`

    Examples
    --------

    .. code-block::

        client = Client()
        data = da.random.random(size=(100, 100), chunks=(10, 10))
        sm_fut = da.sum(data, axis=0).persist(scheduler=client)

        # Call the progress monitor
        wait_and_display_progress(sm_fut, TerminalProgressBar("Monitoring progress: "))

        sm = sm_fut.compute(scheduler=client)
        client.close()
    """

    # If there is no progress bar, then just return without waiting for the future
    if progress_bar is None:
        return

    if hasattr(progress_bar, "start"):
        progress_bar.start()

    progress_bar(1.0)
    while True:
        done, not_done = wait(fut, return_when='FIRST_COMPLETED')
        n_completed, n_pending = len(done), len(not_done)
        n_total = n_completed + n_pending
        percent_completed = n_completed / n_total * 100.0 if n_total > 0 else 100.0

        # It is guaranteed that 'progress_bar' is called for 100% completion
        progress_bar(percent_completed)

        if not n_pending:
            break
        ttime.sleep(0.5)

    if hasattr(progress_bar, "finish"):
        progress_bar.finish()