Python Examples of dask.distributed

Source File: FEMSolver.py From florence with MIT License

9 votes

def LaunchDaskDistributedClient(self, scheduler_ip=None, scheduler_port=None):

        if self.parallel and self.parallel_model == "dask" and self.is_dask_scheduler_initialised is False:

            from multiprocessing.pool import ThreadPool
            try:
                import dask
                from dask.distributed import Client, LocalCluster
            except ImportError:
                raise ImportError("dask is not installed. Install it 'using pip install dask[complete]'")

            dask.config.set(pool=ThreadPool(self.no_of_cpu_cores))
            # INITIALISE CLUSTER
            if scheduler_ip is None:
                cluster = LocalCluster(n_workers=self.no_of_cpu_cores, processes=False, threads_per_worker=None)
                client = Client(cluster)
            else:
                client = Client(scheduler_ip)

            self.dask_client = client

            self.is_dask_scheduler_initialised = True

Source File: utils.py From verde with BSD 3-Clause "New" or "Revised" License

7 votes

def dispatch(function, delayed=False, client=None):
    """
    Decide how to wrap a function for Dask depending on the options given.

    Parameters
    ----------
    function : callable
        The function that will be called.
    delayed : bool
        If True, will wrap the function in :func:`dask.delayed`.
    client : None or dask.distributed Client
        If *delayed* is False and *client* is not None, will return a partial
        execution of the ``client.submit`` with the function as first argument.

    Returns
    -------
    function : callable
        The function wrapped in Dask.

    """
    if delayed:
        return dask.delayed(function)
    if client is not None:
        return functools.partial(client.submit, function)
    return function

Source File: map_processing.py From PyXRF with BSD 3-Clause "New" or "Revised" License

6 votes

def dask_client_create(**kwargs):
    """
    Create Dask client object. The function is trivial and introduced so that
    Dask client is created in uniform way throughout the program.

    Parameters
    ----------
    kwargs: dict, optional
        kwargs will be passed to the Dask client constructor

    Returns
    -------
    client: dask.distributed.Client
        Dask client object
    """
    _kwargs = {"processes": True, "silence_logs": logging.ERROR}
    _kwargs.update(kwargs)
    client = Client(**_kwargs)
    dask.config.set(shuffle="disk")
    path_dask_data = os.path.expanduser("~/.dask")
    dask.config.set({"temporary_directory": path_dask_data})
    return client

Source File: automate.py From aospy with Apache License 2.0

5 votes

def _exec_calcs(calcs, parallelize=False, client=None, **compute_kwargs):
    """Execute the given calculations.

    Parameters
    ----------
    calcs : Sequence of ``aospy.Calc`` objects
    parallelize : bool, default False
        Whether to submit the calculations in parallel or not
    client : distributed.Client or None
        The distributed Client used if parallelize is set to True; if None
        a distributed LocalCluster is used.
    compute_kwargs : dict of keyword arguments passed to ``Calc.compute``

    Returns
    -------
    A list of the values returned by each Calc object that was executed.
    """
    if parallelize:
        def func(calc):
            """Wrap _compute_or_skip_on_error to require only the calc
            argument"""
            if 'write_to_tar' in compute_kwargs:
                compute_kwargs['write_to_tar'] = False
            return _compute_or_skip_on_error(calc, compute_kwargs)

        if client is None:
            n_workers = _n_workers_for_local_cluster(calcs)
            with distributed.LocalCluster(n_workers=n_workers) as cluster:
                with distributed.Client(cluster) as client:
                    result = _submit_calcs_on_client(calcs, client, func)
        else:
            result = _submit_calcs_on_client(calcs, client, func)
        if compute_kwargs['write_to_tar']:
            _serial_write_to_tar(calcs)
        return result
    else:
        return [_compute_or_skip_on_error(calc, compute_kwargs)
                for calc in calcs]

Source File: test_dask_env.py From pangeo-stacks with BSD 3-Clause "New" or "Revised" License

5 votes

def client():
    from dask.distributed import Client
    with Client(n_workers=4) as dask_client:
        yield dask_client

Source File: test_dask_env.py From pangeo-stacks with BSD 3-Clause "New" or "Revised" License

5 votes

def client():
    from dask.distributed import Client
    with Client(n_workers=4) as dask_client:
        yield dask_client

Source File: test_async.py From dask-kubernetes with BSD 3-Clause "New" or "Revised" License

5 votes

def test_diagnostics_link_env_variable(pod_spec, ns):
    pytest.importorskip("bokeh")
    with dask.config.set({"distributed.dashboard.link": "foo-{USER}-{port}"}):
        async with KubeCluster(pod_spec, namespace=ns, asynchronous=True) as cluster:
            port = cluster.scheduler_info["services"]["dashboard"]

            assert (
                "foo-" + getpass.getuser() + "-" + str(port) in cluster.dashboard_link
            )

Source File: test_async.py From dask-kubernetes with BSD 3-Clause "New" or "Revised" License

5 votes

def test_logs(remote_cluster):
    cluster = remote_cluster
    cluster.scale(2)
    await cluster

    start = time()
    while len(cluster.scheduler_info["workers"]) < 2:
        await asyncio.sleep(0.1)
        assert time() < start + 20

    logs = await cluster.logs()
    assert len(logs) == 3
    for _, log in logs.items():
        assert "distributed.scheduler" in log or "distributed.worker" in log

Source File: core.py From dask-kubernetes with BSD 3-Clause "New" or "Revised" License

5 votes

def scale(self, n):
        # A shim to maintain backward compatibility
        # https://github.com/dask/distributed/issues/3054
        maximum = dask.config.get("kubernetes.count.max")
        if maximum is not None and maximum < n:
            logger.info(
                "Tried to scale beyond maximum number of workers %d > %d", n, maximum
            )
            n = maximum
        return super().scale(n)

Source File: executor.py From dagster with Apache License 2.0

5 votes

def build_dict(self, pipeline_name):
        '''Returns a dict we can use for kwargs passed to dask client instantiation.

        Intended to be used like:

        with dask.distributed.Client(**cfg.build_dict()) as client:
            << use client here >>

        '''
        if self.cluster_type in ['yarn', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube']:
            dask_cfg = {'name': pipeline_name}
        else:
            dask_cfg = {}

        if self.cluster_configuration:
            for k, v in self.cluster_configuration.items():
                dask_cfg[k] = v

        # if address is set, don't add LocalCluster args
        # context: https://github.com/dask/distributed/issues/3313
        if (self.cluster_type == 'local') and ('address' not in dask_cfg):
            # We set threads_per_worker because Dagster is not thread-safe. Even though
            # environments=True by default, there is a clever piece of machinery
            # (dask.distributed.deploy.local.nprocesses_nthreads) that automagically makes execution
            # multithreaded by default when the number of available cores is greater than 4.
            # See: https://github.com/dagster-io/dagster/issues/2181
            # We may want to try to figure out a way to enforce this on remote Dask clusters against
            # which users run Dagster workloads.
            dask_cfg['threads_per_worker'] = 1

        return dask_cfg

Source File: test_dask.py From filesystem_spec with BSD 3-Clause "New" or "Revised" License

5 votes

def cli(tmpdir):
    import dask.distributed

    client = dask.distributed.Client(n_workers=1)

    def setup():
        m = fsspec.filesystem("memory")
        with m.open("afile", "wb") as f:
            f.write(b"data")

    client.run(setup)
    try:
        yield client
    finally:
        client.close()

Source File: automate.py From aospy with Apache License 2.0

5 votes

def _submit_calcs_on_client(calcs, client, func):
    """Submit calculations via dask.bag and a distributed client"""
    logging.info('Connected to client: {}'.format(client))
    if LooseVersion(dask.__version__) < '0.18':
        dask_option_setter = dask.set_options
    else:
        dask_option_setter = dask.config.set
    with dask_option_setter(get=client.get):
        return db.from_sequence(calcs).map(func).compute()

Source File: executor.py From dagster with Apache License 2.0

4 votes

def dask_executor(init_context):
    '''Dask-based executor.

    If the Dask executor is used without providing executor-specific config, a local Dask cluster
    will be created (as when calling :py:class:`dask.distributed.Client() <dask:distributed.Client>`
    without specifying the scheduler address).

    The Dask executor optionally takes the following config:

    .. code-block:: none

        cluster:
            {
                local?:  # The cluster type, one of the following ('local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube').
                    {
                        address?: '127.0.0.1:8786',  # The address of a Dask scheduler
                        timeout?: 5,  # Timeout duration for initial connection to the scheduler
                        scheduler_file?: '/path/to/file'  # Path to a file with scheduler information
                        # Whether to connect directly to the workers, or ask the scheduler to serve as
                        # intermediary
                        direct_to_workers?: False,
                        heartbeat_interval?: 1000,  # Time in milliseconds between heartbeats to scheduler
                    }
            }

    If you'd like to configure a dask executor in addition to the
    :py:class:`~dagster.default_executors`, you should add it to the ``executor_defs`` defined on a
    :py:class:`~dagster.ModeDefinition` as follows:

    .. code-block:: python

        from dagster import ModeDefinition, default_executors, pipeline
        from dagster_dask import dask_executor

        @pipeline(mode_defs=[ModeDefinition(executor_defs=default_executors + [dask_executor])])
        def dask_enabled_pipeline():
            pass

    '''
    check_cross_process_constraints(init_context)
    ((cluster_type, cluster_configuration),) = init_context.executor_config['cluster'].items()
    return DaskExecutor(cluster_type, cluster_configuration)

Source File: run_rj_neq.py From perses with MIT License

4 votes

def createSystemFromIUPAC(iupac_name):
    """
    Create an openmm system out of an oemol

    Parameters
    ----------
    iupac_name : str
        IUPAC name

    Returns
    -------
    molecule : openeye.OEMol
        OEMol molecule
    system : openmm.System object
        OpenMM system
    positions : [n,3] np.array of floats
        Positions
    topology : openmm.app.Topology object
        Topology
    """
    from perses.utils.data import get_data_filename
    from perses.utils.openeye import extractPositionsFromOEMol
    # Create OEMol
    molecule = iupac_to_oemol(iupac_name)

    # Generate a topology.
    from openmoltools.forcefield_generators import generateTopologyFromOEMol
    topology = generateTopologyFromOEMol(molecule)

    # Initialize a forcefield with GAFF.
    # TODO: Fix path for `gaff.xml` since it is not yet distributed with OpenMM
    from simtk.openmm.app import ForceField
    gaff_xml_filename = get_data_filename('data/gaff.xml')
    forcefield = ForceField(gaff_xml_filename)

    # Generate template and parameters.
    from openmoltools.forcefield_generators import generateResidueTemplate
    [template, ffxml] = generateResidueTemplate(molecule)

    # Register the template.
    forcefield.registerResidueTemplate(template)

    # Add the parameters.
    forcefield.loadFile(StringIO(ffxml))

    # Create the system.
    system = forcefield.createSystem(topology, removeCMMotion=False)

    # Extract positions
    positions = extractPositionsFromOEMol(molecule)

    return (molecule, system, positions, topology)

Source File: utils.py From HistomicsTK with Apache License 2.0

4 votes

def create_dask_client(args):
    """Create and install a Dask distributed client using args from a
    Namespace, supporting the following attributes:

    - .scheduler: Address of the distributed scheduler, or the
      empty string to start one locally

    """
    import dask
    scheduler = getattr(args, 'scheduler', None)
    num_workers = getattr(args, 'num_workers', 0)
    num_threads_per_worker = getattr(args, 'num_threads_per_worker', 0)

    if scheduler == 'multithreading':
        import dask.threaded
        from multiprocessing.pool import ThreadPool

        if num_threads_per_worker <= 0:
            num_workers = max(1, psutil.cpu_count(logical=False) + num_threads_per_worker)
        else:
            num_workers = num_threads_per_worker
        print('Starting dask thread pool with %d thread(s)' % num_workers)
        dask.config.set(pool=ThreadPool(num_workers))
        dask.config.set(scheduler='threads')
        return

    if scheduler == 'multiprocessing':
        import dask.multiprocessing
        import multiprocessing

        dask.config.set(scheduler='processes')
        if num_workers <= 0:
            num_workers = max(1, psutil.cpu_count(logical=False) + num_workers)

        print('Starting dask multiprocessing pool with %d worker(s)' % num_workers)
        dask.config.set(pool=multiprocessing.Pool(
            num_workers, initializer=dask.multiprocessing.initialize_worker_process))
        return

    import dask.distributed
    if not scheduler:

        if num_workers <= 0:
            num_workers = max(1, psutil.cpu_count(logical=False) + num_workers)
        num_threads_per_worker = (num_threads_per_worker if num_threads_per_worker >= 1 else None)

        print('Creating dask LocalCluster with %d worker(s), %r thread(s) per '
              'worker' % (num_workers, num_threads_per_worker))
        scheduler = dask.distributed.LocalCluster(
            ip='0.0.0.0',  # Allow reaching the diagnostics port externally
            scheduler_port=0,  # Don't expose the scheduler port
            n_workers=num_workers,
            memory_limit=0,
            threads_per_worker=num_threads_per_worker,
            silence_logs=False
        )

    return dask.distributed.Client(scheduler)

Source File: core.py From dask-kubernetes with BSD 3-Clause "New" or "Revised" License

4 votes

def __init__(
        self,
        pod_template=None,
        name=None,
        namespace=None,
        n_workers=None,
        host=None,
        port=None,
        env=None,
        auth=ClusterAuth.DEFAULT,
        idle_timeout=None,
        deploy_mode=None,
        interface=None,
        protocol=None,
        dashboard_address=None,
        security=None,
        scheduler_service_wait_timeout=None,
        scheduler_pod_template=None,
        **kwargs
    ):
        self.pod_template = pod_template
        self.scheduler_pod_template = scheduler_pod_template
        self._generate_name = name
        self._namespace = namespace
        self._n_workers = n_workers
        self._idle_timeout = idle_timeout
        self._deploy_mode = deploy_mode
        self._protocol = protocol
        self._interface = interface
        self._dashboard_address = dashboard_address
        self._scheduler_service_wait_timeout = scheduler_service_wait_timeout
        self.security = security
        if self.security and not isinstance(
            self.security, distributed.security.Security
        ):
            raise RuntimeError(
                "Security object is not a valid distributed.security.Security object"
            )
        self.host = host
        self.port = port
        self.env = env
        self.auth = auth
        self.kwargs = kwargs
        super().__init__(**self.kwargs)

Source File: map_processing.py From PyXRF with BSD 3-Clause "New" or "Revised" License

4 votes

def _fit_xrf_block(data, data_sel_indices,
                   matv, snip_param, use_snip):
    """
    Spectrum fitting for a block of XRF dataset. The function is intended to be
    called using `map_blocks` function for parallel processing using Dask distributed
    package.

    Parameters
    ----------
    data : ndarray
        block of an XRF dataset. Shape=(ny, nx, ne).
    data_sel_indices: tuple
        tuple `(n_start, n_end)` which defines the indices along axis 2 of `data` array
        that are used for fitting. Note that `ne` (in `data`) and `ne_model` (in `matv`)
        are not equal. But `n_end - n_start` MUST be equal to `ne_model`! Indexes
        `n_start .. n_end - 1` will be selected from each pixel.
    matv: ndarray
        Matrix of spectra of the selected elements (emission lines). Shape=(ne_model, n_lines)
    snip_param: dict
        Dictionary of parameters forwarded to 'snip' method for background removal.
        Keys: `e_offset`, `e_linear`, `e_quadratic` (parameters of the energy axis approximation),
        `b_width` (width of the window that defines resolution of the snip algorithm).
    use_snip: bool, optional
        enable/disable background removal using snip algorithm

    Returns
    -------
    data_out: ndarray
        array with fitting results. Shape: `(ny, nx, ne_model + 4)`. For each pixel
        the output data contains: `ne_model` values that represent area under the emission
        line spectra; background area (only in the selected energy range), error (R-factor),
        total count in the selected energy range, total count of the full experimental spectrum.
    """
    spec = data
    spec_sel = spec[:, :, data_sel_indices[0]: data_sel_indices[1]]

    if use_snip:
        bg_sel = np.apply_along_axis(snip_method_numba, 2, spec_sel,
                                     snip_param['e_offset'],
                                     snip_param['e_linear'],
                                     snip_param['e_quadratic'],
                                     width=snip_param['b_width'])

        y = spec_sel - bg_sel
        bg_sum = np.sum(bg_sel, axis=2)

    else:
        y = spec_sel
        bg_sum = np.zeros(shape=data.shape[0:2])

    weights, rfactor, _ = fit_spectrum(y, matv, axis=2, method="nnls")

    total_cnt = np.sum(spec, axis=2)
    sel_cnt = np.sum(spec_sel, axis=2)

    # Stack depth-wise (along axis 2)
    data_out = np.dstack((weights, bg_sum, rfactor, sel_cnt, total_cnt))

    return data_out

Source File: map_processing.py From PyXRF with BSD 3-Clause "New" or "Revised" License

4 votes

def _chunk_numpy_array(data, chunk_size):
    """
    Convert a numpy array into Dask array with chunks of given size. The function
    splits the array into chunks along axes 0 and 1. If the array has more than 2 dimensions,
    then the remaining dimensions are not chunked. Note, that
    `dask_array = da.array(data, chunks=...)` will set the chunk size, but not split the
    data into chunks, therefore the array can not be loaded block by block by workers
    controlled by a distributed scheduler.

    Parameters
    ----------
    data: ndarray(float), 2 or more dimensions
        XRF map of the shape `(ny, nx, ne)`, where `ny` and `nx` represent the image size
        and `ne` is the number of points in spectra
    chunk_size: tuple(int, int) or list(int, int)
         Chunk size for axis 0 and 1: `(chunk_y, chunk_x`). The function will accept
         chunk size values that are larger then the respective `data` array dimensions.

    Returns
    -------
    data_dask: dask.array
        Dask array with the given chunk size
    """

    chunk_y, chunk_x = chunk_size
    ny, nx = data.shape[0:2]
    chunk_y, chunk_x = min(chunk_y, ny), min(chunk_x, nx)

    def _get_slice(n1, n2):
        data_slice = data[slice(n1 * chunk_y, min(n1 * chunk_y + chunk_y, ny)),
                          slice(n2 * chunk_x, min(n2 * chunk_x + chunk_x, nx))]
        # Wrap the slice into a list wiht appropriate dimensions
        for _ in range(2, data.ndim):
            data_slice = [data_slice]
        return data_slice

    # Chunk the numpy array and assemble it as a dask array
    data_dask = da.block([
        [
            _get_slice(_1, _2)
            for _2 in range(int(math.ceil(nx / chunk_x)))
        ]
        for _1 in range(int(math.ceil(ny / chunk_y)))
    ])

    return data_dask

Source File: map_processing.py From PyXRF with BSD 3-Clause "New" or "Revised" License

4 votes

def wait_and_display_progress(fut, progress_bar=None):
    """
    Wait for the future to complete and display the progress bar.
    This method may be used to drive any custom progress bar, which
    displays progress in percent from 0 to 100.

    Parameters
    ----------
    fut: dask future
        future object for the batch of tasks submitted to the distributed
        client.
    progress_bar: callable or None
        callable function or callable object with methods `start()`,
        `__call__(float)` and `finish()`. The methods `start()` and
        `finish()` are optional. For example, this could be a reference
        to an instance of the object `TerminalProgressBar`

    Examples
    --------

    .. code-block::

        client = Client()
        data = da.random.random(size=(100, 100), chunks=(10, 10))
        sm_fut = da.sum(data, axis=0).persist(scheduler=client)

        # Call the progress monitor
        wait_and_display_progress(sm_fut, TerminalProgressBar("Monitoring progress: "))

        sm = sm_fut.compute(scheduler=client)
        client.close()
    """

    # If there is no progress bar, then just return without waiting for the future
    if progress_bar is None:
        return

    if hasattr(progress_bar, "start"):
        progress_bar.start()

    progress_bar(1.0)
    while True:
        done, not_done = wait(fut, return_when='FIRST_COMPLETED')
        n_completed, n_pending = len(done), len(not_done)
        n_total = n_completed + n_pending
        percent_completed = n_completed / n_total * 100.0 if n_total > 0 else 100.0

        # It is guaranteed that 'progress_bar' is called for 100% completion
        progress_bar(percent_completed)

        if not n_pending:
            break
        ttime.sleep(0.5)

    if hasattr(progress_bar, "finish"):
        progress_bar.finish()

Python dask.distributed() Examples