Python Examples of ray.wait

Source File: ray_container.py From adeptRL with GNU General Public License v3.0

7 votes

def step(self):
        print(f"learner {self.rank} step")

        # make sure exp_handles are done
        for handle in self.exp_handles:
            handle.wait()

        # batch together exp
        time.sleep(random.randint(0, 3))

        # update with other learners
        dist.barrier(self.learner_group)
        for p in self.network_grads:
            dist.all_reduce(p, group=self.learner_group)
        print(f"learner {self.rank} shared gradients")
        return True

Source File: test_actor_resources.py From ray with Apache License 2.0

6 votes

def test_lifetime_and_transient_resources(ray_start_regular):
    # This actor acquires resources only when running methods.
    @ray.remote
    class Actor1:
        def method(self):
            pass

    # This actor acquires resources for its lifetime.
    @ray.remote(num_cpus=1)
    class Actor2:
        def method(self):
            pass

    actor1s = [Actor1.remote() for _ in range(10)]
    ray.get([a.method.remote() for a in actor1s])

    actor2s = [Actor2.remote() for _ in range(2)]
    results = [a.method.remote() for a in actor2s]
    ready_ids, remaining_ids = ray.wait(
        results, num_returns=len(results), timeout=5.0)
    assert len(ready_ids) == 1

Source File: test_basic_2.py From ray with Apache License 2.0

6 votes

def test_get_with_timeout(ray_start_regular):
    signal = ray.test_utils.SignalActor.remote()

    # Check that get() returns early if object is ready.
    start = time.time()
    ray.get(signal.wait.remote(should_wait=False), timeout=30)
    assert time.time() - start < 30

    # Check that get() raises a TimeoutError after the timeout if the object
    # is not ready yet.
    result_id = signal.wait.remote()
    with pytest.raises(RayTimeoutError):
        ray.get(result_id, timeout=0.1)

    # Check that a subsequent get() returns early.
    ray.get(signal.send.remote())
    start = time.time()
    ray.get(result_id, timeout=30)
    assert time.time() - start < 30

Source File: test_basic_2.py From ray with Apache License 2.0

6 votes

def test_actor_large_objects(ray_start_regular):
    @ray.remote
    class Actor:
        def __init__(self):
            pass

        def f(self):
            time.sleep(1)
            return np.zeros(10000000)

    a = Actor.remote()
    obj_id = a.f.remote()
    assert not ray.worker.global_worker.core_worker.object_exists(obj_id)
    done, _ = ray.wait([obj_id])
    assert len(done) == 1
    assert ray.worker.global_worker.core_worker.object_exists(obj_id)
    assert isinstance(ray.get(obj_id), np.ndarray)

Source File: ray_trial_executor.py From ray with Apache License 2.0

6 votes

def get_next_available_trial(self):
        shuffled_results = list(self._running.keys())
        random.shuffle(shuffled_results)
        # Note: We shuffle the results because `ray.wait` by default returns
        # the first available result, and we want to guarantee that slower
        # trials (i.e. trials that run remotely) also get fairly reported.
        # See https://github.com/ray-project/ray/issues/4211 for details.
        start = time.time()
        [result_id], _ = ray.wait(shuffled_results)
        wait_time = time.time() - start
        if wait_time > NONTRIVIAL_WAIT_TIME_THRESHOLD_S:
            self._last_nontrivial_wait = time.time()
        if time.time() - self._last_nontrivial_wait > BOTTLENECK_WARN_PERIOD_S:
            logger.warning(
                "Over the last {} seconds, the Tune event loop has been "
                "backlogged processing new results. Consider increasing your "
                "period of result reporting to improve performance.".format(
                    BOTTLENECK_WARN_PERIOD_S))

            self._last_nontrivial_wait = time.time()
        return self._running[result_id]

Source File: test_basic_2.py From ray with Apache License 2.0

6 votes

def test_actor_recursive(ray_start_regular):
    @ray.remote
    class Actor:
        def __init__(self, delegate=None):
            self.delegate = delegate

        def f(self, x):
            if self.delegate:
                return ray.get(self.delegate.f.remote(x))
            return x * 2

    a = Actor.remote()
    b = Actor.remote(a)
    c = Actor.remote(b)

    result = ray.get([c.f.remote(i) for i in range(100)])
    assert result == [x * 2 for x in range(100)]

    result, _ = ray.wait([c.f.remote(i) for i in range(100)], num_returns=100)
    result = ray.get(result)
    assert result == [x * 2 for x in range(100)]

Source File: api.py From ray with Apache License 2.0

6 votes

def wait(object_ids, num_returns=1, timeout=None):
    """Return a list of IDs that are ready and a list of IDs that are not.

    This method is identical to `ray.wait` except it adds support for tuples
    and ndarrays.

    Args:
        object_ids (List[ObjectID], Tuple(ObjectID), np.array(ObjectID)):
            List like of object IDs for objects that may or may not be ready.
            Note that these IDs must be unique.
        num_returns (int): The number of object IDs that should be returned.
        timeout (float): The maximum amount of time in seconds to wait before
            returning.

    Returns:
        A list of object IDs that are ready and a list of the remaining object
            IDs.
    """
    if isinstance(object_ids, (tuple, np.ndarray)):
        return ray.wait(
            list(object_ids), num_returns=num_returns, timeout=timeout)

    return ray.wait(object_ids, num_returns=num_returns, timeout=timeout)

Source File: test_basic_2.py From ray with Apache License 2.0

6 votes

def test_actor_concurrent(ray_start_regular):
    @ray.remote
    class Batcher:
        def __init__(self):
            self.batch = []
            self.event = threading.Event()

        def add(self, x):
            self.batch.append(x)
            if len(self.batch) >= 3:
                self.event.set()
            else:
                self.event.wait()
            return sorted(self.batch)

    a = Batcher.options(max_concurrency=3).remote()
    x1 = a.add.remote(1)
    x2 = a.add.remote(2)
    x3 = a.add.remote(3)
    r1 = ray.get(x1)
    r2 = ray.get(x2)
    r3 = ray.get(x3)
    assert r1 == [1, 2, 3]
    assert r1 == r2 == r3

Source File: test_stress.py From ray with Apache License 2.0

6 votes

def test_wait(ray_start_combination):
    num_nodes, num_workers_per_scheduler, cluster = ray_start_combination
    num_workers = num_nodes * num_workers_per_scheduler

    @ray.remote
    def f(x):
        return x

    x_ids = [f.remote(i) for i in range(100)]
    for i in range(len(x_ids)):
        ray.wait([x_ids[i]])
    for i in range(len(x_ids) - 1):
        ray.wait(x_ids[i:])

    @ray.remote
    def g(x):
        time.sleep(x)

    for i in range(1, 5):
        x_ids = [
            g.remote(np.random.uniform(0, i)) for _ in range(2 * num_workers)
        ]
        ray.wait(x_ids, num_returns=len(x_ids))

    assert cluster.remaining_processes_alive()

Source File: worker.py From ray with Apache License 2.0

6 votes

def get_objects(self, object_ids, timeout=None):
        """Get the values in the object store associated with the IDs.

        Return the values from the local object store for object_ids. This will
        block until all the values for object_ids have been written to the
        local object store.

        Args:
            object_ids (List[object_id.ObjectID]): A list of the object IDs
                whose values should be retrieved.
            timeout (float): timeout (float): The maximum amount of time in
                seconds to wait before returning.
        """
        # Make sure that the values are object IDs.
        for object_id in object_ids:
            if not isinstance(object_id, ObjectID):
                raise TypeError(
                    "Attempting to call `get` on the value {}, "
                    "which is not an ray.ObjectID.".format(object_id))

        timeout_ms = int(timeout * 1000) if timeout else -1
        data_metadata_pairs = self.core_worker.get_objects(
            object_ids, self.current_task_id, timeout_ms)
        return self.deserialize_objects(data_metadata_pairs, object_ids)

Source File: test_advanced.py From ray with Apache License 2.0

6 votes

def test_wait_makes_object_local(ray_start_cluster):
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=0)
    cluster.add_node(num_cpus=2)
    ray.init(address=cluster.address)

    @ray.remote
    class Foo:
        def method(self):
            return np.zeros(1024 * 1024)

    a = Foo.remote()

    # Test get makes the object local.
    x_id = a.method.remote()
    assert not ray.worker.global_worker.core_worker.object_exists(x_id)
    ray.get(x_id)
    assert ray.worker.global_worker.core_worker.object_exists(x_id)

    # Test wait makes the object local.
    x_id = a.method.remote()
    assert not ray.worker.global_worker.core_worker.object_exists(x_id)
    ok, _ = ray.wait([x_id])
    assert len(ok) == 1
    assert ray.worker.global_worker.core_worker.object_exists(x_id)

Source File: ray_two_machines.py From ncluster with MIT License

6 votes

def run_driver():
  ray.init(redis_address=args.ip)

  worker = Worker.remote()
  ps = ParameterServer.remote()
  log = util.FileLogger('out')
  log(f"Worker ip {ray.get(worker.ip.remote())}")
  log(f"PS ip {ray.get(ps.ip.remote())}")
  log(f"Driver ip {socket.gethostbyname(socket.gethostname())}")

  time_list = []
  for i in range(args.iters):
    start_time = time.perf_counter()
    grads = worker.compute_gradients.remote()
    result = ps.receive.remote(grads)
    ray.wait([result])
    elapsed_time_ms = (time.perf_counter() - start_time)*1000
    time_list.append(elapsed_time_ms)
    rate = args.size_mb / (elapsed_time_ms/1000)
    log('%03d/%d sent %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.size_mb, elapsed_time_ms, rate))
    
  min = np.min(time_list)
  median = np.median(time_list)
  log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}")

Source File: ray.py From FractalAI with GNU Affero General Public License v3.0

6 votes

def game_stream(self, examples: bool = False, full_game: bool = False):
        if examples:
            remaining_ids = [gen.get_game_examples.remote(with_id=True) for gen in self.generators]
        else:
            remaining_ids = [gen.recover_game.remote(True) for gen in self.generators]
        while True:
            ready_ids, remaining_ids = ray.wait(remaining_ids)
            for ready_id in ready_ids:
                game, worker_id = ray.get(ready_id)
                if examples:
                    new_id = self.generators[worker_id].get_game_examples.remote(True)
                else:
                    new_id = self.generators[worker_id].recover_game.remote(True)
                remaining_ids.append(new_id)
                if full_game:
                    yield game
                else:
                    if not examples:
                        _states, observs, rewards, ends, infos, actions = game
                        for i in range(len(actions)):
                            yield _states[i], observs[i], rewards[i], ends[i], infos[i], actions[i]
                    else:
                        _states, obs, actions, rewards, new_obs, ends = game
                        for i in range(len(rewards)):
                            yield obs[i], actions[i], rewards[i], new_obs[i], ends[i]

Source File: test_memory_scheduling.py From ray with Apache License 2.0

6 votes

def testMemoryRequest(self):
        try:
            ray.init(num_cpus=1, memory=200 * MB)
            # fits first 2
            a = Actor.remote()
            b = Actor.remote()
            ok, _ = ray.wait(
                [a.ping.remote(), b.ping.remote()],
                timeout=60.0,
                num_returns=2)
            self.assertEqual(len(ok), 2)
            # does not fit
            c = Actor.remote()
            ok, _ = ray.wait([c.ping.remote()], timeout=5.0)
            self.assertEqual(len(ok), 0)
        finally:
            ray.shutdown()

Source File: test_advanced.py From ray with Apache License 2.0

6 votes

def test_wait_cluster(ray_start_cluster):
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=1, resources={"RemoteResource": 1})
    cluster.add_node(num_cpus=1, resources={"RemoteResource": 1})
    ray.init(address=cluster.address)

    @ray.remote(resources={"RemoteResource": 1})
    def f():
        return

    # Make sure we have enough workers on the remote nodes to execute some
    # tasks.
    tasks = [f.remote() for _ in range(10)]
    start = time.time()
    ray.get(tasks)
    end = time.time()

    # Submit some more tasks that can only be executed on the remote nodes.
    tasks = [f.remote() for _ in range(10)]
    # Sleep for a bit to let the tasks finish.
    time.sleep((end - start) * 2)
    _, unready = ray.wait(tasks, num_returns=len(tasks), timeout=0)
    # All remote tasks should have finished.
    assert len(unready) == 0

Source File: test_basic.py From ray with Apache License 2.0

6 votes

def test_fair_queueing(shutdown_only):
    ray.init(
        num_cpus=1, _internal_config=json.dumps({
            "fair_queueing_enabled": 1
        }))

    @ray.remote
    def h():
        return 0

    @ray.remote
    def g():
        return ray.get(h.remote())

    @ray.remote
    def f():
        return ray.get(g.remote())

    # This will never finish without fair queueing of {f, g, h}:
    # https://github.com/ray-project/ray/issues/3644
    ready, _ = ray.wait(
        [f.remote() for _ in range(1000)], timeout=60.0, num_returns=1000)
    assert len(ready) == 1000, len(ready)

Source File: test_multinode_failures_2.py From ray with Apache License 2.0

6 votes

def test_driver_lives_parallel(ray_start_regular):
    all_processes = ray.worker._global_node.all_processes

    process_infos = (all_processes[ray_constants.PROCESS_TYPE_PLASMA_STORE] +
                     all_processes[ray_constants.PROCESS_TYPE_GCS_SERVER] +
                     all_processes[ray_constants.PROCESS_TYPE_RAYLET] +
                     all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR] +
                     all_processes[ray_constants.PROCESS_TYPE_MONITOR])
    assert len(process_infos) == 5

    # Kill all the components in parallel.
    for process_info in process_infos:
        process_info.process.terminate()

    time.sleep(0.1)
    for process_info in process_infos:
        process_info.process.kill()

    for process_info in process_infos:
        process_info.process.wait()

    # If the driver can reach the tearDown method, then it is still alive.

Source File: test_failure.py From ray with Apache License 2.0

6 votes

def test_actor_worker_dying(ray_start_regular):
    @ray.remote
    class Actor:
        def kill(self):
            eval("exit()")

    @ray.remote
    def consume(x):
        pass

    a = Actor.remote()
    [obj], _ = ray.wait([a.kill.remote()], timeout=5)
    with pytest.raises(ray.exceptions.RayActorError):
        ray.get(obj)
    with pytest.raises(ray.exceptions.RayTaskError):
        ray.get(consume.remote(obj))
    wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1)

Source File: test_dynres.py From ray with Apache License 2.0

6 votes

def test_dynamic_res_infeasible_rescheduling(ray_start_regular):
    # This test launches an infeasible task and then creates a
    # resource to make the task feasible. This tests if the
    # infeasible tasks get rescheduled when resources are
    # created at runtime.
    res_name = "test_res"
    res_capacity = 1.0

    @ray.remote
    def set_res(resource_name, resource_capacity):
        ray.experimental.set_resource(resource_name, resource_capacity)

    def f():
        return 1

    remote_task = ray.remote(resources={res_name: res_capacity})(f)
    oid = remote_task.remote()  # This is infeasible
    ray.get(set_res.remote(res_name, res_capacity))  # Now should be feasible

    available_res = ray.available_resources()
    assert available_res[res_name] == res_capacity

    successful, unsuccessful = ray.wait([oid], timeout=1)
    assert successful  # The task completed

Source File: ray_pool.py From rltime with Apache License 2.0

6 votes

def _ensure_resources(self, instances):
        """Checks we have enough ray resources to create the request

        TODO: This doesn't really work with more than 1 receiver as they create
        and check in parallel. In any case ray will not error if we create an
        actor without resources it will just wait and not be used until it can
        be run
        """
        available = ray.available_resources()
        required = {
            "CPU": self._cpus_per_worker,
            **self._custom_resources_per_worker
        }
        required = {key: val * instances for key, val in required.items()}
        if not np.all(
                [available.get(key, 0) >= required[key] for key in required]):
            raise RuntimeError(
                "Not enough RAY resources to start the acting pool. "
                f"Need: {required} Available: {available}")

Source File: test_component_failures_3.py From ray with Apache License 2.0

6 votes

def test_driver_lives_parallel(ray_start_regular):
    all_processes = ray.worker._global_node.all_processes

    process_infos = (all_processes[ray_constants.PROCESS_TYPE_PLASMA_STORE] +
                     all_processes[ray_constants.PROCESS_TYPE_GCS_SERVER] +
                     all_processes[ray_constants.PROCESS_TYPE_RAYLET] +
                     all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR] +
                     all_processes[ray_constants.PROCESS_TYPE_MONITOR])
    assert len(process_infos) == 5

    # Kill all the components in parallel.
    for process_info in process_infos:
        process_info.process.terminate()

    time.sleep(0.1)
    for process_info in process_infos:
        process_info.process.kill()

    for process_info in process_infos:
        process_info.process.wait()

    # If the driver can reach the tearDown method, then it is still alive.

Source File: test_advanced_3.py From ray with Apache License 2.0

6 votes

def test_wait_reconstruction(shutdown_only):
    ray.init(
        num_cpus=1,
        object_store_memory=int(10**8),
        _internal_config=json.dumps({
            "object_pinning_enabled": 0
        }))

    @ray.remote
    def f():
        return np.zeros(6 * 10**7, dtype=np.uint8)

    x_id = f.remote()
    ray.wait([x_id])
    ray.wait([f.remote()])
    assert not ray.worker.global_worker.core_worker.object_exists(x_id)
    ready_ids, _ = ray.wait([x_id])
    assert len(ready_ids) == 1

Source File: test_actor_advanced.py From ray with Apache License 2.0

6 votes

def test_kill(ray_start_regular):
    @ray.remote
    class Actor:
        def hang(self):
            while True:
                time.sleep(1)

    actor = Actor.remote()
    result = actor.hang.remote()
    ready, _ = ray.wait([result], timeout=0.5)
    assert len(ready) == 0
    ray.kill(actor, no_restart=False)

    with pytest.raises(ray.exceptions.RayActorError):
        ray.get(result)

    with pytest.raises(ValueError):
        ray.kill("not_an_actor_handle")


# This test verifies actor creation task failure will not
# hang the caller.

Source File: test_reference_counting.py From ray with Apache License 2.0

6 votes

def test_actor_creation_task(ray_start_regular):
    @ray.remote
    def large_object():
        # This will be spilled to plasma.
        return np.zeros(10 * 1024 * 1024, dtype=np.uint8)

    @ray.remote(resources={"init": 1})
    class Actor:
        def __init__(self, dependency):
            return

        def ping(self):
            return

    a = Actor.remote(large_object.remote())
    ping = a.ping.remote()
    ready, unready = ray.wait([ping], timeout=1)
    assert not ready

    ray.experimental.set_resource("init", 1)
    ray.get(ping)

Source File: test_actor_advanced.py From ray with Apache License 2.0

5 votes

def test_remote_checkpoint_distributed_handle(ray_start_cluster_2_nodes):
    cluster = ray_start_cluster_2_nodes
    counter, ids = setup_counter_actor(test_checkpoint=True)

    @ray.remote
    def fork_many_incs(counter, num_incs):
        x = None
        for _ in range(num_incs):
            x = counter.inc.remote()
        # Only call ray.get() on the last task submitted.
        return ray.get(x)

    # Fork num_iters times.
    count = ray.get(ids[-1])
    num_incs = 100
    num_iters = 10
    forks = [
        fork_many_incs.remote(counter, num_incs) for _ in range(num_iters)
    ]
    ray.wait(forks, num_returns=len(forks))
    ray.wait([counter.__ray_checkpoint__.remote()])
    count += num_incs * num_iters

    # Kill the second plasma store to get rid of the cached objects and
    # trigger the corresponding raylet to exit.
    get_non_head_nodes(cluster)[0].kill_plasma_store(wait=True)

    # Check that the actor restored from a checkpoint.
    assert ray.get(counter.test_restore.remote())
    # Check that the number of inc calls since actor initialization is
    # exactly zero, since there could not have been another inc call since
    # the remote checkpoint.
    num_inc_calls = ray.get(counter.get_num_inc_calls.remote())
    assert num_inc_calls == 0
    # Check that we can submit another call on the actor and get the
    # correct counter result.
    x = ray.get(counter.inc.remote())
    assert x == count + 1

Source File: test_component_failures.py From ray with Apache License 2.0

5 votes

def test_dying_driver_wait(ray_start_regular):
    # Start the Ray processes.
    address_info = ray_start_regular

    @ray.remote
    def sleep_forever():
        time.sleep(10**6)

    x_id = sleep_forever.remote()

    driver = """
import ray
ray.init("{}")
ray.wait([ray.ObjectID(ray.utils.hex_to_binary("{}"))])
""".format(address_info["redis_address"], x_id.hex())

    p = run_string_as_driver_nonblocking(driver)
    # Make sure the driver is running.
    time.sleep(1)
    assert p.poll() is None

    # Kill the driver process.
    p.kill()
    p.wait()
    time.sleep(0.1)

    # Make sure the original task hasn't finished.
    ready_ids, _ = ray.wait([x_id], timeout=0)
    assert len(ready_ids) == 0
    # Seal the object so the store attempts to notify the worker that the
    # wait can return.
    ray.worker.global_worker.put_object(1, x_id)
    time.sleep(0.1)

    # Make sure that nothing has died.
    assert ray.services.remaining_processes_alive()

Source File: test_failure.py From ray with Apache License 2.0

5 votes

def test_get_throws_quickly_when_found_exception(ray_start_regular):
    # We use an actor instead of functions here. If we use functions, it's
    # very likely that two normal tasks are submitted before the first worker
    # is registered to Raylet. Since `maximum_startup_concurrency` is 1,
    # the worker pool will wait for the registration of the first worker
    # and skip starting new workers. The result is, the two tasks will be
    # executed sequentially, which breaks an assumption of this test case -
    # the two tasks run in parallel.
    @ray.remote
    class Actor(object):
        def bad_func1(self):
            raise Exception("Test function intentionally failed.")

        def bad_func2(self):
            os._exit(0)

        def slow_func(self, signal):
            ray.get(signal.wait.remote())

    def expect_exception(objects, exception):
        with pytest.raises(ray.exceptions.RayError) as err:
            ray.get(objects)
        assert err.type is exception

    signal1 = SignalActor.remote()
    actor = Actor.options(max_concurrency=2).remote()
    expect_exception(
        [actor.bad_func1.remote(),
         actor.slow_func.remote(signal1)], ray.exceptions.RayTaskError)
    ray.get(signal1.send.remote())

    signal2 = SignalActor.remote()
    actor = Actor.options(max_concurrency=2).remote()
    expect_exception(
        [actor.bad_func2.remote(),
         actor.slow_func.remote(signal2)], ray.exceptions.RayActorError)
    ray.get(signal2.send.remote())

Source File: test_basic.py From ray with Apache License 2.0

5 votes

def test_background_tasks_with_max_calls(shutdown_only):
    ray.init(num_cpus=2)

    @ray.remote
    def g():
        time.sleep(.1)
        return 0

    @ray.remote(max_calls=1, max_retries=0)
    def f():
        return [g.remote()]

    nested = ray.get([f.remote() for _ in range(10)])

    # Should still be able to retrieve these objects, since f's workers will
    # wait for g to finish before exiting.
    ray.get([x[0] for x in nested])

    @ray.remote(max_calls=1, max_retries=0)
    def f():
        return os.getpid(), g.remote()

    nested = ray.get([f.remote() for _ in range(10)])
    while nested:
        pid, g_id = nested.pop(0)
        ray.get(g_id)
        del g_id
        ray.test_utils.wait_for_pid_to_exit(pid)

Source File: test_advanced_2.py From ray with Apache License 2.0

5 votes

def test_blocking_tasks(ray_start_regular):
    @ray.remote
    def f(i, j):
        return (i, j)

    @ray.remote
    def g(i):
        # Each instance of g submits and blocks on the result of another
        # remote task.
        object_ids = [f.remote(i, j) for j in range(2)]
        return ray.get(object_ids)

    @ray.remote
    def h(i):
        # Each instance of g submits and blocks on the result of another
        # remote task using ray.wait.
        object_ids = [f.remote(i, j) for j in range(2)]
        return ray.wait(object_ids, num_returns=len(object_ids))

    ray.get([h.remote(i) for i in range(4)])

    @ray.remote
    def _sleep(i):
        time.sleep(0.01)
        return (i)

    @ray.remote
    def sleep():
        # Each instance of sleep submits and blocks on the result of
        # another remote task, which takes some time to execute.
        ray.get([_sleep.remote(i) for i in range(10)])

    ray.get(sleep.remote())

Source File: test_component_failures.py From ray with Apache License 2.0

5 votes

def test_dying_worker_wait(ray_start_2_cpus):
    @ray.remote
    def sleep_forever():
        time.sleep(10**6)

    @ray.remote
    def get_pid():
        return os.getpid()

    x_id = sleep_forever.remote()
    # Get the PID of the worker that block_in_wait will run on (sleep a little
    # to make sure that sleep_forever has already started).
    time.sleep(0.1)
    worker_pid = ray.get(get_pid.remote())

    @ray.remote
    def block_in_wait(object_id_in_list):
        ray.wait(object_id_in_list)

    # Have the worker wait in a wait call.
    block_in_wait.remote([x_id])
    time.sleep(0.1)

    # Kill the worker.
    os.kill(worker_pid, SIGKILL)
    time.sleep(0.1)

    # Create the object.
    ray.worker.global_worker.put_object(1, x_id)
    time.sleep(0.1)

    # Make sure that nothing has died.
    assert ray.services.remaining_processes_alive()


# This test checks that when a driver dies in the middle of a wait, the plasma
# store and raylet will not die.

Python ray.wait() Examples