Python ray.wait() Examples
The following are 30
code examples of ray.wait().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
ray
, or try the search function
.
Example #1
Source File: ray_container.py From adeptRL with GNU General Public License v3.0 | 7 votes |
def step(self): print(f"learner {self.rank} step") # make sure exp_handles are done for handle in self.exp_handles: handle.wait() # batch together exp time.sleep(random.randint(0, 3)) # update with other learners dist.barrier(self.learner_group) for p in self.network_grads: dist.all_reduce(p, group=self.learner_group) print(f"learner {self.rank} shared gradients") return True
Example #2
Source File: test_actor_resources.py From ray with Apache License 2.0 | 6 votes |
def test_lifetime_and_transient_resources(ray_start_regular): # This actor acquires resources only when running methods. @ray.remote class Actor1: def method(self): pass # This actor acquires resources for its lifetime. @ray.remote(num_cpus=1) class Actor2: def method(self): pass actor1s = [Actor1.remote() for _ in range(10)] ray.get([a.method.remote() for a in actor1s]) actor2s = [Actor2.remote() for _ in range(2)] results = [a.method.remote() for a in actor2s] ready_ids, remaining_ids = ray.wait( results, num_returns=len(results), timeout=5.0) assert len(ready_ids) == 1
Example #3
Source File: test_basic_2.py From ray with Apache License 2.0 | 6 votes |
def test_get_with_timeout(ray_start_regular): signal = ray.test_utils.SignalActor.remote() # Check that get() returns early if object is ready. start = time.time() ray.get(signal.wait.remote(should_wait=False), timeout=30) assert time.time() - start < 30 # Check that get() raises a TimeoutError after the timeout if the object # is not ready yet. result_id = signal.wait.remote() with pytest.raises(RayTimeoutError): ray.get(result_id, timeout=0.1) # Check that a subsequent get() returns early. ray.get(signal.send.remote()) start = time.time() ray.get(result_id, timeout=30) assert time.time() - start < 30
Example #4
Source File: test_basic_2.py From ray with Apache License 2.0 | 6 votes |
def test_actor_large_objects(ray_start_regular): @ray.remote class Actor: def __init__(self): pass def f(self): time.sleep(1) return np.zeros(10000000) a = Actor.remote() obj_id = a.f.remote() assert not ray.worker.global_worker.core_worker.object_exists(obj_id) done, _ = ray.wait([obj_id]) assert len(done) == 1 assert ray.worker.global_worker.core_worker.object_exists(obj_id) assert isinstance(ray.get(obj_id), np.ndarray)
Example #5
Source File: ray_trial_executor.py From ray with Apache License 2.0 | 6 votes |
def get_next_available_trial(self): shuffled_results = list(self._running.keys()) random.shuffle(shuffled_results) # Note: We shuffle the results because `ray.wait` by default returns # the first available result, and we want to guarantee that slower # trials (i.e. trials that run remotely) also get fairly reported. # See https://github.com/ray-project/ray/issues/4211 for details. start = time.time() [result_id], _ = ray.wait(shuffled_results) wait_time = time.time() - start if wait_time > NONTRIVIAL_WAIT_TIME_THRESHOLD_S: self._last_nontrivial_wait = time.time() if time.time() - self._last_nontrivial_wait > BOTTLENECK_WARN_PERIOD_S: logger.warning( "Over the last {} seconds, the Tune event loop has been " "backlogged processing new results. Consider increasing your " "period of result reporting to improve performance.".format( BOTTLENECK_WARN_PERIOD_S)) self._last_nontrivial_wait = time.time() return self._running[result_id]
Example #6
Source File: test_basic_2.py From ray with Apache License 2.0 | 6 votes |
def test_actor_recursive(ray_start_regular): @ray.remote class Actor: def __init__(self, delegate=None): self.delegate = delegate def f(self, x): if self.delegate: return ray.get(self.delegate.f.remote(x)) return x * 2 a = Actor.remote() b = Actor.remote(a) c = Actor.remote(b) result = ray.get([c.f.remote(i) for i in range(100)]) assert result == [x * 2 for x in range(100)] result, _ = ray.wait([c.f.remote(i) for i in range(100)], num_returns=100) result = ray.get(result) assert result == [x * 2 for x in range(100)]
Example #7
Source File: api.py From ray with Apache License 2.0 | 6 votes |
def wait(object_ids, num_returns=1, timeout=None): """Return a list of IDs that are ready and a list of IDs that are not. This method is identical to `ray.wait` except it adds support for tuples and ndarrays. Args: object_ids (List[ObjectID], Tuple(ObjectID), np.array(ObjectID)): List like of object IDs for objects that may or may not be ready. Note that these IDs must be unique. num_returns (int): The number of object IDs that should be returned. timeout (float): The maximum amount of time in seconds to wait before returning. Returns: A list of object IDs that are ready and a list of the remaining object IDs. """ if isinstance(object_ids, (tuple, np.ndarray)): return ray.wait( list(object_ids), num_returns=num_returns, timeout=timeout) return ray.wait(object_ids, num_returns=num_returns, timeout=timeout)
Example #8
Source File: test_basic_2.py From ray with Apache License 2.0 | 6 votes |
def test_actor_concurrent(ray_start_regular): @ray.remote class Batcher: def __init__(self): self.batch = [] self.event = threading.Event() def add(self, x): self.batch.append(x) if len(self.batch) >= 3: self.event.set() else: self.event.wait() return sorted(self.batch) a = Batcher.options(max_concurrency=3).remote() x1 = a.add.remote(1) x2 = a.add.remote(2) x3 = a.add.remote(3) r1 = ray.get(x1) r2 = ray.get(x2) r3 = ray.get(x3) assert r1 == [1, 2, 3] assert r1 == r2 == r3
Example #9
Source File: test_stress.py From ray with Apache License 2.0 | 6 votes |
def test_wait(ray_start_combination): num_nodes, num_workers_per_scheduler, cluster = ray_start_combination num_workers = num_nodes * num_workers_per_scheduler @ray.remote def f(x): return x x_ids = [f.remote(i) for i in range(100)] for i in range(len(x_ids)): ray.wait([x_ids[i]]) for i in range(len(x_ids) - 1): ray.wait(x_ids[i:]) @ray.remote def g(x): time.sleep(x) for i in range(1, 5): x_ids = [ g.remote(np.random.uniform(0, i)) for _ in range(2 * num_workers) ] ray.wait(x_ids, num_returns=len(x_ids)) assert cluster.remaining_processes_alive()
Example #10
Source File: worker.py From ray with Apache License 2.0 | 6 votes |
def get_objects(self, object_ids, timeout=None): """Get the values in the object store associated with the IDs. Return the values from the local object store for object_ids. This will block until all the values for object_ids have been written to the local object store. Args: object_ids (List[object_id.ObjectID]): A list of the object IDs whose values should be retrieved. timeout (float): timeout (float): The maximum amount of time in seconds to wait before returning. """ # Make sure that the values are object IDs. for object_id in object_ids: if not isinstance(object_id, ObjectID): raise TypeError( "Attempting to call `get` on the value {}, " "which is not an ray.ObjectID.".format(object_id)) timeout_ms = int(timeout * 1000) if timeout else -1 data_metadata_pairs = self.core_worker.get_objects( object_ids, self.current_task_id, timeout_ms) return self.deserialize_objects(data_metadata_pairs, object_ids)
Example #11
Source File: test_advanced.py From ray with Apache License 2.0 | 6 votes |
def test_wait_makes_object_local(ray_start_cluster): cluster = ray_start_cluster cluster.add_node(num_cpus=0) cluster.add_node(num_cpus=2) ray.init(address=cluster.address) @ray.remote class Foo: def method(self): return np.zeros(1024 * 1024) a = Foo.remote() # Test get makes the object local. x_id = a.method.remote() assert not ray.worker.global_worker.core_worker.object_exists(x_id) ray.get(x_id) assert ray.worker.global_worker.core_worker.object_exists(x_id) # Test wait makes the object local. x_id = a.method.remote() assert not ray.worker.global_worker.core_worker.object_exists(x_id) ok, _ = ray.wait([x_id]) assert len(ok) == 1 assert ray.worker.global_worker.core_worker.object_exists(x_id)
Example #12
Source File: ray_two_machines.py From ncluster with MIT License | 6 votes |
def run_driver(): ray.init(redis_address=args.ip) worker = Worker.remote() ps = ParameterServer.remote() log = util.FileLogger('out') log(f"Worker ip {ray.get(worker.ip.remote())}") log(f"PS ip {ray.get(ps.ip.remote())}") log(f"Driver ip {socket.gethostbyname(socket.gethostname())}") time_list = [] for i in range(args.iters): start_time = time.perf_counter() grads = worker.compute_gradients.remote() result = ps.receive.remote(grads) ray.wait([result]) elapsed_time_ms = (time.perf_counter() - start_time)*1000 time_list.append(elapsed_time_ms) rate = args.size_mb / (elapsed_time_ms/1000) log('%03d/%d sent %d MBs in %.1f ms: %.2f MB/second' % (i, args.iters, args.size_mb, elapsed_time_ms, rate)) min = np.min(time_list) median = np.median(time_list) log(f"min: {min:8.2f}, median: {median:8.2f}, mean: {np.mean(time_list):8.2f}")
Example #13
Source File: ray.py From FractalAI with GNU Affero General Public License v3.0 | 6 votes |
def game_stream(self, examples: bool = False, full_game: bool = False): if examples: remaining_ids = [gen.get_game_examples.remote(with_id=True) for gen in self.generators] else: remaining_ids = [gen.recover_game.remote(True) for gen in self.generators] while True: ready_ids, remaining_ids = ray.wait(remaining_ids) for ready_id in ready_ids: game, worker_id = ray.get(ready_id) if examples: new_id = self.generators[worker_id].get_game_examples.remote(True) else: new_id = self.generators[worker_id].recover_game.remote(True) remaining_ids.append(new_id) if full_game: yield game else: if not examples: _states, observs, rewards, ends, infos, actions = game for i in range(len(actions)): yield _states[i], observs[i], rewards[i], ends[i], infos[i], actions[i] else: _states, obs, actions, rewards, new_obs, ends = game for i in range(len(rewards)): yield obs[i], actions[i], rewards[i], new_obs[i], ends[i]
Example #14
Source File: test_memory_scheduling.py From ray with Apache License 2.0 | 6 votes |
def testMemoryRequest(self): try: ray.init(num_cpus=1, memory=200 * MB) # fits first 2 a = Actor.remote() b = Actor.remote() ok, _ = ray.wait( [a.ping.remote(), b.ping.remote()], timeout=60.0, num_returns=2) self.assertEqual(len(ok), 2) # does not fit c = Actor.remote() ok, _ = ray.wait([c.ping.remote()], timeout=5.0) self.assertEqual(len(ok), 0) finally: ray.shutdown()
Example #15
Source File: test_advanced.py From ray with Apache License 2.0 | 6 votes |
def test_wait_cluster(ray_start_cluster): cluster = ray_start_cluster cluster.add_node(num_cpus=1, resources={"RemoteResource": 1}) cluster.add_node(num_cpus=1, resources={"RemoteResource": 1}) ray.init(address=cluster.address) @ray.remote(resources={"RemoteResource": 1}) def f(): return # Make sure we have enough workers on the remote nodes to execute some # tasks. tasks = [f.remote() for _ in range(10)] start = time.time() ray.get(tasks) end = time.time() # Submit some more tasks that can only be executed on the remote nodes. tasks = [f.remote() for _ in range(10)] # Sleep for a bit to let the tasks finish. time.sleep((end - start) * 2) _, unready = ray.wait(tasks, num_returns=len(tasks), timeout=0) # All remote tasks should have finished. assert len(unready) == 0
Example #16
Source File: test_basic.py From ray with Apache License 2.0 | 6 votes |
def test_fair_queueing(shutdown_only): ray.init( num_cpus=1, _internal_config=json.dumps({ "fair_queueing_enabled": 1 })) @ray.remote def h(): return 0 @ray.remote def g(): return ray.get(h.remote()) @ray.remote def f(): return ray.get(g.remote()) # This will never finish without fair queueing of {f, g, h}: # https://github.com/ray-project/ray/issues/3644 ready, _ = ray.wait( [f.remote() for _ in range(1000)], timeout=60.0, num_returns=1000) assert len(ready) == 1000, len(ready)
Example #17
Source File: test_multinode_failures_2.py From ray with Apache License 2.0 | 6 votes |
def test_driver_lives_parallel(ray_start_regular): all_processes = ray.worker._global_node.all_processes process_infos = (all_processes[ray_constants.PROCESS_TYPE_PLASMA_STORE] + all_processes[ray_constants.PROCESS_TYPE_GCS_SERVER] + all_processes[ray_constants.PROCESS_TYPE_RAYLET] + all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR] + all_processes[ray_constants.PROCESS_TYPE_MONITOR]) assert len(process_infos) == 5 # Kill all the components in parallel. for process_info in process_infos: process_info.process.terminate() time.sleep(0.1) for process_info in process_infos: process_info.process.kill() for process_info in process_infos: process_info.process.wait() # If the driver can reach the tearDown method, then it is still alive.
Example #18
Source File: test_failure.py From ray with Apache License 2.0 | 6 votes |
def test_actor_worker_dying(ray_start_regular): @ray.remote class Actor: def kill(self): eval("exit()") @ray.remote def consume(x): pass a = Actor.remote() [obj], _ = ray.wait([a.kill.remote()], timeout=5) with pytest.raises(ray.exceptions.RayActorError): ray.get(obj) with pytest.raises(ray.exceptions.RayTaskError): ray.get(consume.remote(obj)) wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1)
Example #19
Source File: test_dynres.py From ray with Apache License 2.0 | 6 votes |
def test_dynamic_res_infeasible_rescheduling(ray_start_regular): # This test launches an infeasible task and then creates a # resource to make the task feasible. This tests if the # infeasible tasks get rescheduled when resources are # created at runtime. res_name = "test_res" res_capacity = 1.0 @ray.remote def set_res(resource_name, resource_capacity): ray.experimental.set_resource(resource_name, resource_capacity) def f(): return 1 remote_task = ray.remote(resources={res_name: res_capacity})(f) oid = remote_task.remote() # This is infeasible ray.get(set_res.remote(res_name, res_capacity)) # Now should be feasible available_res = ray.available_resources() assert available_res[res_name] == res_capacity successful, unsuccessful = ray.wait([oid], timeout=1) assert successful # The task completed
Example #20
Source File: ray_pool.py From rltime with Apache License 2.0 | 6 votes |
def _ensure_resources(self, instances): """Checks we have enough ray resources to create the request TODO: This doesn't really work with more than 1 receiver as they create and check in parallel. In any case ray will not error if we create an actor without resources it will just wait and not be used until it can be run """ available = ray.available_resources() required = { "CPU": self._cpus_per_worker, **self._custom_resources_per_worker } required = {key: val * instances for key, val in required.items()} if not np.all( [available.get(key, 0) >= required[key] for key in required]): raise RuntimeError( "Not enough RAY resources to start the acting pool. " f"Need: {required} Available: {available}")
Example #21
Source File: test_component_failures_3.py From ray with Apache License 2.0 | 6 votes |
def test_driver_lives_parallel(ray_start_regular): all_processes = ray.worker._global_node.all_processes process_infos = (all_processes[ray_constants.PROCESS_TYPE_PLASMA_STORE] + all_processes[ray_constants.PROCESS_TYPE_GCS_SERVER] + all_processes[ray_constants.PROCESS_TYPE_RAYLET] + all_processes[ray_constants.PROCESS_TYPE_LOG_MONITOR] + all_processes[ray_constants.PROCESS_TYPE_MONITOR]) assert len(process_infos) == 5 # Kill all the components in parallel. for process_info in process_infos: process_info.process.terminate() time.sleep(0.1) for process_info in process_infos: process_info.process.kill() for process_info in process_infos: process_info.process.wait() # If the driver can reach the tearDown method, then it is still alive.
Example #22
Source File: test_advanced_3.py From ray with Apache License 2.0 | 6 votes |
def test_wait_reconstruction(shutdown_only): ray.init( num_cpus=1, object_store_memory=int(10**8), _internal_config=json.dumps({ "object_pinning_enabled": 0 })) @ray.remote def f(): return np.zeros(6 * 10**7, dtype=np.uint8) x_id = f.remote() ray.wait([x_id]) ray.wait([f.remote()]) assert not ray.worker.global_worker.core_worker.object_exists(x_id) ready_ids, _ = ray.wait([x_id]) assert len(ready_ids) == 1
Example #23
Source File: test_actor_advanced.py From ray with Apache License 2.0 | 6 votes |
def test_kill(ray_start_regular): @ray.remote class Actor: def hang(self): while True: time.sleep(1) actor = Actor.remote() result = actor.hang.remote() ready, _ = ray.wait([result], timeout=0.5) assert len(ready) == 0 ray.kill(actor, no_restart=False) with pytest.raises(ray.exceptions.RayActorError): ray.get(result) with pytest.raises(ValueError): ray.kill("not_an_actor_handle") # This test verifies actor creation task failure will not # hang the caller.
Example #24
Source File: test_reference_counting.py From ray with Apache License 2.0 | 6 votes |
def test_actor_creation_task(ray_start_regular): @ray.remote def large_object(): # This will be spilled to plasma. return np.zeros(10 * 1024 * 1024, dtype=np.uint8) @ray.remote(resources={"init": 1}) class Actor: def __init__(self, dependency): return def ping(self): return a = Actor.remote(large_object.remote()) ping = a.ping.remote() ready, unready = ray.wait([ping], timeout=1) assert not ready ray.experimental.set_resource("init", 1) ray.get(ping)
Example #25
Source File: test_actor_advanced.py From ray with Apache License 2.0 | 5 votes |
def test_remote_checkpoint_distributed_handle(ray_start_cluster_2_nodes): cluster = ray_start_cluster_2_nodes counter, ids = setup_counter_actor(test_checkpoint=True) @ray.remote def fork_many_incs(counter, num_incs): x = None for _ in range(num_incs): x = counter.inc.remote() # Only call ray.get() on the last task submitted. return ray.get(x) # Fork num_iters times. count = ray.get(ids[-1]) num_incs = 100 num_iters = 10 forks = [ fork_many_incs.remote(counter, num_incs) for _ in range(num_iters) ] ray.wait(forks, num_returns=len(forks)) ray.wait([counter.__ray_checkpoint__.remote()]) count += num_incs * num_iters # Kill the second plasma store to get rid of the cached objects and # trigger the corresponding raylet to exit. get_non_head_nodes(cluster)[0].kill_plasma_store(wait=True) # Check that the actor restored from a checkpoint. assert ray.get(counter.test_restore.remote()) # Check that the number of inc calls since actor initialization is # exactly zero, since there could not have been another inc call since # the remote checkpoint. num_inc_calls = ray.get(counter.get_num_inc_calls.remote()) assert num_inc_calls == 0 # Check that we can submit another call on the actor and get the # correct counter result. x = ray.get(counter.inc.remote()) assert x == count + 1
Example #26
Source File: test_component_failures.py From ray with Apache License 2.0 | 5 votes |
def test_dying_driver_wait(ray_start_regular): # Start the Ray processes. address_info = ray_start_regular @ray.remote def sleep_forever(): time.sleep(10**6) x_id = sleep_forever.remote() driver = """ import ray ray.init("{}") ray.wait([ray.ObjectID(ray.utils.hex_to_binary("{}"))]) """.format(address_info["redis_address"], x_id.hex()) p = run_string_as_driver_nonblocking(driver) # Make sure the driver is running. time.sleep(1) assert p.poll() is None # Kill the driver process. p.kill() p.wait() time.sleep(0.1) # Make sure the original task hasn't finished. ready_ids, _ = ray.wait([x_id], timeout=0) assert len(ready_ids) == 0 # Seal the object so the store attempts to notify the worker that the # wait can return. ray.worker.global_worker.put_object(1, x_id) time.sleep(0.1) # Make sure that nothing has died. assert ray.services.remaining_processes_alive()
Example #27
Source File: test_failure.py From ray with Apache License 2.0 | 5 votes |
def test_get_throws_quickly_when_found_exception(ray_start_regular): # We use an actor instead of functions here. If we use functions, it's # very likely that two normal tasks are submitted before the first worker # is registered to Raylet. Since `maximum_startup_concurrency` is 1, # the worker pool will wait for the registration of the first worker # and skip starting new workers. The result is, the two tasks will be # executed sequentially, which breaks an assumption of this test case - # the two tasks run in parallel. @ray.remote class Actor(object): def bad_func1(self): raise Exception("Test function intentionally failed.") def bad_func2(self): os._exit(0) def slow_func(self, signal): ray.get(signal.wait.remote()) def expect_exception(objects, exception): with pytest.raises(ray.exceptions.RayError) as err: ray.get(objects) assert err.type is exception signal1 = SignalActor.remote() actor = Actor.options(max_concurrency=2).remote() expect_exception( [actor.bad_func1.remote(), actor.slow_func.remote(signal1)], ray.exceptions.RayTaskError) ray.get(signal1.send.remote()) signal2 = SignalActor.remote() actor = Actor.options(max_concurrency=2).remote() expect_exception( [actor.bad_func2.remote(), actor.slow_func.remote(signal2)], ray.exceptions.RayActorError) ray.get(signal2.send.remote())
Example #28
Source File: test_basic.py From ray with Apache License 2.0 | 5 votes |
def test_background_tasks_with_max_calls(shutdown_only): ray.init(num_cpus=2) @ray.remote def g(): time.sleep(.1) return 0 @ray.remote(max_calls=1, max_retries=0) def f(): return [g.remote()] nested = ray.get([f.remote() for _ in range(10)]) # Should still be able to retrieve these objects, since f's workers will # wait for g to finish before exiting. ray.get([x[0] for x in nested]) @ray.remote(max_calls=1, max_retries=0) def f(): return os.getpid(), g.remote() nested = ray.get([f.remote() for _ in range(10)]) while nested: pid, g_id = nested.pop(0) ray.get(g_id) del g_id ray.test_utils.wait_for_pid_to_exit(pid)
Example #29
Source File: test_advanced_2.py From ray with Apache License 2.0 | 5 votes |
def test_blocking_tasks(ray_start_regular): @ray.remote def f(i, j): return (i, j) @ray.remote def g(i): # Each instance of g submits and blocks on the result of another # remote task. object_ids = [f.remote(i, j) for j in range(2)] return ray.get(object_ids) @ray.remote def h(i): # Each instance of g submits and blocks on the result of another # remote task using ray.wait. object_ids = [f.remote(i, j) for j in range(2)] return ray.wait(object_ids, num_returns=len(object_ids)) ray.get([h.remote(i) for i in range(4)]) @ray.remote def _sleep(i): time.sleep(0.01) return (i) @ray.remote def sleep(): # Each instance of sleep submits and blocks on the result of # another remote task, which takes some time to execute. ray.get([_sleep.remote(i) for i in range(10)]) ray.get(sleep.remote())
Example #30
Source File: test_component_failures.py From ray with Apache License 2.0 | 5 votes |
def test_dying_worker_wait(ray_start_2_cpus): @ray.remote def sleep_forever(): time.sleep(10**6) @ray.remote def get_pid(): return os.getpid() x_id = sleep_forever.remote() # Get the PID of the worker that block_in_wait will run on (sleep a little # to make sure that sleep_forever has already started). time.sleep(0.1) worker_pid = ray.get(get_pid.remote()) @ray.remote def block_in_wait(object_id_in_list): ray.wait(object_id_in_list) # Have the worker wait in a wait call. block_in_wait.remote([x_id]) time.sleep(0.1) # Kill the worker. os.kill(worker_pid, SIGKILL) time.sleep(0.1) # Create the object. ray.worker.global_worker.put_object(1, x_id) time.sleep(0.1) # Make sure that nothing has died. assert ray.services.remaining_processes_alive() # This test checks that when a driver dies in the middle of a wait, the plasma # store and raylet will not die.