Python Examples of multiprocessing.pool.ThreadPool

Source File: FEMSolver.py From florence with MIT License

9 votes

def LaunchDaskDistributedClient(self, scheduler_ip=None, scheduler_port=None):

        if self.parallel and self.parallel_model == "dask" and self.is_dask_scheduler_initialised is False:

            from multiprocessing.pool import ThreadPool
            try:
                import dask
                from dask.distributed import Client, LocalCluster
            except ImportError:
                raise ImportError("dask is not installed. Install it 'using pip install dask[complete]'")

            dask.config.set(pool=ThreadPool(self.no_of_cpu_cores))
            # INITIALISE CLUSTER
            if scheduler_ip is None:
                cluster = LocalCluster(n_workers=self.no_of_cpu_cores, processes=False, threads_per_worker=None)
                client = Client(cluster)
            else:
                client = Client(scheduler_ip)

            self.dask_client = client

            self.is_dask_scheduler_initialised = True

Source File: diagnostics.py From universe with MIT License

6 votes

def __init__(self, n, probe_key, ignore_clock_skew=False, metadata_encoding=None, disable_action_probes=False):
        # Each QR code takes about 1ms (and updates at 5fps). We do
        # our best to ensure the QR is processed in time for the next
        # step call (n/16 would put us right at the threshold).
        self.pool = pool.ThreadPool(max(int(n/4), 1))
        self.qr_pool = pool.ThreadPool(max(int(n/8), 1))
        self.lock = threading.RLock()

        self.instance_n = [None] * n
        self.ignore_clock_skew = ignore_clock_skew
        self.disable_action_probes = disable_action_probes

        self.metadata_encoding = metadata_encoding

        self.update(probe_key=probe_key, metadata_encoding=metadata_encoding)

    # only used in flashgames right now

Source File: solver.py From dogTorch with MIT License

6 votes

def save_features(model, data_loaders, args):
    model.eval()
    os.makedirs(args.features_dir, exist_ok=True)
    thread_pool = pool.ThreadPool(args.workers)
    for data_loader in data_loaders:
        data_index = 0
        for input, target, prev_absolutes, next_absolutes, _ in data_loader:
            input = Variable(input.cuda(async=True), volatile=True)
            features = model.feats(input).data.cpu()
            features_to_save = []
            for feature in features:
                relpath = data_loader.dataset.get_relpath(data_index)
                feature_path = os.path.join(args.features_dir,
                                            relpath + '.pytar')
                features_to_save.append((feature, feature_path))
                data_index += 1
            thread_pool.map(_save_tensor, features_to_save)

Source File: data.py From vaegan-celebs-keras with MIT License

6 votes

def celeba_loader(batch_size, normalize=True, num_child=4, seed=0, workers=8):
    rng = np.random.RandomState(seed)
    images = glob.glob(images_path)

    with Pool(workers) as p:
        while True:
            rng.shuffle(images)
            for s in range(0, len(images), batch_size):
                e = s + batch_size
                batch_names = images[s:e]
                batch_images = p.map(_load_image, batch_names)
                batch_images = np.stack(batch_images)

                if normalize:
                    batch_images = batch_images / 127.5 - 1.
                    # To be sure
                    batch_images = np.clip(batch_images, -1., 1.)

                # Yield the same batch num_child times since the images will be consumed
                # by num_child different child generators
                for i in range(num_child):
                    yield batch_images

Source File: spark_dataset_converter.py From petastorm with Apache License 2.0

6 votes

def _check_dataset_file_median_size(url_list):
    fs, path_list = get_filesystem_and_path_or_paths(url_list)
    RECOMMENDED_FILE_SIZE_BYTES = 50 * 1024 * 1024

    # TODO: also check file size for other file system.
    if isinstance(fs, LocalFileSystem):
        pool = ThreadPool(64)
        try:
            file_size_list = pool.map(os.path.getsize, path_list)
            if len(file_size_list) > 1:
                mid_index = len(file_size_list) // 2
                median_size = sorted(file_size_list)[mid_index]  # take the larger one if tie
                if median_size < RECOMMENDED_FILE_SIZE_BYTES:
                    logger.warning('The median size %d B (< 50 MB) of the parquet files is too small. '
                                   'Total size: %d B. Increase the median file size by calling df.repartition(n) or '
                                   'df.coalesce(n), which might help improve the performance. Parquet files: %s, ...',
                                   median_size, sum(file_size_list), url_list[0])
        finally:
            pool.close()
            pool.join()

Source File: InfrastructureInfo.py From im with GNU General Public License v3.0

6 votes

def destroy_vms(self, auth):
        """
        Destroy all the VMs
        """
        delete_list = list(reversed(self.get_vm_list()))

        exceptions = []
        if Config.MAX_SIMULTANEOUS_LAUNCHES > 1:
            pool = ThreadPool(processes=Config.MAX_SIMULTANEOUS_LAUNCHES)
            pool.map(
                lambda vm: vm.delete(delete_list, auth, exceptions),
                delete_list
            )
            pool.close()
        else:
            # If IM server is the first VM, then it will be the last destroyed
            for vm in delete_list:
                vm.delete(delete_list, auth, exceptions)

        if exceptions:
            msg = ""
            for e in exceptions:
                msg += str(e) + "\n"
            raise Exception("Error destroying the infrastructure: \n%s" % msg)

Source File: migrate.py From pyspider with Apache License 2.0

6 votes

def migrate(pool, from_connection, to_connection):
    """
    Migrate tool for pyspider
    """
    f = connect_database(from_connection)
    t = connect_database(to_connection)

    if isinstance(f, ProjectDB):
        for each in f.get_all():
            each = unicode_obj(each)
            logging.info("projectdb: %s", each['name'])
            t.drop(each['name'])
            t.insert(each['name'], each)
    elif isinstance(f, TaskDB):
        pool = Pool(pool)
        pool.map(
            lambda x, f=from_connection, t=to_connection: taskdb_migrating(x, f, t),
            f.projects)
    elif isinstance(f, ResultDB):
        pool = Pool(pool)
        pool.map(
            lambda x, f=from_connection, t=to_connection: resultdb_migrating(x, f, t),
            f.projects)

Source File: ctaHistoryData.py From vnpy_crypto with MIT License

6 votes

def downloadAllFuturesDailyBar(self):
        """下载所有期货的主力合约日行情"""
        start = time()
        print( u'开始下载所有期货的主力合约日行情')
        
        productSymbolSet = self.readFuturesProductSymbol()
        
        print( u'代码列表读取成功，产品代码：%s' %productSymbolSet)
        
        # 这里也测试了线程池，但可能由于下载函数中涉及较多的数据格
        # 式转换，CPU开销较大，多线程效率并无显著改变。
        #p = ThreadPool(10)
        #p.map(self.downloadFuturesDailyBar, productSymbolSet)
        #p.close()
        #p.join()
        
        for productSymbol in productSymbolSet:
            self.downloadFuturesDailyBar(productSymbol+'0000')

        print( u'所有期货的主力合约日行情已经全部下载完成, 耗时%s秒' %(time()-start))
        
    #----------------------------------------------------------------------

Source File: multithread.py From vnpy_crypto with MIT License

6 votes

def test_multithread_stringio_read_csv(self):
        # see gh-11786
        max_row_range = 10000
        num_files = 100

        bytes_to_df = [
            '\n'.join(
                ['%d,%d,%d' % (i, i, i) for i in range(max_row_range)]
            ).encode() for j in range(num_files)]
        files = [BytesIO(b) for b in bytes_to_df]

        # read all files in many threads
        pool = ThreadPool(8)
        results = pool.map(self.read_csv, files)
        first_result = results[0]

        for result in results:
            tm.assert_frame_equal(first_result, result)

Source File: agent.py From fairseq with MIT License

6 votes

def decode(self, session, low=0, high=100000, num_thread=10):
        corpus_info = session.corpus_info()
        high = min(corpus_info["num_sentences"] - 1, high)
        if low >= high:
            return

        t0 = time.time()
        if num_thread > 1:
            with Pool(10) as p:
                p.map(
                    partial(self._decode_one, session),
                    [sent_id for sent_id in range(low, high + 1)]
                )
        else:
            for sent_id in range(low, high + 1):
                self._decode_one(session, sent_id)

        print(f'Finished {low} to {high} in {time.time() - t0}s')

Source File: _compression.py From arctic with GNU Lesser General Public License v2.1

6 votes

def set_compression_pool_size(pool_size):
    """
    Set the size of the compression workers thread pool.
    If the pool is already created, it waits until all jobs are finished, and then proceeds with setting the new size.

    Parameters
    ----------
        pool_size : `int`
            The size of the pool (must be a positive integer)

    Returns
    -------
    `None`
    """
    pool_size = int(pool_size)
    if pool_size < 1:
        raise ValueError("The compression thread pool size cannot be of size {}".format(pool_size))

    global _compress_thread_pool
    if _compress_thread_pool is not None:
        _compress_thread_pool.close()
        _compress_thread_pool.join()
    _compress_thread_pool = ThreadPool(pool_size)

Source File: selenium_downloader.py From fetchman with Apache License 2.0

6 votes

def download(self, batch):
        if self.driver_pool_size:
            pool = Pool(processes=self.driver_pool_size)
        else:
            pool = Pool(processes=default_settings.DRIVER_POOL_SIZE)

        results = []

        for request in batch:
            results.append(pool.apply_async(self.download_one, (request,)))
        pool.close()
        pool.join()

        true_responses = []
        for result in results:
            true_response = result.get()
            true_responses.append(true_response)
            FetchManLogger.logger.info(true_response)

        return true_responses

Source File: test_multi_thread.py From recruit with Apache License 2.0

6 votes

def test_multi_thread_string_io_read_csv(all_parsers):
    # see gh-11786
    parser = all_parsers
    max_row_range = 10000
    num_files = 100

    bytes_to_df = [
        "\n".join(
            ["%d,%d,%d" % (i, i, i) for i in range(max_row_range)]
        ).encode() for _ in range(num_files)]
    files = [BytesIO(b) for b in bytes_to_df]

    # Read all files in many threads.
    pool = ThreadPool(8)

    results = pool.map(parser.read_csv, files)
    first_result = results[0]

    for result in results:
        tm.assert_frame_equal(first_result, result)

Source File: dataset.py From MONAI with Apache License 2.0

5 votes

def __init__(
        self, data, transform: Callable, cache_num: int = sys.maxsize, cache_rate: float = 1.0, num_workers: int = 0
    ):
        """
        Args:
            data (Iterable): input data to load and transform to generate dataset for model.
            transform: transforms to execute operations on input data.
            cache_num: number of items to be cached. Default is `sys.maxsize`.
                will take the minimum of (cache_num, data_length x cache_rate, data_length).
            cache_rate: percentage of cached data in total, default is 1.0 (cache all).
                will take the minimum of (cache_num, data_length x cache_rate, data_length).
            num_workers: the number of worker threads to use.
                If 0 a single thread will be used. Default is 0.
        """
        if not isinstance(transform, Compose):
            transform = Compose(transform)
        super().__init__(data, transform)
        self.cache_num = min(cache_num, int(len(self) * cache_rate), len(self))
        if self.cache_num > 0:
            self._cache = [None] * self.cache_num
            if num_workers > 0:
                self._item_processed = 0
                self._thread_lock = threading.Lock()
                with ThreadPool(num_workers) as p:
                    p.map(
                        self._load_cache_item_thread,
                        [(i, data[i], transform.transforms) for i in range(self.cache_num)],
                    )
            else:
                for i in range(self.cache_num):
                    self._cache[i] = self._load_cache_item(data[i], transform.transforms)
                    progress_bar(i + 1, self.cache_num, "Load and cache transformed data: ")

Source File: project.py From signac with BSD 3-Clause "New" or "Revised" License

5 votes

def _update_in_memory_cache(self):
        "Update the in-memory state point cache to reflect the workspace."
        logger.debug("Updating in-memory cache...")
        start = time.time()
        job_ids = set(self._job_dirs())
        cached_ids = set(self._sp_cache)
        to_add = job_ids.difference(cached_ids)
        to_remove = cached_ids.difference(job_ids)
        if to_add or to_remove:
            for _id in to_remove:
                del self._sp_cache[_id]

            def _add(_id):
                self._sp_cache[_id] = self._get_statepoint_from_workspace(_id)

            to_add_chunks = split_and_print_progress(
                iterable=list(to_add),
                num_chunks=max(1, min(100, int(len(to_add) / 1000))),
                write=logger.info,
                desc="Read metadata: ")

            with ThreadPool() as pool:
                for chunk in to_add_chunks:
                    pool.map(_add, chunk)

            delta = time.time() - start
            logger.debug("Updated in-memory cache in {:.3f} seconds.".format(delta))
            return to_add, to_remove
        else:
            logger.debug("In-memory cache is up to date.")

Source File: test_h5store.py From signac with BSD 3-Clause "New" or "Revised" License

5 votes

def test_multithreading(self):

        def set_x(x):
            self.get_h5store()['x'] = x

        with closing(ThreadPool(2)) as pool:
            pool.map(set_x, range(100))
        pool.join()

        assert self.get_h5store()['x'] in set(range(100))

Source File: test_h5store.py From signac with BSD 3-Clause "New" or "Revised" License

5 votes

def test_multithreading_with_error(self):

        def set_x(x):
            self.get_h5store()['x'] = x
            if x == 50:
                raise RuntimeError()

        with pytest.raises(RuntimeError):
            with closing(ThreadPool(2)) as pool:
                pool.map(set_x, range(100))
        pool.join()

        assert self.get_h5store()['x'] in set(range(100))

Source File: remote.py From testplan with Apache License 2.0

5 votes

def _start_thread_pool(self):
        size = len(self._instances)
        try:
            if size > 2:
                self.pool = ThreadPool(5 if size > 5 else size)
        except Exception as exc:
            if isinstance(exc, AttributeError):
                self.logger.warning(
                    "Please upgrade to the suggested python interpreter."
                )

Source File: AffineInvariantFeatures.py From DoNotSnap with GNU General Public License v3.0

5 votes

def __init__(self, detector, extractor):
        self.detector = detector
        self.extractor = extractor
        self.pool = ThreadPool(processes=cv2.getNumberOfCPUs())

Source File: batch.py From pyEX with Apache License 2.0

5 votes

def bulkMinuteBars(symbol, dates, token='', version='', filter=''):
    '''fetch many dates worth of minute-bars for a given symbol'''
    _raiseIfNotStr(symbol)
    dates = [_strOrDate(date) for date in dates]
    list_orig = dates.__class__

    args = []
    for date in dates:
        args.append((symbol, '1d', date, token, version, filter))

    pool = ThreadPool(20)
    rets = pool.starmap(chart, args)
    pool.close()

    return list_orig(itertools.chain(*rets))

Source File: utils.py From kickoff-player with GNU General Public License v3.0

5 votes

def thread_pool(callback, args, flatten=True):
  pool = ThreadPool(processes=cpu_count())
  data = pool.map(callback, args)

  pool.close()
  pool.join()

  if flatten:
    data = flatten_list(data)

  return data

Source File: utils.py From bioconda-utils with MIT License

5 votes

def fetch(cls, urls, descs, cb, datas):
        """Fetch data from URLs.

        This will use asyncio to manage a pool of connections at once, speeding
        up download as compared to iterative use of ``requests`` significantly.
        It will also retry on non-permanent HTTP error codes (i.e. 429, 502,
        503 and 504).

        Args:
          urls: List of URLS
          descs: Matching list of descriptions (for progress display)
          cb: As each download is completed, data is passed through this function.
              Use to e.g. offload json parsing into download loop.
        """
        try:
            loop = asyncio.get_event_loop()
        except RuntimeError:
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)

        if loop.is_running():
            logger.warning("Running AsyncRequests.fetch from within running loop")
            # Workaround the fact that asyncio's loop is marked as not-reentrant
            # (it is apparently easy to patch, but not desired by the devs,
            with ThreadPool(1) as pool:
                res = pool.apply(cls.fetch, (urls, descs, cb, datas))
            return res

        task = asyncio.ensure_future(cls.async_fetch(urls, descs, cb, datas))

        try:
            loop.run_until_complete(task)
        except KeyboardInterrupt:
            task.cancel()
            loop.run_forever()
            task.exception()

        return task.result()

Source File: api_client.py From APIv3-python-library with MIT License

5 votes

def pool(self):
        if self._pool is None:
            self._pool = ThreadPool()
        return self._pool

Source File: api_client.py From APIv3-python-library with MIT License

5 votes

def __init__(self, configuration=None, header_name=None, header_value=None,
                 cookie=None):
        if configuration is None:
            configuration = Configuration()
        self.configuration = configuration

        # Use the pool property to lazily initialize the ThreadPool.
        self._pool = None
        self.rest_client = rest.RESTClientObject(configuration)
        self.default_headers = {}
        if header_name is not None:
            self.default_headers[header_name] = header_value
        self.cookie = cookie
        # Set default User-Agent.
        self.user_agent = 'Swagger-Codegen/1.0.0/python'

Source File: __init__.py From oss-ftp with MIT License

5 votes

def Pool(processes=None, initializer=None, initargs=()):
    from multiprocessing.pool import ThreadPool
    return ThreadPool(processes, initializer, initargs)

Source File: __init__.py From BinderFilter with MIT License

5 votes

def Pool(processes=None, initializer=None, initargs=()):
    from multiprocessing.pool import ThreadPool
    return ThreadPool(processes, initializer, initargs)

Source File: _parallel_backends.py From mlens with MIT License

5 votes

def configure(self, n_jobs=1, parallel=None, **backend_args):
        """Build a process or thread pool and return the number of workers"""
        n_jobs = self.effective_n_jobs(n_jobs)
        if n_jobs == 1:
            # Avoid unnecessary overhead and use sequential backend instead.
            raise FallbackToBackend(SequentialBackend())
        self.parallel = parallel
        self._pool = ThreadPool(n_jobs)
        return n_jobs

Source File: async_pubsub.py From monaco with MIT License

5 votes

def __init__(self, connection_pool, threadpool_size=5, **kwargs):
        super(AsyncPubSub, self).__init__(connection_pool, **kwargs)
        if not hasattr(threading.current_thread(), "_children"):
            threading.current_thread()._children = WeakKeyDictionary()
        self.threadpool = ThreadPool(threadpool_size)
        self.running = []

Source File: slave.py From monaco with MIT License

5 votes

def __init__(self):
        monaco = schema.Monaco()
        self.r = redis.StrictRedis(port=config['mgmt_port'])
        # terniaries are always a bad idea. this is a mess of exceptions waiting to cascade so FIXME
        if self.r.info()['role'] == 'master':
            self.rmaster = redis.StrictRedis(port=config['mgmt_port'])
        else:
            self.rmaster = redis.StrictRedis(host=self.r.info()['master_host'], port=config['mgmt_port'], socket_connect_timeout=1, socket_timeout=1)
        monaco.refresh(self.r)
        node_id = monaco.node_ids_by_hostname[config['hostname']]
        self.node = schema.MonacoNode(node_id=node_id)
        self.health_data = {} # dictionary of app_id -> DB health
        self.app_clients = {} # dictionary of app_id -> redis clients
        self.rps = redis.StrictRedis(port=config['mgmt_port'])
        self.pubsub = self.rps.pubsub(ignore_subscribe_messages=True)
        self.lock = threading.Lock()
        self._subscriptions = {}
        self.logger = logging.getLogger('monaco.slave')
        self.redmanager = RedisMgmt()
        self.nutmanager = NutMgmt()
        # for slave based health-checks
        self.sched = Scheduler(daemon=True)
        self.sched.start()
        self.sched.add_interval_job(self.node_health, seconds=5) # TODO: Tune
        self.health_check_pool = ThreadPool(10)
        atexit.register(lambda: self.sched.shutdown(wait=False))

Source File: core.py From deplicate with MIT License

5 votes

def _splitpaths(paths, followlinks):
    with closing(ThreadPool()) as pool:
        upaths = pool.imap(fsdecode, paths)
    return splitpaths(set(upaths), followlinks)

Python multiprocessing.pool.ThreadPool() Examples