Python dask.compute() Examples
The following are 30
code examples of dask.compute().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
dask
, or try the search function
.
Example #1
Source File: k_means.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def fit(self, X, y=None): X = self._check_array(X) labels, centroids, inertia, n_iter = k_means( X, self.n_clusters, oversampling_factor=self.oversampling_factor, random_state=self.random_state, init=self.init, return_n_iter=True, max_iter=self.max_iter, init_max_iter=self.init_max_iter, tol=self.tol, ) self.cluster_centers_ = centroids self.labels_ = labels self.inertia_ = inertia.compute().item() self.n_iter_ = n_iter self.n_features_in_ = X.shape[1] return self
Example #2
Source File: cache.py From intake with BSD 2-Clause "Simplified" License | 6 votes |
def _load(self, files_in, files_out, urlpath, meta=True): """Download a set of files""" import dask out = [] outnames = [] for file_in, file_out in zip(files_in, files_out): cache_path = file_out.path outnames.append(cache_path) # If `_munge_path` did not find a match we want to avoid # writing to the urlpath. if cache_path == urlpath: continue if not os.path.isfile(cache_path): logger.debug("Caching file: {}".format(file_in.path)) logger.debug("Original path: {}".format(urlpath)) logger.debug("Cached at: {}".format(cache_path)) if meta: self._log_metadata(urlpath, file_in.path, cache_path) ddown = dask.delayed(_download) out.append(ddown(file_in, file_out, self.blocksize, self.output)) dask.compute(*out) return outnames
Example #3
Source File: semistructured.py From intake with BSD 2-Clause "Simplified" License | 6 votes |
def _data_to_source(b, path, encoder=None, storage_options=None, **kwargs): import dask.bag as db import posixpath from fsspec import open_files import dask import pickle import json from intake.source.textfiles import TextFilesSource encoder = {None: str, 'str': str, 'json': json.dumps, 'pickle': pickle.dumps}.get(encoder, encoder) if not hasattr(b, 'to_textfiles'): try: b = db.from_sequence(b, npartitions=1) except TypeError: raise NotImplementedError files = open_files(posixpath.join(path, 'part.*'), mode='wt', num=b.npartitions, **(storage_options or {})) dwrite = dask.delayed(write_file) out = [dwrite(part, f, encoder) for part, f in zip(b.to_delayed(), files)] dask.compute(out) s = TextFilesSource(posixpath.join(path, 'part.*'), storage_options=storage_options) return s
Example #4
Source File: views.py From AutoOut with MIT License | 6 votes |
def detect_outliers(request): """ Detect outliers end point """ dataset_id = int(request.GET.get("dataset_id")) if dataset_id is None: return JsonResponse({"status": "failure", "message": "Dataset id is not provided"}) dataset = Dataset.objects.get(pk=dataset_id) file_path = dataset.path delete_features = json.loads(dataset.deleted_features) # Create a detection experiment and start outlier detection process = Process.objects.get(name='Detection') process_status = ProcessStatus.objects.get(name='Running') experiment = Experiment(dataset=dataset, process=process, process_status=process_status) experiment.save() results = delayed(detect_all)(os.path.join(settings.MEDIA_ROOT, file_path), experiment.id, settings.RESULTS_ROOT, delete_features) dask.compute(results) return JsonResponse( {'status': 'success', 'message': 'Detection started successfully', 'experiment_id': experiment.id})
Example #5
Source File: catalog.py From nbodykit with GNU General Public License v3.0 | 6 votes |
def persist(self, columns=None): """ Return a CatalogSource, where the selected columns are computed and persist in memory. """ import dask.array as da if columns is None: columns = self.columns r = {} for key in columns: r[key] = self[key] r = da.compute(r)[0] # particularity of dask from nbodykit.source.catalog.array import ArrayCatalog c = ArrayCatalog(r, comm=self.comm) c.attrs.update(self.attrs) return c
Example #6
Source File: input_pipeline_dask.py From professional-services with Apache License 2.0 | 6 votes |
def calculate_stats(cls, df, target_var): """Calculates descriptive stats of the dataframe required for cleaning. Arguments: df : dask dataframe, The dataframe at hand target_var : string, Dependent variable for the analysis Returns: mean : dask series, mean of each column median : dask series, median of each column dict(zip(categorical_cols, mode)) : dict, Dictionary containing categorical column as keys and their modes as values std : dask series, standard deviation of each column """ categorical_columns = [ col for col in df.columns if col != target_var and df[col].dtype == 'object'] mean_op = df.mean() std_op = df.std() median_op = df.quantile(0.5) mode_op = [df[col].value_counts().idxmax() for col in categorical_columns] mean, median, mode, std = dask.compute( mean_op, median_op, mode_op, std_op) return mean, median, dict(zip(categorical_columns, mode)), std
Example #7
Source File: input_pipeline_dask.py From professional-services with Apache License 2.0 | 6 votes |
def impute(cls, df, target_var, median, mode): """Imputing missing values using median for continuous columns and mode for categorical columns. Arguments: df : dask dataframe, The dataframe at hand target_var : string, Dependent variable for the analysis median : list, median of all columns in data mode : list, mode of all columns in data Returns: df : dask dataframe, Dataframe without missing values """ missing_stats = df.isna().sum().compute() cols = [col for col in df.columns if col != target_var] for col in cols: if missing_stats[col] > 0 and df[col].dtype == 'object': df[col] = df[col].fillna(mode[col]) elif missing_stats[col] > 0: df[col] = df[col].fillna(median[col]) return df
Example #8
Source File: input_pipeline_dask.py From professional-services with Apache License 2.0 | 6 votes |
def kmeans_input_fn(self, name, csv_path=None): """Input function for kmeans Arguments: name : string, Name of the data [Train or Eval] csv_path : The path of the csv on any storage system Returns: A batch of features """ pattern = self._get_pattern(name, csv_path) tf.logging.info('The Pattern of files is : %s', pattern) df = dd.read_csv(pattern) vectors = dask.compute(df.values) return tf.train.limit_epochs( tf.convert_to_tensor(vectors[0], dtype=tf.float32), num_epochs=1)
Example #9
Source File: tests_input_dask.py From professional-services with Apache License 2.0 | 6 votes |
def test_clean_data(self): """ Testing function clean_csv """ copyfile(CSV_PATH, '/tmp/data.csv') iread = self.init_inputreader() stats = self.init_basicstats() ddf, _ = iread._parse_csv() data, mean, std_dev, csv_defaults = stats.clean_data( df=ddf, task_type=TASK_TYPE, target_var=TARGET_VAR, name=NAME ) self_computed_mean = dask.compute(ddf.mean()) self.assertListEqual(list(mean), list(self_computed_mean[0])) self_computed_std_dev = dask.compute(ddf.std(axis=0, skipna=True)) self.assertListEqual(list(std_dev), list(self_computed_std_dev[0])) self.assertIsInstance(data, dask.dataframe.core.DataFrame) self.assertIsInstance(mean, pd.core.series.Series) self.assertIsInstance(std_dev, pd.core.series.Series) self.assertIsInstance(csv_defaults, list)
Example #10
Source File: tests_input_dask.py From professional-services with Apache License 2.0 | 6 votes |
def test_calculate_stats(self): """ Testing function calculate_stats """ iread = self.init_inputreader() stats = self.init_basicstats() ddf, _ = iread._parse_csv() mean, median, mode_dict, std_dev = stats.calculate_stats( df=ddf, target_var=TARGET_VAR ) self_computed_mean = dask.compute(ddf.mean()) self.assertListEqual(list(mean), list(self_computed_mean[0])) self_computed_std_dev = dask.compute(ddf.std(axis=0, skipna=True)) self.assertListEqual(list(std_dev), list(self_computed_std_dev[0])) self_computed_median = dask.compute(ddf.quantile(0.5)) self.assertListEqual(list(median), list(self_computed_median[0])) self.assertIsInstance(mean, pd.core.series.Series) self.assertIsInstance(std_dev, pd.core.series.Series) self.assertIsInstance(median, pd.core.series.Series) self.assertIsInstance(mode_dict, dict)
Example #11
Source File: tests_input_dask.py From professional-services with Apache License 2.0 | 6 votes |
def test_impute(self): """ Testing function impute """ iread = self.init_inputreader() stats = self.init_basicstats() ddf, _ = iread._parse_csv() _, median, _, _ = stats.calculate_stats( df=ddf, target_var=TARGET_VAR ) data = stats.impute( df=ddf, target_var=TARGET_VAR, median=median, mode=MODE ) imputed_data = dask.compute(data.isnull().sum()) rows = ddf.columns for row in rows: col = imputed_data[0][row] self.assertEqual(col, 0) self.assertIsInstance(data, dask.dataframe.core.DataFrame)
Example #12
Source File: cross_registration.py From minian with GNU General Public License v3.0 | 6 votes |
def calculate_centroids_old(cnmds, window, grp_dim=['animal', 'session']): print("computing centroids") cnt_list = [] for anm, cur_anm in cnmds.groupby('animal'): for ss, cur_ss in cur_anm.groupby('session'): # cnt = centroids(cur_ss['A_shifted'], window.sel(animal=anm)) cnt = da.delayed(centroids)( cur_ss['A_shifted'], window.sel(animal=anm)) cnt_list.append(cnt) with ProgressBar(): cnt_list, = da.compute(cnt_list) cnts_ds = pd.concat(cnt_list, ignore_index=True) cnts_ds.height = cnts_ds.height.astype(float) cnts_ds.width = cnts_ds.width.astype(float) cnts_ds.unit_id = cnts_ds.unit_id.astype(int) cnts_ds.animal = cnts_ds.animal.astype(str) cnts_ds.session = cnts_ds.session.astype(str) cnts_ds.session_id = cnts_ds.session_id.astype(str) return cnts_ds
Example #13
Source File: cross_registration.py From minian with GNU General Public License v3.0 | 6 votes |
def centroids_distance_old(cents, A, window, shift, hamming, corr, tile=(50, 50)): sessions = cents['session'].unique() dim_h = (np.min(cents['height']), np.max(cents['height'])) dim_w = (np.min(cents['width']), np.max(cents['width'])) dist_list = [] for ssA, ssB in itt.combinations(sessions, 2): # dist = _calc_cent_dist(ssA, ssB, cents, cnmds, window, tile, dim_h, dim_w) dist = da.delayed(_calc_cent_dist)(ssA, ssB, cents, A, window, tile, dim_h, dim_w, shift, hamming, corr) dist_list.append(dist) with ProgressBar(): dist_list, = da.compute(dist_list) dists = pd.concat(dist_list, ignore_index=True) return dists
Example #14
Source File: cnmf.py From minian with GNU General Public License v3.0 | 6 votes |
def get_noise_welch(varr, noise_range=(0.25, 0.5), noise_method='logmexp', compute=True): print("estimating noise") sn = xr.apply_ufunc( noise_welch, varr.chunk(dict(frame=-1)), input_core_dims=[['frame']], dask='parallelized', vectorize=True, kwargs=dict(noise_range=noise_range, noise_method=noise_method), output_dtypes=[varr.dtype]) if compute: sn = sn.compute() return sn
Example #15
Source File: metsim.py From MetSim with GNU General Public License v3.0 | 6 votes |
def run(self): self._validate_setup() write_locks = {} for times in self._times: filename = self._get_output_filename(times) self.setup_netcdf_output(filename, times) write_locks[filename] = combine_locks([NETCDFC_LOCK, get_write_lock(filename)]) self.logger.info('Starting {} chunks...'.format(len(self.slices))) delayed_objs = [wrap_run_slice(self.params, write_locks, dslice) for dslice in self.slices] persisted = dask.persist(delayed_objs, num_workers=self.params['num_workers']) self.progress_bar(persisted) dask.compute(persisted) self.logger.info('Cleaning up...') try: self._client.cluster.close() self._client.close() if self.params['verbose'] == logging.DEBUG: print() print('closed dask cluster/client') except Exception: pass
Example #16
Source File: benchmark.py From SDV with MIT License | 6 votes |
def benchmark(datasets=None, datasets_path=None, distributed=True, timeout=None): if datasets is None: if datasets_path is None: datasets = get_available_demos().name else: datasets = os.listdir(datasets_path) if distributed: import dask global score_dataset score_dataset = dask.delayed(score_dataset) scores = list() for dataset in datasets: scores.append(score_dataset(dataset, datasets_path, timeout)) if distributed: scores = dask.compute(*scores) return pd.DataFrame(scores)
Example #17
Source File: _blockwise.py From dask-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def fit(self, X, y, **kwargs): X = self._check_array(X) estimatord = dask.delayed(self.estimator) Xs = X.to_delayed() ys = y.to_delayed() if isinstance(X, da.Array): Xs = Xs.flatten() if isinstance(y, da.Array): ys = ys.flatten() if len(Xs) != len(ys): raise ValueError( f"The number of blocks in X and y must match. {len(Xs)} != {len(ys)}" ) estimators = [ dask.delayed(sklearn.base.clone)(estimatord) for _ in range(len(Xs)) ] results = [ estimator_.fit(X_, y_, **kwargs) for estimator_, X_, y_, in zip(estimators, Xs, ys) ] results = list(dask.compute(*results)) self.estimators_ = results
Example #18
Source File: semistructured.py From intake with BSD 2-Clause "Simplified" License | 5 votes |
def _get_partition(self, i): self._load_metadata() return self.parts[i].compute()
Example #19
Source File: preprocessing.py From minian with GNU General Public License v3.0 | 5 votes |
def remove_background_old(varray, window=51): print("creating parallel schedule") varr_ft = varray.astype(np.float32) compute_list = [] for fid in varr_ft.coords['frame'].values: fm = varr_ft.loc[dict(frame=fid)] _ = delayed(remove_background_perframe_old)(fid, fm, varr_ft, window) compute_list.append(_) with ProgressBar(): print("removing background") compute(compute_list) print("normalizing result") varr_ft = scale_varr(varr_ft, (0, 255)).astype(varray.dtype, copy=False) print("background removal done") return varr_ft.rename(varray.name + "_Filtered")
Example #20
Source File: test_merge.py From kartothek with MIT License | 5 votes |
def _merge_datasets(*args, **kwargs): df_list = merge_datasets_as_delayed(*args, **kwargs) s = pickle.dumps(df_list, pickle.HIGHEST_PROTOCOL) df_list = pickle.loads(s) return dask.compute(df_list)[0]
Example #21
Source File: initialization.py From minian with GNU General Public License v3.0 | 5 votes |
def seeds_init(varr, wnd_size=500, method='rolling', stp_size=200, nchunk=100, max_wnd=10, diff_thres=2): print("constructing chunks") idx_fm = varr.coords['frame'] nfm = len(idx_fm) if method == 'rolling': nstp = np.ceil(nfm / stp_size) + 1 centers = np.linspace(0, nfm - 1, nstp) hwnd = np.ceil(wnd_size / 2) max_idx = list( map(lambda c: slice(int(np.floor(c - hwnd).clip(0)), int(np.ceil(c + hwnd))), centers)) elif method == 'random': max_idx = [ np.random.randint(0, nfm - 1, wnd_size) for _ in range(nchunk) ] res = [] print("creating parallel scheme") res = [max_proj_frame(varr, cur_idx) for cur_idx in max_idx] max_res = xr.concat(res, 'sample').chunk(dict(sample=10)) print("computing max projections") max_res = max_res.persist() print("calculating local maximum") loc_max = xr.apply_ufunc( local_max_roll, max_res.chunk(dict(height=-1, width=-1)), input_core_dims=[['height', 'width']], output_core_dims=[['height', 'width']], vectorize=True, dask='parallelized', output_dtypes=[np.uint8], kwargs=dict(k0=2, k1=max_wnd, diff=diff_thres)).sum('sample') loc_max = loc_max.compute() loc_max_flt = loc_max.stack(spatial=['height', 'width']) seeds = (loc_max_flt.where(loc_max_flt > 0, drop=True) .rename('seeds').to_dataframe().reset_index()) return seeds[['height', 'width', 'seeds']].reset_index()
Example #22
Source File: test_gc.py From kartothek with MIT License | 5 votes |
def _run_garbage_collect(*args, **kwargs): tasks = garbage_collect_dataset__delayed(*args, **kwargs) s = pickle.dumps(tasks, pickle.HIGHEST_PROTOCOL) tasks = pickle.loads(s) dask.compute(tasks)
Example #23
Source File: test_delete.py From kartothek with MIT License | 5 votes |
def _delete(*args, **kwargs): tasks = delete_dataset__delayed(*args, **kwargs) s = pickle.dumps(tasks, pickle.HIGHEST_PROTOCOL) tasks = pickle.loads(s) dask.compute(tasks)
Example #24
Source File: semistructured.py From intake with BSD 2-Clause "Simplified" License | 5 votes |
def read(self): self._load_metadata() return self.bag.compute()
Example #25
Source File: visualization.py From minian with GNU General Public License v3.0 | 5 votes |
def compute_subs(self, clicks=None): self.A_sub = self.A_sub.compute() self.C_sub = self.C_sub.compute() self.S_sub = self.S_sub.compute() self.org_sub = self.org_sub.compute() self.C_norm_sub = self.C_norm_sub.compute() self.S_norm_sub = self.S_norm_sub.compute()
Example #26
Source File: visualization.py From minian with GNU General Public License v3.0 | 5 votes |
def _temp_comp_sub(self, usub=None): if usub is None: usub = self.strm_usub.usub if self._normalize: C, S = self.C_norm_sub, self.S_norm_sub else: C, S = self.C_sub, self.S_sub cur_temp = dict() if self._showC: cur_temp['C'] = ( hv.Dataset(C.sel(unit_id=usub) .compute().rename("Intensity (A. U.)") .dropna('frame', how='all')).to(hv.Curve, 'frame')) if self._showS: cur_temp['S'] = ( hv.Dataset(S.sel(unit_id=usub) .compute().rename("Intensity (A. U.)") .dropna('frame', how='all')).to(hv.Curve, 'frame')) cur_vl = (hv.DynamicMap( lambda f, y: hv.VLine(f) if f else hv.VLine(0), streams=[self.strm_f]) .opts(style=dict(color='red'))) cur_cv = hv.Curve([], kdims=['frame'], vdims=['Internsity (A.U.)']) self.strm_f.source = cur_cv h_cv = len(self._w) // 8 w_cv = len(self._w) * 2 temp_comp = (cur_cv * datashade_ndcurve(hv.HoloMap(cur_temp, 'trace') .collate().overlay('trace') .grid('unit_id') .add_dimension('time', 0, 0), 'trace') .opts(plot=dict(shared_xaxis=True)) .map(lambda p: p.opts( plot=dict(frame_height=h_cv, frame_width=w_cv)), hv.RGB) * cur_vl) temp_comp[temp_comp.keys()[0]] = (temp_comp[temp_comp.keys()[0]] .opts(plot=dict(height=h_cv + 75))) return pn.panel(temp_comp)
Example #27
Source File: visualization.py From minian with GNU General Public License v3.0 | 5 votes |
def update_AC(self, usub=None): if usub is None: usub = self.strm_usub.usub if usub: if self._useAC: umask = ((self.A_sub.sel(unit_id=usub) > 0) .any('unit_id')) A_sub = (self.A_sub.sel(unit_id=usub) .where(umask, drop=True).fillna(0)) C_sub = self.C_sub.sel(unit_id=usub) AC = xr.apply_ufunc( da.dot, A_sub, C_sub, input_core_dims=[['height', 'width', 'unit_id'], ['unit_id', 'frame']], output_core_dims=[['height', 'width', 'frame']], dask='allowed') self._AC = AC.compute() wndh, wndw = AC.coords['height'].values, AC.coords['width'].values window = self.A_sub.sel( height=slice(wndh.min(), wndh.max()), width=slice(wndw.min(), wndw.max())) self._AC = self._AC.reindex_like(window).fillna(0) self._mov = (self.org_sub.reindex_like(window)).compute() else: self._AC = self.A_sub.sel(unit_id=usub).sum('unit_id') self._mov = self.org_sub self.strm_f.event(x=0) else: self._AC = xr.DataArray([]) self._mov = xr.DataArray([]) self.strm_f.event(x=0)
Example #28
Source File: visualization.py From minian with GNU General Public License v3.0 | 5 votes |
def centroid(A, verbose=False): def rel_cent(im): im_nan = np.isnan(im) if im_nan.all(): return np.array([np.nan, np.nan]) if im_nan.any(): im = np.nan_to_num(im) cent = np.array(center_of_mass(im)) return cent / im.shape gu_rel_cent = da.gufunc( rel_cent, signature='(h,w)->(d)', output_dtypes=float, output_sizes=dict(d=2), vectorize=True ) cents = (xr.apply_ufunc( gu_rel_cent, A.chunk(dict(height=-1, width=-1)), input_core_dims=[['height', 'width']], output_core_dims=[['dim']], dask='allowed') .assign_coords(dim=['height', 'width'])) if verbose: print("computing centroids") with ProgressBar(): cents=cents.compute() cents_df = (cents.rename('cents').to_series().dropna() .unstack('dim').rename_axis(None, axis='columns') .reset_index()) h_rg = (A.coords['height'].min().values, A.coords['height'].max().values) w_rg = (A.coords['width'].min().values, A.coords['width'].max().values) cents_df['height'] = cents_df['height'] * (h_rg[1] - h_rg[0]) + h_rg[0] cents_df['width'] = cents_df['width'] * (w_rg[1] - w_rg[0]) + w_rg[0] return cents_df
Example #29
Source File: visualization_ply.py From minian with GNU General Public License v3.0 | 5 votes |
def _calculate_contours_centroids(self): cnts_df_list = [] cts_df_list = [] A = self.cnmf['A'].load() for uid in range(self._u): cur_A = A.sel(unit_id=uid) cur_idxs = cur_A.squeeze().dims cur_thres = dask.delayed(cur_A.max)() cur_thres = dask.delayed(float)(cur_thres * .3) cur_cnts = dask.delayed(find_contours)(cur_A, cur_thres) cur_cnts = dask.delayed(np.concatenate)(cur_cnts) cur_cnts = dask.delayed(pd.DataFrame)(cur_cnts, columns=cur_idxs) cur_cnts = cur_cnts.assign(unit_id=uid) cur_cts = dask.delayed(center_of_mass)(cur_A.values) cur_cts = dask.delayed(pd.Series)(cur_cts, index=cur_idxs) cur_cts = cur_cts.append(pd.Series(dict(unit_id=uid))) cnts_df_list.append(cur_cnts) cts_df_list.append(cur_cts) cnts_df_list = dask.compute(*cnts_df_list) cts_df_list = dask.compute(*cts_df_list) cnts_df = pd.concat(cnts_df_list) cts_df = pd.concat(cts_df_list, axis=1).T for dim in cur_idxs: cnts_df[dim].update(cnts_df[dim] / A.sizes[dim] * self._dims[dim]) cts_df[dim].update(cts_df[dim] / A.sizes[dim] * self._dims[dim]) return cnts_df, cts_df
Example #30
Source File: preprocessing.py From minian with GNU General Public License v3.0 | 5 votes |
def detect_brightspot_perframe(varray, thres=0.95): print("creating parallel schedule") spots = [] for fid, fm in varray.rolling(frame=1): sp = delayed(lambda f: f > f.quantile(thres, interpolation='lower'))( fm) spots.append(sp) with ProgressBar(): print("detecting bright spots by frame") spots, = compute(spots) print("concatenating results") spots = xr.concat(spots, dim='frame') return spots # def correct_dust(varray, dust): # mov_corr = varray.values # nz = np.nonzero(dust) # nz_tp = [(d0, d1) for d0, d1 in zip(nz[0], nz[1])] # for i in range(np.count_nonzero(dust)): # cur_dust = (nz[0][i], nz[1][i]) # cur_sur = set( # itt.product( # range(cur_dust[0] - 1, cur_dust[0] + 2), # range(cur_dust[1] - 1, cur_dust[1] + 2))) - set( # cur_dust) - set(nz_tp) # cur_sur = list( # filter( # lambda d: 0 < d[0] < mov.shape[1] and 0 < d[1] < mov.shape[2], # cur_sur)) # if len(cur_sur) > 0: # sur_arr = np.empty((mov.shape[0], len(cur_sur))) # for si, sur in enumerate(cur_sur): # sur_arr[:, si] = mov[:, sur[0], sur[1]] # mov_corr[:, cur_dust[0], cur_dust[1]] = np.mean(sur_arr, axis=1) # else: # print("unable to correct for point ({}, {})".format( # cur_dust[0], cur_dust[1])) # return mov_corr