Python joblib.hash() Examples
The following are 25
code examples of joblib.hash().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
joblib
, or try the search function
.
Example #1
Source File: test_obspvarp.py From anndata with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_setting_ndarray(adata): adata.obsp["a"] = np.ones((M, M)) adata.varp["a"] = np.ones((N, N)) assert np.all(adata.obsp["a"] == np.ones((M, M))) assert np.all(adata.varp["a"] == np.ones((N, N))) h = joblib.hash(adata) with pytest.raises(ValueError): adata.obsp["b"] = np.ones((int(M / 2), M)) with pytest.raises(ValueError): adata.obsp["b"] = np.ones((M, int(M * 2))) with pytest.raises(ValueError): adata.varp["b"] = np.ones((int(N / 2), 10)) with pytest.raises(ValueError): adata.varp["b"] = np.ones((N, int(N * 2))) assert h == joblib.hash(adata)
Example #2
Source File: train.py From cookiecutter-easydata with MIT License | 6 votes |
def train_model(algorithm_params=None, run_number=0, *, dataset_name, algorithm_name, hash_type, **kwargs): """Train a model using the specified algorithm using the given dataset. """ metadata = {} ds = Dataset.load(dataset_name) metadata['data_hash'] = joblib.hash(ds.data, hash_name=hash_type) metadata['target_hash'] = joblib.hash(ds.target, hash_name=hash_type) model = available_algorithms(keys_only=False)[algorithm_name] model.set_params(**algorithm_params) start_time = time.time() model.fit(ds.data, y=ds.target) end_time = record_time_interval('train_model', start_time) metadata['start_time'] = start_time metadata['duration'] = end_time - start_time return model, metadata
Example #3
Source File: datasets.py From cookiecutter-easydata with MIT License | 6 votes |
def get_data_hashes(self, exclude_list=None, hash_type='sha1'): """Compute a the hash of data items exclude_list: list or None List of attributes to skip. if None, skips ['metadata'] hash_type: {'sha1', 'md5', 'sha256'} Algorithm to use for hashing """ if exclude_list is None: exclude_list = ['metadata'] ret = {'hash_type': hash_type} for key, value in self.items(): if key in exclude_list: continue ret[f"{key}_hash"] = joblib.hash(value, hash_name=hash_type) return ret
Example #4
Source File: test_views.py From anndata with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_view_delitem(attr): adata = gen_adata((10, 10)) getattr(adata, attr)["to_delete"] = np.ones((10, 10)) # Shouldn’t be a subclass, should be an ndarray assert type(getattr(adata, attr)["to_delete"]) is np.ndarray view = adata[5:7, :][:, :5] adata_hash = joblib.hash(adata) view_hash = joblib.hash(view) getattr(view, attr).__delitem__("to_delete") assert not view.is_view assert "to_delete" not in getattr(view, attr) assert "to_delete" in getattr(adata, attr) assert adata_hash == joblib.hash(adata) assert view_hash != joblib.hash(view)
Example #5
Source File: test_views.py From anndata with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_set_subset_varm(adata, subset_func): init_hash = joblib.hash(adata) orig_varm_val = adata.varm["o"].copy() while True: subset_idx = slice_subset(adata.var_names) if (adata[:, subset_idx]).shape[1] > 2: break subset = adata[:, subset_idx] internal_idx = _normalize_index( subset_func(np.arange(subset.varm["o"].shape[0])), subset.var_names ) assert subset.is_view subset.varm["o"][internal_idx] = 1 assert not subset.is_view assert np.all(adata.varm["o"] == orig_varm_val) assert init_hash == joblib.hash(adata)
Example #6
Source File: test_views.py From anndata with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_set_subset_obsm(adata, subset_func): init_hash = joblib.hash(adata) orig_obsm_val = adata.obsm["o"].copy() while True: subset_idx = slice_subset(adata.obs_names) if len(adata[subset_idx, :]) > 2: break subset = adata[subset_idx, :] internal_idx = _normalize_index( subset_func(np.arange(subset.obsm["o"].shape[0])), subset.obs_names ) assert subset.is_view subset.obsm["o"][internal_idx] = 1 assert not subset.is_view assert np.all(adata.obsm["o"] == orig_obsm_val) assert init_hash == joblib.hash(adata)
Example #7
Source File: test_views.py From anndata with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_not_set_subset_X(matrix_type, subset_func): adata = ad.AnnData(matrix_type(asarray(sparse.random(20, 20)))) init_hash = joblib.hash(adata) orig_X_val = adata.X.copy() while True: subset_idx = slice_subset(adata.obs_names) if len(adata[subset_idx, :]) > 2: break subset = adata[subset_idx, :] subset = adata[:, subset_idx] internal_idx = _normalize_index( subset_func(np.arange(subset.X.shape[1])), subset.var_names ) assert subset.is_view subset.X[:, internal_idx] = 1 assert not subset.is_view assert not np.any(asarray(adata.X != orig_X_val)) assert init_hash == joblib.hash(adata)
Example #8
Source File: test_views.py From anndata with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_set_obsm(adata): init_hash = joblib.hash(adata) dim0_size = np.random.randint(2, adata.shape[0] - 1) dim1_size = np.random.randint(1, 99) orig_obsm_val = adata.obsm["o"].copy() subset_idx = np.random.choice(adata.obs_names, dim0_size, replace=False) subset = adata[subset_idx, :] assert subset.is_view subset.obsm = dict(o=np.ones((dim0_size, dim1_size))) assert not subset.is_view assert np.all(orig_obsm_val == adata.obsm["o"]) # Checking for mutation assert np.all(subset.obsm["o"] == np.ones((dim0_size, dim1_size))) subset = adata[subset_idx, :] subset_hash = joblib.hash(subset) with pytest.raises(ValueError): subset.obsm = dict(o=np.ones((dim0_size + 1, dim1_size))) with pytest.raises(ValueError): subset.varm = dict(o=np.ones((dim0_size - 1, dim1_size))) assert subset_hash == joblib.hash(subset) # Only modification have been made to a view assert init_hash == joblib.hash(adata)
Example #9
Source File: test_obsmvarm.py From anndata with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_setting_sparse(adata): obsm_sparse = sparse.random(M, 100) adata.obsm["a"] = obsm_sparse assert not np.any((adata.obsm["a"] != obsm_sparse).data) varm_sparse = sparse.random(N, 100) adata.varm["a"] = varm_sparse assert not np.any((adata.varm["a"] != varm_sparse).data) h = joblib.hash(adata) bad_obsm_sparse = sparse.random(M * 2, M) with pytest.raises(ValueError): adata.obsm["b"] = bad_obsm_sparse bad_varm_sparse = sparse.random(N * 2, N) with pytest.raises(ValueError): adata.varm["b"] = bad_varm_sparse assert h == joblib.hash(adata)
Example #10
Source File: test_obspvarp.py From anndata with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_setting_sparse(adata): obsp_sparse = sparse.random(M, M) adata.obsp["a"] = obsp_sparse assert not np.any((adata.obsp["a"] != obsp_sparse).data) varp_sparse = sparse.random(N, N) adata.varp["a"] = varp_sparse assert not np.any((adata.varp["a"] != varp_sparse).data) h = joblib.hash(adata) bad_obsp_sparse = sparse.random(M * 2, M) with pytest.raises(ValueError): adata.obsp["b"] = bad_obsp_sparse bad_varp_sparse = sparse.random(N * 2, N) with pytest.raises(ValueError): adata.varp["b"] = bad_varp_sparse assert h == joblib.hash(adata)
Example #11
Source File: test_obsmvarm.py From anndata with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_setting_ndarray(adata): adata.obsm["a"] = np.ones((M, 10)) adata.varm["a"] = np.ones((N, 10)) assert np.all(adata.obsm["a"] == np.ones((M, 10))) assert np.all(adata.varm["a"] == np.ones((N, 10))) h = joblib.hash(adata) with pytest.raises(ValueError): adata.obsm["b"] = np.ones((int(M / 2), 10)) with pytest.raises(ValueError): adata.obsm["b"] = np.ones((int(M * 2), 10)) with pytest.raises(ValueError): adata.varm["b"] = np.ones((int(N / 2), 10)) with pytest.raises(ValueError): adata.varm["b"] = np.ones((int(N * 2), 10)) assert h == joblib.hash(adata)
Example #12
Source File: test_views.py From anndata with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_set_var(adata, subset_func): init_hash = joblib.hash(adata) subset = adata[:, subset_func(adata.var_names)] new_var = pd.DataFrame( dict(a=np.ones(subset.n_vars), b=np.ones(subset.n_vars)), index=subset.var_names, ) assert subset.is_view subset.var = new_var assert not subset.is_view assert np.all(subset.var == new_var) assert joblib.hash(adata) == init_hash
Example #13
Source File: estimator_checks.py From sktime with BSD 3-Clause "New" or "Revised" License | 5 votes |
def check_fit_does_not_overwrite_hyper_params(Estimator): # Check that we do not overwrite hyper-parameters in fit estimator = _construct_instance(Estimator) set_random_state(estimator) # Make a physical copy of the original estimator parameters before fitting. params = estimator.get_params() original_params = deepcopy(params) # Fit the model fit_args = _make_args(estimator, "fit") estimator.fit(*fit_args) # Compare the state of the model parameters with the original parameters new_params = estimator.get_params() for param_name, original_value in original_params.items(): new_value = new_params[param_name] # We should never change or mutate the internal state of input # parameters by default. To check this we use the joblib.hash function # that introspects recursively any subobjects to compute a checksum. # The only exception to this rule of immutable constructor parameters # is possible RandomState instance but in this check we explicitly # fixed the random_state params recursively to be integer seeds. assert joblib.hash(new_value) == joblib.hash(original_value), ( "Estimator %s should not change or mutate " " the parameter %s from %s to %s during fit." % (estimator.__class__.__name__, param_name, original_value, new_value))
Example #14
Source File: train.py From cookiecutter-easydata with MIT License | 5 votes |
def save_model(metadata=None, model_path=None, hash_type='sha1', *, model_name, model): """Save a model to disk Parameters ---------- model_name: str Unique key to use as model name (and filename) metadata: dict Model metadata model: sklearn estimator representing a model hash_type: {'sha1', 'md5'} hash algorithm to use for joblib hashing model_path: path, default `paths['trained_model_path']` Where model should be saved. Returns ------- copy of metadata """ if metadata is None: metadata = {} else: metadata = metadata.copy() if model_path is None: model_path = paths['trained_model_path'] else: model_path = pathlib.Path(model_path) joblib.dump(model, model_path / f"{model_name}.model") metadata['model_hash'] = joblib.hash(model, hash_name=hash_type) save_json(model_path / f"{model_name}.metadata", metadata) return metadata
Example #15
Source File: datasets.py From cookiecutter-easydata with MIT License | 5 votes |
def __hash__(self): return hash(self.to_hash())
Example #16
Source File: datasets.py From cookiecutter-easydata with MIT License | 5 votes |
def add_url(self, url=None, *, hash_type='sha1', hash_value=None, name=None, file_name=None, force=False, unpack_action=None): """ Add a URL to the file list hash_type: {'sha1', 'md5', 'sha256'} hash_value: string or None if None, hash will be computed from downloaded file file_name: string or None Name of downloaded file. If None, will be the last component of the URL url: string URL to fetch name: str text description of this file. force: boolean (default False) If True, overwrite an existing entry for this file unpack_action: {'zip', 'tgz', 'tbz2', 'tar', 'gzip', 'compress', 'copy'} or None action to take in order to unpack this file. If None, infers from file type. """ if url is None: raise Exception("`url` is required") file_name = infer_filename(file_name=file_name, url=url) fetch_dict = { 'fetch_action': 'url', 'file_name': file_name, 'hash_type': hash_type, 'hash_value': hash_value, 'name': name, 'url': url, } if unpack_action: filelist_entry.update({'unpack_action': unpack_action}) if file_name in self.file_dict and not force: raise Exception(f"{file_name} already in file_dict. Use `force=True` to add anyway.") self.file_dict[file_name] = fetch_dict self.fetched_ = False
Example #17
Source File: ffx.py From sparsereg with MIT License | 5 votes |
def __hash__(self): return hash(joblib.hash((self._final_estimator.coef_, self._final_estimator.intercept_)))
Example #18
Source File: test_views.py From anndata with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_view_failed_delitem(attr): adata = gen_adata((10, 10)) view = adata[5:7, :][:, :5] adata_hash = joblib.hash(adata) view_hash = joblib.hash(view) with pytest.raises(KeyError): getattr(view, attr).__delitem__("not a key") assert view.is_view assert adata_hash == joblib.hash(adata) assert view_hash == joblib.hash(view)
Example #19
Source File: test_views.py From anndata with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_set_obsm_key(adata): init_hash = joblib.hash(adata) orig_obsm_val = adata.obsm["o"].copy() subset_obsm = adata[:50] assert subset_obsm.is_view subset_obsm.obsm["o"] = np.ones((50, 20)) assert not subset_obsm.is_view assert np.all(adata.obsm["o"] == orig_obsm_val) assert init_hash == joblib.hash(adata)
Example #20
Source File: test_views.py From anndata with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_set_varm_key(adata): init_hash = joblib.hash(adata) orig_varm_val = adata.varm["o"].copy() subset_varm = adata[:, :50] assert subset_varm.is_view subset_varm.varm["o"] = np.ones((50, 20)) assert not subset_varm.is_view assert np.all(adata.varm["o"] == orig_varm_val) assert init_hash == joblib.hash(adata)
Example #21
Source File: test_views.py From anndata with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_set_obs(adata, subset_func): init_hash = joblib.hash(adata) subset = adata[subset_func(adata.obs_names), :] new_obs = pd.DataFrame( dict(a=np.ones(subset.n_obs), b=np.ones(subset.n_obs)), index=subset.obs_names, ) assert subset.is_view subset.obs = new_obs assert not subset.is_view assert np.all(subset.obs == new_obs) assert joblib.hash(adata) == init_hash
Example #22
Source File: datasets.py From cookiecutter-easydata with MIT License | 4 votes |
def __init__(self, name='datasource', parse_function=None, dataset_dir=None, file_list=None): """Create a DataSource Parameters ---------- name: str name of dataset parse_function: func (or partial) Function that will be called to process raw data into usable Dataset dataset_dir: path default location for raw files file_list: list list of file_dicts associated with this DataSource. Valid keys for each file_dict include: url: (optional) URL of resource to be fetched hash_type: {'sha1', 'md5', 'sha256'} Type of hash function used to verify file integrity hash_value: string Value of hash used to verify file integrity file_name: string (optional) filename to use when saving file locally. If omitted, it will be inferred from url or source_file name: string or {'DESCR', 'LICENSE'} (optional) description of the file. of DESCR or LICENSE, will be used as metadata unpack_action: {'zip', 'tgz', 'tbz2', 'tar', 'gzip', 'compress', 'copy'} or None action to take in order to unpack this file. If None, infers from file type. """ if file_list is None: file_list = [] if dataset_dir is None: dataset_dir = paths['raw_data_path'] if parse_function is None: parse_function = process_dataset_default self.name = name self.file_dict = {infer_filename(**item):item for item in file_list} self.parse_function = parse_function self.dataset_dir = dataset_dir # sklearn-style attributes. Usually these would be set in fit() self.fetched_ = False self.fetched_files_ = [] self.unpacked_ = False self.unpack_path_ = None
Example #23
Source File: datasets.py From cookiecutter-easydata with MIT License | 4 votes |
def add_file(self, source_file=None, *, hash_type='sha1', hash_value=None, name=None, file_name=None, unpack_action=None, force=False): """ Add a file to the file list. This file must exist on disk, as there is no method specified for fetching it. This is useful when the data source requires an offline procedure for downloading. hash_type: {'sha1', 'md5', 'sha256'} hash_value: string or None if None, hash will be computed from specified file file_name: string Name of destination file. relative to paths['raw_data_dir'] name: str text description of this file. source_file: path file to be copied force: boolean (default False) If True, overwrite an existing entry for this file unpack_action: {'zip', 'tgz', 'tbz2', 'tar', 'gzip', 'compress', 'copy'} or None action to take in order to unpack this file. If None, infers from file type. """ if source_file is None: raise Exception("`source_file` is required") source_file = pathlib.Path(source_file) if not source_file.exists(): logger.warning(f"{source_file} not found on disk") file_name = infer_filename(file_name=file_name, source_file=source_file) if hash_value is None: logger.debug(f"Hash unspecified. Computing {hash_type} hash of {source_file.name}") hash_value = hash_file(source_file, algorithm=hash_type).hexdigest() fetch_dict = { 'fetch_action': 'copy', 'file_name': file_name, 'hash_type': hash_type, 'hash_value': hash_value, 'name': name, 'source_file': str(source_file), } if unpack_action: fetch_dict.update({'unpack_action': unpack_action}) existing_files = [f['source_file'] for k,f in self.file_dict.items()] existing_hashes = [f['hash_value'] for k,f in self.file_dict.items() if f['hash_value']] if file_name in self.file_dict and not force: raise Exception(f"{file_name} already in file_dict. Use `force=True` to add anyway.") if str(source_file.name) in existing_files and not force: raise Exception(f"source file: {source_file} already in file list. Use `force=True` to add anyway.") if hash_value in existing_hashes and not force: raise Exception(f"file with hash {hash_value} already in file list. Use `force=True` to add anyway.") logger.warning("Reproducibility Issue: add_file is often not reproducible. If possible, use add_manual_download instead") self.file_dict[file_name] = fetch_dict self.fetched_ = False
Example #24
Source File: datasets.py From cookiecutter-easydata with MIT License | 4 votes |
def fetch(self, fetch_path=None, force=False): """Fetch files in the `file_dict` to `raw_data_dir` and check hashes. Parameters ---------- fetch_path: None or string By default, assumes dataset_dir force: Boolean If True, ignore the cache and re-download the fetch each time """ if self.fetched_ and force is False: # validate the downloaded files: for filename, item in self.file_dict.items(): raw_data_file = paths['raw_data_path'] / filename if not raw_data_file.exists(): logger.warning(f"{raw_data_file.name} missing. Invalidating fetch cache") self.fetched_ = False break raw_file_hash = hash_file(raw_data_file, algorithm=item['hash_type']).hexdigest() if raw_file_hash != item['hash_value']: logger.warning(f"{raw_data_file.name} {item['hash_type']} hash invalid ({raw_file_hash} != {item['hash_value']}). Invalidating fetch cache.") self.fetched_ = False break else: logger.debug(f'Data Source {self.name} is already fetched. Skipping') return if fetch_path is None: fetch_path = self.dataset_dir else: fetch_path = pathlib.Path(fetch_path) self.fetched_ = False self.fetched_files_ = [] for key, item in self.file_dict.items(): status, result, hash_value = fetch_file(**item) logger.debug(f"Fetching {key}: status:{status}") if status: # True (cached) or HTTP Code (successful download) item['hash_value'] = hash_value item['file_name'] = result.name self.fetched_files_.append(result) else: if item.get('fetch_action', False) != 'message': logger.error(f"fetch of {key} returned: {result}") break else: self.fetched_ = True self.unpacked_ = False return self.fetched_
Example #25
Source File: test_hdf5_backing.py From anndata with BSD 3-Clause "New" or "Revised" License | 4 votes |
def test_backing(adata, tmp_path, backing_h5ad): assert not adata.isbacked adata.filename = backing_h5ad adata.write() assert not adata.file.is_open assert adata.isbacked assert adata[:, 0].is_view assert adata[:, 0].X.tolist() == np.reshape([1, 4, 7], (3, 1)).tolist() # this might give us a trouble as the user might not # know that the file is open again.... assert adata.file.is_open adata[:2, 0].X = [0, 0] assert adata[:, 0].X.tolist() == np.reshape([0, 0, 7], (3, 1)).tolist() adata_subset = adata[:2, [0, 1]] assert adata_subset.is_view subset_hash = joblib.hash(adata_subset) # cannot set view in backing mode... with pytest.raises(ValueError): adata_subset.obs["foo"] = range(2) with pytest.raises(ValueError): adata_subset.var["bar"] = -12 with pytest.raises(ValueError): adata_subset.obsm["o2"] = np.ones((2, 2)) with pytest.raises(ValueError): adata_subset.varm["v2"] = np.zeros((2, 2)) with pytest.raises(ValueError): adata_subset.layers["float2"] = adata_subset.layers["float"].copy() # Things should stay the same after failed operations assert subset_hash == joblib.hash(adata_subset) assert adata_subset.is_view # need to copy first adata_subset = adata_subset.copy(tmp_path / "test.subset.h5ad") # now transition to actual object assert not adata_subset.is_view adata_subset.obs["foo"] = range(2) assert not adata_subset.is_view assert adata_subset.isbacked assert adata_subset.obs["foo"].tolist() == list(range(2)) # save adata_subset.write() # TODO: Also test updating the backing file inplace