Python Examples of joblib.hash

Source File: test_obspvarp.py From anndata with BSD 3-Clause "New" or "Revised" License

6 votes

def test_setting_ndarray(adata):
    adata.obsp["a"] = np.ones((M, M))
    adata.varp["a"] = np.ones((N, N))
    assert np.all(adata.obsp["a"] == np.ones((M, M)))
    assert np.all(adata.varp["a"] == np.ones((N, N)))

    h = joblib.hash(adata)
    with pytest.raises(ValueError):
        adata.obsp["b"] = np.ones((int(M / 2), M))
    with pytest.raises(ValueError):
        adata.obsp["b"] = np.ones((M, int(M * 2)))
    with pytest.raises(ValueError):
        adata.varp["b"] = np.ones((int(N / 2), 10))
    with pytest.raises(ValueError):
        adata.varp["b"] = np.ones((N, int(N * 2)))
    assert h == joblib.hash(adata)

Source File: train.py From cookiecutter-easydata with MIT License

6 votes

def train_model(algorithm_params=None,
                run_number=0, *, dataset_name, algorithm_name, hash_type,
                **kwargs):
    """Train a model using the specified algorithm using the given dataset.

    """
    metadata = {}
    ds = Dataset.load(dataset_name)
    metadata['data_hash'] = joblib.hash(ds.data, hash_name=hash_type)
    metadata['target_hash'] = joblib.hash(ds.target, hash_name=hash_type)
    model = available_algorithms(keys_only=False)[algorithm_name]
    model.set_params(**algorithm_params)
    start_time = time.time()
    model.fit(ds.data, y=ds.target)
    end_time = record_time_interval('train_model', start_time)
    metadata['start_time'] = start_time
    metadata['duration'] = end_time - start_time
    return model, metadata

Source File: datasets.py From cookiecutter-easydata with MIT License

6 votes

def get_data_hashes(self, exclude_list=None, hash_type='sha1'):
        """Compute a the hash of data items

        exclude_list: list or None
            List of attributes to skip.
            if None, skips ['metadata']

        hash_type: {'sha1', 'md5', 'sha256'}
            Algorithm to use for hashing
        """
        if exclude_list is None:
            exclude_list = ['metadata']

        ret = {'hash_type': hash_type}
        for key, value in self.items():
            if key in exclude_list:
                continue
            ret[f"{key}_hash"] = joblib.hash(value, hash_name=hash_type)
        return ret

Source File: test_views.py From anndata with BSD 3-Clause "New" or "Revised" License

6 votes

def test_view_delitem(attr):
    adata = gen_adata((10, 10))
    getattr(adata, attr)["to_delete"] = np.ones((10, 10))
    # Shouldn’t be a subclass, should be an ndarray
    assert type(getattr(adata, attr)["to_delete"]) is np.ndarray
    view = adata[5:7, :][:, :5]
    adata_hash = joblib.hash(adata)
    view_hash = joblib.hash(view)

    getattr(view, attr).__delitem__("to_delete")

    assert not view.is_view
    assert "to_delete" not in getattr(view, attr)
    assert "to_delete" in getattr(adata, attr)
    assert adata_hash == joblib.hash(adata)
    assert view_hash != joblib.hash(view)

Source File: test_views.py From anndata with BSD 3-Clause "New" or "Revised" License

6 votes

def test_set_subset_varm(adata, subset_func):
    init_hash = joblib.hash(adata)
    orig_varm_val = adata.varm["o"].copy()

    while True:
        subset_idx = slice_subset(adata.var_names)
        if (adata[:, subset_idx]).shape[1] > 2:
            break
    subset = adata[:, subset_idx]

    internal_idx = _normalize_index(
        subset_func(np.arange(subset.varm["o"].shape[0])), subset.var_names
    )

    assert subset.is_view
    subset.varm["o"][internal_idx] = 1
    assert not subset.is_view
    assert np.all(adata.varm["o"] == orig_varm_val)

    assert init_hash == joblib.hash(adata)

Source File: test_views.py From anndata with BSD 3-Clause "New" or "Revised" License

6 votes

def test_set_subset_obsm(adata, subset_func):
    init_hash = joblib.hash(adata)
    orig_obsm_val = adata.obsm["o"].copy()

    while True:
        subset_idx = slice_subset(adata.obs_names)
        if len(adata[subset_idx, :]) > 2:
            break
    subset = adata[subset_idx, :]

    internal_idx = _normalize_index(
        subset_func(np.arange(subset.obsm["o"].shape[0])), subset.obs_names
    )

    assert subset.is_view
    subset.obsm["o"][internal_idx] = 1
    assert not subset.is_view
    assert np.all(adata.obsm["o"] == orig_obsm_val)

    assert init_hash == joblib.hash(adata)

Source File: test_views.py From anndata with BSD 3-Clause "New" or "Revised" License

6 votes

def test_not_set_subset_X(matrix_type, subset_func):
    adata = ad.AnnData(matrix_type(asarray(sparse.random(20, 20))))
    init_hash = joblib.hash(adata)
    orig_X_val = adata.X.copy()
    while True:
        subset_idx = slice_subset(adata.obs_names)
        if len(adata[subset_idx, :]) > 2:
            break
    subset = adata[subset_idx, :]

    subset = adata[:, subset_idx]

    internal_idx = _normalize_index(
        subset_func(np.arange(subset.X.shape[1])), subset.var_names
    )
    assert subset.is_view
    subset.X[:, internal_idx] = 1
    assert not subset.is_view
    assert not np.any(asarray(adata.X != orig_X_val))

    assert init_hash == joblib.hash(adata)

Source File: test_views.py From anndata with BSD 3-Clause "New" or "Revised" License

6 votes

def test_set_obsm(adata):
    init_hash = joblib.hash(adata)

    dim0_size = np.random.randint(2, adata.shape[0] - 1)
    dim1_size = np.random.randint(1, 99)
    orig_obsm_val = adata.obsm["o"].copy()
    subset_idx = np.random.choice(adata.obs_names, dim0_size, replace=False)

    subset = adata[subset_idx, :]
    assert subset.is_view
    subset.obsm = dict(o=np.ones((dim0_size, dim1_size)))
    assert not subset.is_view
    assert np.all(orig_obsm_val == adata.obsm["o"])  # Checking for mutation
    assert np.all(subset.obsm["o"] == np.ones((dim0_size, dim1_size)))

    subset = adata[subset_idx, :]
    subset_hash = joblib.hash(subset)
    with pytest.raises(ValueError):
        subset.obsm = dict(o=np.ones((dim0_size + 1, dim1_size)))
    with pytest.raises(ValueError):
        subset.varm = dict(o=np.ones((dim0_size - 1, dim1_size)))
    assert subset_hash == joblib.hash(subset)

    # Only modification have been made to a view
    assert init_hash == joblib.hash(adata)

Source File: test_obsmvarm.py From anndata with BSD 3-Clause "New" or "Revised" License

6 votes

def test_setting_sparse(adata):
    obsm_sparse = sparse.random(M, 100)
    adata.obsm["a"] = obsm_sparse
    assert not np.any((adata.obsm["a"] != obsm_sparse).data)

    varm_sparse = sparse.random(N, 100)
    adata.varm["a"] = varm_sparse
    assert not np.any((adata.varm["a"] != varm_sparse).data)

    h = joblib.hash(adata)

    bad_obsm_sparse = sparse.random(M * 2, M)
    with pytest.raises(ValueError):
        adata.obsm["b"] = bad_obsm_sparse

    bad_varm_sparse = sparse.random(N * 2, N)
    with pytest.raises(ValueError):
        adata.varm["b"] = bad_varm_sparse

    assert h == joblib.hash(adata)

Source File: test_obspvarp.py From anndata with BSD 3-Clause "New" or "Revised" License

6 votes

def test_setting_sparse(adata):
    obsp_sparse = sparse.random(M, M)
    adata.obsp["a"] = obsp_sparse
    assert not np.any((adata.obsp["a"] != obsp_sparse).data)

    varp_sparse = sparse.random(N, N)
    adata.varp["a"] = varp_sparse
    assert not np.any((adata.varp["a"] != varp_sparse).data)

    h = joblib.hash(adata)

    bad_obsp_sparse = sparse.random(M * 2, M)
    with pytest.raises(ValueError):
        adata.obsp["b"] = bad_obsp_sparse

    bad_varp_sparse = sparse.random(N * 2, N)
    with pytest.raises(ValueError):
        adata.varp["b"] = bad_varp_sparse

    assert h == joblib.hash(adata)

Source File: test_obsmvarm.py From anndata with BSD 3-Clause "New" or "Revised" License

6 votes

def test_setting_ndarray(adata):
    adata.obsm["a"] = np.ones((M, 10))
    adata.varm["a"] = np.ones((N, 10))
    assert np.all(adata.obsm["a"] == np.ones((M, 10)))
    assert np.all(adata.varm["a"] == np.ones((N, 10)))

    h = joblib.hash(adata)
    with pytest.raises(ValueError):
        adata.obsm["b"] = np.ones((int(M / 2), 10))
    with pytest.raises(ValueError):
        adata.obsm["b"] = np.ones((int(M * 2), 10))
    with pytest.raises(ValueError):
        adata.varm["b"] = np.ones((int(N / 2), 10))
    with pytest.raises(ValueError):
        adata.varm["b"] = np.ones((int(N * 2), 10))
    assert h == joblib.hash(adata)

Source File: test_views.py From anndata with BSD 3-Clause "New" or "Revised" License

6 votes

def test_set_var(adata, subset_func):
    init_hash = joblib.hash(adata)

    subset = adata[:, subset_func(adata.var_names)]

    new_var = pd.DataFrame(
        dict(a=np.ones(subset.n_vars), b=np.ones(subset.n_vars)),
        index=subset.var_names,
    )

    assert subset.is_view
    subset.var = new_var
    assert not subset.is_view
    assert np.all(subset.var == new_var)

    assert joblib.hash(adata) == init_hash

Source File: estimator_checks.py From sktime with BSD 3-Clause "New" or "Revised" License

5 votes

def check_fit_does_not_overwrite_hyper_params(Estimator):
    # Check that we do not overwrite hyper-parameters in fit
    estimator = _construct_instance(Estimator)
    set_random_state(estimator)

    # Make a physical copy of the original estimator parameters before fitting.
    params = estimator.get_params()
    original_params = deepcopy(params)

    # Fit the model
    fit_args = _make_args(estimator, "fit")
    estimator.fit(*fit_args)

    # Compare the state of the model parameters with the original parameters
    new_params = estimator.get_params()
    for param_name, original_value in original_params.items():
        new_value = new_params[param_name]

        # We should never change or mutate the internal state of input
        # parameters by default. To check this we use the joblib.hash function
        # that introspects recursively any subobjects to compute a checksum.
        # The only exception to this rule of immutable constructor parameters
        # is possible RandomState instance but in this check we explicitly
        # fixed the random_state params recursively to be integer seeds.
        assert joblib.hash(new_value) == joblib.hash(original_value), (
                "Estimator %s should not change or mutate "
                " the parameter %s from %s to %s during fit."
                % (estimator.__class__.__name__, param_name, original_value,
                   new_value))

Source File: train.py From cookiecutter-easydata with MIT License

5 votes

def save_model(metadata=None, model_path=None, hash_type='sha1',
               *, model_name, model):
    """Save a model to disk

    Parameters
    ----------
    model_name: str
        Unique key to use as model name (and filename)
    metadata: dict
        Model metadata
    model:
        sklearn estimator representing a model
    hash_type: {'sha1', 'md5'}
        hash algorithm to use for joblib hashing
    model_path: path, default `paths['trained_model_path']`
        Where model should be saved.

    Returns
    -------
    copy of metadata
    """
    if metadata is None:
        metadata = {}
    else:
        metadata = metadata.copy()

    if model_path is None:
        model_path = paths['trained_model_path']
    else:
        model_path = pathlib.Path(model_path)

    joblib.dump(model, model_path / f"{model_name}.model")
    metadata['model_hash'] = joblib.hash(model, hash_name=hash_type)
    save_json(model_path / f"{model_name}.metadata", metadata)
    return metadata

Source File: datasets.py From cookiecutter-easydata with MIT License

5 votes

def __hash__(self):
        return hash(self.to_hash())

Source File: datasets.py From cookiecutter-easydata with MIT License

5 votes

def add_url(self, url=None, *, hash_type='sha1', hash_value=None,
                name=None, file_name=None, force=False, unpack_action=None):
        """
        Add a URL to the file list

        hash_type: {'sha1', 'md5', 'sha256'}
        hash_value: string or None
            if None, hash will be computed from downloaded file
        file_name: string or None
            Name of downloaded file. If None, will be the last component of the URL
        url: string
            URL to fetch
        name: str
            text description of this file.
        force: boolean (default False)
            If True, overwrite an existing entry for this file
        unpack_action: {'zip', 'tgz', 'tbz2', 'tar', 'gzip', 'compress', 'copy'} or None
            action to take in order to unpack this file. If None, infers from file type.
        """
        if url is None:
            raise Exception("`url` is required")

        file_name = infer_filename(file_name=file_name, url=url)

        fetch_dict = {
            'fetch_action': 'url',
            'file_name': file_name,
            'hash_type': hash_type,
            'hash_value': hash_value,
            'name': name,
            'url': url,
        }
        if unpack_action:
            filelist_entry.update({'unpack_action': unpack_action})

        if file_name in self.file_dict and not force:
            raise Exception(f"{file_name} already in file_dict. Use `force=True` to add anyway.")
        self.file_dict[file_name] = fetch_dict
        self.fetched_ = False

Source File: ffx.py From sparsereg with MIT License

5 votes

def __hash__(self):
        return hash(joblib.hash((self._final_estimator.coef_, self._final_estimator.intercept_)))

Source File: test_views.py From anndata with BSD 3-Clause "New" or "Revised" License

5 votes

def test_view_failed_delitem(attr):
    adata = gen_adata((10, 10))
    view = adata[5:7, :][:, :5]
    adata_hash = joblib.hash(adata)
    view_hash = joblib.hash(view)

    with pytest.raises(KeyError):
        getattr(view, attr).__delitem__("not a key")

    assert view.is_view
    assert adata_hash == joblib.hash(adata)
    assert view_hash == joblib.hash(view)

Source File: test_views.py From anndata with BSD 3-Clause "New" or "Revised" License

5 votes

def test_set_obsm_key(adata):
    init_hash = joblib.hash(adata)

    orig_obsm_val = adata.obsm["o"].copy()
    subset_obsm = adata[:50]
    assert subset_obsm.is_view
    subset_obsm.obsm["o"] = np.ones((50, 20))
    assert not subset_obsm.is_view
    assert np.all(adata.obsm["o"] == orig_obsm_val)

    assert init_hash == joblib.hash(adata)

Source File: test_views.py From anndata with BSD 3-Clause "New" or "Revised" License

5 votes

def test_set_varm_key(adata):
    init_hash = joblib.hash(adata)

    orig_varm_val = adata.varm["o"].copy()
    subset_varm = adata[:, :50]
    assert subset_varm.is_view
    subset_varm.varm["o"] = np.ones((50, 20))
    assert not subset_varm.is_view
    assert np.all(adata.varm["o"] == orig_varm_val)

    assert init_hash == joblib.hash(adata)

Source File: test_views.py From anndata with BSD 3-Clause "New" or "Revised" License

5 votes

def test_set_obs(adata, subset_func):
    init_hash = joblib.hash(adata)

    subset = adata[subset_func(adata.obs_names), :]

    new_obs = pd.DataFrame(
        dict(a=np.ones(subset.n_obs), b=np.ones(subset.n_obs)), index=subset.obs_names,
    )

    assert subset.is_view
    subset.obs = new_obs
    assert not subset.is_view
    assert np.all(subset.obs == new_obs)

    assert joblib.hash(adata) == init_hash

Source File: datasets.py From cookiecutter-easydata with MIT License

4 votes

def __init__(self,
                 name='datasource',
                 parse_function=None,
                 dataset_dir=None,
                 file_list=None):
        """Create a DataSource
        Parameters
        ----------
        name: str
            name of dataset
        parse_function: func (or partial)
            Function that will be called to process raw data into usable Dataset
        dataset_dir: path
            default location for raw files
        file_list: list
            list of file_dicts associated with this DataSource.
            Valid keys for each file_dict include:
                url: (optional)
                    URL of resource to be fetched
                hash_type: {'sha1', 'md5', 'sha256'}
                    Type of hash function used to verify file integrity
                hash_value: string
                    Value of hash used to verify file integrity
                file_name: string (optional)
                    filename to use when saving file locally. If omitted, it will be inferred from url or source_file
                name: string or {'DESCR', 'LICENSE'} (optional)
                    description of the file. of DESCR or LICENSE, will be used as metadata
                unpack_action: {'zip', 'tgz', 'tbz2', 'tar', 'gzip', 'compress', 'copy'} or None
                    action to take in order to unpack this file. If None, infers from file type.

        """
        if file_list is None:
            file_list = []

        if dataset_dir is None:
            dataset_dir = paths['raw_data_path']
        if parse_function is None:
            parse_function = process_dataset_default
        self.name = name
        self.file_dict = {infer_filename(**item):item for item in file_list}
        self.parse_function = parse_function
        self.dataset_dir = dataset_dir

        # sklearn-style attributes. Usually these would be set in fit()
        self.fetched_ = False
        self.fetched_files_ = []
        self.unpacked_ = False
        self.unpack_path_ = None

Source File: datasets.py From cookiecutter-easydata with MIT License

4 votes

def add_file(self, source_file=None, *, hash_type='sha1', hash_value=None,
                 name=None, file_name=None, unpack_action=None,
                 force=False):
        """
        Add a file to the file list.

        This file must exist on disk, as there is no method specified for fetching it.
        This is useful when the data source requires an offline procedure for downloading.

        hash_type: {'sha1', 'md5', 'sha256'}
        hash_value: string or None
            if None, hash will be computed from specified file
        file_name: string
            Name of destination file. relative to paths['raw_data_dir']
        name: str
            text description of this file.
        source_file: path
            file to be copied
        force: boolean (default False)
            If True, overwrite an existing entry for this file
        unpack_action: {'zip', 'tgz', 'tbz2', 'tar', 'gzip', 'compress', 'copy'} or None
            action to take in order to unpack this file. If None, infers from file type.
        """
        if source_file is None:
            raise Exception("`source_file` is required")
        source_file = pathlib.Path(source_file)

        if not source_file.exists():
            logger.warning(f"{source_file} not found on disk")

        file_name = infer_filename(file_name=file_name, source_file=source_file)

        if hash_value is None:
            logger.debug(f"Hash unspecified. Computing {hash_type} hash of {source_file.name}")
            hash_value = hash_file(source_file, algorithm=hash_type).hexdigest()

        fetch_dict = {
            'fetch_action': 'copy',
            'file_name': file_name,
            'hash_type': hash_type,
            'hash_value': hash_value,
            'name': name,
            'source_file': str(source_file),
        }
        if unpack_action:
            fetch_dict.update({'unpack_action': unpack_action})

        existing_files = [f['source_file'] for k,f in self.file_dict.items()]
        existing_hashes = [f['hash_value'] for k,f in self.file_dict.items() if f['hash_value']]
        if file_name in self.file_dict and not force:
            raise Exception(f"{file_name} already in file_dict. Use `force=True` to add anyway.")
        if str(source_file.name) in existing_files and not force:
            raise Exception(f"source file: {source_file} already in file list. Use `force=True` to add anyway.")
        if hash_value in existing_hashes and not force:
            raise Exception(f"file with hash {hash_value} already in file list. Use `force=True` to add anyway.")

        logger.warning("Reproducibility Issue: add_file is often not reproducible. If possible, use add_manual_download instead")
        self.file_dict[file_name] = fetch_dict
        self.fetched_ = False

Source File: datasets.py From cookiecutter-easydata with MIT License

4 votes

def fetch(self, fetch_path=None, force=False):
        """Fetch files in the `file_dict` to `raw_data_dir` and check hashes.

        Parameters
        ----------
        fetch_path: None or string
            By default, assumes dataset_dir

        force: Boolean
            If True, ignore the cache and re-download the fetch each time
        """
        if self.fetched_ and force is False:
            # validate the downloaded files:
            for filename, item in self.file_dict.items():
                raw_data_file = paths['raw_data_path'] / filename
                if not raw_data_file.exists():
                    logger.warning(f"{raw_data_file.name} missing. Invalidating fetch cache")
                    self.fetched_ = False
                    break
                raw_file_hash = hash_file(raw_data_file, algorithm=item['hash_type']).hexdigest()
                if raw_file_hash != item['hash_value']:
                    logger.warning(f"{raw_data_file.name} {item['hash_type']} hash invalid ({raw_file_hash} != {item['hash_value']}). Invalidating fetch cache.")
                    self.fetched_ = False
                    break
            else:
                logger.debug(f'Data Source {self.name} is already fetched. Skipping')
                return

        if fetch_path is None:
            fetch_path = self.dataset_dir
        else:
            fetch_path = pathlib.Path(fetch_path)

        self.fetched_ = False
        self.fetched_files_ = []
        for key, item in self.file_dict.items():
            status, result, hash_value = fetch_file(**item)
            logger.debug(f"Fetching {key}: status:{status}")
            if status:  # True (cached) or HTTP Code (successful download)
                item['hash_value'] = hash_value
                item['file_name'] = result.name
                self.fetched_files_.append(result)
            else:
                if item.get('fetch_action', False) != 'message':
                    logger.error(f"fetch of {key} returned: {result}")
                break
        else:
            self.fetched_ = True

        self.unpacked_ = False
        return self.fetched_

Source File: test_hdf5_backing.py From anndata with BSD 3-Clause "New" or "Revised" License

4 votes

def test_backing(adata, tmp_path, backing_h5ad):
    assert not adata.isbacked

    adata.filename = backing_h5ad
    adata.write()
    assert not adata.file.is_open
    assert adata.isbacked
    assert adata[:, 0].is_view
    assert adata[:, 0].X.tolist() == np.reshape([1, 4, 7], (3, 1)).tolist()
    # this might give us a trouble as the user might not
    # know that the file is open again....
    assert adata.file.is_open

    adata[:2, 0].X = [0, 0]
    assert adata[:, 0].X.tolist() == np.reshape([0, 0, 7], (3, 1)).tolist()

    adata_subset = adata[:2, [0, 1]]
    assert adata_subset.is_view
    subset_hash = joblib.hash(adata_subset)

    # cannot set view in backing mode...
    with pytest.raises(ValueError):
        adata_subset.obs["foo"] = range(2)
    with pytest.raises(ValueError):
        adata_subset.var["bar"] = -12
    with pytest.raises(ValueError):
        adata_subset.obsm["o2"] = np.ones((2, 2))
    with pytest.raises(ValueError):
        adata_subset.varm["v2"] = np.zeros((2, 2))
    with pytest.raises(ValueError):
        adata_subset.layers["float2"] = adata_subset.layers["float"].copy()

    # Things should stay the same after failed operations
    assert subset_hash == joblib.hash(adata_subset)
    assert adata_subset.is_view

    # need to copy first
    adata_subset = adata_subset.copy(tmp_path / "test.subset.h5ad")
    # now transition to actual object
    assert not adata_subset.is_view
    adata_subset.obs["foo"] = range(2)
    assert not adata_subset.is_view
    assert adata_subset.isbacked
    assert adata_subset.obs["foo"].tolist() == list(range(2))

    # save
    adata_subset.write()


# TODO: Also test updating the backing file inplace

Python joblib.hash() Examples