Python Examples of tqdm.tqdm.pandas

Source File: tests_pandas.py From Tautulli with GNU General Public License v3.0

6 votes

def test_pandas_apply_args_deprecation():
    """Test warning info in
    `pandas.Dataframe(Series).progress_apply(func, *args)`"""
    try:
        from numpy.random import randint
        from tqdm import tqdm_pandas
        import pandas as pd
    except ImportError:
        raise SkipTest

    with closing(StringIO()) as our_file:
        tqdm_pandas(tqdm(file=our_file, leave=False, ascii=True, ncols=20))
        df = pd.DataFrame(randint(0, 50, (500, 3)))
        df.progress_apply(lambda x: None, 1)  # 1 shall cause a warning
        # Check deprecation message
        res = our_file.getvalue()
        assert all([i in res for i in (
            "TqdmDeprecationWarning", "not supported",
            "keyword arguments instead")])

Source File: tests_pandas.py From Tautulli with GNU General Public License v3.0

6 votes

def test_pandas_leave():
    """Test pandas with `leave=True`"""
    try:
        from numpy.random import randint
        import pandas as pd
    except ImportError:
        raise SkipTest

    with closing(StringIO()) as our_file:
        df = pd.DataFrame(randint(0, 100, (1000, 6)))
        tqdm.pandas(file=our_file, leave=True, ascii=True)
        df.groupby(0).progress_apply(lambda x: None)

        our_file.seek(0)

        exres = '100%|##########| 100/100'
        if exres not in our_file.read():
            our_file.seek(0)
            raise AssertionError(
                "\nExpected:\n{0}\nIn:{1}\n".format(exres, our_file.read()))

Source File: bert.py From nyaggle with MIT License

6 votes

def _process(self, X: pd.DataFrame, func: Callable[[str, np.ndarray], Any]):
        is_pandas = isinstance(X, pd.DataFrame)
        X = convert_input(X)

        tqdm.pandas()
        columns = self.text_columns or [c for c in X.columns if X[c].dtype == np.object]
        non_text_columns = [c for c in X.columns if c not in columns]

        column_names = []
        processed = []
        for c in columns:
            emb = np.vstack(X[c].progress_apply(lambda x: self._process_text(x)))
            emb = func(c, emb)
            processed.append(emb)
            column_names += [self.column_format.format(col=c, idx=i) for i in range(emb.shape[1])]

        processed_df = pd.DataFrame(np.hstack(processed), columns=column_names)

        if non_text_columns:
            X_ = X[non_text_columns].copy()
            X_ = pd.concat([X_, processed_df], axis=1)
        else:
            X_ = processed_df

        return X_ if self.return_same_type and is_pandas else X_.values

Source File: loader.py From fine-grained-sentiment with MIT License

6 votes

def create_dataloader(self,
                          df: pd.DataFrame,
                          batch_size: int = 32,
                          shuffle: bool = False,
                          valid_pct: float = None):
        "Process rows in pd.DataFrame using n_cpus and return a DataLoader"

        tqdm.pandas()
        with ProcessPoolExecutor(max_workers=n_cpu) as executor:
            result = list(
                tqdm(executor.map(self.process_row, df.iterrows(), chunksize=8192),
                     desc=f"Processing {len(df)} examples on {n_cpu} cores",
                     total=len(df)))

        features = [r[0] for r in result]
        labels = [r[1] for r in result]

        dataset = TensorDataset(torch.tensor(features, dtype=torch.long),
                                torch.tensor(labels, dtype=torch.long))

        if valid_pct is not None:
            valid_size = int(valid_pct * len(df))
            train_size = len(df) - valid_size
            valid_dataset, train_dataset = random_split(dataset, [valid_size, train_size])
            valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            return train_loader, valid_loader

        data_loader = DataLoader(dataset,
                                 batch_size=batch_size,
                                 num_workers=0,
                                 shuffle=shuffle,
                                 pin_memory=torch.cuda.is_available())
        return data_loader

Source File: data_pack.py From MatchZoo-py with Apache License 2.0

5 votes

def frame(self) -> 'DataPack.FrameView':
        """
        View the data pack as a :class:`pandas.DataFrame`.

        Returned data frame is created by merging the left data frame,
        the right dataframe and the relation data frame. Use `[]` to access
        an item or a slice of items.

        :return: A :class:`matchzoo.DataPack.FrameView` instance.

        Example:
            >>> import matchzoo as mz
            >>> data_pack = mz.datasets.toy.load_data()
            >>> type(data_pack.frame)
            <class 'matchzoo.data_pack.data_pack.DataPack.FrameView'>
            >>> frame_slice = data_pack.frame[0:5]
            >>> type(frame_slice)
            <class 'pandas.core.frame.DataFrame'>
            >>> list(frame_slice.columns)
            ['id_left', 'text_left', 'id_right', 'text_right', 'label']
            >>> full_frame = data_pack.frame()
            >>> len(full_frame) == len(data_pack)
            True

        """
        return DataPack.FrameView(self)

Source File: tests_pandas.py From Tautulli with GNU General Public License v3.0

5 votes

def test_pandas_data_frame():
    """Test pandas.DataFrame.progress_apply and .progress_applymap"""
    try:
        from numpy.random import randint
        import pandas as pd
    except ImportError:
        raise SkipTest

    with closing(StringIO()) as our_file:
        tqdm.pandas(file=our_file, leave=True, ascii=True)
        df = pd.DataFrame(randint(0, 50, (100, 200)))

        def task_func(x):
            return x + 1

        # applymap
        res1 = df.progress_applymap(task_func)
        res2 = df.applymap(task_func)
        assert res1.equals(res2)

        # apply
        for axis in [0, 1]:
            res3 = df.progress_apply(task_func, axis=axis)
            res4 = df.apply(task_func, axis=axis)
            assert res3.equals(res4)

        our_file.seek(0)
        if our_file.read().count('100%') < 3:
            our_file.seek(0)
            raise AssertionError("\nExpected:\n{0}\nIn:\n{1}\n".format(
                '100% at least three times', our_file.read()))

        # apply_map, apply axis=0, apply axis=1
        expects = ['20000/20000', '200/200', '100/100']
        for exres in expects:
            our_file.seek(0)
            if our_file.getvalue().count(exres) < 1:
                our_file.seek(0)
                raise AssertionError(
                    "\nExpected:\n{0}\nIn:\n {1}\n".format(
                        exres + " at least once.", our_file.read()))

Source File: tests_pandas.py From Tautulli with GNU General Public License v3.0

5 votes

def test_pandas_series():
    """Test pandas.Series.progress_apply and .progress_map"""
    try:
        from numpy.random import randint
        import pandas as pd
    except ImportError:
        raise SkipTest

    with closing(StringIO()) as our_file:
        tqdm.pandas(file=our_file, leave=True, ascii=True)

        series = pd.Series(randint(0, 50, (123,)))
        res1 = series.progress_apply(lambda x: x + 10)
        res2 = series.apply(lambda x: x + 10)
        assert res1.equals(res2)

        res3 = series.progress_map(lambda x: x + 10)
        res4 = series.map(lambda x: x + 10)
        assert res3.equals(res4)

        expects = ['100%', '123/123']
        for exres in expects:
            our_file.seek(0)
            if our_file.getvalue().count(exres) < 2:
                our_file.seek(0)
                raise AssertionError(
                    "\nExpected:\n{0}\nIn:\n{1}\n".format(
                        exres + " at least twice.", our_file.read()))

Source File: classifiers.py From fine-grained-sentiment with MIT License

5 votes

def predict(self, train_file: None, test_file: str, lower_case: bool) -> pd.DataFrame:
        "Use tqdm to display model prediction status bar"
        # pip install tqdm
        from tqdm import tqdm
        tqdm.pandas()
        df = self.read_data(test_file, lower_case)
        df['pred'] = df['text'].progress_apply(self.score)
        return df

Source File: classifiers.py From fine-grained-sentiment with MIT License

5 votes

def predict(self, train_file: None, test_file: str, lower_case: bool) -> pd.DataFrame:
        "Use tqdm to display model prediction status bar"
        # pip install tqdm
        from tqdm import tqdm
        tqdm.pandas()
        df = self.read_data(test_file, lower_case)
        df['pred'] = df['text'].progress_apply(self.score)
        return df

Source File: data_pack.py From MatchZoo with Apache License 2.0

5 votes

def _apply_on_text_left(self, func, rename, verbose=1):
        name = rename or 'text_left'
        if verbose:
            tqdm.pandas(desc="Processing " + name + " with " + func.__name__)
            self._left[name] = self._left['text_left'].progress_apply(func)
        else:
            self._left[name] = self._left['text_left'].apply(func)

Source File: data_pack.py From MatchZoo with Apache License 2.0

5 votes

def _apply_on_text_right(self, func, rename, verbose=1):
        name = rename or 'text_right'
        if verbose:
            tqdm.pandas(desc="Processing " + name + " with " + func.__name__)
            self._right[name] = self._right['text_right'].progress_apply(func)
        else:
            self._right[name] = self._right['text_right'].apply(func)

Source File: data_pack.py From MatchZoo with Apache License 2.0

5 votes

def frame(self) -> 'DataPack.FrameView':
        """
        View the data pack as a :class:`pandas.DataFrame`.

        Returned data frame is created by merging the left data frame,
        the right dataframe and the relation data frame. Use `[]` to access
        an item or a slice of items.

        :return: A :class:`matchzoo.DataPack.FrameView` instance.

        Example:
            >>> import matchzoo as mz
            >>> data_pack = mz.datasets.toy.load_data()
            >>> type(data_pack.frame)
            <class 'matchzoo.data_pack.data_pack.DataPack.FrameView'>
            >>> frame_slice = data_pack.frame[0:5]
            >>> type(frame_slice)
            <class 'pandas.core.frame.DataFrame'>
            >>> list(frame_slice.columns)
            ['id_left', 'text_left', 'id_right', 'text_right', 'label']
            >>> full_frame = data_pack.frame()
            >>> len(full_frame) == len(data_pack)
            True

        """
        return DataPack.FrameView(self)

Source File: pandas.py From snorkel with Apache License 2.0

5 votes

def apply(
        self,
        df: pd.DataFrame,
        progress_bar: bool = True,
        fault_tolerant: bool = False,
        return_meta: bool = False,
    ) -> Union[np.ndarray, Tuple[np.ndarray, ApplierMetadata]]:
        """Label Pandas DataFrame of data points with LFs.

        Parameters
        ----------
        df
            Pandas DataFrame containing data points to be labeled by LFs
        progress_bar
            Display a progress bar?
        fault_tolerant
            Output ``-1`` if LF execution fails?
        return_meta
            Return metadata from apply call?

        Returns
        -------
        np.ndarray
            Matrix of labels emitted by LFs
        ApplierMetadata
            Metadata, such as fault counts, for the apply call
        """
        f_caller = _FunctionCaller(fault_tolerant)
        apply_fn = partial(apply_lfs_to_data_point, lfs=self._lfs, f_caller=f_caller)
        call_fn = df.apply
        if progress_bar:
            tqdm.pandas()
            call_fn = df.progress_apply
        labels = call_fn(apply_fn, axis=1)
        labels_with_index = rows_to_triplets(labels)
        L = self._numpy_from_row_data(labels_with_index)
        if return_meta:
            return L, ApplierMetadata(f_caller.fault_counts)
        return L

Source File: data_pack.py From MatchZoo-py with Apache License 2.0

5 votes

def _apply_on_text_left(self, func, rename, verbose=1):
        name = rename or 'text_left'
        if verbose:
            tqdm.pandas(desc="Processing " + name + " with " + func.__name__)
            self._left[name] = self._left['text_left'].progress_apply(func)
        else:
            self._left[name] = self._left['text_left'].apply(func)

Source File: data_pack.py From MatchZoo-py with Apache License 2.0

5 votes

def _apply_on_text_right(self, func, rename, verbose=1):
        name = rename or 'text_right'
        if verbose:
            tqdm.pandas(desc="Processing " + name + " with " + func.__name__)
            self._right[name] = self._right['text_right'].progress_apply(func)
        else:
            self._right[name] = self._right['text_right'].apply(func)

Source File: preprocess_aclImdb_v1.py From lambda-deep-learning-demo with Apache License 2.0

5 votes

def process_csv(args, split_name):
  raw_csv = os.path.join(args.output_dir, split_name + "_raw.csv")
  clean_csv = os.path.join(args.output_dir, split_name + ".csv")
  data, labels = load_dataset(os.path.join(args.input_dir, split_name))

  # save as csv file, seperated by tab
  if not os.path.exists(args.output_dir):
      os.makedirs(args.output_dir)

  with open(raw_csv, 'w') as f:
    writer = csv.writer(f, delimiter='\t')
    for sentence, label in zip(data, labels):
      writer.writerow([sentence, label])

  data = ingest_data(raw_csv)

  tqdm.pandas(desc="progress-bar")

  data = post_process(data, args.remove_punctuation)

  data.to_csv(clean_csv, sep='\t', header=False, index=False)

Source File: utils.py From DALEX with GNU General Public License v3.0

4 votes

def aggregate_profiles(all_profiles, type, groups, intercept, span):
    if type == 'partial':
        aggregated_profiles = \
            all_profiles.groupby(['_vname_', '_label_', '_x_'] + groups)['_yhat_'].mean().reset_index()

    else:
        # split all_profiles into groups
        tqdm.pandas(desc='Calculating accumulated dependency') if type == 'accumulated' else tqdm.pandas(
            desc="Calculating conditional dependency")
        aggregated_profiles = \
            all_profiles. \
                loc[:, ["_vname_", "_label_", "_x_", "_yhat_", "_ids_", "_original_"] + groups]. \
                groupby(['_vname_', '_label_']). \
                progress_apply(lambda split_profile: split_over_variables_and_labels(split_profile, type, groups, span))

    aggregated_profiles.loc[:, '_ids_'] = 0

    if type == 'partial':
        if not intercept:
            aggregated_profiles.loc[:, '_yhat_'] = aggregated_profiles.loc[:, '_yhat_'] - all_profiles[
                '_yhat_'].mean()

        aggregated_profiles = aggregated_profiles
    elif type == 'conditional':
        if not intercept:
            aggregated_profiles.loc[:, '_yhat_'] = aggregated_profiles.loc[:, '_yhat_'] - all_profiles[
                '_yhat_'].mean()
        aggregated_profiles = aggregated_profiles.reset_index().rename(columns={'level_2': '_grid_'})
    else:
        if intercept:
            aggregated_profiles.loc[:, '_yhat_'] = aggregated_profiles.loc[:, '_yhat_'] + all_profiles[
                '_yhat_'].mean()
        aggregated_profiles = aggregated_profiles.reset_index().rename(columns={'level_2': '_grid_'})

    # postprocessing
    if len(groups) != 0:
        aggregated_profiles['_groups_'] = aggregated_profiles.loc[:, groups].apply(lambda row: '_'.join(row), axis=1)
        aggregated_profiles.drop(columns=groups)

        aggregated_profiles.loc[:, '_label_'] = \
            aggregated_profiles.loc[:, ['_label_', '_groups_']].apply(lambda row: '_'.join(row), axis=1)

    return aggregated_profiles

Source File: utils.py From DALEX with GNU General Public License v3.0

4 votes

def split_over_variables_and_labels(split_profile, type, groups, span):
    """
    Inner function that calculates actual conditional profiles for one variable only. Iterated over each variable and group.

    :param split_profile: pandas.DataFrame, one group of the dataset (with only one variable)
    :param groups: str, name of grouping variable
    :return: pd.DataFrame, dataframe with calculated conditional profile for only one variable
    """

    if split_profile.shape[0] == 0:
        return None

    if pd.api.types.is_numeric_dtype(split_profile['_x_']):
        # for continuous variables we will calculate weighted average
        # where weights come from gaussian kernel and distance between points
        # scaling factor, range if the range i > 0
        split_profile['_original_'] = split_profile['_original_'].astype('float')
        range_x = split_profile['_x_'].max() - split_profile['_x_'].min()

        if range_x == 0:
            range_x = 1

        # scalled differences
        diffs = (split_profile['_original_'] - split_profile['_x_']) / range_x

        split_profile['_w_'] = norm(diffs, 0, span)

    else:
        # for categorical variables we will calculate weighted average
        # but weights are 0-1, 1 if it's the same level and 0 otherwise
        split_profile['_w_'] = split_profile['_original_'] == split_profile['_x_']

    if type == 'accumulated':
        # diffs
        split_profile['_yhat_'] = split_profile. \
            groupby('_ids_')['_yhat_']. \
            transform(lambda column: column.diff())

        # diff causes NaNs at the beginning of each group
        split_profile.loc[np.isnan(split_profile['_yhat_']), '_yhat_'] = 0

    par_profile = split_profile.groupby(['_x_'] + groups). \
        apply(lambda point: (point['_yhat_'] * point['_w_']).sum() / point['_w_'].sum() \
        if point['_w_'].sum() != 0 else 0)

    par_profile.name = '_yhat_'
    par_profile = par_profile.reset_index()

    if type == 'accumulated':
        if len(groups) == 0:
            par_profile['_yhat_'] = par_profile['_yhat_'].cumsum()
        else:
            par_profile['_yhat_'] = par_profile.groupby(groups)['_yhat_'].transform(
                lambda column: column.cumsum())

    return par_profile

Source File: tests_pandas.py From Tautulli with GNU General Public License v3.0

4 votes

def test_pandas_groupby_apply():
    """Test pandas.DataFrame.groupby(...).progress_apply"""
    try:
        from numpy.random import randint
        import pandas as pd
    except ImportError:
        raise SkipTest

    with closing(StringIO()) as our_file:
        tqdm.pandas(file=our_file, leave=False, ascii=True)

        df = pd.DataFrame(randint(0, 50, (500, 3)))
        df.groupby(0).progress_apply(lambda x: None)

        dfs = pd.DataFrame(randint(0, 50, (500, 3)), columns=list('abc'))
        dfs.groupby(['a']).progress_apply(lambda x: None)

        our_file.seek(0)

        # don't expect final output since no `leave` and
        # high dynamic `miniters`
        nexres = '100%|##########|'
        if nexres in our_file.read():
            our_file.seek(0)
            raise AssertionError("\nDid not expect:\n{0}\nIn:{1}\n".format(
                nexres, our_file.read()))

    with closing(StringIO()) as our_file:
        tqdm.pandas(file=our_file, leave=True, ascii=True)

        dfs = pd.DataFrame(randint(0, 50, (500, 3)), columns=list('abc'))
        dfs.loc[0] = [2, 1, 1]
        dfs['d'] = 100

        expects = ['500/500', '1/1', '4/4', '2/2']
        dfs.groupby(dfs.index).progress_apply(lambda x: None)
        dfs.groupby('d').progress_apply(lambda x: None)
        dfs.groupby(dfs.columns, axis=1).progress_apply(lambda x: None)
        dfs.groupby([2, 2, 1, 1], axis=1).progress_apply(lambda x: None)

        our_file.seek(0)
        if our_file.read().count('100%') < 4:
            our_file.seek(0)
            raise AssertionError("\nExpected:\n{0}\nIn:\n{1}\n".format(
                '100% at least four times', our_file.read()))

        for exres in expects:
            our_file.seek(0)
            if our_file.getvalue().count(exres) < 1:
                our_file.seek(0)
                raise AssertionError(
                    "\nExpected:\n{0}\nIn:\n {1}\n".format(
                        exres + " at least once.", our_file.read()))

Python tqdm.tqdm.pandas() Examples