Python tqdm.tqdm.pandas() Examples

The following are 19 code examples of tqdm.tqdm.pandas(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tqdm.tqdm , or try the search function .
Example #1
Source File: tests_pandas.py    From Tautulli with GNU General Public License v3.0 6 votes vote down vote up
def test_pandas_apply_args_deprecation():
    """Test warning info in
    `pandas.Dataframe(Series).progress_apply(func, *args)`"""
    try:
        from numpy.random import randint
        from tqdm import tqdm_pandas
        import pandas as pd
    except ImportError:
        raise SkipTest

    with closing(StringIO()) as our_file:
        tqdm_pandas(tqdm(file=our_file, leave=False, ascii=True, ncols=20))
        df = pd.DataFrame(randint(0, 50, (500, 3)))
        df.progress_apply(lambda x: None, 1)  # 1 shall cause a warning
        # Check deprecation message
        res = our_file.getvalue()
        assert all([i in res for i in (
            "TqdmDeprecationWarning", "not supported",
            "keyword arguments instead")]) 
Example #2
Source File: tests_pandas.py    From Tautulli with GNU General Public License v3.0 6 votes vote down vote up
def test_pandas_leave():
    """Test pandas with `leave=True`"""
    try:
        from numpy.random import randint
        import pandas as pd
    except ImportError:
        raise SkipTest

    with closing(StringIO()) as our_file:
        df = pd.DataFrame(randint(0, 100, (1000, 6)))
        tqdm.pandas(file=our_file, leave=True, ascii=True)
        df.groupby(0).progress_apply(lambda x: None)

        our_file.seek(0)

        exres = '100%|##########| 100/100'
        if exres not in our_file.read():
            our_file.seek(0)
            raise AssertionError(
                "\nExpected:\n{0}\nIn:{1}\n".format(exres, our_file.read())) 
Example #3
Source File: bert.py    From nyaggle with MIT License 6 votes vote down vote up
def _process(self, X: pd.DataFrame, func: Callable[[str, np.ndarray], Any]):
        is_pandas = isinstance(X, pd.DataFrame)
        X = convert_input(X)

        tqdm.pandas()
        columns = self.text_columns or [c for c in X.columns if X[c].dtype == np.object]
        non_text_columns = [c for c in X.columns if c not in columns]

        column_names = []
        processed = []
        for c in columns:
            emb = np.vstack(X[c].progress_apply(lambda x: self._process_text(x)))
            emb = func(c, emb)
            processed.append(emb)
            column_names += [self.column_format.format(col=c, idx=i) for i in range(emb.shape[1])]

        processed_df = pd.DataFrame(np.hstack(processed), columns=column_names)

        if non_text_columns:
            X_ = X[non_text_columns].copy()
            X_ = pd.concat([X_, processed_df], axis=1)
        else:
            X_ = processed_df

        return X_ if self.return_same_type and is_pandas else X_.values 
Example #4
Source File: loader.py    From fine-grained-sentiment with MIT License 6 votes vote down vote up
def create_dataloader(self,
                          df: pd.DataFrame,
                          batch_size: int = 32,
                          shuffle: bool = False,
                          valid_pct: float = None):
        "Process rows in pd.DataFrame using n_cpus and return a DataLoader"

        tqdm.pandas()
        with ProcessPoolExecutor(max_workers=n_cpu) as executor:
            result = list(
                tqdm(executor.map(self.process_row, df.iterrows(), chunksize=8192),
                     desc=f"Processing {len(df)} examples on {n_cpu} cores",
                     total=len(df)))

        features = [r[0] for r in result]
        labels = [r[1] for r in result]

        dataset = TensorDataset(torch.tensor(features, dtype=torch.long),
                                torch.tensor(labels, dtype=torch.long))

        if valid_pct is not None:
            valid_size = int(valid_pct * len(df))
            train_size = len(df) - valid_size
            valid_dataset, train_dataset = random_split(dataset, [valid_size, train_size])
            valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            return train_loader, valid_loader

        data_loader = DataLoader(dataset,
                                 batch_size=batch_size,
                                 num_workers=0,
                                 shuffle=shuffle,
                                 pin_memory=torch.cuda.is_available())
        return data_loader 
Example #5
Source File: data_pack.py    From MatchZoo-py with Apache License 2.0 5 votes vote down vote up
def frame(self) -> 'DataPack.FrameView':
        """
        View the data pack as a :class:`pandas.DataFrame`.

        Returned data frame is created by merging the left data frame,
        the right dataframe and the relation data frame. Use `[]` to access
        an item or a slice of items.

        :return: A :class:`matchzoo.DataPack.FrameView` instance.

        Example:
            >>> import matchzoo as mz
            >>> data_pack = mz.datasets.toy.load_data()
            >>> type(data_pack.frame)
            <class 'matchzoo.data_pack.data_pack.DataPack.FrameView'>
            >>> frame_slice = data_pack.frame[0:5]
            >>> type(frame_slice)
            <class 'pandas.core.frame.DataFrame'>
            >>> list(frame_slice.columns)
            ['id_left', 'text_left', 'id_right', 'text_right', 'label']
            >>> full_frame = data_pack.frame()
            >>> len(full_frame) == len(data_pack)
            True

        """
        return DataPack.FrameView(self) 
Example #6
Source File: tests_pandas.py    From Tautulli with GNU General Public License v3.0 5 votes vote down vote up
def test_pandas_data_frame():
    """Test pandas.DataFrame.progress_apply and .progress_applymap"""
    try:
        from numpy.random import randint
        import pandas as pd
    except ImportError:
        raise SkipTest

    with closing(StringIO()) as our_file:
        tqdm.pandas(file=our_file, leave=True, ascii=True)
        df = pd.DataFrame(randint(0, 50, (100, 200)))

        def task_func(x):
            return x + 1

        # applymap
        res1 = df.progress_applymap(task_func)
        res2 = df.applymap(task_func)
        assert res1.equals(res2)

        # apply
        for axis in [0, 1]:
            res3 = df.progress_apply(task_func, axis=axis)
            res4 = df.apply(task_func, axis=axis)
            assert res3.equals(res4)

        our_file.seek(0)
        if our_file.read().count('100%') < 3:
            our_file.seek(0)
            raise AssertionError("\nExpected:\n{0}\nIn:\n{1}\n".format(
                '100% at least three times', our_file.read()))

        # apply_map, apply axis=0, apply axis=1
        expects = ['20000/20000', '200/200', '100/100']
        for exres in expects:
            our_file.seek(0)
            if our_file.getvalue().count(exres) < 1:
                our_file.seek(0)
                raise AssertionError(
                    "\nExpected:\n{0}\nIn:\n {1}\n".format(
                        exres + " at least once.", our_file.read())) 
Example #7
Source File: tests_pandas.py    From Tautulli with GNU General Public License v3.0 5 votes vote down vote up
def test_pandas_series():
    """Test pandas.Series.progress_apply and .progress_map"""
    try:
        from numpy.random import randint
        import pandas as pd
    except ImportError:
        raise SkipTest

    with closing(StringIO()) as our_file:
        tqdm.pandas(file=our_file, leave=True, ascii=True)

        series = pd.Series(randint(0, 50, (123,)))
        res1 = series.progress_apply(lambda x: x + 10)
        res2 = series.apply(lambda x: x + 10)
        assert res1.equals(res2)

        res3 = series.progress_map(lambda x: x + 10)
        res4 = series.map(lambda x: x + 10)
        assert res3.equals(res4)

        expects = ['100%', '123/123']
        for exres in expects:
            our_file.seek(0)
            if our_file.getvalue().count(exres) < 2:
                our_file.seek(0)
                raise AssertionError(
                    "\nExpected:\n{0}\nIn:\n{1}\n".format(
                        exres + " at least twice.", our_file.read())) 
Example #8
Source File: classifiers.py    From fine-grained-sentiment with MIT License 5 votes vote down vote up
def predict(self, train_file: None, test_file: str, lower_case: bool) -> pd.DataFrame:
        "Use tqdm to display model prediction status bar"
        # pip install tqdm
        from tqdm import tqdm
        tqdm.pandas()
        df = self.read_data(test_file, lower_case)
        df['pred'] = df['text'].progress_apply(self.score)
        return df 
Example #9
Source File: classifiers.py    From fine-grained-sentiment with MIT License 5 votes vote down vote up
def predict(self, train_file: None, test_file: str, lower_case: bool) -> pd.DataFrame:
        "Use tqdm to display model prediction status bar"
        # pip install tqdm
        from tqdm import tqdm
        tqdm.pandas()
        df = self.read_data(test_file, lower_case)
        df['pred'] = df['text'].progress_apply(self.score)
        return df 
Example #10
Source File: data_pack.py    From MatchZoo with Apache License 2.0 5 votes vote down vote up
def _apply_on_text_left(self, func, rename, verbose=1):
        name = rename or 'text_left'
        if verbose:
            tqdm.pandas(desc="Processing " + name + " with " + func.__name__)
            self._left[name] = self._left['text_left'].progress_apply(func)
        else:
            self._left[name] = self._left['text_left'].apply(func) 
Example #11
Source File: data_pack.py    From MatchZoo with Apache License 2.0 5 votes vote down vote up
def _apply_on_text_right(self, func, rename, verbose=1):
        name = rename or 'text_right'
        if verbose:
            tqdm.pandas(desc="Processing " + name + " with " + func.__name__)
            self._right[name] = self._right['text_right'].progress_apply(func)
        else:
            self._right[name] = self._right['text_right'].apply(func) 
Example #12
Source File: data_pack.py    From MatchZoo with Apache License 2.0 5 votes vote down vote up
def frame(self) -> 'DataPack.FrameView':
        """
        View the data pack as a :class:`pandas.DataFrame`.

        Returned data frame is created by merging the left data frame,
        the right dataframe and the relation data frame. Use `[]` to access
        an item or a slice of items.

        :return: A :class:`matchzoo.DataPack.FrameView` instance.

        Example:
            >>> import matchzoo as mz
            >>> data_pack = mz.datasets.toy.load_data()
            >>> type(data_pack.frame)
            <class 'matchzoo.data_pack.data_pack.DataPack.FrameView'>
            >>> frame_slice = data_pack.frame[0:5]
            >>> type(frame_slice)
            <class 'pandas.core.frame.DataFrame'>
            >>> list(frame_slice.columns)
            ['id_left', 'text_left', 'id_right', 'text_right', 'label']
            >>> full_frame = data_pack.frame()
            >>> len(full_frame) == len(data_pack)
            True

        """
        return DataPack.FrameView(self) 
Example #13
Source File: pandas.py    From snorkel with Apache License 2.0 5 votes vote down vote up
def apply(
        self,
        df: pd.DataFrame,
        progress_bar: bool = True,
        fault_tolerant: bool = False,
        return_meta: bool = False,
    ) -> Union[np.ndarray, Tuple[np.ndarray, ApplierMetadata]]:
        """Label Pandas DataFrame of data points with LFs.

        Parameters
        ----------
        df
            Pandas DataFrame containing data points to be labeled by LFs
        progress_bar
            Display a progress bar?
        fault_tolerant
            Output ``-1`` if LF execution fails?
        return_meta
            Return metadata from apply call?

        Returns
        -------
        np.ndarray
            Matrix of labels emitted by LFs
        ApplierMetadata
            Metadata, such as fault counts, for the apply call
        """
        f_caller = _FunctionCaller(fault_tolerant)
        apply_fn = partial(apply_lfs_to_data_point, lfs=self._lfs, f_caller=f_caller)
        call_fn = df.apply
        if progress_bar:
            tqdm.pandas()
            call_fn = df.progress_apply
        labels = call_fn(apply_fn, axis=1)
        labels_with_index = rows_to_triplets(labels)
        L = self._numpy_from_row_data(labels_with_index)
        if return_meta:
            return L, ApplierMetadata(f_caller.fault_counts)
        return L 
Example #14
Source File: data_pack.py    From MatchZoo-py with Apache License 2.0 5 votes vote down vote up
def _apply_on_text_left(self, func, rename, verbose=1):
        name = rename or 'text_left'
        if verbose:
            tqdm.pandas(desc="Processing " + name + " with " + func.__name__)
            self._left[name] = self._left['text_left'].progress_apply(func)
        else:
            self._left[name] = self._left['text_left'].apply(func) 
Example #15
Source File: data_pack.py    From MatchZoo-py with Apache License 2.0 5 votes vote down vote up
def _apply_on_text_right(self, func, rename, verbose=1):
        name = rename or 'text_right'
        if verbose:
            tqdm.pandas(desc="Processing " + name + " with " + func.__name__)
            self._right[name] = self._right['text_right'].progress_apply(func)
        else:
            self._right[name] = self._right['text_right'].apply(func) 
Example #16
Source File: preprocess_aclImdb_v1.py    From lambda-deep-learning-demo with Apache License 2.0 5 votes vote down vote up
def process_csv(args, split_name):
  raw_csv = os.path.join(args.output_dir, split_name + "_raw.csv")
  clean_csv = os.path.join(args.output_dir, split_name + ".csv")
  data, labels = load_dataset(os.path.join(args.input_dir, split_name))

  # save as csv file, seperated by tab
  if not os.path.exists(args.output_dir):
      os.makedirs(args.output_dir)

  with open(raw_csv, 'w') as f:
    writer = csv.writer(f, delimiter='\t')
    for sentence, label in zip(data, labels):
      writer.writerow([sentence, label])

  data = ingest_data(raw_csv)

  tqdm.pandas(desc="progress-bar")

  data = post_process(data, args.remove_punctuation)

  data.to_csv(clean_csv, sep='\t', header=False, index=False) 
Example #17
Source File: utils.py    From DALEX with GNU General Public License v3.0 4 votes vote down vote up
def aggregate_profiles(all_profiles, type, groups, intercept, span):
    if type == 'partial':
        aggregated_profiles = \
            all_profiles.groupby(['_vname_', '_label_', '_x_'] + groups)['_yhat_'].mean().reset_index()

    else:
        # split all_profiles into groups
        tqdm.pandas(desc='Calculating accumulated dependency') if type == 'accumulated' else tqdm.pandas(
            desc="Calculating conditional dependency")
        aggregated_profiles = \
            all_profiles. \
                loc[:, ["_vname_", "_label_", "_x_", "_yhat_", "_ids_", "_original_"] + groups]. \
                groupby(['_vname_', '_label_']). \
                progress_apply(lambda split_profile: split_over_variables_and_labels(split_profile, type, groups, span))

    aggregated_profiles.loc[:, '_ids_'] = 0

    if type == 'partial':
        if not intercept:
            aggregated_profiles.loc[:, '_yhat_'] = aggregated_profiles.loc[:, '_yhat_'] - all_profiles[
                '_yhat_'].mean()

        aggregated_profiles = aggregated_profiles
    elif type == 'conditional':
        if not intercept:
            aggregated_profiles.loc[:, '_yhat_'] = aggregated_profiles.loc[:, '_yhat_'] - all_profiles[
                '_yhat_'].mean()
        aggregated_profiles = aggregated_profiles.reset_index().rename(columns={'level_2': '_grid_'})
    else:
        if intercept:
            aggregated_profiles.loc[:, '_yhat_'] = aggregated_profiles.loc[:, '_yhat_'] + all_profiles[
                '_yhat_'].mean()
        aggregated_profiles = aggregated_profiles.reset_index().rename(columns={'level_2': '_grid_'})

    # postprocessing
    if len(groups) != 0:
        aggregated_profiles['_groups_'] = aggregated_profiles.loc[:, groups].apply(lambda row: '_'.join(row), axis=1)
        aggregated_profiles.drop(columns=groups)

        aggregated_profiles.loc[:, '_label_'] = \
            aggregated_profiles.loc[:, ['_label_', '_groups_']].apply(lambda row: '_'.join(row), axis=1)

    return aggregated_profiles 
Example #18
Source File: utils.py    From DALEX with GNU General Public License v3.0 4 votes vote down vote up
def split_over_variables_and_labels(split_profile, type, groups, span):
    """
    Inner function that calculates actual conditional profiles for one variable only. Iterated over each variable and group.

    :param split_profile: pandas.DataFrame, one group of the dataset (with only one variable)
    :param groups: str, name of grouping variable
    :return: pd.DataFrame, dataframe with calculated conditional profile for only one variable
    """

    if split_profile.shape[0] == 0:
        return None

    if pd.api.types.is_numeric_dtype(split_profile['_x_']):
        # for continuous variables we will calculate weighted average
        # where weights come from gaussian kernel and distance between points
        # scaling factor, range if the range i > 0
        split_profile['_original_'] = split_profile['_original_'].astype('float')
        range_x = split_profile['_x_'].max() - split_profile['_x_'].min()

        if range_x == 0:
            range_x = 1

        # scalled differences
        diffs = (split_profile['_original_'] - split_profile['_x_']) / range_x

        split_profile['_w_'] = norm(diffs, 0, span)

    else:
        # for categorical variables we will calculate weighted average
        # but weights are 0-1, 1 if it's the same level and 0 otherwise
        split_profile['_w_'] = split_profile['_original_'] == split_profile['_x_']

    if type == 'accumulated':
        # diffs
        split_profile['_yhat_'] = split_profile. \
            groupby('_ids_')['_yhat_']. \
            transform(lambda column: column.diff())

        # diff causes NaNs at the beginning of each group
        split_profile.loc[np.isnan(split_profile['_yhat_']), '_yhat_'] = 0

    par_profile = split_profile.groupby(['_x_'] + groups). \
        apply(lambda point: (point['_yhat_'] * point['_w_']).sum() / point['_w_'].sum() \
        if point['_w_'].sum() != 0 else 0)

    par_profile.name = '_yhat_'
    par_profile = par_profile.reset_index()

    if type == 'accumulated':
        if len(groups) == 0:
            par_profile['_yhat_'] = par_profile['_yhat_'].cumsum()
        else:
            par_profile['_yhat_'] = par_profile.groupby(groups)['_yhat_'].transform(
                lambda column: column.cumsum())

    return par_profile 
Example #19
Source File: tests_pandas.py    From Tautulli with GNU General Public License v3.0 4 votes vote down vote up
def test_pandas_groupby_apply():
    """Test pandas.DataFrame.groupby(...).progress_apply"""
    try:
        from numpy.random import randint
        import pandas as pd
    except ImportError:
        raise SkipTest

    with closing(StringIO()) as our_file:
        tqdm.pandas(file=our_file, leave=False, ascii=True)

        df = pd.DataFrame(randint(0, 50, (500, 3)))
        df.groupby(0).progress_apply(lambda x: None)

        dfs = pd.DataFrame(randint(0, 50, (500, 3)), columns=list('abc'))
        dfs.groupby(['a']).progress_apply(lambda x: None)

        our_file.seek(0)

        # don't expect final output since no `leave` and
        # high dynamic `miniters`
        nexres = '100%|##########|'
        if nexres in our_file.read():
            our_file.seek(0)
            raise AssertionError("\nDid not expect:\n{0}\nIn:{1}\n".format(
                nexres, our_file.read()))

    with closing(StringIO()) as our_file:
        tqdm.pandas(file=our_file, leave=True, ascii=True)

        dfs = pd.DataFrame(randint(0, 50, (500, 3)), columns=list('abc'))
        dfs.loc[0] = [2, 1, 1]
        dfs['d'] = 100

        expects = ['500/500', '1/1', '4/4', '2/2']
        dfs.groupby(dfs.index).progress_apply(lambda x: None)
        dfs.groupby('d').progress_apply(lambda x: None)
        dfs.groupby(dfs.columns, axis=1).progress_apply(lambda x: None)
        dfs.groupby([2, 2, 1, 1], axis=1).progress_apply(lambda x: None)

        our_file.seek(0)
        if our_file.read().count('100%') < 4:
            our_file.seek(0)
            raise AssertionError("\nExpected:\n{0}\nIn:\n{1}\n".format(
                '100% at least four times', our_file.read()))

        for exres in expects:
            our_file.seek(0)
            if our_file.getvalue().count(exres) < 1:
                our_file.seek(0)
                raise AssertionError(
                    "\nExpected:\n{0}\nIn:\n {1}\n".format(
                        exres + " at least once.", our_file.read()))