Python pandas.value_counts() Examples

The following are 27 code examples of pandas.value_counts(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pandas , or try the search function .
Example #1
Source File: spatial_heatmap.py    From NanoPlot with GNU General Public License v3.0 6 votes vote down vote up
def spatial_heatmap(array, path, title=None, color="Greens", figformat="png"):
    """Taking channel information and creating post run channel activity plots."""
    logging.info("Nanoplotter: Creating heatmap of reads per channel using {} reads."
                 .format(array.size))
    activity_map = Plot(
        path=path + "." + figformat,
        title="Number of reads generated per channel")
    layout = make_layout(maxval=np.amax(array))
    valueCounts = pd.value_counts(pd.Series(array))
    for entry in valueCounts.keys():
        layout.template[np.where(layout.structure == entry)] = valueCounts[entry]
    plt.figure()
    ax = sns.heatmap(
        data=pd.DataFrame(layout.template, index=layout.yticks, columns=layout.xticks),
        xticklabels="auto",
        yticklabels="auto",
        square=True,
        cbar_kws={"orientation": "horizontal"},
        cmap=color,
        linewidths=0.20)
    ax.set_title(title or activity_map.title)
    activity_map.fig = ax.get_figure()
    activity_map.save(format=figformat)
    plt.close("all")
    return [activity_map] 
Example #2
Source File: test_split.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_blockwise_shufflesplit():
    splitter = dask_ml.model_selection.ShuffleSplit(random_state=0)
    assert splitter.get_n_splits() == 10
    gen = splitter.split(dX)

    train_idx, test_idx = next(gen)
    assert isinstance(train_idx, da.Array)
    assert isinstance(test_idx, da.Array)

    assert train_idx.shape == (99,)  # 90% of 110
    assert test_idx.shape == (11,)

    assert train_idx.chunks == ((45, 45, 9),)
    assert test_idx.chunks == ((5, 5, 1),)

    counts = pd.value_counts(train_idx.compute())
    assert counts.max() == 1

    N = len(X)

    np.testing.assert_array_equal(
        np.unique(da.concatenate([train_idx, test_idx])), np.arange(N)
    ) 
Example #3
Source File: tagged_corpus.py    From underthesea with GNU General Public License v3.0 6 votes vote down vote up
def _analyze_field(self, df, id, output_folder=".", n_head=10):
        id = str(id)
        m = df.shape[1]
        df.columns = [str(i) for i in range(m)]

        agg_dict = dict()
        agg_dict[id] = "size"
        for i in range(int(id)):
            agg_dict[str(i)] = lambda x: ", ".join(
                pd.value_counts(x).index[:n_head])
        name_dict = dict()
        name_dict[id] = "count"
        df_analyze = df.groupby(id).agg(agg_dict).rename(
            columns=name_dict).reset_index()
        filename = join(output_folder, "column-%s-analyze.xlsx" % id)

        log = u""
        log += u"Tags         : {}\n".format(df_analyze.shape[0])
        tags = df_analyze[id].to_dict().values()
        tags = sorted(tags)
        log += u"List tags    : {}\n".format(u", ".join(tags))
        df_analyze.to_excel(filename, index=False)
        return log 
Example #4
Source File: tc_.py    From underthesea with GNU General Public License v3.0 6 votes vote down vote up
def _analyze_field(self, df, id, output_folder=".", n_head=10):
        id = str(id)
        m = df.shape[1]
        df.columns = [str(i) for i in range(m)]

        agg_dict = dict()
        agg_dict[id] = "size"
        for i in range(int(id)):
            agg_dict[str(i)] = lambda x: ", ".join(
                pd.value_counts(x).index[:n_head])
        name_dict = dict()
        name_dict[id] = "count"
        df_analyze = df.groupby(id).agg(agg_dict).rename(
            columns=name_dict).reset_index()
        filename = join(output_folder, "column-%s-analyze.xlsx" % id)

        log = u""
        log += u"Tags         : {}\n".format(df_analyze.shape[0])
        tags = df_analyze[id].to_dict().values()
        tags = sorted(tags)
        log += u"List tags    : {}\n".format(u", ".join(tags))
        df_analyze.to_excel(filename, index=False)
        return log 
Example #5
Source File: test_example_sleeping_giant.py    From postman_problems with MIT License 5 votes vote down vote up
def test_get_shortest_paths_distances():
    df = read_edgelist(EDGELIST)
    graph = create_networkx_graph_from_edgelist(df, edge_id='id')

    odd_nodes = get_odd_nodes(graph)
    odd_node_pairs = list(itertools.combinations(odd_nodes, 2))

    # coarsely checking structure of `get_shortest_paths_distances` return value
    odd_node_pairs_shortest_paths = get_shortest_paths_distances(graph, odd_node_pairs, 'distance')
    assert len(odd_node_pairs_shortest_paths) == 630
    assert type(odd_node_pairs_shortest_paths) == dict

    # check that each node name appears the same number of times in `get_shortest_paths_distances` return value
    node_names = list(itertools.chain(*[i[0] for i in odd_node_pairs_shortest_paths.items()]))
    assert set(pd.value_counts(node_names)) == set([35]) 
Example #6
Source File: testPlotting.py    From fitbit-analyzer with Apache License 2.0 5 votes vote down vote up
def test_plottingOnIntradayStats(self):
        filepath =  RESOURCE_PATH + "\\unittest\\test_sleep_basic01.csv"
        data1 = utils.loadIntradayData(filepath)
        filepath =  RESOURCE_PATH + "\\unittest\\test_sleep_basic02.csv"
        data2 = utils.loadIntradayData(filepath)
        stats = sleepStats.generateStatsFrom([data1, data2],
                                             sleepStats.STATS_NAME_INTRADAY)

        data = stats.apply(pd.value_counts)
        mplot.plotSleepValueHeatmap(data, sleepValue=1) 
Example #7
Source File: sleepStats.py    From fitbit-analyzer with Apache License 2.0 5 votes vote down vote up
def normalizedIntradayCountStats(intradayStats, limitCount=5):
    # For each minute, number of days for which we have a valid measure (record)
    notNullCount = intradayStats.count()
    # Ignore minutes where we have low level of records
    notNullCount[notNullCount < limitCount] = None
    # Count how many times each value appears for each minute
    valueCount = intradayStats.apply(pd.value_counts)
    # Normalize each minute by records count
    res = valueCount.div(notNullCount, axis=1)
    return res 
Example #8
Source File: test_split.py    From dask-ml with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_kfold(shuffle):
    splitter = dask_ml.model_selection.KFold(
        n_splits=5, random_state=0, shuffle=shuffle
    )
    assert splitter.get_n_splits() == 5
    gen = splitter.split(dX)

    train_idx, test_idx = next(gen)
    assert isinstance(train_idx, da.Array)
    assert isinstance(test_idx, da.Array)

    assert train_idx.shape == (88,)  # 80% of 110
    assert test_idx.shape == (22,)

    assert train_idx.chunks == ((28, 50, 10),)
    assert test_idx.chunks == ((22,),)

    counts = pd.value_counts(train_idx.compute())
    assert counts.max() == 1

    N = len(X)

    np.testing.assert_array_equal(
        np.unique(da.concatenate([train_idx, test_idx])), np.arange(N)
    )

    expected_chunks = [
        (((22, 6, 50, 10),), ((22,),)),
        (((44, 34, 10),), ((6, 16),)),
        (((50, 16, 12, 10),), ((22,),)),
        (((50, 38),), ((12, 10),)),
    ]

    for (exp_train_idx, exp_test_idx), (train_idx, test_idx) in zip(
        expected_chunks, gen
    ):
        assert train_idx.chunks == exp_train_idx
        assert test_idx.chunks == exp_test_idx 
Example #9
Source File: core.py    From econtools with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def df_cluster(n, k, cluster_id):
    g = len(pd.value_counts(cluster_id))
    df = g - 1
    vce_correct = ((n - 1) / (n - k)) * (g / (g - 1))
    return df, vce_correct, g 
Example #10
Source File: prepare_dataset.py    From moses with MIT License 5 votes vote down vote up
def split_dataset(dataset, seed):
    logger.info('Splitting the dataset')
    scaffolds = pd.value_counts(dataset['scaffold'])
    scaffolds = sorted(scaffolds.items(), key=lambda x: (-x[1], x[0]))
    test_scaffolds = set([x[0] for x in scaffolds[9::10]])
    dataset['SPLIT'] = 'train'
    test_scaf_idx = [x in test_scaffolds for x in dataset['scaffold']]
    dataset.loc[test_scaf_idx, 'SPLIT'] = 'test_scaffolds'
    test_idx = dataset.loc[dataset['SPLIT'] == 'train'].sample(
        frac=0.1, random_state=seed
    ).index
    dataset.loc[test_idx, 'SPLIT'] = 'test'
    dataset.drop('scaffold', axis=1, inplace=True)
    return dataset 
Example #11
Source File: others.py    From arche with MIT License 5 votes vote down vote up
def garbage_symbols(df: pd.DataFrame) -> Result:
    """Find unwanted symbols in `np.object` columns.

    Returns:
        A result containing item keys per field which contained any trash symbol
    """
    garbage = (
        r"(?P<spaces>^\s|\s$)"
        r"|(?P<html_entities>&[a-zA-Z]{2,}?;|&#\d*?;)"
        r"|(?P<css>[.#@][^\d{}#.\s][^{}#.]+?{(?:[^:;{}]+?:[^:;{}]+?;)+?\s*?})"
        r"|(?P<html_tags></??(?:h\d|b|u|i|div|ul|ol|li|table|tbody|th|tr|td|p|a|br|img|sup|SUP|"
        r"blockquote)\s*?/??>|<!--|-->)"
    )

    errors = {}
    row_keys: Set = set()
    rule_result = Result("Garbage Symbols", items_count=len(df))

    for column in tqdm(df.select_dtypes([np.object]).columns, desc="Garbage Symbols"):
        matches = df[column].apply(str).str.extractall(garbage, flags=re.IGNORECASE)
        if not matches.empty:
            error_keys = df.loc[matches.unstack().index.values].index
            bad_texts = matches.stack().value_counts().index.sort_values().tolist()
            # escape backslashes for markdown repr, `\n > \\n`
            bad_texts = [
                f"'{codecs.encode(bx, 'unicode_escape').decode()[:20]}'"
                for bx in bad_texts
            ]
            error = (
                f"{len(error_keys)/len(df)*100:.1f}% of '{column}' "
                f"values contain `{', '.join(bad_texts)}`"
            )

            errors[error] = list(error_keys)
            row_keys = row_keys.union(error_keys)
    if errors:
        rule_result.add_error(
            f"{len(row_keys)/len(df) * 100:.1f}% ({len(row_keys)}) items affected",
            errors=errors,
        )
    return rule_result 
Example #12
Source File: tagged_corpus.py    From underthesea with GNU General Public License v3.0 5 votes vote down vote up
def _analyze_first_token(self, df, id, output_folder="."):
        filename = join(output_folder, "column-%s-analyze.xlsx" % id)
        df_analyze = df[id].value_counts().reset_index(name="count")
        df_analyze = df_analyze.rename(columns={"index": "0"})
        df_analyze.to_excel(filename, index=False)
        log = u""
        log += u"Unique words : {}\n".format(df_analyze.shape[0])
        log += u"Top words    : {}\n".format(
            u", ".join(list(df_analyze["0"].to_dict().values())[:20]))
        return log 
Example #13
Source File: tc_.py    From underthesea with GNU General Public License v3.0 5 votes vote down vote up
def _analyze_first_token(self, df, id, output_folder="."):
        filename = join(output_folder, "column-%s-analyze.xlsx" % id)
        df_analyze = df[id].value_counts().reset_index(name="count")
        df_analyze = df_analyze.rename(columns={"index": "0"})
        df_analyze.to_excel(filename, index=False)
        log = u""
        log += u"Unique words : {}\n".format(df_analyze.shape[0])
        log += u"Top words    : {}\n".format(
            u", ".join(list(df_analyze["0"].to_dict().values())[:20]))
        return log 
Example #14
Source File: __init__.py    From pandas-summary with MIT License 5 votes vote down vote up
def _get_bool_summary(self, column):
        series = self.df[column]

        stats = {}
        for class_name, class_value in dict(series.value_counts()).items():
            stats['"{}" count'.format(class_name)] = '{}'.format(class_value)
            stats['"{}" perc'.format(class_name)] = '{}'.format(
                self._percent(class_value / self.length))

        return pd.concat([pd.Series(stats, name=column),
                          self.columns_stats[column]],
                         sort=True) 
Example #15
Source File: __init__.py    From pandas-summary with MIT License 5 votes vote down vote up
def _get_categorical_summary(self, column):
        series = self.df[column]
        # Only run if at least 1 non-missing value
        value_counts = series.value_counts()
        stats = {
            'top': '{}: {}'.format(value_counts.index[0], value_counts.iloc[0]),
        }
        return pd.concat([pd.Series(stats, name=column),
                          self.columns_stats[column]],
                         sort=True) 
Example #16
Source File: __init__.py    From pandas-summary with MIT License 5 votes vote down vote up
def _get_median_absolute_deviation(self, series, multiplier=3):
        """
        Returns count of values larger than `multiplier` * `mad`
        :type series:
        :param multiplier:
        :return (array):
        """
        capped_series = np.minimum(
            series, series.median() + multiplier * series.mad())
        count = pd.value_counts(series != capped_series)
        count = count[True] if True in count else 0
        perc = self._percent(count / self.length)
        return count, perc 
Example #17
Source File: __init__.py    From pandas-summary with MIT License 5 votes vote down vote up
def _get_deviation_of_mean(self, series, multiplier=3):
        """
        Returns count of values deviating of the mean, i.e. larger than `multiplier` * `std`.
        :type series:
        :param multiplier:
        :return:
        """
        capped_series = np.minimum(
            series, series.mean() + multiplier * series.std())
        count = pd.value_counts(series != capped_series)
        count = count[True] if True in count else 0
        perc = self._percent(count / self.length)
        return count, perc 
Example #18
Source File: __init__.py    From pandas-summary with MIT License 5 votes vote down vote up
def columns_types(self):
        return pd.value_counts(self.columns_stats.loc['types']) 
Example #19
Source File: test_randomness_index_map.py    From vivarium with GNU General Public License v3.0 5 votes vote down vote up
def test_hash_uniformity(map_size_and_hashed_values):
    n, h = map_size_and_hashed_values

    k = len(h)
    num_bins = k//5  # Want about 5 items per bin for chi-squared
    bins = np.linspace(0, n + 1, num_bins)

    binned_data = pd.cut(h, bins)
    distribution = pd.value_counts(binned_data).sort_index()
    c, p = chisquare(distribution)

    assert p > 0.05, "Data not uniform" 
Example #20
Source File: train_main.py    From ibeis with Apache License 2.0 5 votes vote down vote up
def class_weights(self):
        import pandas as pd
        label_freq = pd.value_counts(self.labels)
        class_weights = label_freq.median() / label_freq
        class_weights = class_weights.sort_index().values
        class_weights = torch.from_numpy(class_weights.astype(np.float32))
        return class_weights 
Example #21
Source File: pairwise.py    From CausalGAN with MIT License 5 votes vote down vote up
def calc_tvd(label_dict,attr):
    '''
    attr should be a 0,1 pandas dataframe with
    columns corresponding to label names

    for example:
    names=zip(*self.graph)[0]
    calc_tvd(label_dict,attr[names])

    label_dict should be a dictionary key:1d-array of samples
    '''
    ####Calculate Total Variation####
    if np.min(attr.values)<0:
        raise ValueError('calc_tvd received \
                 attr that may not have been in {0,1}')

    label_names=label_dict.keys()
    attr=attr[label_names]

    df2=attr.drop_duplicates()
    df2 = df2.reset_index(drop = True).reset_index()
    df2=df2.rename(columns = {'index':'ID'})
    real_data_id=pd.merge(attr,df2)
    real_counts = pd.value_counts(real_data_id['ID'])
    real_pdf=real_counts/len(attr)

    label_list_dict={k:np.round(v.ravel()) for k,v in label_dict.items()}
    df_dat=pd.DataFrame.from_dict(label_list_dict)
    dat_id=pd.merge(df_dat,df2,on=label_names,how='left')
    dat_counts=pd.value_counts(dat_id['ID'])
    dat_pdf = dat_counts / dat_counts.sum()
    diff=real_pdf.subtract(dat_pdf, fill_value=0)
    tvd=0.5*diff.abs().sum()
    return tvd 
Example #22
Source File: visualize.py    From adversarial-policies with MIT License 5 votes vote down vote up
def _visualize_helper(
    model_dir, output_dir, subsample_rate, save_type, ordering, external_legend_params
):
    logger.info("Generating figures")

    # Data
    metadata_df = pd.read_csv(os.path.join(model_dir, "metadata.csv"))
    cluster_ids = np.load(os.path.join(model_dir, "cluster_ids.npy"))
    metadata_df["ax_1"] = cluster_ids[:, 0]
    metadata_df["ax_2"] = cluster_ids[:, 1]
    metadata_df["opponent_id"] = metadata_df["opponent_id"].apply(ABBREVIATIONS.get)

    def save_path(prefix):
        return osp.join(output_dir, f"{prefix}.{save_type}")

    counts = pd.value_counts(metadata_df["opponent_id"])
    min_counts = counts.min()
    opponent_groups = metadata_df.groupby("opponent_id")
    opponent_dfs = {name: group.sample(n=min_counts) for name, group in opponent_groups}
    opponent_dfs = [opponent_dfs[label] for label in ordering]
    metadata_df = pd.concat(opponent_dfs)

    _plot_and_save_chart(save_path("combined"), [metadata_df])
    _plot_and_save_chart(save_path("subsampled"), [metadata_df.sample(frac=subsample_rate)])
    _plot_and_save_chart(save_path("sidebyside"), opponent_dfs)

    if external_legend_params is not None:
        _external_legend(osp.join(output_dir, "external_legend.pdf"))

    logger.info("Visualization complete") 
Example #23
Source File: views.py    From dtale with GNU Lesser General Public License v2.1 4 votes vote down vote up
def describe(data_id, column):
    """
    :class:`flask:flask.Flask` route which returns standard details about column data using
    :meth:`pandas:pandas.DataFrame.describe` to the front-end as JSON

    :param data_id: integer string identifier for a D-Tale process's data
    :type data_id: str
    :param column: required dash separated string "START-END" stating a range of row indexes to be returned
                   to the screen
    :return: JSON {
        describe: object representing output from :meth:`pandas:pandas.Series.describe`,
        unique_data: array of unique values when data has <= 100 unique values
        success: True/False
    }

    """
    data = global_state.get_data(data_id)[[column]]
    additional_aggs = None
    curr_dtypes = global_state.get_dtypes(data_id)
    dtype = next(
        (
            dtype_info["dtype"]
            for dtype_info in curr_dtypes
            if dtype_info["name"] == column
        ),
        None,
    )
    if classify_type(dtype) in ["I", "F"]:
        additional_aggs = ["sum", "median", "mode", "var", "sem", "skew", "kurt"]
    code = build_code_export(data_id)
    desc, desc_code = load_describe(data[column], additional_aggs=additional_aggs)
    code += desc_code
    return_data = dict(describe=desc, success=True)
    uniq_vals = data[column].value_counts().sort_values(ascending=False)
    total_uniq_vals = len(uniq_vals)
    if "unique" not in return_data["describe"]:
        return_data["describe"]["unique"] = json_int(total_uniq_vals, as_string=True)
    uniq_vals.index.name = "value"
    uniq_vals.name = "count"
    uniq_vals = uniq_vals.reset_index()
    uniq_f, _ = build_formatters(uniq_vals)
    if total_uniq_vals <= 100:
        code.append("uniq_vals = data['{}'].unique()".format(column))
        return_data["uniques"] = dict(
            data=uniq_f.format_dicts(uniq_vals.itertuples()),
            total=total_uniq_vals,
            top=False,
        )
    else:  # get top 100 most common values
        return_data["uniques"] = dict(
            data=uniq_f.format_dicts(uniq_vals.head(100).itertuples()),
            total=total_uniq_vals,
            top=True,
        )
        uniq_code = "uniq_vals = data['{}'].value_counts().sort_values(ascending=False).head(100).index.values"
        code.append(uniq_code.format(column))
    return_data["code"] = "\n".join(code)
    return jsonify(return_data) 
Example #24
Source File: format_utils.py    From pyseqlogo with MIT License 4 votes vote down vote up
def read_alignment(infile, data_type='fasta', seq_type='dna', pseudo_count=1):
    """Read alignment file as motif

    Parameters
    ----------

    infile: str
        Path to input alignment file

    data_type: str
        'fasta', 'stockholm', etc/. as supported by Bio.AlignIO

    seq_type: str
        'dna', 'rna' or 'aa'

    pseudo_count: int
        psuedo counts to add before calculating information cotent

    Returns
    -------

    (motif, information_content) : tuple
        A motif instance followd by total informatio content of the motif

    """
    alignment = AlignIO.read(infile, data_type)
    data = []
    for aln in alignment:
        data.append([x for x in str(aln.seq)])
    df = pd.DataFrame(data)
    df_counts = df.apply(pd.value_counts, 0)
    total = df_counts[[0]].sum()
    df_counts = df_counts[df_counts.index != '-']
    # Remove - from counts
    counts_dict = df_counts.to_dict(orient='index')
    counts = {}
    for key, val in counts_dict.items():
        counts[key] = list(val.values())
    return counts, total
    """
    summary_align = AlignInfo.SummaryInfo(alignment)
    if seq_type == 'dna':
        info_content = summary_align.information_content(e_freq_table = naive_freq_tables['dna'],
                                                         chars_to_ignore = ['N'],
                                                         pseudo_count = pseudo_count)
    elif seq_type == 'rna':
        info_content = summary_align.information_content(e_freq_table = naive_freq_tables['rna'],
                                                         chars_to_ignore = ['N'],
                                                         pseudo_count = pseudo_count)
    else:
        info_content = summary_align.information_content(e_freq_table = naive_freq_tables['aa'],
                                                         pseudo_count = pseudo_count)
    motif = create_motif_from_alignment(alignment)
    return (motif, summary_align.ic_vector)
    """ 
Example #25
Source File: others.py    From arche with MIT License 4 votes vote down vote up
def compare_boolean_fields(
    source_df: pd.DataFrame,
    target_df: pd.DataFrame,
    err_thr: float = 0.10,
    warn_thr: float = 0.05,
) -> Result:
    """Compare booleans distribution between two dataframes

    Returns:
        A result containing dataframe with distributions and messages if differences
        are in thresholds
    """

    source_bool = source_df.select_dtypes(include="bool")
    target_bool = target_df.select_dtypes(include="bool")

    result = Result("Boolean Fields")
    if not fields_to_compare(source_bool, target_bool):
        result.outcome = Outcome.SKIPPED
        return result

    dummy = pd.DataFrame(columns=[True, False])
    source_counts = pd.concat(
        [dummy, source_bool.apply(pd.value_counts, normalize=True).T], sort=False
    ).fillna(0.0)
    target_counts = pd.concat(
        [dummy, target_bool.apply(pd.value_counts, normalize=True).T], sort=False
    ).fillna(0.0)
    difs = (source_counts - target_counts)[True]

    bool_covs = pd.concat(
        [
            source_counts.rename("{}_source".format),
            target_counts.rename("{}_target".format),
        ]
    ).sort_index()
    bool_covs.name = "Coverage for boolean fields"
    result.stats.append(bool_covs)

    err_diffs = difs[difs.abs() > err_thr]
    if not err_diffs.empty:
        result.add_error(
            f"{', '.join(err_diffs.index)} relative frequencies differ "
            f"by more than {err_thr:.0%}"
        )

    warn_diffs = difs[(difs.abs() > warn_thr) & (difs.abs() <= err_thr)]
    if not warn_diffs.empty:
        result.add_warning(
            f"{', '.join(warn_diffs.index)} relative frequencies differ by "
            f"{warn_thr:.0%}-{err_thr:.0%}"
        )

    return result 
Example #26
Source File: test_general.py    From modin with Apache License 2.0 4 votes vote down vote up
def test_value_counts(normalize, bins, dropna):
    def sort_index_for_equal_values(result, ascending):
        is_range = False
        is_end = False
        i = 0
        new_index = np.empty(len(result), dtype=type(result.index))
        while i < len(result):
            j = i
            if i < len(result) - 1:
                while result[result.index[i]] == result[result.index[i + 1]]:
                    i += 1
                    if is_range is False:
                        is_range = True
                    if i == len(result) - 1:
                        is_end = True
                        break
            if is_range:
                k = j
                for val in sorted(result.index[j : i + 1], reverse=not ascending):
                    new_index[k] = val
                    k += 1
                if is_end:
                    break
                is_range = False
            else:
                new_index[j] = result.index[j]
            i += 1
        return pandas.Series(result, index=new_index)

    # We sort indices for pandas result because of issue #1650
    values = np.array([3, 1, 2, 3, 4, np.nan])
    modin_result = pd.value_counts(values, normalize=normalize, ascending=False)
    pandas_result = sort_index_for_equal_values(
        pandas.value_counts(values, normalize=normalize, ascending=False), False
    )
    df_equals(modin_result, pandas_result)

    modin_result = pd.value_counts(values, bins=bins, ascending=False)
    pandas_result = sort_index_for_equal_values(
        pandas.value_counts(values, bins=bins, ascending=False), False
    )
    df_equals(modin_result, pandas_result)

    modin_result = pd.value_counts(values, dropna=dropna, ascending=True)
    pandas_result = sort_index_for_equal_values(
        pandas.value_counts(values, dropna=dropna, ascending=True), True
    )
    df_equals(modin_result, pandas_result) 
Example #27
Source File: kaggle_titanic.py    From stacked_generalization with Apache License 2.0 4 votes vote down vote up
def pre_process(self, drop=True, title_to_onehot=True, norm_fare=True):
        def get_title(name):
            title_search = re.search(' ([A-Za-z]+)\.', name)
            if title_search:
                return title_search.group(1)
            return ""

        def normalize_fare(data):
            new_data = None
            for embarked in (0, 1, 2):
                temp = data[data.Embarked == embarked]
                temp['Fare'] /= temp['Fare'].values.mean()
                if new_data is None:
                    new_data = temp
                else:
                    new_data = pd.concat([new_data, temp])
            new_data = new_data.sort('PassengerId')
            return new_data

        data = pd.read_csv(self.file_name).replace('male',0).replace('female',1)
        data['Age'].fillna(data.Age.median(), inplace=True)
        data['Fare'].fillna(data.Fare.median(), inplace=True)
        data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
        data['Embarked'] = data['Embarked'].replace('S',0).replace('C',1).replace('Q',2)
        data['Embarked'].fillna(0, inplace=True)
        if norm_fare:
            data = normalize_fare(data)

        # Get all the titles and print how often each one occurs.
        titles = data["Name"].apply(get_title)
        print(pd.value_counts(titles))

        # Map each title to an integer.  Some titles are very rare, and are compressed into the same codes as other titles.
        title_mapping = {"Dona": 1, "Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}
        for k,v in title_mapping.items():
            titles[titles == k] = v

        # Add in the title column.
        data['Title'] = titles
        data['Title'].fillna(1, inplace=True)
        #data['Pos'] = data["Title"] + data['Pclass']
        if drop:
            #data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Pclass', 'Cabin', 'Embarked'], axis=1)
            data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis=1)
            #data = data.drop(['Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Pclass', 'Title'], axis=1)
        print(data.keys())
        if title_to_onehot:
            self.encode(data, 'Title', [i for i in range(1, 11)])
            data = data.drop(['Title'], axis=1)
        return data