Python Examples of pandas.value

Source File: spatial_heatmap.py From NanoPlot with GNU General Public License v3.0

6 votes

def spatial_heatmap(array, path, title=None, color="Greens", figformat="png"):
    """Taking channel information and creating post run channel activity plots."""
    logging.info("Nanoplotter: Creating heatmap of reads per channel using {} reads."
                 .format(array.size))
    activity_map = Plot(
        path=path + "." + figformat,
        title="Number of reads generated per channel")
    layout = make_layout(maxval=np.amax(array))
    valueCounts = pd.value_counts(pd.Series(array))
    for entry in valueCounts.keys():
        layout.template[np.where(layout.structure == entry)] = valueCounts[entry]
    plt.figure()
    ax = sns.heatmap(
        data=pd.DataFrame(layout.template, index=layout.yticks, columns=layout.xticks),
        xticklabels="auto",
        yticklabels="auto",
        square=True,
        cbar_kws={"orientation": "horizontal"},
        cmap=color,
        linewidths=0.20)
    ax.set_title(title or activity_map.title)
    activity_map.fig = ax.get_figure()
    activity_map.save(format=figformat)
    plt.close("all")
    return [activity_map]

Source File: test_split.py From dask-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def test_blockwise_shufflesplit():
    splitter = dask_ml.model_selection.ShuffleSplit(random_state=0)
    assert splitter.get_n_splits() == 10
    gen = splitter.split(dX)

    train_idx, test_idx = next(gen)
    assert isinstance(train_idx, da.Array)
    assert isinstance(test_idx, da.Array)

    assert train_idx.shape == (99,)  # 90% of 110
    assert test_idx.shape == (11,)

    assert train_idx.chunks == ((45, 45, 9),)
    assert test_idx.chunks == ((5, 5, 1),)

    counts = pd.value_counts(train_idx.compute())
    assert counts.max() == 1

    N = len(X)

    np.testing.assert_array_equal(
        np.unique(da.concatenate([train_idx, test_idx])), np.arange(N)
    )

Source File: tagged_corpus.py From underthesea with GNU General Public License v3.0

6 votes

def _analyze_field(self, df, id, output_folder=".", n_head=10):
        id = str(id)
        m = df.shape[1]
        df.columns = [str(i) for i in range(m)]

        agg_dict = dict()
        agg_dict[id] = "size"
        for i in range(int(id)):
            agg_dict[str(i)] = lambda x: ", ".join(
                pd.value_counts(x).index[:n_head])
        name_dict = dict()
        name_dict[id] = "count"
        df_analyze = df.groupby(id).agg(agg_dict).rename(
            columns=name_dict).reset_index()
        filename = join(output_folder, "column-%s-analyze.xlsx" % id)

        log = u""
        log += u"Tags         : {}\n".format(df_analyze.shape[0])
        tags = df_analyze[id].to_dict().values()
        tags = sorted(tags)
        log += u"List tags    : {}\n".format(u", ".join(tags))
        df_analyze.to_excel(filename, index=False)
        return log

Source File: tc_.py From underthesea with GNU General Public License v3.0

6 votes

def _analyze_field(self, df, id, output_folder=".", n_head=10):
        id = str(id)
        m = df.shape[1]
        df.columns = [str(i) for i in range(m)]

        agg_dict = dict()
        agg_dict[id] = "size"
        for i in range(int(id)):
            agg_dict[str(i)] = lambda x: ", ".join(
                pd.value_counts(x).index[:n_head])
        name_dict = dict()
        name_dict[id] = "count"
        df_analyze = df.groupby(id).agg(agg_dict).rename(
            columns=name_dict).reset_index()
        filename = join(output_folder, "column-%s-analyze.xlsx" % id)

        log = u""
        log += u"Tags         : {}\n".format(df_analyze.shape[0])
        tags = df_analyze[id].to_dict().values()
        tags = sorted(tags)
        log += u"List tags    : {}\n".format(u", ".join(tags))
        df_analyze.to_excel(filename, index=False)
        return log

Source File: test_example_sleeping_giant.py From postman_problems with MIT License

5 votes

def test_get_shortest_paths_distances():
    df = read_edgelist(EDGELIST)
    graph = create_networkx_graph_from_edgelist(df, edge_id='id')

    odd_nodes = get_odd_nodes(graph)
    odd_node_pairs = list(itertools.combinations(odd_nodes, 2))

    # coarsely checking structure of `get_shortest_paths_distances` return value
    odd_node_pairs_shortest_paths = get_shortest_paths_distances(graph, odd_node_pairs, 'distance')
    assert len(odd_node_pairs_shortest_paths) == 630
    assert type(odd_node_pairs_shortest_paths) == dict

    # check that each node name appears the same number of times in `get_shortest_paths_distances` return value
    node_names = list(itertools.chain(*[i[0] for i in odd_node_pairs_shortest_paths.items()]))
    assert set(pd.value_counts(node_names)) == set([35])

Source File: testPlotting.py From fitbit-analyzer with Apache License 2.0

5 votes

def test_plottingOnIntradayStats(self):
        filepath =  RESOURCE_PATH + "\\unittest\\test_sleep_basic01.csv"
        data1 = utils.loadIntradayData(filepath)
        filepath =  RESOURCE_PATH + "\\unittest\\test_sleep_basic02.csv"
        data2 = utils.loadIntradayData(filepath)
        stats = sleepStats.generateStatsFrom([data1, data2],
                                             sleepStats.STATS_NAME_INTRADAY)

        data = stats.apply(pd.value_counts)
        mplot.plotSleepValueHeatmap(data, sleepValue=1)

Source File: sleepStats.py From fitbit-analyzer with Apache License 2.0

5 votes

def normalizedIntradayCountStats(intradayStats, limitCount=5):
    # For each minute, number of days for which we have a valid measure (record)
    notNullCount = intradayStats.count()
    # Ignore minutes where we have low level of records
    notNullCount[notNullCount < limitCount] = None
    # Count how many times each value appears for each minute
    valueCount = intradayStats.apply(pd.value_counts)
    # Normalize each minute by records count
    res = valueCount.div(notNullCount, axis=1)
    return res

Source File: test_split.py From dask-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def test_kfold(shuffle):
    splitter = dask_ml.model_selection.KFold(
        n_splits=5, random_state=0, shuffle=shuffle
    )
    assert splitter.get_n_splits() == 5
    gen = splitter.split(dX)

    train_idx, test_idx = next(gen)
    assert isinstance(train_idx, da.Array)
    assert isinstance(test_idx, da.Array)

    assert train_idx.shape == (88,)  # 80% of 110
    assert test_idx.shape == (22,)

    assert train_idx.chunks == ((28, 50, 10),)
    assert test_idx.chunks == ((22,),)

    counts = pd.value_counts(train_idx.compute())
    assert counts.max() == 1

    N = len(X)

    np.testing.assert_array_equal(
        np.unique(da.concatenate([train_idx, test_idx])), np.arange(N)
    )

    expected_chunks = [
        (((22, 6, 50, 10),), ((22,),)),
        (((44, 34, 10),), ((6, 16),)),
        (((50, 16, 12, 10),), ((22,),)),
        (((50, 38),), ((12, 10),)),
    ]

    for (exp_train_idx, exp_test_idx), (train_idx, test_idx) in zip(
        expected_chunks, gen
    ):
        assert train_idx.chunks == exp_train_idx
        assert test_idx.chunks == exp_test_idx

Source File: core.py From econtools with BSD 3-Clause "New" or "Revised" License

5 votes

def df_cluster(n, k, cluster_id):
    g = len(pd.value_counts(cluster_id))
    df = g - 1
    vce_correct = ((n - 1) / (n - k)) * (g / (g - 1))
    return df, vce_correct, g

Source File: prepare_dataset.py From moses with MIT License

5 votes

def split_dataset(dataset, seed):
    logger.info('Splitting the dataset')
    scaffolds = pd.value_counts(dataset['scaffold'])
    scaffolds = sorted(scaffolds.items(), key=lambda x: (-x[1], x[0]))
    test_scaffolds = set([x[0] for x in scaffolds[9::10]])
    dataset['SPLIT'] = 'train'
    test_scaf_idx = [x in test_scaffolds for x in dataset['scaffold']]
    dataset.loc[test_scaf_idx, 'SPLIT'] = 'test_scaffolds'
    test_idx = dataset.loc[dataset['SPLIT'] == 'train'].sample(
        frac=0.1, random_state=seed
    ).index
    dataset.loc[test_idx, 'SPLIT'] = 'test'
    dataset.drop('scaffold', axis=1, inplace=True)
    return dataset

Source File: others.py From arche with MIT License

5 votes

def garbage_symbols(df: pd.DataFrame) -> Result:
    """Find unwanted symbols in `np.object` columns.

    Returns:
        A result containing item keys per field which contained any trash symbol
    """
    garbage = (
        r"(?P<spaces>^\s|\s$)"
        r"|(?P<html_entities>&[a-zA-Z]{2,}?;|&#\d*?;)"
        r"|(?P<css>[.#@][^\d{}#.\s][^{}#.]+?{(?:[^:;{}]+?:[^:;{}]+?;)+?\s*?})"
        r"|(?P<html_tags></??(?:h\d|b|u|i|div|ul|ol|li|table|tbody|th|tr|td|p|a|br|img|sup|SUP|"
        r"blockquote)\s*?/??>|<!--|-->)"
    )

    errors = {}
    row_keys: Set = set()
    rule_result = Result("Garbage Symbols", items_count=len(df))

    for column in tqdm(df.select_dtypes([np.object]).columns, desc="Garbage Symbols"):
        matches = df[column].apply(str).str.extractall(garbage, flags=re.IGNORECASE)
        if not matches.empty:
            error_keys = df.loc[matches.unstack().index.values].index
            bad_texts = matches.stack().value_counts().index.sort_values().tolist()
            # escape backslashes for markdown repr, `\n > \\n`
            bad_texts = [
                f"'{codecs.encode(bx, 'unicode_escape').decode()[:20]}'"
                for bx in bad_texts
            ]
            error = (
                f"{len(error_keys)/len(df)*100:.1f}% of '{column}' "
                f"values contain `{', '.join(bad_texts)}`"
            )

            errors[error] = list(error_keys)
            row_keys = row_keys.union(error_keys)
    if errors:
        rule_result.add_error(
            f"{len(row_keys)/len(df) * 100:.1f}% ({len(row_keys)}) items affected",
            errors=errors,
        )
    return rule_result

Source File: tagged_corpus.py From underthesea with GNU General Public License v3.0

5 votes

def _analyze_first_token(self, df, id, output_folder="."):
        filename = join(output_folder, "column-%s-analyze.xlsx" % id)
        df_analyze = df[id].value_counts().reset_index(name="count")
        df_analyze = df_analyze.rename(columns={"index": "0"})
        df_analyze.to_excel(filename, index=False)
        log = u""
        log += u"Unique words : {}\n".format(df_analyze.shape[0])
        log += u"Top words    : {}\n".format(
            u", ".join(list(df_analyze["0"].to_dict().values())[:20]))
        return log

Source File: tc_.py From underthesea with GNU General Public License v3.0

5 votes

def _analyze_first_token(self, df, id, output_folder="."):
        filename = join(output_folder, "column-%s-analyze.xlsx" % id)
        df_analyze = df[id].value_counts().reset_index(name="count")
        df_analyze = df_analyze.rename(columns={"index": "0"})
        df_analyze.to_excel(filename, index=False)
        log = u""
        log += u"Unique words : {}\n".format(df_analyze.shape[0])
        log += u"Top words    : {}\n".format(
            u", ".join(list(df_analyze["0"].to_dict().values())[:20]))
        return log