Python altair.X Examples

The following are 23 code examples of altair.X(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module altair , or try the search function .
Example #1
Source File: BubbleDiachronicVisualization.py    From scattertext with Apache License 2.0 8 votes vote down vote up
def visualize(display_df):
        viridis = ['#440154', '#472c7a', '#3b518b', '#2c718e', '#21908d', '#27ad81', '#5cc863', '#aadc32', '#fde725']
        import altair as alt
        color_scale = alt.Scale(
            domain=(display_df.dropna().trending.min(),
                    0,
                    display_df.dropna().trending.max()),
            range=[viridis[0], viridis[len(viridis) // 2], viridis[-1]]
        )

        return alt.Chart(display_df).mark_circle().encode(
            alt.X('variable'),
            alt.Y('term'),
            size='frequency',
            color=alt.Color('trending:Q', scale=color_scale),
        ) 
Example #2
Source File: core.py    From starborn with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def violinplot(x=None, y=None, data=None, orient=None):
    # TODO: automatically infer orientation

    if orient is None or orient == 'v':
        kwargs = dict(
                    x=alt.X('count(*):Q',
                            axis=alt.Axis(grid=False, labels=False),
                            stack='center',
                            title=''),
                    y=alt.Y('{y}:Q'.format(y=y), bin=alt.Bin(maxbins=100)),
                    column='{x}:N'.format(x=x),
                    color='{x}:N'.format(x=x)
        )
    else:
        kwargs = dict(
                    y=alt.Y('count(*):Q',
                            axis=alt.Axis(grid=False, labels=False),
                            stack='center',
                            title=''),
                    x=alt.X('{x}:Q'.format(x=x), bin=alt.Bin(maxbins=100)),
                    row='{y}:N'.format(y=y),
                    color='{y}:N'.format(y=y)
        )
    chart = alt.Chart(data).mark_area().encode(**kwargs)
    return chart 
Example #3
Source File: plot.py    From retentioneering-tools with Mozilla Public License 2.0 6 votes vote down vote up
def altair_step_matrix(diff, plot_name=None, title='', vmin=None, vmax=None, font_size=12, **kwargs):
    heatmap_data = diff.reset_index().melt('index')
    heatmap_data.columns = ['y', 'x', 'z']
    table = alt.Chart(heatmap_data).encode(
        x=alt.X('x:O', sort=None),
        y=alt.Y('y:O', sort=None)
    )
    heatmap = table.mark_rect().encode(
        color=alt.Color(
            'z:Q',
            scale=alt.Scale(scheme='blues'),
        )
    )
    text = table.mark_text(
        align='center', fontSize=font_size
    ).encode(
        text='z',
        color=alt.condition(
            abs(alt.datum.z) < 0.8,
            alt.value('black'),
            alt.value('white'))
    )
    heatmap_object = (heatmap + text).properties(
        width=3 * font_size * len(diff.columns),
        height=2 * font_size * diff.shape[0]
    )
    return heatmap_object, plot_name, None, diff.retention.retention_config 
Example #4
Source File: core.py    From starborn with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def pairplot(data, hue=None, vars=None):
    if vars is None:
        vars = list(data.columns)

    chart = alt.Chart(data).mark_circle().encode(
                alt.X(alt.repeat("column"), type='quantitative'),
                alt.Y(alt.repeat("row"), type='quantitative'),
                color='{hue}:N'.format(hue=hue)
            ).properties(
                width=250,
                height=250
            ).repeat(
                row=vars,
                column=vars
            )
    return chart 
Example #5
Source File: core.py    From starborn with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def scatterplot(x, y, data, hue=None, xlim=None, ylim=None):
    # TODO: refactor so it uses category_chart_kwargs?
    if xlim is None:
        xlim = get_limit_tuple(data[x])
    if ylim is None:
        ylim = get_limit_tuple(data[y])
    xscale = alt.Scale(domain=xlim)
    yscale = alt.Scale(domain=ylim)
    
    other_args = {'color': '{hue}:N'.format(hue=hue)} if hue else {}
    points = alt.Chart(data).mark_circle().encode(
        alt.X(x, scale=xscale),
        alt.Y(y, scale=yscale),
        **other_args
    )
    return points 
Example #6
Source File: _core.py    From altair_pandas with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def hist(self, bins=None, orientation="vertical", **kwargs):
        data = self._preprocess_data(with_index=False)
        column = data.columns[0]
        if isinstance(bins, int):
            bins = alt.Bin(maxbins=bins)
        elif bins is None:
            bins = True
        if orientation == "vertical":
            Indep, Dep = alt.X, alt.Y
        elif orientation == "horizontal":
            Indep, Dep = alt.Y, alt.X
        else:
            raise ValueError("orientation must be 'horizontal' or 'vertical'.")

        mark = self._get_mark_def({"type": "bar", "orient": orientation}, kwargs)
        return alt.Chart(data, mark=mark).encode(
            Indep(column, title=None, bin=bins), Dep("count()", title="Frequency")
        ) 
Example #7
Source File: explore.py    From gobbli with Apache License 2.0 5 votes vote down vote up
def st_heatmap(
    heatmap_df: pd.DataFrame, x_col_name: str, y_col_name: str, color_col_name: str
):
    heatmap = (
        alt.Chart(heatmap_df, height=700, width=700)
        .mark_rect()
        .encode(alt.X(x_col_name), alt.Y(y_col_name), alt.Color(color_col_name))
    )
    st.altair_chart(heatmap) 
Example #8
Source File: core.py    From starborn with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def heatmap(data, vmin=None, vmax=None, annot=None, fmt='.2g'):

    # We always want to have a DataFrame with semantic information
    if not isinstance(data, pd.DataFrame):
        matrix = np.asarray(data)
        data = pd.DataFrame(matrix)

    melted = data.stack().reset_index(name='Value')

    x = data.columns.name
    y = data.index.name

    heatmap = alt.Chart(melted).mark_rect().encode(
        alt.X('{x}:O'.format(x=x), scale=alt.Scale(paddingInner=0)),
        alt.Y('{y}:O'.format(y=y), scale=alt.Scale(paddingInner=0)),
        color='Value:Q'
    )
    
    if not annot:
        return heatmap

    # Overlay text
    text = alt.Chart(melted).mark_text(baseline='middle').encode(
        x='{x}:O'.format(x=x),
        y='{y}:O'.format(y=y),
        text=alt.Text('Value', format=fmt),
        color=alt.condition(alt.expr.datum['Value'] > 70,
                            alt.value('black'),
                            alt.value('white'))
    )
    return heatmap + text 
Example #9
Source File: core.py    From starborn with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def jointplot(x, y, data, kind='scatter', hue=None, xlim=None, ylim=None):
    if xlim is None:
        xlim = get_limit_tuple(data[x])
    if ylim is None:
        ylim = get_limit_tuple(data[y])
    xscale = alt.Scale(domain=xlim)
    yscale = alt.Scale(domain=ylim)
 
    points = scatterplot(x, y, data, hue=hue, xlim=xlim, ylim=ylim)

    area_args = {'opacity': .3, 'interpolate': 'step'}

    blank_axis = alt.Axis(title='')

    top_hist = alt.Chart(data).mark_area(**area_args).encode(
        alt.X('{x}:Q'.format(x=x),
              # when using bins, the axis scale is set through
              # the bin extent, so we do not specify the scale here
              # (which would be ignored anyway)
              bin=alt.Bin(maxbins=20, extent=xscale.domain),
              stack=None,
              axis=blank_axis,
             ),
        alt.Y('count()', stack=None, axis=blank_axis),
        alt.Color('{hue}:N'.format(hue=hue)),
    ).properties(height=60)

    right_hist = alt.Chart(data).mark_area(**area_args).encode(
        alt.Y('{y}:Q'.format(y=y),
              bin=alt.Bin(maxbins=20, extent=yscale.domain),
              stack=None,
              axis=blank_axis,
             ),
        alt.X('count()', stack=None, axis=blank_axis),
        alt.Color('{hue}:N'.format(hue=hue)),
    ).properties(width=60)

    return top_hist & (points | right_hist) 
Example #10
Source File: _core.py    From altair_pandas with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _xy(self, mark, **kwargs):
        data = self._preprocess_data(with_index=True)
        return (
            alt.Chart(data, mark=self._get_mark_def(mark, kwargs))
            .encode(
                x=alt.X(data.columns[0], title=None),
                y=alt.Y(data.columns[1], title=None),
                tooltip=list(data.columns),
            )
            .interactive()
        ) 
Example #11
Source File: _core.py    From altair_pandas with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def box(self, vert=True, **kwargs):
        data = self._preprocess_data(with_index=False)
        chart = (
            alt.Chart(data)
            .transform_fold(list(data.columns), as_=["column", "value"])
            .mark_boxplot()
            .encode(x=alt.X("column:N", title=None), y="value:Q")
        )
        if not vert:
            chart.encoding.x, chart.encoding.y = chart.encoding.y, chart.encoding.x
        return chart 
Example #12
Source File: explore.py    From gobbli with Apache License 2.0 5 votes vote down vote up
def show_document_length_distribution(tokens: List[List[str]]):
    st.header("Document Length Distribution")
    document_lengths = get_document_lengths(tokens)
    doc_lengths = pd.DataFrame({"Token Count": document_lengths})
    doc_length_chart = (
        alt.Chart(doc_lengths, height=500, width=700)
        .mark_bar()
        .encode(
            alt.X("Token Count", bin=alt.Bin(maxbins=30)),
            alt.Y("count()", type="quantitative"),
        )
    )

    st.altair_chart(doc_length_chart) 
Example #13
Source File: _core.py    From altair_pandas with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def hist(self, bins=None, stacked=None, orientation="vertical", **kwargs):
        data = self._preprocess_data(with_index=False)
        if isinstance(bins, int):
            bins = alt.Bin(maxbins=bins)
        elif bins is None:
            bins = True
        if orientation == "vertical":
            Indep, Dep = alt.X, alt.Y
        elif orientation == "horizontal":
            Indep, Dep = alt.Y, alt.X
        else:
            raise ValueError("orientation must be 'horizontal' or 'vertical'.")

        mark = self._get_mark_def({"type": "bar", "orient": orientation}, kwargs)
        chart = (
            alt.Chart(data, mark=mark)
            .transform_fold(list(data.columns), as_=["column", "value"])
            .encode(
                Indep("value:Q", title=None, bin=bins),
                Dep("count()", title="Frequency", stack=stacked),
                color="column:N",
            )
        )

        if kwargs.get("subplots"):
            nrows, ncols = _get_layout(data.shape[1], kwargs.get("layout", (-1, 1)))
            chart = chart.encode(facet=alt.Facet("column:N", title=None)).properties(
                columns=ncols
            )

        return chart 
Example #14
Source File: _core.py    From altair_pandas with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def hist_frame(self, column=None, layout=(-1, 2), **kwargs):
        if column is not None:
            if isinstance(column, str):
                column = [column]
        data = self._preprocess_data(with_index=False, usecols=column)
        data = data._get_numeric_data()
        nrows, ncols = _get_layout(data.shape[1], layout)
        return (
            alt.Chart(data, mark=self._get_mark_def("bar", kwargs))
            .encode(
                x=alt.X(alt.repeat("repeat"), type="quantitative", bin=True),
                y=alt.Y("count()", title="Frequency"),
            )
            .repeat(repeat=list(data.columns), columns=ncols)
        ) 
Example #15
Source File: _core.py    From pdvega with MIT License 5 votes vote down vote up
def _x(x, df, ordinal_threshold=6, **kwargs):
    return alt.X(
        field=x,
        type=infer_vegalite_type(df[x], ordinal_threshold=ordinal_threshold),
        **kwargs
    ) 
Example #16
Source File: app.py    From demo-self-driving with Apache License 2.0 5 votes vote down vote up
def frame_selector_ui(summary):
    st.sidebar.markdown("# Frame")

    # The user can pick which type of object to search for.
    object_type = st.sidebar.selectbox("Search for which objects?", summary.columns, 2)

    # The user can select a range for how many of the selected objecgt should be present.
    min_elts, max_elts = st.sidebar.slider("How many %ss (select a range)?" % object_type, 0, 25, [10, 20])
    selected_frames = get_selected_frames(summary, object_type, min_elts, max_elts)
    if len(selected_frames) < 1:
        return None, None

    # Choose a frame out of the selected frames.
    selected_frame_index = st.sidebar.slider("Choose a frame (index)", 0, len(selected_frames) - 1, 0)

    # Draw an altair chart in the sidebar with information on the frame.
    objects_per_frame = summary.loc[selected_frames, object_type].reset_index(drop=True).reset_index()
    chart = alt.Chart(objects_per_frame, height=120).mark_area().encode(
        alt.X("index:Q", scale=alt.Scale(nice=False)),
        alt.Y("%s:Q" % object_type))
    selected_frame_df = pd.DataFrame({"selected_frame": [selected_frame_index]})
    vline = alt.Chart(selected_frame_df).mark_rule(color="red").encode(
        alt.X("selected_frame:Q", axis=None)
    )
    st.sidebar.altair_chart(alt.layer(chart, vline))

    selected_frame = selected_frames[selected_frame_index]
    return selected_frame_index, selected_frame

# Select frames based on the selection in the sidebar 
Example #17
Source File: _misc.py    From altair_pandas with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def scatter_matrix(
    df,
    color: Union[str, None] = None,
    alpha: float = 1.0,
    tooltip: Union[List[str], tooltipList, None] = None,
    **kwargs
) -> alt.Chart:
    """ plots a scatter matrix

    At the moment does not support neither histogram nor kde;
    Uses f-f scatterplots instead. Interactive and with a cusotmizable
    tooltip

    Parameters
    ----------
    df : DataFame
        DataFame to be used for scatterplot. Only numeric columns will be included.
    color : string [optional]
        Can be a column name or specific color value (hex, webcolors).
    alpha : float
        Opacity of the markers, within [0,1]
    tooltip: list [optional]
        List of specific column names or alt.Tooltip objects. If none (default),
        will show all columns.
    """
    dfc = _preprocess_data(df)
    tooltip = _process_tooltip(tooltip) or dfc.columns.tolist()
    cols = dfc._get_numeric_data().columns.tolist()

    chart = (
        alt.Chart(dfc)
        .mark_circle()
        .encode(
            x=alt.X(alt.repeat("column"), type="quantitative"),
            y=alt.X(alt.repeat("row"), type="quantitative"),
            opacity=alt.value(alpha),
            tooltip=tooltip,
        )
        .properties(width=150, height=150)
    )

    if color:
        color = str(color)

        if color in dfc:
            color = alt.Color(color)
            if "colormap" in kwargs:
                color.scale = alt.Scale(scheme=kwargs.get("colormap"))
        else:
            color = alt.value(color)
        chart = chart.encode(color=color)

    return chart.repeat(row=cols, column=cols).interactive() 
Example #18
Source File: rewrite.py    From errudite with GNU General Public License v2.0 4 votes vote down vote up
def visualize_delta_confidence_per_model(self, 
        instance_hash: Dict[InstanceKey, Instance]={},
        instance_hash_rewritten: Dict[InstanceKey, Instance]={},
        filtered_instances: List[InstanceKey]=None,
        model: str=None):
        """
        Visualize the rewrite distribution, in terms of model confidence. 
        It's a histogram that shows the distribution of the delta confidence.
        This historgram is different
        for each different model. 
        
        Parameters
        ----------
        instance_hash : Dict[InstanceKey, Instance]
            A dict that saves all the *original* instances, by default {}. 
            It denotes by the corresponding instance keys.
            If ``{}``, resolve to ``Instance.instance_hash``.
        instance_hash_rewritten : Dict[InstanceKey, Instance]
            A dict that saves all the *rewritten* instances, by default {}. 
            It denotes by the corresponding instance keys.
            If ``{}``, resolve to ``Instance.instance_hash_rewritten``.
        filtered_instances : List[InstanceKey], optional
            A selected list of instances. If given, only display the distribution
            of the selected instances, by default None
        model : str, optional
            The selected model, by default ``None``. 
            If ``None``, resolve to ``Instance.model``.
        
        Returns
        -------
        alt.Chart
            An altair chart object. 
        """
        model = Instance.resolve_default_model(model)
        instance_hash = instance_hash or Instance.instance_hash
        instance_hash_rewritten = instance_hash_rewritten or Instance.instance_hash_rewritten
        output = []
        if filtered_instances:
            qids = list(np.unique([i.qid for i in filtered_instances]))
        else:
            qids = None
        data = Rewrite.get_delta_performance(self,
            qids, instance_hash, instance_hash_rewritten, model)['delta_confidences']
        output = [ {"delta_confidence": d} for d in data ] 
        df = pd.DataFrame(output)
        chart = alt.Chart(df).mark_bar().encode(
            y=alt.Y('count()'),
            x=alt.X('delta_confidence:Q', bin=True)
        ).properties(width=150, height=100, title=f'{self.rid} on {model}')#.configure_facet(spacing=5)#
        return chart 
Example #19
Source File: rewrite.py    From errudite with GNU General Public License v2.0 4 votes vote down vote up
def visualize_models(self, 
        instance_hash: Dict[InstanceKey, Instance]={},
        instance_hash_rewritten: Dict[InstanceKey, Instance]={},
        filtered_instances: List[InstanceKey]=None,
        models: str=[]):
        """
        Visualize the rewrite distribution. 
        It's a one-bar histogram that displays the count of instances rewritten, and
        the proportion of "flip_to_correct", "flip_to_incorrect", "unflip"
        Because of the flipping proportion, this historgram is different
        for each different model. 
        
        Parameters
        ----------
        instance_hash : Dict[InstanceKey, Instance]
            A dict that saves all the *original* instances, by default {}. 
            It denotes by the corresponding instance keys.
            If ``{}``, resolve to ``Instance.instance_hash``.
        instance_hash_rewritten : Dict[InstanceKey, Instance]
            A dict that saves all the *rewritten* instances, by default {}. 
            It denotes by the corresponding instance keys.
            If ``{}``, resolve to ``Instance.instance_hash_rewritten``.
        filtered_instances : List[InstanceKey], optional
            A selected list of instances. If given, only display the distribution
            of the selected instances, by default None
        models : List[str], optional
            A list of instances, with the bars for each group concated vertically.
            By default []. If [], resolve to ``[ Instance.model ]``.
        
        Returns
        -------
        alt.Chart
            An altair chart object. 
        """
        model = models or [ Instance.model ]
        instance_hash = instance_hash or Instance.instance_hash
        instance_hash_rewritten = instance_hash_rewritten or Instance.instance_hash_rewritten
        if not models:
            models = [ Instance.resolve_default_model(None) ]
        output = []
        for model in models:
            #Instance.set_default_model(model=model)
            data = self.serialize(instance_hash, instance_hash_rewritten, filtered_instances, model)
            for flip, count in data["counts"].items():
                output.append({
                    "flip": flip,
                    "count": count,
                    "model": model
                })
        df = pd.DataFrame(output)
        chart = alt.Chart(df).mark_bar().encode(
            y=alt.Y('model:N'),
            x=alt.X('count:Q', stack="zero"),
            color=alt.Color('flip:N', scale=alt.Scale(
                range=["#1f77b4", "#ff7f0e", "#c7c7c7"],
                domain=["flip_to_correct", "flip_to_incorrect", "unflip"])),
            tooltip=['model:N', 'count:Q', 'correctness:N']
        ).properties(width=100)#.configure_facet(spacing=5)#
        return chart 
Example #20
Source File: group.py    From errudite with GNU General Public License v2.0 4 votes vote down vote up
def visualize_models(self, 
        instance_hash: Dict[InstanceKey, Instance]={},
        instance_hash_rewritten: Dict[InstanceKey, Instance]={},
        filtered_instances: List[InstanceKey]=None,
        models: List[str]=[]):
        """
        Visualize the group distribution. 
        It's a one-bar histogram that displays the count of instances in the group, and
        the proportion of incorrect predictions.
        Because of the incorrect prediction proportion, this historgram is different
        for each different model. 
        
        Parameters
        ----------
        instance_hash : Dict[InstanceKey, Instance]
            A dict that saves all the *original* instances, by default {}. 
            It denotes by the corresponding instance keys.
            If ``{}``, resolve to ``Instance.instance_hash``.
        instance_hash_rewritten : Dict[InstanceKey, Instance]
            A dict that saves all the *rewritten* instances, by default {}. 
            It denotes by the corresponding instance keys.
            If ``{}``, resolve to ``Instance.instance_hash_rewritten``.
        filtered_instances : List[InstanceKey], optional
            A selected list of instances. If given, only display the distribution
            of the selected instances, by default None
        models : List[str], optional
            A list of instances, with the bars for each group concated vertically.
            By default []. If [], resolve to ``[ Instance.model ]``.
        
        Returns
        -------
        alt.Chart
            An altair chart object. 
        """
        instance_hash = instance_hash or Instance.instance_hash
        instance_hash_rewritten = instance_hash_rewritten or Instance.instance_hash_rewritten
        models = models or [ Instance.resolve_default_model(None) ]
        output = []
        for model in models:
            #Instance.set_default_model(model=model)
            data = self.serialize(instance_hash, instance_hash_rewritten, filtered_instances, model)
            for correctness, count in data["counts"].items():
                output.append({
                    "correctness": correctness,
                    "count": count,
                    "model": model
                })
        
        df = pd.DataFrame(output)
        chart = alt.Chart(df).mark_bar().encode(
            y=alt.Y('model:N'),
            x=alt.X('count:Q', stack="zero"),
            color=alt.Color('correctness:N', scale=alt.Scale(domain=["correct", "incorrect"])),
            tooltip=['model:N', 'count:Q', 'correctness:N']
        ).properties(width=100)#.configure_facet(spacing=5)#
        return chart 
Example #21
Source File: explore.py    From gobbli with Apache License 2.0 4 votes vote down vote up
def show_label_distribution(
    sample_labels: Union[List[str], List[List[str]]],
    all_labels: Optional[Union[List[str], List[List[str]]]] = None,
):
    if sample_labels is not None:
        st.header("Label Distribution")
        label_counts = _collect_label_counts(sample_labels)

        if all_labels is None:
            label_chart = (
                alt.Chart(label_counts, height=500, width=700)
                .mark_bar()
                .encode(
                    alt.X("Label", type="nominal"),
                    alt.Y("Proportion", type="quantitative"),
                )
            )
        else:
            label_counts["Label Set"] = "Sample"
            all_label_counts = _collect_label_counts(all_labels)
            all_label_counts["Label Set"] = "All Documents"
            label_counts = pd.concat([label_counts, all_label_counts])

            label_chart = (
                alt.Chart(label_counts, width=100)
                .mark_bar()
                .encode(
                    alt.X(
                        "Label Set",
                        type="nominal",
                        title=None,
                        sort=["Sample", "All Documents"],
                    ),
                    alt.Y("Proportion", type="quantitative"),
                    alt.Column(
                        "Label", type="nominal", header=alt.Header(labelAngle=0)
                    ),
                    alt.Color("Label Set", type="nominal", legend=None),
                )
            )

        st.altair_chart(label_chart) 
Example #22
Source File: evaluate.py    From gobbli with Apache License 2.0 4 votes vote down vote up
def errors_report(self, k: int = 10) -> str:
        """
        Args:
          k: The number of results to return for each of false positives and false negatives.

        Returns:
          A nicely-formatted human-readable report describing the biggest mistakes made by
          the classifier for each class.
        """
        errors = self.errors(k=k)
        output = "Errors Report\n" "------------\n\n"

        for label, (false_positives, false_negatives) in errors.items():

            def make_errors_str(errors: List[ClassificationError]) -> str:
                if self.multilabel:
                    return "\n".join(
                        (
                            f"Correct Value: {label in e.y_true}\n"
                            f"Predicted Probability: {e.y_pred_proba[label]}"
                            f"Text: {truncate_text(escape_line_delimited_text(e.X), 500)}\n"
                        )
                        for e in errors
                    )
                else:
                    return "\n".join(
                        (
                            f"True Class: {e.y_true}\n"
                            f"Predicted Class: {e.y_pred} (Probability: {e.y_pred_proba[e.y_pred]})\n"
                            f"Text: {truncate_text(escape_line_delimited_text(e.X), 500)}\n"
                        )
                        for e in errors
                    )

            false_positives_str = make_errors_str(false_positives)
            if len(false_positives_str) == 0:
                false_positives_str = "None"
            false_negatives_str = make_errors_str(false_negatives)
            if len(false_negatives_str) == 0:
                false_negatives_str = "None"

            header_name = "CLASS" if self.multilabel else "LABEL"

            output += (
                " -------\n"
                f"| {header_name}: {label}\n"
                " -------\n\n"
                "False Positives\n"
                "***************\n\n"
                f"{false_positives_str}\n\n"
                "False Negatives\n"
                "***************\n\n"
                f"{false_negatives_str}\n\n"
            )

        return output 
Example #23
Source File: evaluate.py    From gobbli with Apache License 2.0 4 votes vote down vote up
def errors_for_label(self, label: str, k: int = 10):
        """
        Output the biggest mistakes for the given class by the classifier

        Args:
          label: The label to return errors for.
          k: The number of results to return for each of false positives and false negatives.

        Returns:
          A 2-tuple.  The first element is a list of the top ``k`` false positives, and the
          second element is a list of the top ``k`` false negatives.
        """
        pred_label = self.y_pred_multilabel[label].astype("bool")
        true_label = self.y_true_multilabel[label].astype("bool")

        # Order false positives/false negatives by the degree of the error;
        # i.e. we want the false positives with highest predicted probability first
        # and false negatives with lowest predicted probability first
        # Take the top `k` of each
        false_positives = (
            self.y_pred_proba.loc[pred_label & ~true_label]
            .sort_values(by=label, ascending=False)
            .iloc[:k]
        )
        false_negatives = (
            self.y_pred_proba.loc[~pred_label & true_label]
            .sort_values(by=label, ascending=True)
            .iloc[:k]
        )

        def create_classification_errors(
            y_pred_proba: pd.DataFrame,
        ) -> List[ClassificationError]:
            classification_errors = []
            for ndx, row in y_pred_proba.iterrows():
                classification_errors.append(
                    ClassificationError(
                        X=self.X[ndx],
                        y_true=self.y_true[ndx],
                        y_pred_proba=row.to_dict(),
                    )
                )
            return classification_errors

        return (
            create_classification_errors(false_positives),
            create_classification_errors(false_negatives),
        )