Source code for lens.explorer

"""Explore a Summary"""

import sys
import logging

import numpy as np
import matplotlib.pyplot as plt

import plotly.tools
import plotly.offline as py

from lens.summarise import Summary
from lens.formatting import JupyterTable
from lens.plotting import (plot_distribution, plot_pairdensity,
                           plot_correlation, plot_cdf)

logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler())

# Check whether we are in a notebook environment
# this is a false positive if we are in the Jupyter console
IN_NOTEBOOK = 'ipykernel' in sys.modules

PLOTLY_TO_MPL_KWS = {
    'strip_style': True,
    'resize': True,
}

PLOTLY_KWS = {
    'show_link': False,
}


def _render(fig, showlegend=None):
    """Plot a matploltib or plotly figure"""
    if isinstance(fig, plt.Figure):
        fig = plotly.tools.mpl_to_plotly(fig, **PLOTLY_TO_MPL_KWS)

    if showlegend is not None:
        fig.layout['showlegend'] = showlegend

    if not IN_NOTEBOOK:
        message = 'Lens explorer can only plot in a Jupyter notebook'
        logger.error(message)
        raise ValueError(message)
    else:
        if not py.offline.__PLOTLY_OFFLINE_INITIALIZED:
            py.init_notebook_mode()
        return py.iplot(fig, **PLOTLY_KWS)


[docs]class Explorer(object):
    """An explorer to visualise a Lens Summary

    Once a Lens ``Summary`` has been generated with
    :func:`lens.summarise.summarise`, this class provides the methods necessary
    to explore the summary though tables and plots. It is best used from within
    a Jupyter notebook.
    """
    # Number of points to show in the CDF plot
    _N_cdf = 1000

    def __init__(self, summary, plot_renderer=_render):
        if not isinstance(summary, Summary):
            raise TypeError('Can only explore a lens Summary')
        self.summary = summary
        self.plot_renderer = plot_renderer

[docs]    def describe(self):
        """General description of the dataset.

        Produces a table including the following information about each column:

        ``desc``
            the type of data: currently ``categorical`` or ``numeric``.
            Lens will calculate different quantities for this column
            depending on the value of ``desc``.

        ``dtype``
            the type of data in Pandas.

        ``name``
            column name

        ``notnulls``
            number of non-null values in the column

        ``nulls``
            number of null-values in the column

        ``unique``
            number of unique values in the column
        """
        summary = self.summary
        columns = summary.columns

        header = ['']
        header.extend(columns)

        desc = ['desc']
        desc.extend([summary._desc(column) for column in columns])

        dtype = ['dtype']
        dtype.extend([summary.summary(column)['dtype'] for column in columns])

        notnulls = ['notnulls']
        notnulls.extend(
            [summary.summary(column)['notnulls'] for column in columns])

        nulls = ['nulls']
        nulls.extend([summary.summary(column)['nulls'] for column in columns])

        unique = ['unique']
        unique.extend(
            [summary.summary(column)['unique'] for column in columns])

        return JupyterTable([header, desc, dtype, notnulls, nulls, unique])

[docs]    def column_details(self, column, sort=False):
        """Show type-specific column details.

        For numeric columns, this method produces a table with summary
        statistics, including minimum, maximum, mean, and median. For
        categorical columns, it produces a frequency table for each category
        sorted in descending order of frequency.

        Parameters
        ----------
        column : str
            Name of the column.
        sort : boolean, optional
            Sort frequency tables in categorical variables by
            category name.
        """
        details = self.summary.details(column)
        desc = details['desc']

        if desc == 'numeric':
            caption = ''
            data = [['', details['name']], ['desc', details['desc']],
                    ['dtype', self.summary.summary(column)['dtype']],
                    ['min', details['min']], ['max', details['max']],
                    ['mean', details['mean']], ['median', details['median']],
                    ['std', details['std']], ['sum', details['sum']],
                    ['IQR', details['iqr']]]
            return JupyterTable(data)
        elif desc == 'categorical':
            caption = '<p>desc: {}, dtype: {}</p>'.format(
                details['desc'], self.summary.summary(column)['dtype'])
            data = [['item', 'frequency']]
            frequencies = []
            for item, frequency in details['frequencies'].items():
                frequencies.append([item, frequency])
            if sort:
                data.extend(sorted(frequencies, key=lambda x: x[0]))
            else:
                data.extend(sorted(frequencies, key=lambda x: -x[1]))
        else:
            caption = ''
            data = [['', details['name']], ['desc', details['desc']],
                    ['dtype', self.summary.summary(column)['dtype']]]

        return JupyterTable(data, caption=caption)

[docs]    def distribution(self, column):
        """Show properties of the distribution of values in the column.

        Parameters
        ----------
        column : str
            Name of the column.
        """
        raise NotImplementedError

[docs]    def distribution_plot(self, column, bins=None):
        """Plot the distribution of a numeric column.

        Create a plotly plot with a histogram of the values in a column. The
        number of bin in the histogram is decided according to the
        Freedman-Diaconis rule unless given by the `bins` parameter.

        Parameters
        ----------
        column : str
            Name of the column.
        bins : int, optional
            Number of bins to use for histogram. If not given, the
            Freedman-Diaconis rule will be used to estimate the best number of
            bins. This argument also accepts the formats taken by the `bins`
            parameter of matplotlib's :function:`~matplotlib.pyplot.hist`.
        """
        ax = plot_distribution(self.summary, column, bins)
        self.plot_renderer(ax)

[docs]    def cdf_plot(self, column):
        """Plot the empirical cumulative distribution function of a column.

        Creates a plotly plot with the empirical CDF of a column.

        Parameters
        ----------
        column : str
            Name of the column.
        """
        ax = plot_cdf(self.summary, column, self._N_cdf)
        self.plot_renderer(ax)

[docs]    def crosstab(self, column1, column2):
        """Show a contingency table of two categorical columns.

        Print a contingency table for two categorical variables showing the
        multivariate frequancy distribution of the columns.

        Parameters
        ----------
        column1 : str
            First column.
        column2 : str
            Second column.
        """
        pair_details = self.summary.pair_details(column1, column2)

        for column in [column1, column2]:
            column_details = self.summary.details(column)
            if column_details['desc'] != 'categorical':
                raise ValueError('Column `{}` is not categorical'
                                 .format(column))

        pair_details = self.summary.pair_details(column1, column2)
        pairdensity = pair_details['pairdensity']

        # Convert to numpy arrays for ease of reindexing
        x = np.array(pairdensity['x'])
        y = np.array(pairdensity['y'])
        crosstab = np.array(pairdensity['density'])

        # Sort by first column category names
        idx = np.argsort(x)
        x = x[idx]
        crosstab = crosstab[:, idx]

        # Sort by second column category names
        idx = np.argsort(y)
        y = y[idx]
        crosstab = crosstab[idx]

        table = [[''] + x.tolist()]
        for y_category, crosstab_row in zip(y, crosstab):
            table.append([y_category] + crosstab_row.tolist())

        return JupyterTable(table)

[docs]    def pairwise_density_plot(self, column1, column2):
        """Plot the pairwise density between two columns.

        This plot is an approximation of a scatterplot through a 2D Kernel
        Density Estimate for two numerical variables. When one of the variables
        is categorical, a 1D KDE for each of the categories is shown,
        normalised to the total number of non-null observations. For two
        categorical variables, the plot produced is a heatmap representation of
        the contingency table.

        Parameters
        ----------
        column1 : str
            First column.
        column2 : str
            Second column.
        """
        allowed_descriptions = ['numeric', 'categorical']
        for column in [column1, column2]:
            column_description = self.summary.summary(column)['desc']
            if column_description not in allowed_descriptions:
                raise ValueError(
                    'Column {} is not numeric or categorical'.format(column))

        fig = plot_pairdensity(self.summary, column1, column2)
        self.plot_renderer(fig)

[docs]    def correlation_plot(self, include=None, exclude=None):
        """Plot the correlation matrix for numeric columns

        Plot a Spearman rank order correlation coefficient matrix showing the
        correlation between columns. The matrix is reordered to group together
        columns that have a higher correlation coefficient.  The columns to be
        plotted in the correlation plot can be selected through either the
        ``include`` or ``exclude`` keyword arguments. Only one of them can be
        given.

        Parameters
        ----------

        include : list of str
            List of columns to include in the correlation plot.
        exclude : list of str
            List of columns to exclude from the correlation plot.
        """
        fig = plot_correlation(self.summary, include, exclude)
        self.plot_renderer(fig)

[docs]    def correlation(self, include=None, exclude=None):
        """Show the correlation matrix for numeric columns.

        Print a Spearman rank order correlation coefficient matrix in tabular
        form, showing the correlation between columns. The matrix is reordered
        to group together columns that have a higher correlation coefficient.
        The columns to be shown in the table can be selected
        through either the ``include`` or ``exclude`` keyword arguments. Only
        one of them can be given.

        Parameters
        ----------

        include : list of str
            List of columns to include in the correlation plot.
        exclude : list of str
            List of columns to exclude from the correlation plot.
        """
        columns, correlation_matrix = self.summary.correlation_matrix(
            include, exclude)
        headers = [''] + columns
        rows = []
        for column, correlation_row in zip(columns, correlation_matrix):
            rows.append([column] + correlation_row.tolist())
        return JupyterTable([headers] + rows)


def explore(summary):
    """Create an Explorer instance from a Lens Summary"""
    return Explorer(summary)