Source code for lens.summarise

"""Summarise a Pandas DataFrame"""

import time
import os
import pandas as pd
import numpy as np
import logging
import json

import scipy.interpolate

from dask.multiprocessing import get as get_multiprocessing
from dask.threaded import get as get_threaded
from dask.local import get_sync

from .dask_graph import create_dask_graph
from .tdigest_utils import tdigest_from_centroids
from .utils import hierarchical_ordering
from .version import __version__

logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler())


class LensSummaryError(Exception):
    pass


class EmptyDataFrameError(Exception):
    pass


def _validate_report(report):
    """Validates a dict report"""
    columns = report['_columns']
    column_props = report['column_properties']
    num_cols = [col for col in columns if (column_props[col]['numeric'])]
    for num_col in num_cols:
        if (num_col not in report['column_summary'].keys() or
                num_col not in report['correlation']['_columns'] or
                num_col not in report['outliers'].keys()):
            raise LensSummaryError('Column `{}` is marked as numeric but '
                                   'the report lacks its numeric summary'
                                   ' and correlation'.format(num_col))

    cat_cols = [col for col in columns
                if column_props[col]['is_categorical']]
    for cat_col in cat_cols:
        if cat_col not in report['frequencies'].keys():
            raise LensSummaryError('Column `{}` is marked as categorical but '
                                   'the report lacks its frequency analysis'
                                   .format(cat_col))


[docs]class Summary(object): """A summary of a pandas DataFrame. Create a summary instance by calling :func:`lens.summarise.summarise` on a DataFrame. This calculates several quantities of interest to data scientists. The Summary object is designed for programmatic use. For more direct visual inspection, use the :class:`lens.explorer.Explorer` class in a Jupyter notebook. """ def __init__(self, report): if not isinstance(report, dict): raise TypeError('report argument must be a dict') _validate_report(report) self._report = report
[docs] @staticmethod def from_json(file): """Create a Summary from a report saved in JSON format. Parameters ---------- file : str or buffer Path to file containing the JSON report or buffer from which the report can be read. Returns ------- :class:`~lens.summarise.Summary` ``Summary`` object containing the summary in the JSON file. """ if hasattr(file, 'read'): report = json.load(file) else: with open(file, 'r') as f: report = json.load(f) return Summary(report)
[docs] def to_json(self, file=None): """Produce a JSON serialization of the report. Parameters ---------- file : str or buffer, optional File name or writeable buffer to save the JSON report. If omitted, a string containing the report will be returned. Returns ------- str JSON serialization of the summary report """ if file is None: return json.dumps(self._report, separators=(',', ':')) else: if hasattr(file, 'write'): json.dump(self._report, file, separators=(',', ':')) else: with open(file, 'w') as f: json.dump(self._report, f, separators=(',', ':'))
@property def columns(self): """Get a list of column names of the dataset. Returns ------- list Column names Examples -------- >>> summary.columns ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality'] """ return self._report['_columns'] @property def rows(self): """Get the number of rows in the dataset. Returns ------- int Number of rows Examples -------- >>> summary.rows 4898 """ return self._report['row_count']['total'] @property def rows_unique(self): """Get the number of unique rows in the dataset. Returns ------- int Number of unique rows. """ return self._report['row_count']['unique'] def _desc(self, column): """Return the inferred description of a column. Parameters ---------- column : str Column name. Returns ------- str Description of the column. """ column_props = self._report['column_properties'][column] if column_props['is_categorical']: return 'categorical' elif column_props['numeric']: return 'numeric' elif column_props['is_ID']: return 'ID_like' else: return None
[docs] def summary(self, column): """Basic information about the column This returns information about the number of nulls and unique values in ``column`` as well as which type this column is. This is guaranteed to return a dictionary with the same keys for every column. The dictionary contains the following keys: ``desc`` the type of data: currently ``categorical`` or ``numeric``. Lens will calculate different quantities for this column depending on the value of ``desc``. ``dtype`` the type of data in Pandas. ``name`` column name ``notnulls`` number of non-null values in the column ``nulls`` number of null-values in the column ``unique`` number of unique values in the column Examples -------- >>> summary.summary('quality') {'desc': 'categorical', 'dtype': 'int64', 'name': 'quality', 'notnulls': 4898, 'nulls': 0, 'unique': 7} >>> summary.summary('chlorides') {'desc': 'numeric', 'dtype': 'float64', 'name': 'chlorides', 'notnulls': 4898, 'nulls': 0, 'unique': 160} Parameters ---------- column : str Column name Returns ------- dict Dictionary of summary information. """ if column not in self._report['_columns']: raise LensSummaryError('The data summary does not contain' ' information about column `{}`.' .format(column)) column_props = self._report['column_properties'][column] summary = {'name': column, 'desc': self._desc(column)} for key in ['nulls', 'notnulls', 'unique', 'dtype']: summary[key] = column_props[key] return summary
[docs] def details(self, column): """Type-specific information for a column The `details` method returns additional information on ``column``, beyond that provided by the ``summary`` method. If ``column`` is numeric, this returns summary statistics. If it is categorical, it returns a dictionary of how often each category occurs. Examples -------- >>> summary.details('alcohol') {'desc': 'numeric', 'iqr': 1.9000000000000004, 'max': 14.199999999999999, 'mean': 10.514267047774602, 'median': 10.4, 'min': 8.0, 'name': 'alcohol', 'std': 1.2306205677573181, 'sum': 51498.880000000005} >>> summary.details('quality') {'desc': 'categorical', 'frequencies': {3: 20, 4: 163, 5: 1457, 6: 2198, 7: 880, 8: 175, 9: 5}, 'iqr': 1.0, 'max': 9, 'mean': 5.8779093507554103, 'median': 6.0, 'min': 3, 'name': 'quality', 'std': 0.88563857496783116, 'sum': 28790} Parameters ---------- column : str Column name Returns ------- dict Dictionary of detailed information. """ if column not in self._report['_columns']: raise LensSummaryError('The data summary does not contain' ' information about column `{}`.' .format(column)) column_props = self._report['column_properties'][column] details = {'name': column, 'desc': self._desc(column)} if column_props['is_categorical']: details['frequencies'] = self._report['frequencies'][column] if column_props['numeric']: column_summ = self._report['column_summary'][column] for k in ['min', 'max', 'mean', 'median', 'std', 'sum', 'iqr']: details[k] = column_summ[k] return details
[docs] def pair_details(self, first, second): """Get pairwise information for a column pair. The information returned depends on the types of the two columns. It may contain the following keys. correlation dictionary with the Spearman rank correlation coefficient and Pearson product-moment correlation coefficient between the columns. This is returned when both columns are numeric. pairdensity dictionary with an estimate of the pairwise density between the columns. The density is either a 2D KDE estimate if both columns are numerical, or several 1D KDE estimates if one of the columns is categorical and the other numerical (grouped by the categorical column) or a cross-tabuluation. Examples -------- >>> summary.pair_details('chlorides', 'quality') {'correlation': { 'pearson': -0.20993441094675602, 'spearman': -0.31448847828244203}, {'pairdensity': { 'density': <2d numpy array> 'x': <1d numpy array of x-values> 'y': <1d numpy array of y-values> 'x_scale': 'linear', 'y_scale': 'cat'} } >>> summary.pair_details('alcohol', 'chlorides') {'correlation': { 'pearson': -0.36018871210816106, 'spearman': -0.5708064071153713}, {'pairdensity': { 'density': <2d numpy array> 'x': <1d numpy array of x-values> 'y': <1d numpy array of y-values> 'x_scale': 'linear', 'y_scale': 'linear'} } Parameters ---------- first : str Name of the first column. second : str Name of the second column. Returns ------- dict Dictionary of pairwise information. """ if first == second: raise ValueError( 'Can only return the pair details of two different columns: ' 'received {} twice.'.format(first)) pair_details = {} # Correlation corr_report = self._report['correlation'] try: idx = [corr_report['_columns'].index(col) for col in [first, second]] except ValueError as e: logger.info('No correlation information for column `{}`' .format(e.args[0].split()[0])) else: correlation = {k: corr_report[k][idx[0]][idx[1]] for k in ['spearman', 'pearson']} pair_details['correlation'] = correlation # Pair density / Crosstab pairdensity_report = self._report['pairdensity'] # We store pairdensity information for both first/second and # second/first in a single key in the report, so we check for both # report[first][second] and report[second][first] to find it and # transpose if necessary. try: pairdensity = pairdensity_report[first][second] scales = pairdensity['scales'] density = np.array(pairdensity['density']) except KeyError: try: pairdensity = pairdensity_report[second][first] # Invert scale information and transpose matrix scales = pairdensity['scales'][::-1] density = np.array(pairdensity['density']).T except KeyError: logger.info('No pairdensity information for columns `{}`' ' and `{}`'.format(first, second)) pairdensity = None if pairdensity is not None: pairdensity = {'density': density, 'x': pairdensity['axes'][first], 'y': pairdensity['axes'][second], 'x_scale': scales[0], 'y_scale': scales[1]} pair_details['pairdensity'] = pairdensity return pair_details
[docs] def histogram(self, column): """ Return the histogram for `column`. This function returns a histogram for the column. The number of bins is estimated through the Freedman-Diaconis rule. Parameters ---------- column: str Name of the column Returns ------- counts: array Counts for each of the bins of the histogram. bin_edges : array Edges of the bins in the histogram. Length is ``length(counts)+1``. """ self._check_column_name(column) try: histogram = self._report['column_summary'][column]['histogram'] except KeyError: raise ValueError('{} is not a numeric column'.format(column)) return [np.array(histogram[key]) for key in ['counts', 'bin_edges']]
[docs] def kde(self, column): """ Return a Kernel Density Estimate for `column`. This function returns a KDE for the column. It is computed between the minimum and maximum values of the column and uses Scott's rule to compute the bandwith. Parameters ---------- column: str Name of the column Returns ------- x: array Values at which the KDE has been evaluated. y : array Values of the KDE. """ self._check_column_name(column) try: kde = self._report['column_summary'][column]['kde'] except KeyError: raise ValueError('{} is not a numeric column'.format(column)) return [np.array(kde[key]) for key in ['x', 'y']]
def _tdigest_report(self, column): """ Return the list of tdigest centroids and means from report """ self._check_column_name(column) try: tdigest_list = self._report['column_summary'][column]['tdigest'] except KeyError: raise ValueError('{} is not a numeric column'.format(column)) return tdigest_list
[docs] def tdigest_centroids(self, column): """Get TDigest centroids and counts for column. Parameters ---------- column : str Name of the column. Returns ------- :class:`numpy.array` Means of the TDigest centroids. :class:`numpy.array` Counts for each of the TDigest centroids. """ tdigest_list = self._tdigest_report(column) xs, counts = zip(*tdigest_list) return np.array(xs), np.array(counts)
[docs] def pdf(self, column): """ Approximate pdf for `column` This returns a function representing the pdf of a numeric column. Examples -------- >>> pdf = summary.pdf('chlorides') >>> min_value = summary.details('chlorides')['min'] >>> max_value = summary.details('chlorides')['max'] >>> xs = np.linspace(min_value, max_value, 200) >>> plt.plot(xs, pdf(xs)) Parameters ---------- column : str Name of the column. Returns ------- pdf: function Function representing the pdf. """ xs, counts = self.tdigest_centroids(column) return scipy.interpolate.interp1d(xs, counts)
[docs] def tdigest(self, column): """Return a TDigest object approximating the distribution of a column Documentation for the TDigest class can be found at https://github.com/CamDavidsonPilon/tdigest. Parameters ---------- column : str Name of the column. Returns ------- :class:`tdigest.TDigest` TDigest instance computed from the values of the column. """ return tdigest_from_centroids(self._tdigest_report(column))
[docs] def cdf(self, column): """ Approximate cdf for `column` This returns a function representing the cdf of a numeric column. Examples -------- >>> cdf = summary.cdf('chlorides') >>> min_value = summary.details('chlorides')['min'] >>> max_value = summary.details('chlorides')['max'] >>> xs = np.linspace(min_value, max_value, 200) >>> plt.plot(xs, cdf(xs)) Parameters ---------- column : str Name of the column. Returns ------- cdf: function Function representing the cdf. """ tdigest = self.tdigest(column) return tdigest.quantile
[docs] def correlation_matrix(self, include=None, exclude=None): """ Correlation matrix for numeric columns Parameters ---------- include: list of strings, optional List of numeric columns to include. Includes all columns by default. exclude: list of strings, optional List of numeric columns to exclude. Includes all columns by default. Returns ------- columns: list of strings List of column names correlation_matrix: 2D array of floats The correlation matrix, ordered such that ``correlation_matrix[i, j]`` is the correlation between ``columns[i]`` and ``columns[j]`` Notes ----- The columns are ordered through hierarchical clustering. Thus, neighbouring columns in the output will be more correlated. """ if include is not None and exclude is not None: raise ValueError( "Either 'include' or 'exclude' should be defined, " "but not both") available_columns = self._report['correlation']['_columns'] if include is not None: non_numeric_includes = set(include) - set(available_columns) if non_numeric_includes: raise ValueError( "Only numeric columns can be included in the " "correlation plot. Columns {} are not " "numeric".format(non_numeric_includes)) columns = include elif exclude is not None: columns = set(available_columns) - set(exclude) else: columns = available_columns columns = list(columns) # Filter the correlation matrix to select only the above columns correlation_report = self._report['correlation'] idx = [correlation_report['_columns'].index(col) for col in columns] correlation_matrix = (np.array(correlation_report['spearman']) [idx][:, idx]) return hierarchical_ordering(columns, correlation_matrix)
def _check_column_name(self, column): if column not in self.columns: raise KeyError(column)
[docs]def summarise(df, scheduler='multiprocessing', num_workers=None, size=None, pairdensities=True): """Create a Lens Summary for a Pandas DataFrame. This creates a :class:`~lens.Summary` instance containing many quantities of interest to a data scientist. Examples -------- Let's explore the wine quality dataset. >>> import pandas as pd >>> import lens >>> url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv" # noqa >>> wines_df = pd.read_csv(url, sep=';') >>> summary = lens.summarise(wines_df) Now that we have a :class:`~lens.Summary` instance we can inspect the shape of the dataset >>> summary.columns ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality'] >>> summary.rows 4898 So far, nothing groundbreaking. Let's look at the ``quality`` column: >>> summary.summary('quality') {'desc': 'categorical', 'dtype': 'int64', 'name': 'quality', 'notnulls': 4898, 'nulls': 0, 'unique': 7} This tells us that there are seven unique values in the quality columns, and zero null values. It also tells us that lens will treat this column as categorical. Let's look at this in more details: >>> summary.details('quality') {'desc': 'categorical', 'frequencies': {3: 20, 4: 163, 5: 1457, 6: 2198, 7: 880, 8: 175, 9: 5}, 'iqr': 1.0, 'max': 9, 'mean': 5.8779093507554103, 'median': 6.0, 'min': 3, 'name': 'quality', 'std': 0.88563857496783116, 'sum': 28790} This tells us that the median wine quality is 6 and the standard deviation is less than one. Let's now get the correlation between the ``quality`` column and the ``alcohol`` column: >>> summary.pair_detail('quality', 'alcohol')['correlation'] {'pearson': 0.4355747154613688, 'spearman': 0.4403691816246831} Thus, the Spearman Rank Correlation coefficient between these two columns is 0.44. Parameters ---------- df : pd.DataFrame DataFrame to be analysed. scheduler : str, optional Dask scheduler to use. Must be one of ['multiprocessing', 'threaded', 'sync']. num_workers : int or None, optional Number of workers in the pool. If the environment variable `NUM_CPUS` is set that number will be used, otherwise it will use as many workers as CPUs available in the machine. size : int, optional DataFrame size on disk, which will be added to the report. pairdensities : bool, optional Whether to compute the pairdensity estimation between all pairs of numerical columns. For most datasets, this is the most expensive computation. Default is True. Returns ------- summary : :class:`~lens.Summary` The computed data summary. """ if not isinstance(df, pd.DataFrame): raise TypeError('Can only summarise a Pandas DataFrame') if len(df.columns) == 0: raise EmptyDataFrameError('The DataFrame has no columns') if num_workers is None: try: num_workers = int(os.environ['NUM_CPUS']) logger.debug('Number of workers read from environment: {}' .format(num_workers)) except ValueError: # Set to None if NUM_CPUS cannot be cast to an integer logger.warning('Environment variable NUM_CPUS={} cannot be' ' interpreted as an integer, defaulting to' ' number of cores in system' .format(os.environ.get('NUM_CPUS'))) num_workers = None except KeyError: # NUM_CPUS not in environment num_workers = None schedulers = {'multiprocessing': get_multiprocessing, 'threaded': get_threaded, 'sync': get_sync} try: kwargs = {'get': schedulers[scheduler]} except KeyError: raise KeyError('`scheduler` must be one of {}' .format(schedulers.keys())) if num_workers is not None: kwargs['num_workers'] = num_workers tstart = time.time() report = create_dask_graph(df, pairdensities=pairdensities).compute(**kwargs) report['_run_time'] = time.time() - tstart report['_lens_version'] = __version__ if size is not None: report['size'] = size return Summary(report)