Source code for hypercluster.visualize

from typing import List, Optional
import logging
from collections import Counter
from itertools import cycle
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import DataFrame
from scipy.cluster import hierarchy
from scipy.spatial.distance import pdist
from hypercluster.constants import param_delim
from hypercluster.utilities import convert_to_multiind, evaluate_one

matplotlib.rcParams["pdf.fonttype"] = 42
matplotlib.rcParams["ps.fonttype"] = 42
sns.set(font="arial", style="white", color_codes=True, font_scale=1.3)
matplotlib.rcParams.update({"savefig.bbox": "tight"})
cmap = sns.cubehelix_palette(
    start=0,
    rot=0.4,
    gamma=1.0,
    hue=0.82,
    light=1,
    dark=0,
    reverse=False,
    as_cmap=True
)
cmap.set_over('black')
cmap.set_under('white')
cmap.set_bad("#DAE0E6")


[docs]def zscore(df): """Row zscores a DataFrame, ignores np.nan Args: df (DataFrame): DataFrame to z-score Returns (DataFrame): Row-zscored DataFrame. """ return df.subtract(df.mean(axis=1), axis=0).divide(df.std(axis=1), axis=0)
[docs]def compute_order( df, dist_method: str = "euclidean", cluster_method: str = "average" ): """Gives hierarchical clustering order for the rows of a DataFrame Args: df (DataFrame): DataFrame with rows to order. dist_method (str): Distance method to pass to scipy.cluster.hierarchy.linkage. cluster_method (str): Clustering method to pass to scipy.spatial.distance.pdist. Returns (pandas.Index): Ordered row index. """ dist_mat = pdist(df, metric=dist_method) link_mat = hierarchy.linkage(dist_mat, method=cluster_method) return df.index[hierarchy.leaves_list(hierarchy.optimal_leaf_ordering(link_mat, dist_mat))]
[docs]def visualize_evaluations( evaluations_df: DataFrame, savefig: bool = False, output_prefix: str = "evaluations", **heatmap_kws ) -> List[matplotlib.axes.Axes]: """Makes a z-scored visualization of all evaluations. Args: evaluations_df (DataFrame): Evaluations dataframe from clustering.optimize_clustering output_prefix (str): If saving a figure, file prefix to use. savefig (bool): Whether to save a pdf **heatmap_kws: Additional keyword arguments to pass to seaborn.heatmap. Returns (List[matplotlib.axes.Axes]): List of all matplotlib axes. """ clusterers = sorted( list(set([i.split(param_delim, 1)[0] for i in evaluations_df.columns])) ) width_ratios = [ dict( Counter( [i.split(param_delim, 1)[0] for i in evaluations_df.columns] ) )[clus] for clus in clusterers ] evaluations_df = zscore(evaluations_df) width = 0.18 * (len(evaluations_df.columns) + 2 + (0.01 * (len(clusterers) - 1))) height = 0.22 * (len(evaluations_df)) fig, axs = plt.subplots( figsize=(width, height), nrows=1, ncols=(len(clusterers) + 1), gridspec_kw=dict( width_ratios=width_ratios + [2], wspace=0.01, left=0, right=1, top=1, bottom=0, ), ) vmin = np.nanquantile(evaluations_df, 0.1) vmax = np.nanquantile(evaluations_df, 0.9) heatmap_kws['cmap'] = heatmap_kws.get('cmap', cmap) heatmap_kws['vmin'] = heatmap_kws.get('vmin', vmin) heatmap_kws['vmax'] = heatmap_kws.get('vmax', vmax) for i, clus in enumerate(clusterers): temp = convert_to_multiind(clus, evaluations_df) ax = axs[i] sns.heatmap( temp, ax=ax, yticklabels=temp.index, xticklabels=["-".join([str(i) for i in col]) for col in temp.columns], cbar_ax=axs[-1], cbar_kws=dict(label="z-score"), **heatmap_kws ) ax.set_ylabel("") ax.set_title(clus) ax.set_yticklabels([]) axs[0].set_ylabel("evaluation method") axs[0].set_yticklabels(temp.index, rotation=0) if savefig: plt.savefig("%s.pdf" % output_prefix) return axs
[docs]def visualize_pairwise( df: DataFrame, savefig: bool = False, output_prefix: Optional[str] = None, method: Optional[str] = None, **heatmap_kws ) -> List[matplotlib.axes.Axes]: """Visualize symmetrical square DataFrames. Args: df (DataFrame): DataFrame to visualize. savefig (bool): Whether to save a pdf. output_prefix (str): If saving a pdf, file prefix to use. method (str): Label for cbar, if relevant. **heatmap_kws: Additional keywords to pass to `seaborn.heatmap`_ Returns (List[matplotlib.axes.Axes]): List of matplotlib axes for figure. .. _seaborn.heatmap: https://seaborn.pydata.org/generated/seaborn.heatmap.html """ heatmap_kws = {**heatmap_kws} vmin = np.nanquantile(df, 0.1) vmax = np.nanquantile(df, 0.9) heatmap_kws['cmap'] = heatmap_kws.get('cmap', cmap) heatmap_kws['vmin'] = heatmap_kws.get('vmin', vmin) heatmap_kws['vmax'] = heatmap_kws.get('vmax', vmax) cbar_kws = heatmap_kws.get('cbar_kws', {}) cbar_kws['label'] = cbar_kws.get('label', method) heatmap_kws['cbar_kws'] = cbar_kws cbar_ratio = 2 wspace = 0.01 height = 0.18 * len(df) width = 0.18 * (len(df.columns)+cbar_ratio+wspace) fig, axs = plt.subplots( figsize=(width, height), nrows=1, ncols=2, gridspec_kw=dict( width_ratios=[len(df.columns), cbar_ratio], wspace=wspace, left=0, right=1, top=1, bottom=0, ) ) try: order = compute_order(df.fillna(df.median())) except ValueError: order = df.index df = df.loc[order, order] sns.heatmap( df, xticklabels=order, yticklabels=order, ax=axs[0], cbar_ax=axs[1], **heatmap_kws ) if savefig: if output_prefix is None: output_prefix = "heatmap.pairwise" plt.savefig('%s.pdf' % output_prefix) return axs
[docs]def visualize_label_agreement( labels: DataFrame, method: Optional[str] = None, savefig: bool = False, output_prefix: Optional[str] = None, **heatmap_kws ) -> List[matplotlib.axes.Axes]: """Visualize similarity between clustering results given an evaluation metric. Args: labels (DataFrame): Labels DataFrame, e.g. from optimize_clustering or \ AutoClusterer.labels_ method (str): Method with which to compare labels. Must be a metric like the ones in \ constants.need_ground_truth, which takes two sets of labels. savefig (bool): Whether to save a pdf. output_prefix (str): If saving a pdf, file prefix to use. **heatmap_kws: Additional keywords to pass to `seaborn.heatmap`_ Returns (List[matplotlib.axes.Axes]): List of matplotlib axes .. _seaborn.heatmap: https://seaborn.pydata.org/generated/seaborn.heatmap.html """ if savefig and output_prefix is None: output_prefix = 'heatmap.labels.pairwise' if method is None: method = 'adjusted_rand_score' labels = labels.astype(float).corr( lambda x, y: evaluate_one(x, method=method, gold_standard=y) ) return visualize_pairwise(labels, savefig, output_prefix, method=method, **heatmap_kws)
[docs]def visualize_sample_label_consistency( labels: DataFrame, savefig: bool = False, output_prefix: Optional[str] = None, **heatmap_kws ) -> List[matplotlib.axes.Axes]: """Visualize how often two samples are labeled in the same group across conditions. Interpret with care--if you use more conditions for some type of clusterers, e.g. more n_clusters for KMeans, those cluster more similarly across conditions than between clusterers. This means that more agreement in labeling could be due to the choice of clusterers rather than true similarity between samples. Args: labels (DataFrame): Labels DataFrame, e.g. from optimize_clustering or \ AutoClusterer.labels_ savefig (bool): Whether to save a pdf. output_prefix (str): If saving a pdf, file prefix to use. **heatmap_kws: Additional keywords to pass to `seaborn.heatmap`_ Returns (List[matplotlib.axes.Axes]): List of matplotlib axes .. _seaborn.heatmap: https://seaborn.pydata.org/generated/seaborn.heatmap.html """ if savefig and output_prefix is None: output_prefix = "heatmap.sample.pairwise" #TODO change this to much faster matmult labels = labels.transpose().astype(float).corr(lambda x, y: sum( np.equal(x[((x != -1) | (y != -1))], y[((x != -1) | (y != -1))]) )) return visualize_pairwise(labels, savefig, output_prefix, method='# same label', **heatmap_kws)
[docs]def visualize_for_picking_labels( evaluation_df: DataFrame, method: Optional[str] = None, savefig_prefix: Optional[str] = None ): """Generates graphs similar to a `scree graph`_ for PCA for each parameter and each clusterer. Args: evaluation_df (DataFrame): DataFrame of evaluations to visualize. Clusterer.evaluation_df. method (str): Which metric to visualize. savefig_prefix (str): If not None, save a figure with give prefix. Returns: matplotlib axes. .. _scree graph: https://en.wikipedia.org/wiki/Scree_plot """ if method is None: method = "silhouette_score" cluss_temp = list(set([i.split(param_delim, 1)[0] for i in evaluation_df.columns])) # get figure dimensions ncols = 0 cluss = [] for ploti, clus in enumerate(cluss_temp): scores = convert_to_multiind( clus, evaluation_df.loc[[method], :] ).transpose().dropna(how='any') if len(scores) == 0: logging.error( 'Score %s is missing for clusterer %s, skipping visualization' % (method, clus) ) continue indep = scores.index.to_frame().reset_index(drop=True) try: indep.astype(float) except ValueError or AssertionError: logging.error('Cannot convert %s data to floats, skipping visualization' % clus) continue cluss.append(clus) if scores.index.nlevels > ncols: ncols = scores.index.nlevels if not cluss: logging.error('No valid clusterers, cannot visualize. ') return None cluss.sort() ybuff = np.abs(np.nanquantile(evaluation_df.loc[method], 0.05)) ylim = (evaluation_df.loc[method].min() - ybuff, evaluation_df.loc[method].max() + ybuff) colors = cycle(sns.color_palette('twilight', n_colors=len(cluss) * ncols)) fig = plt.figure(figsize=(5 * (ncols), 5 * len(cluss))) gs = plt.GridSpec(nrows=len(cluss), ncols=ncols, wspace=0.25, hspace=0.25) for ploti, clus in enumerate(cluss): scores = convert_to_multiind( clus, evaluation_df.loc[[method], :] ).transpose().dropna(how='any') indep = scores.index.to_frame().reset_index(drop=True) for whcol, col in enumerate(indep.columns): if whcol == 0: saveax = plt.subplot(gs[ploti, whcol]) ax = saveax ax.set_ylim(ylim) ax.set_ylabel(clus) else: ax = plt.subplot(gs[ploti, whcol], sharey=saveax) color = next(colors) # plot eval results sns.regplot( indep[col], scores[method].values, color=color, ax=ax, logistic=True, ) axs = fig.get_axes() axs[0].set_title('%s results per parameter' % method, ha='left') if savefig_prefix: plt.savefig('%s.pdf' % savefig_prefix) return axs