Source code for hypercluster.classes

from typing import List, Union
from pandas import DataFrame
from .utilities import *
from .visualize import *
from itertools import product
from .constants import *


[docs]class Clusterer:
    """Meta class for shared methods for both AutoClusterer and MultiAutoClusterer.  
    """
[docs]    def pick_best_labels(self, method: Optional[str] = None, min_or_max: Optional[str] = None):
        return pick_best_labels(self.evaluation_df, self.labels_df, method, min_or_max)

[docs]    def visualize_evaluations(
            self,
            savefig: bool = False,
            output_prefix: str = "evaluations",
            **heatmap_kws
    ) -> List[matplotlib.axes.Axes]:
        return visualize_evaluations(self.evaluation_df, savefig, output_prefix, **heatmap_kws)

[docs]    def visualize_sample_label_consistency(
            self,
            savefig: bool = False,
            output_prefix: Optional[str] = None,
            **heatmap_kws
    ) -> List[matplotlib.axes.Axes]:
        return visualize_sample_label_consistency(
            self.labels_df,
            savefig,
            output_prefix,
            **heatmap_kws
        )

[docs]    def visualize_label_agreement(
            self,
            method: Optional[str] = None,
            savefig: bool = False,
            output_prefix: Optional[str] = None,
            **heatmap_kws
    ) -> List[matplotlib.axes.Axes]:
        return visualize_label_agreement(
            self.labels_df,
            method,
            savefig,
            output_prefix,
            **heatmap_kws
        )

[docs]    def visualize_for_picking_labels(
            self,
            method: Optional[str] = None,
            savefig_prefix: Optional[str] = None
    ):
        return visualize_for_picking_labels(self.evaluation_df, method, savefig_prefix)

[docs]    def fit_predict(self, data: Optional[DataFrame], parameter_set_name, method, min_of_max):
        pass


[docs]class AutoClusterer (Clusterer):
    """Main hypercluster object.  

    Attributes: 
        clusterer_name (str): String name of clusterer.  
        params_to_optimize (dict): Dictionary with possibilities for different parameters. Ex \
        format - {'parameter_name':[1, 2, 3, 4, 5]}. If None, will optimize default \
        selection, given in hypercluster.constants.variables_to_optimize. Default None.  
        random_search (bool): Whether to search a random selection of possible parameters or \
        all possibilities. Default True.  
        random_search_fraction (float): If random_search is True, what fraction of the \
        possible parameters to search. Default 0.5.  
        param_weights (dict): Dictionary of str: dictionaries. Ex format - { \
        'parameter_name':{'param_option_1':0.5, 'param_option_2':0.5}}.  
        clus_kwargs (dict): Additional kwargs to pass into given clusterer, but not to be \
        optimized. Default None.  
        labels_ (Optional[DataFrame]): If already fit, labels DataFrame fit to data.  
        evaluation_ (Optional[DataFrame]): If already fit and evalute, evaluations per label.  
        data (Optional[DataFrame]): Data to fit, will not fit by default even if passed data.  
    """

    def __init__(
        self,
        clusterer_name: Optional[str] = "KMeans",
        params_to_optimize: Optional[dict] = None,
        random_search: bool = False,
        random_search_fraction: Optional[float] = 0.5,
        param_weights: dict = {},
        clus_kwargs: Optional[dict] = None,
        labels_: Optional[DataFrame] = None,
        evaluation_: Optional[DataFrame] = None,
        data: Optional[DataFrame] = None,
        labels_df: Optional[DataFrame] = None,
        evaluation_df: Optional[DataFrame] = None
    ):

        self.clusterer_name = clusterer_name
        self.params_to_optimize = params_to_optimize
        self.random_search = random_search
        self.random_search_fraction = random_search_fraction
        self.param_weights = param_weights
        self.clus_kwargs = clus_kwargs

        if self.params_to_optimize is None:
            self.params_to_optimize = variables_to_optimize[clusterer_name]
        if self.clus_kwargs is None:
            self.clus_kwargs = {}

        if labels_df is None and labels_ is not None:
            labels_df = generate_flattened_df({clusterer_name: labels_})
        self.labels_df = labels_df

        if evaluation_df is None and evaluation_ is not None:
            evaluation_df = generate_flattened_df(evaluation_)
        self.evaluation_df = evaluation_df

        self.labels_ = labels_
        self.evaluation_ = evaluation_
        self.data = data

        self.static_kwargs = None
        self.total_possible_conditions = None
        self.param_sets = None
        self.generate_param_sets()

[docs]    def generate_param_sets(self):
        """Uses info from init to make a Dataframe of all parameter sets that will be tried. 

        Returns (AutoClusterer): 
            self
        """
        conditions = 1
        vars_to_optimize = {}
        static_kwargs = {}
        for parameter_name, possible_values in self.params_to_optimize.items():
            if len(possible_values) == 1:
                static_kwargs[parameter_name] = possible_values
            elif len(possible_values) > 1:
                vars_to_optimize[parameter_name] = possible_values
                conditions *= conditions * len(possible_values)
            else:
                logging.error(
                    "Parameter %s was given no possibilities. Will continue with default "
                    "parameters."
                    % parameter_name
                )

        self.static_kwargs = static_kwargs
        self.total_possible_conditions = conditions

        parameters = pd.DataFrame(columns=list(vars_to_optimize.keys()))
        for row in iter(product(*vars_to_optimize.values())):
            parameters = parameters.append(
                dict(zip(vars_to_optimize.keys(), row)), ignore_index=True
            )

        if self.random_search and len(parameters) > 1:
            will_search = int(conditions * self.random_search_fraction)

            # calculates probability of getting a particular set of parameters, given the probs of
            # all the individual params. If a prob isn't set, give uniform probability to each
            # parameter.
            if self.param_weights:
                weights = parameters.apply(
                    lambda param_set: calculate_row_weights(
                        param_set, self.param_weights, vars_to_optimize
                    ),
                    axis=1,
                )
            else:
                weights = None
            parameters = parameters.sample(will_search, weights=weights)

        for col in static_kwargs.keys():
            parameters[col] = static_kwargs[col]

        logging.info(
            "For clusterer %s, testing %s out of %s possible conditions"
            % (self.clusterer_name, len(parameters), conditions)
        )

        self.param_sets = parameters
        return self

[docs]    def fit(self, data: DataFrame):
        """Fits clusterer to data with each parameter set. 

        Args: 
            data (DataFrame): DataFrame with elements to cluster as index and features as columns.  

        Returns (AutoClusterer):  
            self
        """
        self.data = data
        if self.param_sets.shape == (0, 0):
            label_results = pd.DataFrame(
                cluster(self.clusterer_name, data).labels_,
                columns=["default_parameters"],
                index=data.index,
            )
            self.labels_ = label_results
            self.labels_df = generate_flattened_df({self.clusterer_name: label_results})
            return self

        label_results = pd.DataFrame(columns=self.param_sets.columns.union(data.index))
        for i, row in self.param_sets.iterrows():
            single_params = row.to_dict()
            labels = cluster(self.clusterer_name, data, single_params).labels_

            label_row = dict(zip(data.index, labels))
            label_row.update(single_params)
            label_results = label_results.append(label_row, ignore_index=True)
            logging.info(
                "%s - %s of conditions done" % (i, (i / self.total_possible_conditions))
            )
        if len(self.param_sets.columns) > 0:
            label_results = label_results.set_index(
                list(self.param_sets.columns)
            ).transpose()

        if isinstance(data.index, pd.MultiIndex):
            label_results.index = pd.MultiIndex.from_tuples(label_results.index)
        self.labels_ = label_results
        self.labels_df = generate_flattened_df({self.clusterer_name: label_results})
        return self

[docs]    def evaluate(
            self,
            methods: Optional[Iterable[str]] = None,
            metric_kwargs: Optional[dict] = None,
            gold_standard: Optional[Iterable] = None
    ):
        """Evaluate labels with given metrics. 

        Args: 
            methods (Optional[Iterable[str]]): List of evaluation methods to use.  
            metric_kwargs (Optional[dict]): Additional kwargs per evaluation metric. Structure of \
            {'metric_name':{'param1':value, 'param2':val2}.  
            gold_standard (Optional[Iterable]): Gold standard labels, if available. Only needed \
            if using a metric that needs ground truth.  

        Returns (AutoClusterer):  
            self with attribute .evaluation_; a DataFrame with all eval values per labels.  

        """
        if self.labels_ is None:
            logging.error('Cannot evaluate model, need to fit first.')
        if methods is None:
            methods = inherent_metrics
        if metric_kwargs is None:
            metric_kwargs = {}

        evaluation_df = pd.DataFrame({"methods": methods})
        for col in self.labels_.columns:
            evaluation_df[col] = evaluation_df.apply(
                lambda row: evaluate_one(
                    self.labels_[col],
                    method=row["methods"],
                    data=self.data,
                    gold_standard=gold_standard,
                    metric_kwargs=metric_kwargs.get(row["methods"], None),
                ),
                axis=1,
            )
        evaluation_df = evaluation_df.set_index('methods')
        evaluation_df.columns = self.labels_.columns
        self.evaluation_ = evaluation_df
        self.evaluation_df = generate_flattened_df({self.clusterer_name: evaluation_df})
        return self


[docs]class MultiAutoClusterer (Clusterer):
    """Object for training multiple clustering algorithms.  

    Attributes: 
        algorithm_names (Optional[Union[Iterable, str]]): List of algorithm names to test OR \
        name of category of clusterers from hypercluster.constants.categories, OR None. If None, \
        default is hypercluster.constants.variables_to_optimize.keys().  
        algorithm_parameters (Optional[Dict[str, dict]]):  Dictionary of hyperparameters to \
        optimize. Example format: {'clusterer_name1':{'hyperparam1':[val1, val2]}}.  
        random_search (bool): Whether to search a random subsample of possible conditions.  
        random_search_fraction (float): If random_search, what fraction of conditions to search.  
        algorithm_param_weights (Dict[str, Dict[str, dict]]): If random_search, and you want to \
        give probability weights to certain parameters, dictionary of probability weights. \
        Example format: {'clusterer1': {'hyperparam1':{val1:probability1, val2:probability2}}}.  
        algorithm_clus_kwargs (Dict[str, dict]): Dictionary of additional keyword args for any \
        clusterer. Example format: {'clusterer1':{'param1':val1}}.  
        data (Optional[DataFrame]): Optional, data to fit. Will not fit even if passed, \
        need to call fit method.  
        evaluation_methods (Optional[List[str]]): List of metrics with which to evaluate. If \
        None, will use hypercluster.constants.inherent_metrics. Default is None.  
        metric_kwargs (Optional[Dict[str, dict]]): Additional keyword args for any metric \
        function. Example format: {'metric1':{'param1':value}}.  
        gold_standard (Optional[Iterable]): If using methods that need ground truth, vector of \
        correct labels. Can also pass in during evaluate.  
        autoclusterers (Iterable[AutoClusterer]): If building from initialized AutoClusterer \
        objects, can give a list of them here. If these are given, it will override anything
        passed to labels\_ and evaluation\_.  
        labels_ (Optional[Dict[str, DataFrame]]): Dictionary of label DataFrames per clusterer, \
        if already fit.  Example format: {'clusterer1': labels_df}.  
        evaluation_ (Optional[Dict[str, DataFrame]]): Dictionary of evaluation DataFrames per \
        clusterer, if already fit and evaluated.  Example format: {'clusterer1': evaluation_df}.  
        labels_df (Optional[DataFrame]): Combined DataFrame of all labeling results.  
        evaluation_df (Optional[DataFrame]): Combined DataFrame of all evaluation results.  
    """
    def __init__(
            self,
            algorithm_names: Optional[Union[Iterable, str]] = None,
            algorithm_parameters: Optional[Dict[str, dict]] = None,
            random_search: bool = False,
            random_search_fraction: Optional[float] = 0.5,
            algorithm_param_weights: Optional[dict] = None,
            algorithm_clus_kwargs: Optional[dict] = None,
            data: Optional[DataFrame] = None,
            evaluation_methods: Optional[List[str]] = None,
            metric_kwargs: Optional[Dict[str, dict]] = None,
            gold_standard: Optional[Iterable] = None,
            autoclusterers: Iterable[AutoClusterer] = None,
            labels_: Dict[str, AutoClusterer] = None,
            evaluation_: Dict[str, AutoClusterer] = None,
            labels_df: Optional[DataFrame] = None,
            evaluation_df: Optional[DataFrame] = None
    ):

        self.random_search = random_search
        self.random_search_fraction = random_search_fraction

        if autoclusterers is None:
            if algorithm_names in list(categories.keys()):
                algorithm_names = categories[algorithm_names]
            elif algorithm_names is None:
                algorithm_names = variables_to_optimize.keys()
            self.algorithm_names = algorithm_names

            if algorithm_parameters is None:
                algorithm_parameters = {
                    clus_name: variables_to_optimize[clus_name] for clus_name in
                    self.algorithm_names
                }
            self.algorithm_parameters = algorithm_parameters

            if algorithm_param_weights is None:
                algorithm_param_weights = {}
            self.algorithm_param_weights = algorithm_param_weights

            if algorithm_clus_kwargs is None:
                self.algorithm_clus_kwargs = {}

            if labels_ is None:
                labels_ = {}
            else:
                labels_df = generate_flattened_df(labels_)
            self.labels_ = labels_
            self.labels_df = labels_df

            if evaluation_ is None:
                evaluation_ = {}
            else:
                evaluation_df = generate_flattened_df(evaluation_)
            self.evaluation_ = evaluation_
            self.evaluation_df = evaluation_df

            autoclusterers = []
            for clus_name in self.algorithm_names:
                autoclusterers.append(AutoClusterer(
                    clus_name,
                    params_to_optimize=self.algorithm_parameters.get(clus_name, {}),
                    random_search = self.random_search,
                    random_search_fraction = self.random_search_fraction,
                    data=data,
                    param_weights=self.algorithm_param_weights.get(clus_name, {}),
                    clus_kwargs=self.algorithm_clus_kwargs.get(clus_name, {}),
                    labels_=self.labels_.get(clus_name, None),
                    evaluation_=self.evaluation_.get(clus_name, None)
                ))
            self.autoclusterers = autoclusterers

        else:
            self.algorithm_names = [ac.clusterer_name for ac in autoclusterers]
            self.algorithm_parameters = {
                ac.clusterer_name: ac.params_to_optimize for ac in autoclusterers
            }
            self.algorithm_param_weights = {
                ac.clusterer_name: ac.param_weights for ac in autoclusterers
            }
            self.algorithm_clus_kwargs = {
                ac.clusterer_name: ac.clus_kwargs for ac in autoclusterers
            }
            self.labels_ = {
                ac.clusterer_name: ac.labels_ for ac in autoclusterers if ac.labels_ is not None
            }
            self.evaluation_ = {
                ac.clusterer_name: ac.evaluation_
                for ac in autoclusterers if ac.evaluation_ is not None
            }

            self.labels_df = generate_flattened_df(self.labels_)
            self.evaluation_df = generate_flattened_df(self.evaluation_)

        self.autoclusterers = autoclusterers
        self.data = data
        self.evaluation_methods = evaluation_methods
        self.metric_kwargs = metric_kwargs
        self.gold_standard = gold_standard

[docs]    def fit(self, data: Optional[DataFrame] = None):
        if data is None:
            data = self.data
        if data is None:
            raise ValueError('Must initialize with data or pass data in function to fit.')
        self.data = data

        fitted_clusterers = []
        for clusterer in self.autoclusterers:
            fitted_clusterers.append(clusterer.fit(data))
        #TODO right now each AC is storing it's own copy of the data.
        self.autoclusterers = fitted_clusterers
        self.labels_ = {
            ac.clusterer_name: ac.labels_ for ac in self.autoclusterers
        }
        self.labels_df = generate_flattened_df(self.labels_)
        return self

[docs]    def evaluate(
            self,
            evaluation_methods: Optional[list] = None,
            metric_kwargs: Optional[dict] = None,
            gold_standard: Optional[Iterable] = None
    ):
        if evaluation_methods is None and self.evaluation_methods is None:
            evaluation_methods = inherent_metrics
        elif evaluation_methods is None:
            evaluation_methods = self.evaluation_methods

        if metric_kwargs is None and self.metric_kwargs is None:
            metric_kwargs = {}
        elif metric_kwargs is None:
            metric_kwargs = self.metric_kwargs

        if gold_standard is None:
            gold_standard = self.gold_standard

        evaluated_clusterers = []
        for ac in self.autoclusterers:
            evaluated_clusterers.append(ac.evaluate(
                methods=evaluation_methods,
                metric_kwargs=metric_kwargs,
                gold_standard=gold_standard
            ))

        self.gold_standard = gold_standard
        self.metric_kwargs = metric_kwargs
        self.evaluation_methods = evaluation_methods

        self.autoclusterers = evaluated_clusterers
        self.evaluation_ = {
            ac.clusterer_name: ac.evaluation_ for ac in self.autoclusterers
        }
        self.evaluation_df = generate_flattened_df(self.evaluation_)
        return self