metrics and usability

mgraffg · mgraffg · commit 5cee7ef1b72d · 2025-01-29T18:48:56.000Z
diff --git a/CompStats/interface.py b/CompStats/interface.py
@@ -47,8 +47,12 @@ class Perf(object):
     :type n_jobs: int
     :param num_samples: Number of bootstrap samples, default=500.
     :type num_samples: int
+    :param use_tqdm: Whether to use tqdm.tqdm to visualize the progress, default=True
+    :type use_tqdm: bool
+
 
     >>> from sklearn.svm import LinearSVC
+    >>> from sklearn.linear_model import LogisticRegression
     >>> from sklearn.ensemble import RandomForestClassifier
     >>> from sklearn.datasets import load_iris
     >>> from sklearn.model_selection import train_test_split
@@ -65,22 +69,36 @@ class Perf(object):
     <Perf>
     Prediction statistics with standard error
     alg-1 = 1.000 (0.000)
-    forest = 0.978 (0.019)
+    forest = 0.946 (0.038)
+    
+    If an algorithm's prediction is missing, this can be included by calling the instance, as can be seen in the following instruction. Note that the algorithm's name can also be given with the keyword :py:attr:`name.`
+
+    >>> lr = LogisticRegression().fit(X_train, y_train)
+    >>> perf(lr.predict(X_val), name='Log. Reg.')
+    <Perf>
+    Prediction statistics with standard error
+    alg-1 = 1.000 (0.000)
+    forest = 0.946 (0.038)
+    Log. Reg. = 0.946 (0.038)
+    
+    The performance function used to compare the algorithms can be changed, and the same bootstrap samples would be used if the instance were cloned. Consequently, the values are computed using the same samples, as can be seen in the following example.
 
     >>> perf_error = clone(perf)
     >>> perf_error.error_func = lambda y, hy: (y != hy).mean()
     >>> perf_error
     <Perf>
     Prediction statistics with standard error
     alg-1 = 0.000 (0.000)
-    forest = 0.022 (0.018)    
+    forest = 0.044 (0.030)
+    Log. Reg. = 0.044 (0.030)
 
     """
     def __init__(self, y_true, *args,
                  score_func=macro(f1_score),
                  error_func=None,
                  num_samples: int=500,
                  n_jobs: int=-1,
+                 use_tqdm=True,
                  **kwargs):
         assert (score_func is None) ^ (error_func is None)
         self.score_func = score_func
@@ -93,6 +111,7 @@ def __init__(self, y_true, *args,
         self.y_true = y_true
         self.num_samples = num_samples
         self.n_jobs = n_jobs
+        self.use_tqdm = use_tqdm
         self._init()
 
     def _init(self):
@@ -140,6 +159,20 @@ def __str__(self):
             output.append(f'{key} = {value:0.3f} ({se[key]:0.3f})')
         return "\n".join(output)
 
+    def __call__(self, y_pred, name=None):
+        """Add predictions"""
+        if name is None:
+            k = len(self.predictions) + 1
+            if k == 0:
+                k = 1
+            name = f'alg-{k}'
+        self.predictions[name] = np.asanyarray(y_pred)
+        samples = self._statistic_samples
+        calls = samples.calls
+        if name in calls:
+            del calls[name]
+        return self
+
     def difference(self, wrt_to: str=None):
         """Compute the difference w.r.t any algorithm by default is the best
 
@@ -285,7 +318,7 @@ def statistic_samples(self):
         algs = set(samples.calls.keys())
         algs = set(self.predictions.keys()) - algs
         if len(algs):
-            for key in progress_bar(algs):
+            for key in progress_bar(algs, use_tqdm=self.use_tqdm):
                 samples(self.y_true, self.predictions[key], name=key)
         return self._statistic_samples
 
diff --git a/CompStats/metrics.py b/CompStats/metrics.py
@@ -15,22 +15,143 @@
 from sklearn import metrics
 
 
+def accuracy_score(y_true, *args,
+                   normalize=True, sample_weight=None,
+                   num_samples: int=500,
+                   n_jobs: int=-1, 
+                   use_tqdm=True,
+                   **kwargs):
+    """:py:class:`~CompStats.interface.Perf` with :py:func:`sklearn.metrics.accuracy_score` as :py:attr:`score_func.`
+
+    :param y_true: True measurement or could be a pandas.DataFrame where column label 'y' corresponds to the true measurement.
+    :type y_true: numpy.ndarray or pandas.DataFrame 
+    :param args: Predictions, the algorithms will be identified with alg-k where k=1 is the first argument included in :py:attr:`args.`
+    :type args: numpy.ndarray
+    :param kwargs: Predictions, the algorithms will be identified using the keyword
+    :type args: numpy.ndarray
+    :param num_samples: Number of bootstrap samples, default=500.
+    :type num_samples: int
+    :param n_jobs: Number of jobs to compute the statistic, default=-1 corresponding to use all threads.
+    :type n_jobs: int
+    :param use_tqdm: Whether to use tqdm.tqdm to visualize the progress, default=True
+    :type use_tqdm: bool    
+    :param normalize: see sklearn.metrics.f1_score
+    :param sample_weight: see sklearn.metrics.f1_score
+
+    >>> from sklearn.svm import LinearSVC
+    >>> from sklearn.ensemble import RandomForestClassifier
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.model_selection import train_test_split
+    >>> from sklearn.base import clone
+    >>> from CompStats.metrics import accuracy_score
+    >>> X, y = load_iris(return_X_y=True)
+    >>> _ = train_test_split(X, y, test_size=0.3)
+    >>> X_train, X_val, y_train, y_val = _
+    >>> m = LinearSVC().fit(X_train, y_train)
+    >>> hy = m.predict(X_val)
+    >>> ens = RandomForestClassifier().fit(X_train, y_train)
+    >>> score = accuracy_score(y_val, hy,
+                               forest=ens.predict(X_val))
+    >>> score
+    <Perf>
+    Prediction statistics with standard error
+    forest = 0.978 (0.023)
+    alg-1 = 0.956 (0.030)
+    >>> diff = score.difference()
+    >>> diff
+    <Difference>
+    difference p-values w.r.t forest
+    alg-1 0.252  
+    """
+
+    def inner(y, hy):
+        return metrics.accuracy_score(y, hy,
+                                      normalize=normalize,
+                                      sample_weight=sample_weight)
+    return Perf(y_true, *args, score_func=inner,
+                num_samples=num_samples, n_jobs=n_jobs,
+                use_tqdm=use_tqdm,
+                **kwargs)
+
+
+def balanced_accuracy_score(y_true, *args,
+                            sample_weight=None, adjusted=False,
+                            num_samples: int=500,
+                            n_jobs: int=-1,
+                            use_tqdm=True,
+                            **kwargs):
+    """:py:class:`~CompStats.interface.Perf` with :py:func:`sklearn.metrics.balanced_accuracy_score` as :py:attr:`score_func.`
+
+    :param y_true: True measurement or could be a pandas.DataFrame where column label 'y' corresponds to the true measurement.
+    :type y_true: numpy.ndarray or pandas.DataFrame 
+    :param args: Predictions, the algorithms will be identified with alg-k where k=1 is the first argument included in :py:attr:`args.`
+    :type args: numpy.ndarray
+    :param kwargs: Predictions, the algorithms will be identified using the keyword
+    :type args: numpy.ndarray
+    :param num_samples: Number of bootstrap samples, default=500.
+    :type num_samples: int
+    :param n_jobs: Number of jobs to compute the statistic, default=-1 corresponding to use all threads.
+    :type n_jobs: int
+    :param use_tqdm: Whether to use tqdm.tqdm to visualize the progress, default=True
+    :type use_tqdm: bool    
+    :param sample_weight: see sklearn.metrics.f1_score
+    :param adjusted: see sklearn.metrics.f1_score
+
+    >>> from sklearn.svm import LinearSVC
+    >>> from sklearn.ensemble import RandomForestClassifier
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.model_selection import train_test_split
+    >>> from sklearn.base import clone
+    >>> from CompStats.metrics import balanced_accuracy_score
+    >>> X, y = load_iris(return_X_y=True)
+    >>> _ = train_test_split(X, y, test_size=0.3)
+    >>> X_train, X_val, y_train, y_val = _
+    >>> m = LinearSVC().fit(X_train, y_train)
+    >>> hy = m.predict(X_val)
+    >>> ens = RandomForestClassifier().fit(X_train, y_train)
+    >>> score = balanced_accuracy_score(y_val, hy,
+                                        forest=ens.predict(X_val))
+    >>> score
+    <Perf>
+    Prediction statistics with standard error
+    forest = 0.957 (0.031)
+    alg-1 = 0.935 (0.037)
+    >>> diff = score.difference()
+    >>> diff
+    <Difference>
+    difference p-values w.r.t forest
+    alg-1 0.254  
+    """
+
+    def inner(y, hy):
+        return metrics.balanced_accuracy_score(y, hy,
+                                               adjusted=adjusted,
+                                               sample_weight=sample_weight)
+    return Perf(y_true, *args, score_func=inner,
+                num_samples=num_samples, n_jobs=n_jobs,
+                use_tqdm=use_tqdm,
+                **kwargs)
+
+
 def f1_score(y_true, *args, labels=None, pos_label=1,
              average='binary', sample_weight=None,
              zero_division='warn', num_samples: int=500,
-             n_jobs: int=-1, **kwargs):
-    """:py:class:`~CompStats.interface.Perf` with :py:func:`~sklearn.metrics.f1_score` as :py:attr:`score_func.`
+             n_jobs: int=-1, use_tqdm=True,
+             **kwargs):
+    """:py:class:`~CompStats.interface.Perf` with :py:func:`sklearn.metrics.f1_score` as :py:attr:`score_func.`
 
     :param y_true: True measurement or could be a pandas.DataFrame where column label 'y' corresponds to the true measurement.
     :type y_true: numpy.ndarray or pandas.DataFrame 
     :param args: Predictions, the algorithms will be identified with alg-k where k=1 is the first argument included in :py:attr:`args.`
     :type args: numpy.ndarray
     :param kwargs: Predictions, the algorithms will be identified using the keyword
     :type args: numpy.ndarray
-    :param n_jobs: Number of jobs to compute the statistic, default=-1 corresponding to use all threads.
-    :type n_jobs: int
     :param num_samples: Number of bootstrap samples, default=500.
     :type num_samples: int
+    :param n_jobs: Number of jobs to compute the statistic, default=-1 corresponding to use all threads.
+    :type n_jobs: int
+    :param use_tqdm: Whether to use tqdm.tqdm to visualize the progress, default=True
+    :type use_tqdm: bool    
     :param labels: see sklearn.metrics.f1_score
     :param pos_label: see sklearn.metrics.f1_score
     :param average: see sklearn.metrics.f1_score
@@ -72,4 +193,5 @@ def inner(y, hy):
                                 zero_division=zero_division)
     return Perf(y_true, *args, score_func=inner,
                 num_samples=num_samples, n_jobs=n_jobs,
-                **kwargs)
+                use_tqdm=use_tqdm,
+                **kwargs)
diff --git a/CompStats/tests/test_interface.py b/CompStats/tests/test_interface.py
@@ -130,4 +130,24 @@ def test_Perf_dataframe():
 
     df = pd.read_csv(DATA)
     perf = Perf(df, num_samples=50)
-    assert 'INGEOTEC' in perf.statistic()
+    assert 'INGEOTEC' in perf.statistic()
+
+
+def test_Perf_call():
+    """Test Perf call"""
+    from CompStats.interface import Perf
+
+    X, y = load_iris(return_X_y=True)
+    _ = train_test_split(X, y, test_size=0.3)
+    X_train, X_val, y_train, y_val = _
+    m = LinearSVC().fit(X_train, y_train)
+    hy = m.predict(X_val)
+    ens = RandomForestClassifier().fit(X_train, y_train)
+    hy2 = ens.predict(X_val)
+    perf = Perf(y_val, num_samples=50)
+    for xx in [hy, hy2]:
+        _ = perf(xx)
+        print(_)
+    perf(hy, name='alg-2')
+    assert 'alg-2' not in perf._statistic_samples.calls
+    assert 'alg-1' in perf._statistic_samples.calls
diff --git a/CompStats/tests/test_metrics.py b/CompStats/tests/test_metrics.py
@@ -27,4 +27,30 @@ def test_f1_score():
     ens = RandomForestClassifier().fit(X_train, y_train)
     perf = f1_score(y_val, forest=ens.predict(X_val),
                     num_samples=50, average='macro')
-    assert 'forest' in perf.statistic()
+    assert 'forest' in perf.statistic()
+
+
+def test_accuracy_score():
+    """Test f1_score"""
+    from CompStats.metrics import accuracy_score
+
+    X, y = load_iris(return_X_y=True)
+    _ = train_test_split(X, y, test_size=0.3)
+    X_train, X_val, y_train, y_val = _
+    ens = RandomForestClassifier().fit(X_train, y_train)
+    perf = accuracy_score(y_val, forest=ens.predict(X_val),
+                          num_samples=50)
+    assert 'forest' in perf.statistic()
+
+
+def test_balanced_accuracy_score():
+    """Test f1_score"""
+    from CompStats.metrics import balanced_accuracy_score
+
+    X, y = load_iris(return_X_y=True)
+    _ = train_test_split(X, y, test_size=0.3)
+    X_train, X_val, y_train, y_val = _
+    ens = RandomForestClassifier().fit(X_train, y_train)
+    perf = balanced_accuracy_score(y_val, forest=ens.predict(X_val),
+                                   num_samples=50)
+    assert 'forest' in perf.statistic()        
diff --git a/CompStats/tests/test_performance.py b/CompStats/tests/test_performance.py
@@ -73,7 +73,7 @@ def test_performance_multiple_metrics():
         {"func": mean_absolute_error, 'BiB': False}
         ]
     perf = performance_multiple_metrics(df, "y", metrics)
-    ins = plot_performance_multiple(perf)
+    plot_performance_multiple(perf)
     assert 'accuracy_score' in perf['samples']
     assert 'y' not in perf['samples']['accuracy_score']
     assert 'INGEOTEC' in perf['samples']['accuracy_score']
@@ -90,7 +90,7 @@ def test_difference_multiple():
         ]
     perf = performance_multiple_metrics(df, "y", metrics)
     diff = difference_multiple(perf)
-    ins = plot_difference_multiple(diff)
+    plot_difference_multiple(diff)
     assert diff['winner']['accuracy_score']['best'] == 'BoW'
     assert 'BoW' not in diff['winner']['accuracy_score']['diff'].keys()
     # assert isinstance(ins, sns.FacetGrid)
diff --git a/CompStats/utils.py b/CompStats/utils.py
@@ -18,8 +18,8 @@
     USE_TQDM = False
 
 
-def progress_bar(arg, **kwargs):
+def progress_bar(arg, use_tqdm: bool=True, **kwargs):
     """Progress bar using tqdm"""
-    if USE_TQDM:
-        return tqdm(arg, **kwargs)
-    return arg
+    if not USE_TQDM or not use_tqdm:
+        return arg
+    return tqdm(arg, **kwargs)