INGEOTEC
diff --git a/‎CompStats/interface.py‎
Lines changed: 4 additions & 4 deletions b/‎CompStats/interface.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎CompStats/metrics.py‎
Lines changed: 15 additions & 62 deletions b/‎CompStats/metrics.py‎
Lines changed: 15 additions & 62 deletions
diff --git a/‎CompStats/performance.py‎
Lines changed: 29 additions & 20 deletions b/‎CompStats/performance.py‎
Lines changed: 29 additions & 20 deletions
diff --git a/‎CompStats/utils.py‎
Lines changed: 30 additions & 0 deletions b/‎CompStats/utils.py‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎README.rst‎
Lines changed: 67 additions & 2 deletions b/‎README.rst‎
Lines changed: 67 additions & 2 deletions
@@ -31,8 +31,8 @@ class Perf(object):
     :type score_func: Function where the first argument is :math:`y` and the second is :math:`\\hat{y}.`
     :param error_func: Function to measure the performance where the best algorithm has the lowest value.
     :type error_func: Function where the first argument is :math:`y` and the second is :math:`\\hat{y}.` 
-    :param args: Predictions, the algorithms will be identified with alg-k where k=1 is the first argument included in :py:attr:`args.`
-    :type args: numpy.ndarray
+    :param y_pred: Predictions, the algorithms will be identified with alg-k where k=1 is the first argument included in :py:attr:`args.`
+    :type y_pred: numpy.ndarray
     :param kwargs: Predictions, the algorithms will be identified using the keyword
     :type kwargs: numpy.ndarray
     :param n_jobs: Number of jobs to compute the statistic, default=-1 corresponding to use all threads.
@@ -85,7 +85,7 @@ class Perf(object):
     Log. Reg. = 0.044 (0.030)
 
     """
-    def __init__(self, y_true, *args,
+    def __init__(self, y_true, *y_pred,
                  score_func=balanced_accuracy_score,
                  error_func=None,
                  num_samples: int=500,
@@ -96,7 +96,7 @@ def __init__(self, y_true, *args,
         self.score_func = score_func
         self.error_func = error_func
         algs = {}
-        for k, v in enumerate(args):
+        for k, v in enumerate(y_pred):
             algs[f'alg-{k+1}'] = np.asanyarray(v)
         algs.update(**kwargs)
         self.predictions = algs
 
@@ -11,33 +11,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from CompStats.interface import Perf
 from sklearn import metrics
+from CompStats.interface import Perf
+from CompStats.utils import perf_docs
 
 
-def accuracy_score(y_true, *args,
+@perf_docs
+def accuracy_score(y_true, *y_pred,
                    normalize=True, sample_weight=None,
                    num_samples: int=500,
                    n_jobs: int=-1, 
                    use_tqdm=True,
                    **kwargs):
-    """:py:class:`~CompStats.interface.Perf` with :py:func:`sklearn.metrics.accuracy_score` as :py:attr:`score_func.`
-
-    :param y_true: True measurement or could be a pandas.DataFrame where column label 'y' corresponds to the true measurement.
-    :type y_true: numpy.ndarray or pandas.DataFrame 
-    :param args: Predictions, the algorithms will be identified with alg-k where k=1 is the first argument included in :py:attr:`args.`
-    :type args: numpy.ndarray
-    :param kwargs: Predictions, the algorithms will be identified using the keyword
-    :type args: numpy.ndarray
-    :param num_samples: Number of bootstrap samples, default=500.
-    :type num_samples: int
-    :param n_jobs: Number of jobs to compute the statistic, default=-1 corresponding to use all threads.
-    :type n_jobs: int
-    :param use_tqdm: Whether to use tqdm.tqdm to visualize the progress, default=True
-    :type use_tqdm: bool    
-    :param normalize: see sklearn.metrics.f1_score
-    :param sample_weight: see sklearn.metrics.f1_score
-
+    """
     >>> from sklearn.svm import LinearSVC
     >>> from sklearn.ensemble import RandomForestClassifier
     >>> from sklearn.datasets import load_iris
@@ -61,42 +47,27 @@ def accuracy_score(y_true, *args,
     >>> diff
     <Difference>
     difference p-values w.r.t forest
-    alg-1 0.252  
+    alg-1 0.252
     """
 
     def inner(y, hy):
         return metrics.accuracy_score(y, hy,
                                       normalize=normalize,
                                       sample_weight=sample_weight)
-    return Perf(y_true, *args, score_func=inner,
+    return Perf(y_true, *y_pred, score_func=inner,
                 num_samples=num_samples, n_jobs=n_jobs,
                 use_tqdm=use_tqdm,
                 **kwargs)
 
 
-def balanced_accuracy_score(y_true, *args,
+@perf_docs
+def balanced_accuracy_score(y_true, *y_pred,
                             sample_weight=None, adjusted=False,
                             num_samples: int=500,
                             n_jobs: int=-1,
                             use_tqdm=True,
                             **kwargs):
-    """:py:class:`~CompStats.interface.Perf` with :py:func:`sklearn.metrics.balanced_accuracy_score` as :py:attr:`score_func.`
-
-    :param y_true: True measurement or could be a pandas.DataFrame where column label 'y' corresponds to the true measurement.
-    :type y_true: numpy.ndarray or pandas.DataFrame 
-    :param args: Predictions, the algorithms will be identified with alg-k where k=1 is the first argument included in :py:attr:`args.`
-    :type args: numpy.ndarray
-    :param kwargs: Predictions, the algorithms will be identified using the keyword
-    :type args: numpy.ndarray
-    :param num_samples: Number of bootstrap samples, default=500.
-    :type num_samples: int
-    :param n_jobs: Number of jobs to compute the statistic, default=-1 corresponding to use all threads.
-    :type n_jobs: int
-    :param use_tqdm: Whether to use tqdm.tqdm to visualize the progress, default=True
-    :type use_tqdm: bool    
-    :param sample_weight: see sklearn.metrics.f1_score
-    :param adjusted: see sklearn.metrics.f1_score
-
+    """
     >>> from sklearn.svm import LinearSVC
     >>> from sklearn.ensemble import RandomForestClassifier
     >>> from sklearn.datasets import load_iris
@@ -127,37 +98,19 @@ def inner(y, hy):
         return metrics.balanced_accuracy_score(y, hy,
                                                adjusted=adjusted,
                                                sample_weight=sample_weight)
-    return Perf(y_true, *args, score_func=inner,
+    return Perf(y_true, *y_pred, score_func=inner,
                 num_samples=num_samples, n_jobs=n_jobs,
                 use_tqdm=use_tqdm,
                 **kwargs)
 
 
-def f1_score(y_true, *args, labels=None, pos_label=1,
+@perf_docs
+def f1_score(y_true, *y_pred, labels=None, pos_label=1,
              average='binary', sample_weight=None,
              zero_division='warn', num_samples: int=500,
              n_jobs: int=-1, use_tqdm=True,
              **kwargs):
-    """:py:class:`~CompStats.interface.Perf` with :py:func:`sklearn.metrics.f1_score` as :py:attr:`score_func.`
-
-    :param y_true: True measurement or could be a pandas.DataFrame where column label 'y' corresponds to the true measurement.
-    :type y_true: numpy.ndarray or pandas.DataFrame 
-    :param args: Predictions, the algorithms will be identified with alg-k where k=1 is the first argument included in :py:attr:`args.`
-    :type args: numpy.ndarray
-    :param kwargs: Predictions, the algorithms will be identified using the keyword
-    :type args: numpy.ndarray
-    :param num_samples: Number of bootstrap samples, default=500.
-    :type num_samples: int
-    :param n_jobs: Number of jobs to compute the statistic, default=-1 corresponding to use all threads.
-    :type n_jobs: int
-    :param use_tqdm: Whether to use tqdm.tqdm to visualize the progress, default=True
-    :type use_tqdm: bool    
-    :param labels: see sklearn.metrics.f1_score
-    :param pos_label: see sklearn.metrics.f1_score
-    :param average: see sklearn.metrics.f1_score
-    :param sample_weight: see sklearn.metrics.f1_score
-    :param zero_division: see sklearn.metrics.f1_score
-
+    """
     >>> from sklearn.svm import LinearSVC
     >>> from sklearn.ensemble import RandomForestClassifier
     >>> from sklearn.datasets import load_iris
@@ -191,7 +144,7 @@ def inner(y, hy):
                                 average=average,
                                 sample_weight=sample_weight,
                                 zero_division=zero_division)
-    return Perf(y_true, *args, score_func=inner,
+    return Perf(y_true, *y_pred, score_func=inner,
                 num_samples=num_samples, n_jobs=n_jobs,
                 use_tqdm=use_tqdm,
                 **kwargs)
@@ -32,8 +32,7 @@ def performance(data: pd.DataFrame,
                 n_jobs: int=-1,
                 BiB: bool=True,
                 statistic_samples: StatisticSamples=None) -> StatisticSamples:
-    """
-    Calculate bootstrap samples of a performance score for a given dataset.
+    """Calculate bootstrap samples of a performance score for a given dataset.
 
     Parameters:
     data (pd.DataFrame): Input dataset.
@@ -48,6 +47,7 @@ def performance(data: pd.DataFrame,
     StatisticSamples: Object containing the bootstrap samples of the performance score.
 
     Example usage:
+
     >>> from sklearn.metrics import accuracy_score
     >>> import pandas as pd
     >>> from CompStats import performance
@@ -86,6 +86,7 @@ def difference(statistic_samples: StatisticSamples): #, best_index: int=-1):
     6. Returns a new StatisticSamples instance with the computed differences and the name of the best performing algorithm.
 
     Example usage:
+
     >>> from CompStats import performance, difference
     >>> from CompStats.tests.test_performance import DATA
     >>> from sklearn.metrics import f1_score
@@ -129,6 +130,7 @@ def all_differences(statistic_samples: StatisticSamples):
     5. Returns a new StatisticSamples instance with the computed differences.
 
     Example usage:
+
     >>> from CompStats import performance, all_differences
     >>> from CompStats.tests.test_performance import DATA
     >>> from sklearn.metrics import f1_score
@@ -163,30 +165,34 @@ def plot_performance(statistic_samples: StatisticSamples, CI: float=0.05,
                      var_name='Algorithm', value_name='Score',
                      capsize=0.2, linestyle='none', kind='point',
                      sharex=False, **kwargs):
-    """
-    Plots the performance of algorithms with confidence intervals.
-
-    Parameters:
-    statistic_samples (StatisticSamples or pd.DataFrame): An instance of StatisticSamples containing the performance data, 
-                                                          or a DataFrame in long format.
-    CI (float): Confidence interval level (default is 0.05).
-    var_name (str): Variable name for algorithms (default is 'Algorithm').
-    value_name (str): Variable name for scores (default is 'Score').
-    capsize (float): Size of the caps on error bars (default is 0.2).
-    linestyle (str): Line style for the plot (default is 'none').
-    kind (str): Type of plot (default is 'point').
-    sharex (bool): Whether to share the x-axis among subplots (default is False).
-    **kwargs: Additional keyword arguments passed to seaborn's catplot function.
-
-    Returns:
-    sns.axisgrid.FacetGrid: A seaborn FacetGrid object containing the plot.
+    """Plots the performance of algorithms with confidence intervals.
+
+    :param statistic_samples: An instance of StatisticSamples containing the performance data, or a DataFrame in long format.
+    :type statistic_samples: StatisticSamples or pd.DataFrame
+    :param CI: Confidence interval level (default is 0.05).
+    :type CI: float
+    :param var_name: Variable name for algorithms (default is 'Algorithm').
+    :type var_name: str
+    :param value_name: Variable name for scores (default is 'Score').
+    :type value_name: str
+    :param capsize: Size of the caps on error bars (default is 0.2).
+    :type capsize: float
+    :param linestyle: Line style for the plot (default is 'none').
+    :type linestyle: str
+    :param kind: Type of plot (default is 'point').
+    :type kind: str
+    :param sharex: Whether to share the x-axis among subplots (default is False).
+    :type sharex: bool
+    :param kwargs: Additional keyword arguments passed to seaborn's catplot function.
+
+    :returns: A seaborn FacetGrid object containing the plot.
+    :rtype: sns.axisgrid.FacetGrid
 
     The function works as follows:
     1. If statistic_samples is an instance of StatisticSamples, it extracts and sorts the performance data.
     2. Converts the data into a long format DataFrame.
     3. Computes the confidence intervals if CI is provided as a float.
     4. Plots the performance data with confidence intervals using seaborn's catplot.
-
     
     >>> from CompStats import performance, plot_performance
     >>> from CompStats.tests.test_performance import DATA
@@ -316,6 +322,7 @@ def performance_multiple_metrics(data: pd.DataFrame, gold: str,
     6. Compiles the results into a dictionary and returns it.
 
     Example usage:
+
     >>> from sklearn.metrics import accuracy_score, f1_score
     >>> import pandas as pd
     >>> from CompStats import performance_multiple_metrics
@@ -433,6 +440,7 @@ def difference_multiple(results_dict, CI: float=0.05,):
     7. Returns a dictionary with these calculated differences and additional information.
 
     Example usage:
+
     >>> from CompStats import performance, difference_multiple
     >>> from CompStats.tests.test_performance import DATA
     >>> from sklearn.metrics import f1_score
@@ -542,6 +550,7 @@ def plot_performance_multiple(results_dict: dict, CI: float = 0.05, capsize: flo
     3. Sets the title of each plot to the metric name and the best performing algorithm.
 
     Example usage:
+
     >>> from CompStats import plot_performance_multiple
     >>> results = {
     >>>     'accuracy': {
 
@@ -23,3 +23,33 @@ def progress_bar(arg, use_tqdm: bool=True, **kwargs):
     if not USE_TQDM or not use_tqdm:
         return arg
     return tqdm(arg, **kwargs)
+
+
+from functools import wraps
+
+
+def perf_docs(func):
+    """Decorator to Perf with any write :py:class:`~sklearn.metrics` documentation
+    """
+
+    func.__doc__ = f""":py:class:`~CompStats.interface.Perf` with :py:func:`~sklearn.metrics.{func.__name__}` as :py:attr:`score_func.` The parameters not described can be found in :py:func:`~sklearn.metrics.{func.__name__}`.
+    
+:param y_true: True measurement or could be a pandas.DataFrame where column label 'y' corresponds to the true measurement.
+:type y_true: numpy.ndarray or pandas.DataFrame 
+:param y_pred: Predictions, the algorithms will be identified with alg-k where k=1 is the first argument included in :py:attr:`y_pred.`
+:type y_pred: numpy.ndarray
+:param kwargs: Predictions, the algorithms will be identified using the keyword
+:type kwargs: numpy.ndarray
+:param num_samples: Number of bootstrap samples, default=500.
+:type num_samples: int
+:param n_jobs: Number of jobs to compute the statistic, default=-1 corresponding to use all threads.
+:type n_jobs: int
+:param use_tqdm: Whether to use tqdm.tqdm to visualize the progress, default=True
+:type use_tqdm: bool
+""" + func.__doc__
+
+    @wraps(func)
+    def inner(*args, **kwargs):
+        return func(*args, **kwargs)
+
+    return inner
@@ -23,6 +23,71 @@ CompStats
 		:target: https://compstats.readthedocs.io/en/latest/?badge=latest
 
 .. image:: https://colab.research.google.com/assets/colab-badge.svg
-		:target: https://colab.research.google.com/github/INGEOTEC/CompStats/blob/docs/docs/CompStats.ipynb
+		:target: https://colab.research.google.com/github/INGEOTEC/CompStats/blob/docs/docs/CompStats_metrics.ipynb
 
-Collaborative competitions have gained popularity in the scientific and technological fields. These competitions involve defining tasks, selecting evaluation scores, and devising result verification methods. In the standard scenario, participants receive a training set and are expected to provide a solution for a held-out dataset kept by organizers. An essential challenge for organizers arises when comparing algorithms' performance, assessing multiple participants, and ranking them. Statistical tools are often used for this purpose; however, traditional statistical methods often fail to capture decisive differences between systems' performance. CompStats implements an evaluation methodology for statistically analyzing competition results and competition. CompStats offers several advantages, including off-the-shell comparisons with correction mechanisms and the inclusion of confidence intervals. 
+Collaborative competitions have gained popularity in the scientific and technological fields. These competitions involve defining tasks, selecting evaluation scores, and devising result verification methods. In the standard scenario, participants receive a training set and are expected to provide a solution for a held-out dataset kept by organizers. An essential challenge for organizers arises when comparing algorithms' performance, assessing multiple participants, and ranking them. Statistical tools are often used for this purpose; however, traditional statistical methods often fail to capture decisive differences between systems' performance. CompStats implements an evaluation methodology for statistically analyzing competition results and competition. CompStats offers several advantages, including off-the-shell comparisons with correction mechanisms and the inclusion of confidence intervals. 
+
+To illustrate the use of `CompStats`, the following snippets show an example. The instructions load the necessary libraries, including the one to obtain the problem (e.g., digits), three different classifiers, and the last line is the score used to measure the performance and compare the algorithm. 
+
+>>> from sklearn.svm import LinearSVC
+>>> from sklearn.naive_bayes import GaussianNB
+>>> from sklearn.ensemble import RandomForestClassifier
+>>> from sklearn.datasets import load_digits
+>>> from sklearn.model_selection import train_test_split
+>>> from sklearn.base import clone
+>>> from CompStats.metrics import f1_score
+
+The first step is to load the digits problem and split the dataset into training and validation sets. The second step is to estimate the parameters of a linear Support Vector Machine and predict the validation set's classes. The predictions are stored in the variable `hy`.
+
+>>> X, y = load_digits(return_X_y=True)
+>>> _ = train_test_split(X, y, test_size=0.3)
+>>> X_train, X_val, y_train, y_val = _
+>>> m = LinearSVC().fit(X_train, y_train)
+>>> hy = m.predict(X_val)
+
+Once the predictions are available, it is time to measure the algorithm's performance, as seen in the following code. It is essential to note that the API used in `sklearn.metrics` is followed; the difference is that the function returns an instance with different methods that can be used to estimate different performance statistics and compare algorithms. 
+
+>>> score = f1_score(y_val, hy, average='macro')
+>>> score
+<Perf>
+Prediction statistics with standard error
+alg-1 = 0.936 (0.010)
+
+The previous code shows the macro-f1 score and, in parenthesis, its standard error. The actual performance value is stored in the `statistic` function.
+
+>>> score.statistic()
+{'alg-1': 0.9355476018466147}
+
+Continuing with the example, let us assume that one wants to test another classifier on the same problem, in this case, a random forest, as can be seen in the following two lines. The second line predicts the validation set and sets it to the analysis. 
+
+>>> ens = RandomForestClassifier().fit(X_train, y_train)
+>>> score(ens.predict(X_val), name='Random Forest')
+<Perf>
+Prediction statistics with standard error
+Random Forest = 0.970 (0.008)
+alg-1 = 0.936 (0.010)
+
+Let us incorporate another prediction, now with the Naive Bayes classifier, as seen below.
+
+>>> nb = GaussianNB().fit(X_train, y_train)
+>>> score(nb.predict(X_val), name='Naive Bayes')
+<Perf>
+Prediction statistics with standard error
+Random Forest = 0.970 (0.008)
+alg-1 = 0.936 (0.010)
+Naive Bayes = 0.821 (0.016)
+
+The final step is to compare the performance of the three classifiers, which can be done with the `difference` method, as seen next.  
+
+>>> diff = score.difference()
+>>> diff
+<Difference>
+difference p-values w.r.t Random Forest
+alg-1 0.0
+Naive Bayes 0.0
+
+The class `Difference` has the `plot` method that can be used to depict the difference with respect to the best. 
+
+>>> diff.plot()
+
+.. image:: https://github.com/INGEOTEC/CompStats/raw/docs/docs/source/digits_difference.png