Skip to content

Commit 2571f06

Browse files
authored
Merge pull request #15 from INGEOTEC/docs
Docs
2 parents b81f50e + c7fb5bc commit 2571f06

File tree

10 files changed

+636
-92
lines changed

10 files changed

+636
-92
lines changed

CompStats/interface.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ class Perf(object):
3131
:type score_func: Function where the first argument is :math:`y` and the second is :math:`\\hat{y}.`
3232
:param error_func: Function to measure the performance where the best algorithm has the lowest value.
3333
:type error_func: Function where the first argument is :math:`y` and the second is :math:`\\hat{y}.`
34-
:param args: Predictions, the algorithms will be identified with alg-k where k=1 is the first argument included in :py:attr:`args.`
35-
:type args: numpy.ndarray
34+
:param y_pred: Predictions, the algorithms will be identified with alg-k where k=1 is the first argument included in :py:attr:`args.`
35+
:type y_pred: numpy.ndarray
3636
:param kwargs: Predictions, the algorithms will be identified using the keyword
3737
:type kwargs: numpy.ndarray
3838
:param n_jobs: Number of jobs to compute the statistic, default=-1 corresponding to use all threads.
@@ -85,7 +85,7 @@ class Perf(object):
8585
Log. Reg. = 0.044 (0.030)
8686
8787
"""
88-
def __init__(self, y_true, *args,
88+
def __init__(self, y_true, *y_pred,
8989
score_func=balanced_accuracy_score,
9090
error_func=None,
9191
num_samples: int=500,
@@ -96,7 +96,7 @@ def __init__(self, y_true, *args,
9696
self.score_func = score_func
9797
self.error_func = error_func
9898
algs = {}
99-
for k, v in enumerate(args):
99+
for k, v in enumerate(y_pred):
100100
algs[f'alg-{k+1}'] = np.asanyarray(v)
101101
algs.update(**kwargs)
102102
self.predictions = algs

CompStats/metrics.py

Lines changed: 15 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -11,33 +11,19 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
from CompStats.interface import Perf
1514
from sklearn import metrics
15+
from CompStats.interface import Perf
16+
from CompStats.utils import perf_docs
1617

1718

18-
def accuracy_score(y_true, *args,
19+
@perf_docs
20+
def accuracy_score(y_true, *y_pred,
1921
normalize=True, sample_weight=None,
2022
num_samples: int=500,
2123
n_jobs: int=-1,
2224
use_tqdm=True,
2325
**kwargs):
24-
""":py:class:`~CompStats.interface.Perf` with :py:func:`sklearn.metrics.accuracy_score` as :py:attr:`score_func.`
25-
26-
:param y_true: True measurement or could be a pandas.DataFrame where column label 'y' corresponds to the true measurement.
27-
:type y_true: numpy.ndarray or pandas.DataFrame
28-
:param args: Predictions, the algorithms will be identified with alg-k where k=1 is the first argument included in :py:attr:`args.`
29-
:type args: numpy.ndarray
30-
:param kwargs: Predictions, the algorithms will be identified using the keyword
31-
:type args: numpy.ndarray
32-
:param num_samples: Number of bootstrap samples, default=500.
33-
:type num_samples: int
34-
:param n_jobs: Number of jobs to compute the statistic, default=-1 corresponding to use all threads.
35-
:type n_jobs: int
36-
:param use_tqdm: Whether to use tqdm.tqdm to visualize the progress, default=True
37-
:type use_tqdm: bool
38-
:param normalize: see sklearn.metrics.f1_score
39-
:param sample_weight: see sklearn.metrics.f1_score
40-
26+
"""
4127
>>> from sklearn.svm import LinearSVC
4228
>>> from sklearn.ensemble import RandomForestClassifier
4329
>>> from sklearn.datasets import load_iris
@@ -61,42 +47,27 @@ def accuracy_score(y_true, *args,
6147
>>> diff
6248
<Difference>
6349
difference p-values w.r.t forest
64-
alg-1 0.252
50+
alg-1 0.252
6551
"""
6652

6753
def inner(y, hy):
6854
return metrics.accuracy_score(y, hy,
6955
normalize=normalize,
7056
sample_weight=sample_weight)
71-
return Perf(y_true, *args, score_func=inner,
57+
return Perf(y_true, *y_pred, score_func=inner,
7258
num_samples=num_samples, n_jobs=n_jobs,
7359
use_tqdm=use_tqdm,
7460
**kwargs)
7561

7662

77-
def balanced_accuracy_score(y_true, *args,
63+
@perf_docs
64+
def balanced_accuracy_score(y_true, *y_pred,
7865
sample_weight=None, adjusted=False,
7966
num_samples: int=500,
8067
n_jobs: int=-1,
8168
use_tqdm=True,
8269
**kwargs):
83-
""":py:class:`~CompStats.interface.Perf` with :py:func:`sklearn.metrics.balanced_accuracy_score` as :py:attr:`score_func.`
84-
85-
:param y_true: True measurement or could be a pandas.DataFrame where column label 'y' corresponds to the true measurement.
86-
:type y_true: numpy.ndarray or pandas.DataFrame
87-
:param args: Predictions, the algorithms will be identified with alg-k where k=1 is the first argument included in :py:attr:`args.`
88-
:type args: numpy.ndarray
89-
:param kwargs: Predictions, the algorithms will be identified using the keyword
90-
:type args: numpy.ndarray
91-
:param num_samples: Number of bootstrap samples, default=500.
92-
:type num_samples: int
93-
:param n_jobs: Number of jobs to compute the statistic, default=-1 corresponding to use all threads.
94-
:type n_jobs: int
95-
:param use_tqdm: Whether to use tqdm.tqdm to visualize the progress, default=True
96-
:type use_tqdm: bool
97-
:param sample_weight: see sklearn.metrics.f1_score
98-
:param adjusted: see sklearn.metrics.f1_score
99-
70+
"""
10071
>>> from sklearn.svm import LinearSVC
10172
>>> from sklearn.ensemble import RandomForestClassifier
10273
>>> from sklearn.datasets import load_iris
@@ -127,37 +98,19 @@ def inner(y, hy):
12798
return metrics.balanced_accuracy_score(y, hy,
12899
adjusted=adjusted,
129100
sample_weight=sample_weight)
130-
return Perf(y_true, *args, score_func=inner,
101+
return Perf(y_true, *y_pred, score_func=inner,
131102
num_samples=num_samples, n_jobs=n_jobs,
132103
use_tqdm=use_tqdm,
133104
**kwargs)
134105

135106

136-
def f1_score(y_true, *args, labels=None, pos_label=1,
107+
@perf_docs
108+
def f1_score(y_true, *y_pred, labels=None, pos_label=1,
137109
average='binary', sample_weight=None,
138110
zero_division='warn', num_samples: int=500,
139111
n_jobs: int=-1, use_tqdm=True,
140112
**kwargs):
141-
""":py:class:`~CompStats.interface.Perf` with :py:func:`sklearn.metrics.f1_score` as :py:attr:`score_func.`
142-
143-
:param y_true: True measurement or could be a pandas.DataFrame where column label 'y' corresponds to the true measurement.
144-
:type y_true: numpy.ndarray or pandas.DataFrame
145-
:param args: Predictions, the algorithms will be identified with alg-k where k=1 is the first argument included in :py:attr:`args.`
146-
:type args: numpy.ndarray
147-
:param kwargs: Predictions, the algorithms will be identified using the keyword
148-
:type args: numpy.ndarray
149-
:param num_samples: Number of bootstrap samples, default=500.
150-
:type num_samples: int
151-
:param n_jobs: Number of jobs to compute the statistic, default=-1 corresponding to use all threads.
152-
:type n_jobs: int
153-
:param use_tqdm: Whether to use tqdm.tqdm to visualize the progress, default=True
154-
:type use_tqdm: bool
155-
:param labels: see sklearn.metrics.f1_score
156-
:param pos_label: see sklearn.metrics.f1_score
157-
:param average: see sklearn.metrics.f1_score
158-
:param sample_weight: see sklearn.metrics.f1_score
159-
:param zero_division: see sklearn.metrics.f1_score
160-
113+
"""
161114
>>> from sklearn.svm import LinearSVC
162115
>>> from sklearn.ensemble import RandomForestClassifier
163116
>>> from sklearn.datasets import load_iris
@@ -191,7 +144,7 @@ def inner(y, hy):
191144
average=average,
192145
sample_weight=sample_weight,
193146
zero_division=zero_division)
194-
return Perf(y_true, *args, score_func=inner,
147+
return Perf(y_true, *y_pred, score_func=inner,
195148
num_samples=num_samples, n_jobs=n_jobs,
196149
use_tqdm=use_tqdm,
197150
**kwargs)

CompStats/performance.py

Lines changed: 29 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,7 @@ def performance(data: pd.DataFrame,
3232
n_jobs: int=-1,
3333
BiB: bool=True,
3434
statistic_samples: StatisticSamples=None) -> StatisticSamples:
35-
"""
36-
Calculate bootstrap samples of a performance score for a given dataset.
35+
"""Calculate bootstrap samples of a performance score for a given dataset.
3736
3837
Parameters:
3938
data (pd.DataFrame): Input dataset.
@@ -48,6 +47,7 @@ def performance(data: pd.DataFrame,
4847
StatisticSamples: Object containing the bootstrap samples of the performance score.
4948
5049
Example usage:
50+
5151
>>> from sklearn.metrics import accuracy_score
5252
>>> import pandas as pd
5353
>>> from CompStats import performance
@@ -86,6 +86,7 @@ def difference(statistic_samples: StatisticSamples): #, best_index: int=-1):
8686
6. Returns a new StatisticSamples instance with the computed differences and the name of the best performing algorithm.
8787
8888
Example usage:
89+
8990
>>> from CompStats import performance, difference
9091
>>> from CompStats.tests.test_performance import DATA
9192
>>> from sklearn.metrics import f1_score
@@ -129,6 +130,7 @@ def all_differences(statistic_samples: StatisticSamples):
129130
5. Returns a new StatisticSamples instance with the computed differences.
130131
131132
Example usage:
133+
132134
>>> from CompStats import performance, all_differences
133135
>>> from CompStats.tests.test_performance import DATA
134136
>>> from sklearn.metrics import f1_score
@@ -163,30 +165,34 @@ def plot_performance(statistic_samples: StatisticSamples, CI: float=0.05,
163165
var_name='Algorithm', value_name='Score',
164166
capsize=0.2, linestyle='none', kind='point',
165167
sharex=False, **kwargs):
166-
"""
167-
Plots the performance of algorithms with confidence intervals.
168-
169-
Parameters:
170-
statistic_samples (StatisticSamples or pd.DataFrame): An instance of StatisticSamples containing the performance data,
171-
or a DataFrame in long format.
172-
CI (float): Confidence interval level (default is 0.05).
173-
var_name (str): Variable name for algorithms (default is 'Algorithm').
174-
value_name (str): Variable name for scores (default is 'Score').
175-
capsize (float): Size of the caps on error bars (default is 0.2).
176-
linestyle (str): Line style for the plot (default is 'none').
177-
kind (str): Type of plot (default is 'point').
178-
sharex (bool): Whether to share the x-axis among subplots (default is False).
179-
**kwargs: Additional keyword arguments passed to seaborn's catplot function.
180-
181-
Returns:
182-
sns.axisgrid.FacetGrid: A seaborn FacetGrid object containing the plot.
168+
"""Plots the performance of algorithms with confidence intervals.
169+
170+
:param statistic_samples: An instance of StatisticSamples containing the performance data, or a DataFrame in long format.
171+
:type statistic_samples: StatisticSamples or pd.DataFrame
172+
:param CI: Confidence interval level (default is 0.05).
173+
:type CI: float
174+
:param var_name: Variable name for algorithms (default is 'Algorithm').
175+
:type var_name: str
176+
:param value_name: Variable name for scores (default is 'Score').
177+
:type value_name: str
178+
:param capsize: Size of the caps on error bars (default is 0.2).
179+
:type capsize: float
180+
:param linestyle: Line style for the plot (default is 'none').
181+
:type linestyle: str
182+
:param kind: Type of plot (default is 'point').
183+
:type kind: str
184+
:param sharex: Whether to share the x-axis among subplots (default is False).
185+
:type sharex: bool
186+
:param kwargs: Additional keyword arguments passed to seaborn's catplot function.
187+
188+
:returns: A seaborn FacetGrid object containing the plot.
189+
:rtype: sns.axisgrid.FacetGrid
183190
184191
The function works as follows:
185192
1. If statistic_samples is an instance of StatisticSamples, it extracts and sorts the performance data.
186193
2. Converts the data into a long format DataFrame.
187194
3. Computes the confidence intervals if CI is provided as a float.
188195
4. Plots the performance data with confidence intervals using seaborn's catplot.
189-
190196
191197
>>> from CompStats import performance, plot_performance
192198
>>> from CompStats.tests.test_performance import DATA
@@ -316,6 +322,7 @@ def performance_multiple_metrics(data: pd.DataFrame, gold: str,
316322
6. Compiles the results into a dictionary and returns it.
317323
318324
Example usage:
325+
319326
>>> from sklearn.metrics import accuracy_score, f1_score
320327
>>> import pandas as pd
321328
>>> from CompStats import performance_multiple_metrics
@@ -433,6 +440,7 @@ def difference_multiple(results_dict, CI: float=0.05,):
433440
7. Returns a dictionary with these calculated differences and additional information.
434441
435442
Example usage:
443+
436444
>>> from CompStats import performance, difference_multiple
437445
>>> from CompStats.tests.test_performance import DATA
438446
>>> from sklearn.metrics import f1_score
@@ -542,6 +550,7 @@ def plot_performance_multiple(results_dict: dict, CI: float = 0.05, capsize: flo
542550
3. Sets the title of each plot to the metric name and the best performing algorithm.
543551
544552
Example usage:
553+
545554
>>> from CompStats import plot_performance_multiple
546555
>>> results = {
547556
>>> 'accuracy': {

CompStats/utils.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,33 @@ def progress_bar(arg, use_tqdm: bool=True, **kwargs):
2323
if not USE_TQDM or not use_tqdm:
2424
return arg
2525
return tqdm(arg, **kwargs)
26+
27+
28+
from functools import wraps
29+
30+
31+
def perf_docs(func):
32+
"""Decorator to Perf with any write :py:class:`~sklearn.metrics` documentation
33+
"""
34+
35+
func.__doc__ = f""":py:class:`~CompStats.interface.Perf` with :py:func:`~sklearn.metrics.{func.__name__}` as :py:attr:`score_func.` The parameters not described can be found in :py:func:`~sklearn.metrics.{func.__name__}`.
36+
37+
:param y_true: True measurement or could be a pandas.DataFrame where column label 'y' corresponds to the true measurement.
38+
:type y_true: numpy.ndarray or pandas.DataFrame
39+
:param y_pred: Predictions, the algorithms will be identified with alg-k where k=1 is the first argument included in :py:attr:`y_pred.`
40+
:type y_pred: numpy.ndarray
41+
:param kwargs: Predictions, the algorithms will be identified using the keyword
42+
:type kwargs: numpy.ndarray
43+
:param num_samples: Number of bootstrap samples, default=500.
44+
:type num_samples: int
45+
:param n_jobs: Number of jobs to compute the statistic, default=-1 corresponding to use all threads.
46+
:type n_jobs: int
47+
:param use_tqdm: Whether to use tqdm.tqdm to visualize the progress, default=True
48+
:type use_tqdm: bool
49+
""" + func.__doc__
50+
51+
@wraps(func)
52+
def inner(*args, **kwargs):
53+
return func(*args, **kwargs)
54+
55+
return inner

README.rst

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,71 @@ CompStats
2323
:target: https://compstats.readthedocs.io/en/latest/?badge=latest
2424

2525
.. image:: https://colab.research.google.com/assets/colab-badge.svg
26-
:target: https://colab.research.google.com/github/INGEOTEC/CompStats/blob/docs/docs/CompStats.ipynb
26+
:target: https://colab.research.google.com/github/INGEOTEC/CompStats/blob/docs/docs/CompStats_metrics.ipynb
2727

28-
Collaborative competitions have gained popularity in the scientific and technological fields. These competitions involve defining tasks, selecting evaluation scores, and devising result verification methods. In the standard scenario, participants receive a training set and are expected to provide a solution for a held-out dataset kept by organizers. An essential challenge for organizers arises when comparing algorithms' performance, assessing multiple participants, and ranking them. Statistical tools are often used for this purpose; however, traditional statistical methods often fail to capture decisive differences between systems' performance. CompStats implements an evaluation methodology for statistically analyzing competition results and competition. CompStats offers several advantages, including off-the-shell comparisons with correction mechanisms and the inclusion of confidence intervals.
28+
Collaborative competitions have gained popularity in the scientific and technological fields. These competitions involve defining tasks, selecting evaluation scores, and devising result verification methods. In the standard scenario, participants receive a training set and are expected to provide a solution for a held-out dataset kept by organizers. An essential challenge for organizers arises when comparing algorithms' performance, assessing multiple participants, and ranking them. Statistical tools are often used for this purpose; however, traditional statistical methods often fail to capture decisive differences between systems' performance. CompStats implements an evaluation methodology for statistically analyzing competition results and competition. CompStats offers several advantages, including off-the-shell comparisons with correction mechanisms and the inclusion of confidence intervals.
29+
30+
To illustrate the use of `CompStats`, the following snippets show an example. The instructions load the necessary libraries, including the one to obtain the problem (e.g., digits), three different classifiers, and the last line is the score used to measure the performance and compare the algorithm.
31+
32+
>>> from sklearn.svm import LinearSVC
33+
>>> from sklearn.naive_bayes import GaussianNB
34+
>>> from sklearn.ensemble import RandomForestClassifier
35+
>>> from sklearn.datasets import load_digits
36+
>>> from sklearn.model_selection import train_test_split
37+
>>> from sklearn.base import clone
38+
>>> from CompStats.metrics import f1_score
39+
40+
The first step is to load the digits problem and split the dataset into training and validation sets. The second step is to estimate the parameters of a linear Support Vector Machine and predict the validation set's classes. The predictions are stored in the variable `hy`.
41+
42+
>>> X, y = load_digits(return_X_y=True)
43+
>>> _ = train_test_split(X, y, test_size=0.3)
44+
>>> X_train, X_val, y_train, y_val = _
45+
>>> m = LinearSVC().fit(X_train, y_train)
46+
>>> hy = m.predict(X_val)
47+
48+
Once the predictions are available, it is time to measure the algorithm's performance, as seen in the following code. It is essential to note that the API used in `sklearn.metrics` is followed; the difference is that the function returns an instance with different methods that can be used to estimate different performance statistics and compare algorithms.
49+
50+
>>> score = f1_score(y_val, hy, average='macro')
51+
>>> score
52+
<Perf>
53+
Prediction statistics with standard error
54+
alg-1 = 0.936 (0.010)
55+
56+
The previous code shows the macro-f1 score and, in parenthesis, its standard error. The actual performance value is stored in the `statistic` function.
57+
58+
>>> score.statistic()
59+
{'alg-1': 0.9355476018466147}
60+
61+
Continuing with the example, let us assume that one wants to test another classifier on the same problem, in this case, a random forest, as can be seen in the following two lines. The second line predicts the validation set and sets it to the analysis.
62+
63+
>>> ens = RandomForestClassifier().fit(X_train, y_train)
64+
>>> score(ens.predict(X_val), name='Random Forest')
65+
<Perf>
66+
Prediction statistics with standard error
67+
Random Forest = 0.970 (0.008)
68+
alg-1 = 0.936 (0.010)
69+
70+
Let us incorporate another prediction, now with the Naive Bayes classifier, as seen below.
71+
72+
>>> nb = GaussianNB().fit(X_train, y_train)
73+
>>> score(nb.predict(X_val), name='Naive Bayes')
74+
<Perf>
75+
Prediction statistics with standard error
76+
Random Forest = 0.970 (0.008)
77+
alg-1 = 0.936 (0.010)
78+
Naive Bayes = 0.821 (0.016)
79+
80+
The final step is to compare the performance of the three classifiers, which can be done with the `difference` method, as seen next.
81+
82+
>>> diff = score.difference()
83+
>>> diff
84+
<Difference>
85+
difference p-values w.r.t Random Forest
86+
alg-1 0.0
87+
Naive Bayes 0.0
88+
89+
The class `Difference` has the `plot` method that can be used to depict the difference with respect to the best.
90+
91+
>>> diff.plot()
92+
93+
.. image:: https://github.com/INGEOTEC/CompStats/raw/docs/docs/source/digits_difference.png

0 commit comments

Comments
 (0)