Skip to content

Commit ad6eb81

Browse files
authored
Merge pull request #25 from INGEOTEC/develop
Version - 0.1.11
2 parents 05f948d + 2bf538b commit ad6eb81

File tree

8 files changed

+231
-144
lines changed

8 files changed

+231
-144
lines changed

CompStats/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
__version__ = '0.1.10'
14+
__version__ = '0.1.11'
1515
from CompStats.bootstrap import StatisticSamples
1616
from CompStats.measurements import CI, SE, difference_p_value
1717
from CompStats.performance import performance, difference, all_differences, plot_performance, plot_difference

CompStats/interface.py

Lines changed: 61 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
from CompStats.utils import progress_bar
2121
from CompStats import measurements
2222
from CompStats.measurements import SE
23-
from CompStats.performance import plot_performance, plot_difference
2423
from CompStats.utils import dataframe
2524

2625

@@ -248,7 +247,7 @@ def best(self):
248247
else:
249248
self._best = np.array([key] * value.shape[1])
250249
return self._best
251-
BiB = True if self.statistic_samples.BiB else False
250+
BiB = bool(self.statistic_samples.BiB)
252251
keys = np.array(list(self.statistic.keys()))
253252
data = np.asanyarray([self.statistic[k]
254253
for k in keys])
@@ -338,6 +337,12 @@ def plot(self, value_name:str=None,
338337
CI:float=0.05,
339338
kind:str='point', linestyle:str='none',
340339
col_wrap:int=3, capsize:float=0.2,
340+
comparison:bool=True,
341+
right:bool=True,
342+
comp_legend:str='Comparison',
343+
winner_legend:str='Best',
344+
tie_legend:str='Equivalent',
345+
loser_legend:str='Different',
341346
**kwargs):
342347
"""plot with seaborn
343348
@@ -363,32 +368,79 @@ def plot(self, value_name:str=None,
363368
value_name = 'Score'
364369
else:
365370
value_name = 'Error'
371+
if not isinstance(self.statistic, dict):
372+
comparison = False
373+
best = self.best
374+
if isinstance(best, np.ndarray):
375+
if best.shape[0] < col_wrap:
376+
col_wrap = best.shape[0]
366377
df = self.dataframe(value_name=value_name, var_name=var_name,
367-
alg_legend=alg_legend, perf_names=perf_names)
378+
alg_legend=alg_legend, perf_names=perf_names,
379+
comparison=comparison, alpha=CI, right=right,
380+
comp_legend=comp_legend,
381+
winner_legend=winner_legend,
382+
tie_legend=tie_legend,
383+
loser_legend=loser_legend)
368384
if var_name not in df.columns:
369385
var_name = None
370386
col_wrap = None
371387
ci = lambda x: measurements.CI(x, alpha=CI)
388+
if comparison:
389+
kwargs.update(dict(hue=comp_legend))
372390
f_grid = sns.catplot(df, x=value_name, errorbar=ci,
373391
y=alg_legend, col=var_name,
374392
kind=kind, linestyle=linestyle,
375393
col_wrap=col_wrap, capsize=capsize, **kwargs)
376394
return f_grid
377395

378-
379-
def dataframe(self, value_name:str='Score',
396+
def dataframe(self, comparison:bool=False,
397+
right:bool=True,
398+
alpha:float=0.05,
399+
value_name:str='Score',
380400
var_name:str='Performance',
381401
alg_legend:str='Algorithm',
402+
comp_legend:str='Comparison',
403+
winner_legend:str='Best',
404+
tie_legend:str='Equivalent',
405+
loser_legend:str='Different',
382406
perf_names:str=None):
383407
"""Dataframe"""
384408
if perf_names is None and isinstance(self.best, np.ndarray):
385409
func_name = self.statistic_func.__name__
386410
perf_names = [f'{func_name}({i})'
387411
for i, k in enumerate(self.best)]
388-
return dataframe(self, value_name=value_name,
389-
var_name=var_name,
390-
alg_legend=alg_legend,
391-
perf_names=perf_names)
412+
df = dataframe(self, value_name=value_name,
413+
var_name=var_name,
414+
alg_legend=alg_legend,
415+
perf_names=perf_names)
416+
if not comparison:
417+
return df
418+
df[comp_legend] = tie_legend
419+
diff = self.difference()
420+
best = self.best
421+
if isinstance(best, str):
422+
for name, p in diff.p_value(right=right).items():
423+
if p >= alpha:
424+
continue
425+
df.loc[df[alg_legend] == name, comp_legend] = loser_legend
426+
df.loc[df[alg_legend] == best, comp_legend] = winner_legend
427+
else:
428+
p_values = diff.p_value(right=right)
429+
systems = list(p_values.keys())
430+
p_values = np.array([p_values[k] for k in systems])
431+
for name, p_value, winner in zip(perf_names,
432+
p_values.T,
433+
best):
434+
mask = df[var_name] == name
435+
for alg, p in zip(systems, p_value):
436+
if p >= alpha and winner != alg:
437+
continue
438+
_ = mask & (df[alg_legend] == alg)
439+
if winner == alg:
440+
df.loc[_, comp_legend] = winner_legend
441+
else:
442+
df.loc[_, comp_legend] = loser_legend
443+
return df
392444

393445
@property
394446
def n_jobs(self):

CompStats/tests/test_interface.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,30 @@
1717
from sklearn.svm import LinearSVC
1818
from sklearn.ensemble import RandomForestClassifier
1919
from sklearn.naive_bayes import GaussianNB
20-
from sklearn.datasets import load_iris, load_digits
20+
from sklearn.datasets import load_iris, load_digits, load_breast_cancer
2121
from sklearn.model_selection import train_test_split
2222
import pandas as pd
2323
from CompStats.tests.test_performance import DATA
2424

2525

26+
def test_Perf_plot_col_wrap():
27+
"""Test plot when 2 classes"""
28+
from CompStats.metrics import f1_score
29+
30+
X, y = load_breast_cancer(return_X_y=True)
31+
_ = train_test_split(X, y, test_size=0.3)
32+
X_train, X_val, y_train, y_val = _
33+
ens = RandomForestClassifier().fit(X_train, y_train)
34+
nb = GaussianNB().fit(X_train, y_train)
35+
svm = LinearSVC().fit(X_train, y_train)
36+
score = f1_score(y_val, ens.predict(X_val),
37+
average=None,
38+
num_samples=50)
39+
score(nb.predict(X_val))
40+
score(svm.predict(X_val))
41+
score.plot()
42+
43+
2644
def test_Difference_dataframe():
2745
"""Test Difference dataframe"""
2846
from CompStats.metrics import f1_score

README.rst

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ CompStats
2727

2828
Collaborative competitions have gained popularity in the scientific and technological fields. These competitions involve defining tasks, selecting evaluation scores, and devising result verification methods. In the standard scenario, participants receive a training set and are expected to provide a solution for a held-out dataset kept by organizers. An essential challenge for organizers arises when comparing algorithms' performance, assessing multiple participants, and ranking them. Statistical tools are often used for this purpose; however, traditional statistical methods often fail to capture decisive differences between systems' performance. CompStats implements an evaluation methodology for statistically analyzing competition results and competition. CompStats offers several advantages, including off-the-shell comparisons with correction mechanisms and the inclusion of confidence intervals.
2929

30-
To illustrate the use of `CompStats`, the following snippets show an example. The instructions load the necessary libraries, including the one to obtain the problem (e.g., digits), three different classifiers, and the last line is the score used to measure the performance and compare the algorithm.
30+
To illustrate the use of `CompStats`, the following snippets show an example. The instructions load the necessary libraries, including the one to obtain the problem (e.g., digits), four different classifiers, and the last line is the score used to measure the performance and compare the algorithm.
3131

3232
>>> from sklearn.svm import LinearSVC
3333
>>> from sklearn.naive_bayes import GaussianNB
@@ -51,10 +51,10 @@ Once the predictions are available, it is time to measure the algorithm's perfor
5151
>>> score
5252
<Perf(score_func=f1_score, statistic=0.9435, se=0.0099)>
5353

54-
The previous code shows the macro-f1 score and, in parenthesis, its standard error. The actual performance value is stored in the `statistic` function.
54+
The previous code shows the macro-f1 score and its standard error. The actual performance value is stored in the attributes `statistic` function, and `se`
5555

56-
>>> score.statistic
57-
0.9434834454375508
56+
>>> score.statistic, score.se
57+
(0.9521479775366307, 0.009717884979482313)
5858

5959
Continuing with the example, let us assume that one wants to test another classifier on the same problem, in this case, a random forest, as can be seen in the following two lines. The second line predicts the validation set and sets it to the analysis.
6060

@@ -63,28 +63,36 @@ Continuing with the example, let us assume that one wants to test another classi
6363
<Perf(score_func=f1_score)>
6464
Statistic with its standard error (se)
6565
statistic (se)
66-
0.9655 (0.0077) <= Random Forest
67-
0.9435 (0.0099) <= alg-1
66+
0.9720 (0.0076) <= Random Forest
67+
0.9521 (0.0097) <= alg-1
6868

69-
Let us incorporate another prediction, now with the Naive Bayes classifier, as seen below.
69+
Let us incorporate another predictions, now with Naive Bayes classifier, and Histogram Gradient Boosting as seen below.
7070

7171
>>> nb = GaussianNB().fit(X_train, y_train)
7272
>>> score(nb.predict(X_val), name='Naive Bayes')
7373
<Perf(score_func=f1_score)>
7474
Statistic with its standard error (se)
7575
statistic (se)
76-
0.9655 (0.0077) <= Random Forest
77-
0.9435 (0.0099) <= alg-1
78-
0.8549 (0.0153) <= Naive Bayes
76+
0.9759 (0.0068) <= Hist. Grad. Boost. Tree
77+
0.9720 (0.0076) <= Random Forest
78+
0.9521 (0.0097) <= alg-1
79+
0.8266 (0.0159) <= Naive Bayes
7980

80-
The final step is to compare the performance of the three classifiers, which can be done with the `difference` method, as seen next.
81+
The performance, its confidence interval (5%), and a statistical comparison (5%) between the best performing system with the rest of the algorithms is depicted in the following figure.
82+
83+
>>> score.plot()
84+
85+
.. image:: https://github.com/INGEOTEC/CompStats/raw/docs/docs/source/digits_perf.png
86+
87+
The final step is to compare the performance of the four classifiers, which can be done with the `difference` method, as seen next.
8188

8289
>>> diff = score.difference()
8390
>>> diff
8491
<Difference>
85-
difference p-values w.r.t Random Forest
92+
difference p-values w.r.t Hist. Grad. Boost. Tree
8693
0.0000 <= Naive Bayes
87-
0.0120 <= alg-1
94+
0.0100 <= alg-1
95+
0.3240 <= Random Forest
8896

8997
The class `Difference` has the `plot` method that can be used to depict the difference with respect to the best.
9098

0 commit comments

Comments
 (0)