Adjusted rand scores
scikit-learn and back-of-envelop calculatiions
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics, datasets, cluster
from scipy.special import binom
import pandas as pddef calcadjrand(df, ct):
sum_nij, sum_ai, sum_bj = 0, 0, 0
for nij in ct[:-1, :-1].flatten():
sum_nij += binom(nij, 2)
for na in ct[:-1, -1]:
sum_ai += binom(na, 2)
for nb in ct[-1, :-1]:
sum_bj += binom(nb, 2)
n2 = binom(ct[-1, -1], 2)
numerator = sum_nij - sum_ai * sum_bj / n2
denominator = (sum_ai + sum_bj) / 2 - sum_ai * sum_bj / n2
ARI = numerator / denominator
print('Adjusted rand score calculated using scikit-learn')
print(metrics.adjusted_rand_score(df.y, df.p))
print('\nAdjusted rand score calculated semi-automatically')
print(ARI)Example #1¶
# perfect match
y = [0, 0, 1, 1]
p = [0, 0, 1, 1]
df = pd.DataFrame(list(zip(y, p)), columns=['y', 'p'])
ct = pd.crosstab(df.y, df.p, margins=True).values
pd.crosstab(df.y, df.p, margins=True)Loading...
calcadjrand(df, ct)
sum_nij = binom(2, 2) + binom(0, 2) + binom(0, 2) + binom(2, 2)
sum_ai = binom(2, 2) + binom(2, 2)
sum_bj = binom(2, 2) + binom(2, 2)
numerator = sum_nij - (sum_ai * sum_bj) / binom(4, 2)
denominator = (sum_ai + sum_bj) / 2 - (sum_ai * sum_bj) / binom(4, 2)
ARI = numerator / denominator
print('\nAdjusted rand score calculated manually back-of-the-envelop')
print(ARI)Adjusted rand score calculated using scikit-learn
1.0
Adjusted rand score calculated semi-automatically
1.0
Adjusted rand score calculated manually back-of-the-envelop
1.0
Example #2¶
# half match
y = [0, 0, 1, 1]
p = [0, 1, 0, 1]
df = pd.DataFrame(list(zip(y, p)), columns=['y', 'p'])
ct = pd.crosstab(df.y, df.p, margins=True).values
pd.crosstab(df.y, df.p, margins=True)Loading...
calcadjrand(df, ct)
sum_nij = binom(1, 2) + binom(1, 2) + binom(1, 2) + binom(1, 2)
sum_ai = binom(2, 2) + binom(2, 2)
sum_bj = binom(2, 2) + binom(2, 2)
numerator = sum_nij - (sum_ai * sum_bj) / binom(4, 2)
denominator = (sum_ai + sum_bj) / 2 - (sum_ai * sum_bj) / binom(4, 2)
ARI = numerator / denominator
print('\nAdjusted rand score calculated manually back-of-the-envelop')
print(ARI)Adjusted rand score calculated using scikit-learn
-0.49999999999999994
Adjusted rand score calculated semi-automatically
-0.49999999999999994
Adjusted rand score calculated manually back-of-the-envelop
-0.49999999999999994
Example #3¶
y = [0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
p = [0, 1, 0, 1, 1, 2, 2, 2, 2, 2]
df = pd.DataFrame(list(zip(y, p)), columns=['y', 'p'])
ct = pd.crosstab(df.y, df.p, margins=True).values
pd.crosstab(df.y, df.p, margins=True)Loading...
calcadjrand(df, ct)
sum_nij = binom(1, 2) + binom(1, 2) + binom(0, 2) + \
binom(1, 2) + binom(2, 2) + binom(1, 2) + \
binom(0, 2) + binom(0, 2) + binom(4, 2)
sum_ai = binom(2, 2) + binom(4, 2) + binom(4, 2)
sum_bj = binom(2, 2) + binom(3, 2) + binom(5, 2)
numerator = sum_nij - (sum_ai * sum_bj) / binom(10, 2)
denominator = (sum_ai + sum_bj) / 2 - (sum_ai * sum_bj) / binom(10, 2)
ARI = numerator / denominator
print('\nAdjusted rand score calculated manually back-of-the-envelop')
print(ARI)Adjusted rand score calculated using scikit-learn
0.31257344300822565
Adjusted rand score calculated semi-automatically
0.31257344300822565
Adjusted rand score calculated manually back-of-the-envelop
0.31257344300822565
Example #4¶
np.random.seed(0)
n_samples = 1500
X, y = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05)
p = cluster.MiniBatchKMeans(n_clusters=2).fit(X).labels_.astype(np.int)
df = pd.DataFrame(list(zip(y, p)), columns=['y', 'p'])
ct = pd.crosstab(df.y, df.p, margins=True).values
pd.crosstab(df.y, df.p, margins=True)Loading...
calcadjrand(df, ct)
sum_nij = binom(380, 2) + binom(370, 2) + binom(383, 2) + binom(367, 2)
sum_ai = binom(750, 2) + binom(750, 2)
sum_bj = binom(763, 2) + binom(737, 2)
numerator = sum_nij - (sum_ai * sum_bj) / binom(1500, 2)
denominator = (sum_ai + sum_bj) / 2 - (sum_ai * sum_bj) / binom(1500, 2)
ARI = numerator / denominator
print('\nAdjusted rand score calculated manually back-of-the-envelop')
print(ARI)Adjusted rand score calculated using scikit-learn
-0.0006513453670644906
Adjusted rand score calculated semi-automatically
-0.0006513453670644906
Adjusted rand score calculated manually back-of-the-envelop
-0.0006513453670644906
Example #5¶
y = [0, 3, 3, 1, 1, 3, 3, 2, 2, 2]
p = [3, 3, 0, 1, 1, 2, 3, 3, 2, 2]
df = pd.DataFrame(list(zip(y, p)), columns=['y', 'p'])
ct = pd.crosstab(df.y, df.p, margins=True).values
pd.crosstab(df.y, df.p, margins=True)Loading...
calcadjrand(df, ct)
sum_nij = binom(0, 2) + binom(0, 2) + binom(0, 2) + binom(1, 2) + \
binom(0, 2) + binom(2, 2) + binom(0, 2) + binom(0, 2) + \
binom(0, 2) + binom(0, 2) + binom(2, 2) + binom(1, 2) + \
binom(1, 2) + binom(0, 2) + binom(1, 2) + binom(2, 2)
sum_ai = binom(1, 2) + binom(2, 2) + binom(3, 2) + binom(4, 2)
sum_bj = binom(1, 2) + binom(2, 2) + binom(3, 2) + binom(4, 2)
numerator = sum_nij - (sum_ai * sum_bj) / binom(10, 2)
denominator = (sum_ai + sum_bj) / 2 - (sum_ai * sum_bj) / binom(10, 2)
ARI = numerator / denominator
print('\nAdjusted rand score calculated manually back-of-the-envelop')
print(ARI)Adjusted rand score calculated using scikit-learn
0.09999999999999999
Adjusted rand score calculated semi-automatically
0.09999999999999999
Adjusted rand score calculated manually back-of-the-envelop
0.09999999999999999
Example #6¶
y = [0] * 6
y.extend( [1] * 15)
y.extend( [2] * 24)
p = [0, 1, 1, 2, 2, 2, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2]
df = pd.DataFrame(list(zip(y, p)), columns=['y', 'p'])
ct = pd.crosstab(df.y, df.p, margins=True).values
pd.crosstab(df.y, df.p, margins=True)Loading...
calcadjrand(df, ct)
sum_nij = binom(1, 2) + binom(2, 2) + binom(3, 2) + \
binom(4, 2) + binom(5, 2) + binom(6, 2) + \
binom(7, 2) + binom(8, 2) + binom(9, 2)
sum_ai = binom(6, 2) + binom(15, 2) + binom(24, 2)
sum_bj = binom(12, 2) + binom(15, 2) + binom(18, 2)
numerator = sum_nij - sum_ai * sum_bj / binom(45, 2)
denominator = (sum_ai + sum_bj) / 2 - sum_ai * sum_bj / binom(45, 2)
ARI = numerator / denominator
print('\nAdjusted rand score calculated manually back-of-the-envelop')
print(ARI)Adjusted rand score calculated using scikit-learn
-0.041666666666666644
Adjusted rand score calculated semi-automatically
-0.041666666666666644
Adjusted rand score calculated manually back-of-the-envelop
-0.041666666666666644