Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

Adjusted rand scores

scikit-learn and back-of-envelop calculatiions

import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics, datasets, cluster
from scipy.special import binom
import pandas as pd
def calcadjrand(df, ct):
    sum_nij, sum_ai, sum_bj = 0, 0, 0
    for nij in ct[:-1, :-1].flatten():
        sum_nij += binom(nij, 2)
    for na in ct[:-1, -1]:
        sum_ai += binom(na, 2)
    for nb in ct[-1, :-1]:
        sum_bj += binom(nb, 2)
    n2 = binom(ct[-1, -1], 2)
    numerator = sum_nij  -  sum_ai * sum_bj / n2
    denominator = (sum_ai + sum_bj) / 2  -  sum_ai * sum_bj / n2
    ARI = numerator / denominator
    print('Adjusted rand score calculated using scikit-learn')
    print(metrics.adjusted_rand_score(df.y, df.p))
    print('\nAdjusted rand score calculated semi-automatically')
    print(ARI)

Example #1

# perfect match
y = [0, 0, 1, 1] 
p = [0, 0, 1, 1]
df = pd.DataFrame(list(zip(y, p)), columns=['y', 'p'])
ct = pd.crosstab(df.y, df.p, margins=True).values
pd.crosstab(df.y, df.p, margins=True)
Loading...
calcadjrand(df, ct)
sum_nij = binom(2, 2) + binom(0, 2) + binom(0, 2) + binom(2, 2)
sum_ai  = binom(2, 2) + binom(2, 2)
sum_bj  = binom(2, 2) + binom(2, 2)
numerator = sum_nij  -  (sum_ai * sum_bj) / binom(4, 2)
denominator = (sum_ai + sum_bj) / 2  -  (sum_ai * sum_bj) / binom(4, 2)
ARI = numerator / denominator
print('\nAdjusted rand score calculated manually back-of-the-envelop')
print(ARI)
Adjusted rand score calculated using scikit-learn
1.0

Adjusted rand score calculated semi-automatically
1.0

Adjusted rand score calculated manually back-of-the-envelop
1.0

Example #2

# half match
y = [0, 0, 1, 1]
p = [0, 1, 0, 1]
df = pd.DataFrame(list(zip(y, p)), columns=['y', 'p'])
ct = pd.crosstab(df.y, df.p, margins=True).values
pd.crosstab(df.y, df.p, margins=True)
Loading...
calcadjrand(df, ct)
sum_nij = binom(1, 2) + binom(1, 2) + binom(1, 2) + binom(1, 2)
sum_ai  = binom(2, 2) + binom(2, 2)
sum_bj  = binom(2, 2) + binom(2, 2)
numerator = sum_nij  -  (sum_ai * sum_bj) / binom(4, 2)
denominator = (sum_ai + sum_bj) / 2  -  (sum_ai * sum_bj) / binom(4, 2)
ARI = numerator / denominator
print('\nAdjusted rand score calculated manually back-of-the-envelop')
print(ARI)
Adjusted rand score calculated using scikit-learn
-0.49999999999999994

Adjusted rand score calculated semi-automatically
-0.49999999999999994

Adjusted rand score calculated manually back-of-the-envelop
-0.49999999999999994

Example #3

y = [0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
p = [0, 1, 0, 1, 1, 2, 2, 2, 2, 2]
df = pd.DataFrame(list(zip(y, p)), columns=['y', 'p'])
ct = pd.crosstab(df.y, df.p, margins=True).values
pd.crosstab(df.y, df.p, margins=True)
Loading...
calcadjrand(df, ct)
sum_nij = binom(1, 2) + binom(1, 2) + binom(0, 2) + \
          binom(1, 2) + binom(2, 2) + binom(1, 2) + \
          binom(0, 2) + binom(0, 2) + binom(4, 2)
sum_ai  = binom(2, 2) + binom(4, 2) + binom(4, 2)
sum_bj  = binom(2, 2) + binom(3, 2) + binom(5, 2)
numerator = sum_nij  -  (sum_ai * sum_bj) / binom(10, 2)
denominator = (sum_ai + sum_bj) / 2  -  (sum_ai * sum_bj) / binom(10, 2)
ARI = numerator / denominator
print('\nAdjusted rand score calculated manually back-of-the-envelop')
print(ARI)
Adjusted rand score calculated using scikit-learn
0.31257344300822565

Adjusted rand score calculated semi-automatically
0.31257344300822565

Adjusted rand score calculated manually back-of-the-envelop
0.31257344300822565

Example #4

np.random.seed(0)
n_samples = 1500
X, y = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05)
p = cluster.MiniBatchKMeans(n_clusters=2).fit(X).labels_.astype(np.int)
df = pd.DataFrame(list(zip(y, p)), columns=['y', 'p'])
ct = pd.crosstab(df.y, df.p, margins=True).values
pd.crosstab(df.y, df.p, margins=True)
Loading...
calcadjrand(df, ct)
sum_nij = binom(380, 2) + binom(370, 2) + binom(383, 2) + binom(367, 2) 
sum_ai  = binom(750, 2) + binom(750, 2)
sum_bj  = binom(763, 2) + binom(737, 2)
numerator = sum_nij  -  (sum_ai * sum_bj) / binom(1500, 2)
denominator = (sum_ai + sum_bj) / 2  -  (sum_ai * sum_bj) / binom(1500, 2)
ARI = numerator / denominator
print('\nAdjusted rand score calculated manually back-of-the-envelop')
print(ARI)
Adjusted rand score calculated using scikit-learn
-0.0006513453670644906

Adjusted rand score calculated semi-automatically
-0.0006513453670644906

Adjusted rand score calculated manually back-of-the-envelop
-0.0006513453670644906

Example #5

y = [0, 3, 3, 1, 1, 3, 3, 2, 2, 2]
p = [3, 3, 0, 1, 1, 2, 3, 3, 2, 2]
df = pd.DataFrame(list(zip(y, p)), columns=['y', 'p'])
ct = pd.crosstab(df.y, df.p, margins=True).values
pd.crosstab(df.y, df.p, margins=True)
Loading...
calcadjrand(df, ct)
sum_nij = binom(0, 2) + binom(0, 2) + binom(0, 2) + binom(1, 2) + \
          binom(0, 2) + binom(2, 2) + binom(0, 2) + binom(0, 2) + \
          binom(0, 2) + binom(0, 2) + binom(2, 2) + binom(1, 2) + \
          binom(1, 2) + binom(0, 2) + binom(1, 2) + binom(2, 2) 
sum_ai  = binom(1, 2) + binom(2, 2) + binom(3, 2) + binom(4, 2)
sum_bj  = binom(1, 2) + binom(2, 2) + binom(3, 2) + binom(4, 2)
numerator = sum_nij  -  (sum_ai * sum_bj) / binom(10, 2)
denominator = (sum_ai + sum_bj) / 2  -  (sum_ai * sum_bj) / binom(10, 2)
ARI = numerator / denominator
print('\nAdjusted rand score calculated manually back-of-the-envelop')
print(ARI)
Adjusted rand score calculated using scikit-learn
0.09999999999999999

Adjusted rand score calculated semi-automatically
0.09999999999999999

Adjusted rand score calculated manually back-of-the-envelop
0.09999999999999999

Example #6

y = [0] * 6
y.extend( [1] * 15)
y.extend( [2] * 24)
p = [0, 1, 1, 2, 2, 2, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0,
     0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2]
df = pd.DataFrame(list(zip(y, p)), columns=['y', 'p'])
ct = pd.crosstab(df.y, df.p, margins=True).values
pd.crosstab(df.y, df.p, margins=True)
Loading...
calcadjrand(df, ct)
sum_nij = binom(1, 2) + binom(2, 2) + binom(3, 2) + \
          binom(4, 2) + binom(5, 2) + binom(6, 2) + \
          binom(7, 2) + binom(8, 2) + binom(9, 2)
sum_ai  = binom(6, 2) + binom(15, 2) + binom(24, 2)
sum_bj  = binom(12, 2) + binom(15, 2) + binom(18, 2)
numerator = sum_nij  -  sum_ai * sum_bj / binom(45, 2)
denominator = (sum_ai + sum_bj) / 2  -  sum_ai * sum_bj / binom(45, 2)
ARI = numerator / denominator
print('\nAdjusted rand score calculated manually back-of-the-envelop')
print(ARI)
Adjusted rand score calculated using scikit-learn
-0.041666666666666644

Adjusted rand score calculated semi-automatically
-0.041666666666666644

Adjusted rand score calculated manually back-of-the-envelop
-0.041666666666666644