Bibliotecas¶

In [94]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import ruptures as rpt
import warnings

from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.cluster import Birch
from sklearn.metrics import silhouette_samples, silhouette_score

#warnings.filterwarnings("ignore")

Funções¶

In [97]:
def gini(x):
    total = 0
    
    for xi in x:
        for xj in x:
            total += np.abs(xi - xj)

    return total / ((len(x) ** 2) * 2 * np.mean(x))
In [99]:
def theil(x):
    u = np.mean(x)
    t = 0

    for xi in x:
        t += np.log(xi/u) * (xi / (u * len(x)))

    return t
In [101]:
def mld(x):  # não tolera dados negativos ou nulos
    u = np.mean(x)
    t = 0

    for xi in x:
        t += np.log(u/xi)

    return t
In [103]:
def changepoint(data, x, y):  # a série tem o "patamar novo" do ponto detectado em diante
    model = "rbf"
    algo = rpt.Pelt(model=model).fit(data[y].values)
    result = algo.predict(pen = 2 * np.log(len(data[x])))

    fig = px.line(data, x=x, y=y)
    fig.add_vline(x=data.iloc[0][x])

    for resul in result:
        fig.add_vline(x=data.iloc[resul-1][x])

    fig.show()
In [105]:
def cluster(data):
    fig = go.Figure(data=go.Scatter(x=data[data['Labels'] == 0]["points"], mode="markers", text=data[data["Labels"] == 0]["driverId"]))

    for label in data["Labels"].unique():
        if label == 0:
            continue

        else:
            fig.add_trace(go.Scatter(x=data[data['Labels'] == label]["points"], mode="markers", text=data[data["Labels"] == label]["driverId"]))

    fig.show()
In [107]:
def clusterc(data):
    fig = go.Figure(data=go.Scatter(x=data[data['Labels'] == 0]["points"], mode="markers", text=data[data["Labels"] == 0]["constructorId"]))

    for label in data["Labels"].unique():
        if label == 0:
            continue

        else:
            fig.add_trace(go.Scatter(x=data[data['Labels'] == label]["points"], mode="markers", text=data[data["Labels"] == label]["constructorId"]))

    fig.show()
In [109]:
def clusterd(data, medida):
    fig = go.Figure(data=go.Scatter(x=data[data['Labels'] == 0][medida], mode="markers", text=data[data["Labels"] == 0]["Ano"]))

    for label in data["Labels"].unique():
        if label == 0:
            continue

        else:
            fig.add_trace(go.Scatter(x=data[data['Labels'] == label][medida], mode="markers", text=data[data["Labels"] == label]["Ano"]))

    fig.show()

Pré-Processamento¶

In [112]:
df = pd.read_csv("D:\\Esportes\\F1\\F1 db\\f1db-csv\\f1db-seasons-driver-standings.csv")
df0 = df[df["points"] != 0]
dfc = pd.read_csv("D:\\Esportes\\F1\\F1 db\\f1db-csv\\f1db-seasons-constructor-standings.csv")
dfc0 = dfc[dfc["points"] != 0]
dfc
Out[112]:
year positionDisplayOrder positionNumber positionText constructorId engineManufacturerId points
0 1958 1 1.0 1 vanwall vanwall 48.0
1 1958 2 2.0 2 ferrari ferrari 40.0
2 1958 3 3.0 3 cooper climax 31.0
3 1958 4 4.0 4 brm brm 18.0
4 1958 5 5.0 5 maserati maserati 6.0
... ... ... ... ... ... ... ...
705 2025 6 6.0 6 aston-martin mercedes 52.0
706 2025 7 7.0 7 kick-sauber ferrari 51.0
707 2025 8 8.0 8 racing-bulls honda-rbpt 45.0
708 2025 9 9.0 9 haas ferrari 35.0
709 2025 10 10.0 10 alpine renault 20.0

710 rows × 7 columns

Clusterização¶

Pilotos (incluindo 0)¶

In [14]:
for ano in df["year"].unique():
    temp = df[df["year"] == ano]
    silhouette_avg = []
    print(ano)
    for n_clusters in range(2, 11):
        clusterer = KMeans(n_clusters = n_clusters)
        cluster_labels = clusterer.fit_predict(temp["points"].values.reshape(-1, 1).tolist())
        silhouette_avg.append(silhouette_score(temp["points"].values.reshape(-1, 1).tolist(), cluster_labels))
    
    melhor = silhouette_avg.index(max(silhouette_avg)) + 2
    clusterer = KMeans(n_clusters = melhor)
    cluster_labels = clusterer.fit_predict(temp["points"].values.reshape(-1, 1).tolist())
    temp["Labels"] = cluster_labels
    print(silhouette_score(temp["points"].values.reshape(-1, 1).tolist(), cluster_labels))
    cluster(temp)
1950
0.8313141495980811
1951
0.7487690497861516
1952
0.804082211580146
1953
0.7181759129434699
1954
0.7856025347535867
1955
0.801459428143493
1956
0.8832655406458562
1957
0.7632246125420362
1958
0.759830185378558
1959
0.688067126766979
1960
0.8037198415838049
1961
0.6448768273015044
1962
0.669737979292082
1963
0.603627345094753
1964
0.7301544876746514
1965
0.7069735566455253
1966
0.6538917407784206
1967
0.7960691555141212
1968
0.7396815998339378
1969
0.6693530042306297
1970
0.7336670067720708
1971
0.7021022622359122
1972
0.7256953091705947
1973
0.7806645271534784
1974
0.7457928483705798
1975
0.6440933410268554
1976
0.7113192260738181
1977
0.7202175301774412
1978
0.7000591106657774
1979
0.7064053577248651
1980
0.8057744023703516
1981
0.7653484431208817
1982
0.7332290936692427
1983
0.7423034480943249
1984
0.5968679972251407
1985
0.5651625825931543
1986
0.8379585314417138
1987
0.7466464409504723
1988
0.832035279350325
1989
0.8209756465962681
1990
0.7444951965359489
1991
0.7589677194292314
1992
0.8114535164613662
1993
0.849233475556196
1994
0.8762410970464458
1995
0.6720820060917254
1996
0.7299817419560448
1997
0.7364856023452371
1998
0.7545217057058412
1999
0.7650531089010569
2000
0.8532849076267064
2001
0.7840485677984427
2002
0.7714220031268767
2003
0.7816415989299511
2004
0.6992532869193855
2005
0.7583685562049832
2006
0.7894409207631783
2007
0.821850120638549
2008
0.7230256128413611
2009
0.685633085815124
2010
0.8033224275679746
2011
0.8410579918293389
2012
0.7695702056350417
2013
0.7163565802373008
2014
0.6941204232706766
2015
0.7729614298618767
2016
0.7945618277873422
2017
0.7685148902496494
2018
0.7913168004226975
2019
0.8111168703843543
2020
0.8104201350581502
2021
0.7216377958849519
2022
0.8025994826695553
2023
0.7621242262273097
2024
0.8051870925014329
2025
0.7949899243449562
In [15]:
for ano in df["year"].unique():
    temp = df[df["year"] == ano]
    silhouette_avg = []
    print(ano)
    for n_clusters in range(2, 11):
        clusterer = GaussianMixture(n_components = n_clusters)
        cluster_labels = clusterer.fit_predict(temp["points"].values.reshape(-1, 1).tolist())
        silhouette_avg.append(silhouette_score(temp["points"].values.reshape(-1, 1).tolist(), cluster_labels))
    
    melhor = silhouette_avg.index(max(silhouette_avg)) + 2
    clusterer = GaussianMixture(n_components = melhor)
    cluster_labels = clusterer.fit_predict(temp["points"].values.reshape(-1, 1).tolist())
    temp["Labels"] = cluster_labels
    print(silhouette_score(temp["points"].values.reshape(-1, 1).tolist(), cluster_labels))
    cluster(temp)
1950
0.8313141495980811
1951
0.7487690497861516
1952
0.804082211580146
1953
0.7115215716881396
1954
0.7464253688834682
1955
0.801459428143493
1956
0.8832655406458562
1957
0.7632246125420362
1958
0.759830185378558
1959
0.6629953922261406
1960
0.8037198415838049
1961
0.7659053953000394
1962
0.669737979292082
1963
0.6848672366550106
1964
0.6935923269914391
1965
0.7086731609457321
1966
0.6424661320505736
1967
0.7960691555141212
1968
0.7218582228418958
1969
0.6693530042306297
1970
0.7111788746316079
1971
0.766546343786521
1972
0.7256953091705947
1973
0.7546452066043982
1974
0.7457928483705798
1975
0.6843630351594416
1976
0.7113192260738181
1977
0.7202175301774412
1978
0.7000591106657774
1979
0.7053222043892818
1980
0.8057744023703516
1981
0.7653484431208817
1982
0.7316234974662237
1983
0.7423034480943249
1984
0.7857862606547609
1985
0.6233503401360543
1986
0.8379585314417138
1987
0.7527923437191689
1988
0.832035279350325
1989
0.8209756465962681
1990
0.7699085263901749
1991
0.685689809601219
1992
0.8114535164613662
1993
0.849233475556196
1994
0.8762410970464458
1995
0.6870111670697203
1996
0.6715484362237036
1997
0.7364856023452371
1998
0.642841951219443
1999
0.7650531089010569
2000
0.8532849076267064
2001
0.7840485677984427
2002
0.7372831615832602
2003
0.7656471332239531
2004
0.6577212768859666
2005
0.7583685562049832
2006
0.7894409207631783
2007
0.821850120638549
2008
0.7051621873956923
2009
0.6921365180267233
2010
0.6590503559917744
2011
0.7486020062502805
2012
0.7695702056350417
2013
0.7163565802373008
2014
0.6700502907941074
2015
0.7729614298618767
2016
0.7945618277873422
2017
0.6816291047536793
2018
0.7913168004226975
2019
0.8111168703843543
2020
0.8104201350581502
2021
0.6612377907896354
2022
0.6705723585254465
2023
0.7621242262273097
2024
0.7697446283893066
2025
0.7949899243449562
In [16]:
for ano in df["year"].unique():
    temp = df[df["year"] == ano]
    print(ano)
    clusterer = Birch()
    cluster_labels = clusterer.fit_predict(temp["points"].values.reshape(-1, 1).tolist())
    temp["Labels"] = cluster_labels
    print(silhouette_score(temp["points"].values.reshape(-1, 1).tolist(), cluster_labels))
    cluster(temp)
1950
0.7623876636280248
1951
0.6722252829315847
1952
0.7644382101126403
1953
0.7051863591354304
1954
0.7245351963523796
1955
0.7005619432558534
1956
0.7759388976060635
1957
0.5713550405955361
1958
0.6021865246847107
1959
0.6348435278839822
1960
0.7179677179893312
1961
0.6754950097781932
1962
0.6562025145791508
1963
0.6619696830330211
1964
0.6962928307397505
1965
0.6410690764934392
1966
0.6231758856737503
1967
0.6910517200955202
1968
0.6653469034152771
1969
0.6693530042306297
1970
0.719265777326842
1971
0.618320306808646
1972
0.5980017316644533
1973
0.6691524256006519
1974
0.6881490100224322
1975
0.6632074416433728
1976
0.6740039020383763
1977
0.6605812895544333
1978
0.6161226622589479
1979
0.6238800171880501
1980
0.7631626414829408
1981
0.7515476102947917
1982
0.686551442415382
1983
0.6162350710742236
1984
0.7485011507388687
1985
0.5973703615732937
1986
0.7709438615918525
1987
0.6807902374196813
1988
0.5810703502315716
1989
0.8195375812151494
1990
0.7311057368658382
1991
0.7499407460683831
1992
0.8114535164613662
1993
0.8037286464108541
1994
0.6686980270994577
1995
0.6870111670697203
1996
0.6698130571724606
1997
0.5777719172765445
1998
0.7545217057058412
1999
0.7650531089010569
2000
0.7986657051702922
2001
0.7840485677984427
2002
0.7650756035067715
2003
0.7656471332239531
2004
0.6743807705317139
2005
0.7366562902100184
2006
0.7894409207631783
2007
0.7351866798683951
2008
0.7063863950436645
2009
0.6812531632829318
2010
0.8154929123407347
2011
0.8126496506751478
2012
0.7178944662592158
2013
0.7163565802373008
2014
0.71259172139879
2015
0.6016269921705187
2016
0.7945618277873422
2017
0.7668942888342545
2018
0.7342062055033767
2019
0.7492468708756187
2020
0.7321709713073892
2021
0.6705370684867674
2022
0.773712263049222
2023
0.7621242262273097
2024
0.7697446283893066
2025
0.7755101044945143

Pilotos (excluindo 0)¶

In [18]:
for ano in df0["year"].unique():
    temp = df0[df0["year"] == ano]
    silhouette_avg = []
    print(ano)
    for n_clusters in range(2, 11):
        clusterer = KMeans(n_clusters = n_clusters)
        cluster_labels = clusterer.fit_predict(temp["points"].values.reshape(-1, 1).tolist())
        silhouette_avg.append(silhouette_score(temp["points"].values.reshape(-1, 1).tolist(), cluster_labels))
    
    melhor = silhouette_avg.index(max(silhouette_avg)) + 2
    clusterer = KMeans(n_clusters = melhor)
    cluster_labels = clusterer.fit_predict(temp["points"].values.reshape(-1, 1).tolist())
    temp["Labels"] = cluster_labels
    print(silhouette_score(temp["points"].values.reshape(-1, 1).tolist(), cluster_labels))
    cluster(temp)
1950
0.8313141495980811
1951
0.7487690497861516
1952
0.804082211580146
1953
0.7115215716881396
1954
0.7933154786406731
1955
0.801459428143493
1956
0.8832655406458562
1957
0.7632246125420362
1958
0.8064024533798089
1959
0.688067126766979
1960
0.735159200649674
1961
0.7601080601626755
1962
0.7016326617301747
1963
0.6619696830330211
1964
0.7301544876746514
1965
0.7069735566455253
1966
0.6327383975391095
1967
0.7960691555141212
1968
0.7396815998339378
1969
0.6693530042306297
1970
0.7336670067720708
1971
0.766546343786521
1972
0.696618383697969
1973
0.7806645271534784
1974
0.7457928483705798
1975
0.6843630351594416
1976
0.6863429691174074
1977
0.7202175301774412
1978
0.7000591106657774
1979
0.6940142796807226
1980
0.8057744023703516
1981
0.7653484431208817
1982
0.7332290936692427
1983
0.7423034480943249
1984
0.7485011507388687
1985
0.630393709383325
1986
0.8379585314417138
1987
0.7466464409504723
1988
0.832035279350325
1989
0.8209756465962681
1990
0.7699085263901749
1991
0.7589677194292314
1992
0.8114535164613662
1993
0.849233475556196
1994
0.8762410970464458
1995
0.6720820060917254
1996
0.7299817419560448
1997
0.6104148911289635
1998
0.7545217057058412
1999
0.7650531089010569
2000
0.8225806499505692
2001
0.7359371637776506
2002
0.7366310508809067
2003
0.7586210909084679
2004
0.66394535067155
2005
0.7554598636794895
2006
0.7688868415204976
2007
0.7956873947793417
2008
0.6989010955459967
2009
0.6635183883718384
2010
0.7980531841320546
2011
0.8035388126123046
2012
0.7168470532285139
2013
0.6646856908776734
2014
0.6161479872938135
2015
0.7697278037871605
2016
0.7812847461972414
2017
0.7552926664919701
2018
0.7913168004226975
2019
0.8100426879740733
2020
0.7985286704289817
2021
0.6809393893815345
2022
0.7975871911739396
2023
0.7555500777517361
2024
0.7860094619799421
2025
0.7957478489704258
In [19]:
for ano in df0["year"].unique():
    temp = df0[df0["year"] == ano]
    silhouette_avg = []
    print(ano)
    for n_clusters in range(2, 11):
        clusterer = GaussianMixture(n_components = n_clusters)
        cluster_labels = clusterer.fit_predict(temp["points"].values.reshape(-1, 1).tolist())
        silhouette_avg.append(silhouette_score(temp["points"].values.reshape(-1, 1).tolist(), cluster_labels))
    
    melhor = silhouette_avg.index(max(silhouette_avg)) + 2
    clusterer = GaussianMixture(n_components = melhor)
    cluster_labels = clusterer.fit_predict(temp["points"].values.reshape(-1, 1).tolist())
    temp["Labels"] = cluster_labels
    print(silhouette_score(temp["points"].values.reshape(-1, 1).tolist(), cluster_labels))
    cluster(temp)
1950
0.8313141495980811
1951
0.7487690497861516
1952
0.804082211580146
1953
0.7115215716881396
1954
0.7464253688834682
1955
0.801459428143493
1956
0.8832655406458562
1957
0.7632246125420362
1958
0.759830185378558
1959
0.6629953922261406
1960
0.7469135802469137
1961
0.7659053953000394
1962
0.669737979292082
1963
0.57840248839927
1964
0.721615867439218
1965
0.7086731609457321
1966
0.6424661320505736
1967
0.7960691555141212
1968
0.7218582228418958
1969
0.7007594701086397
1970
0.7212361868402236
1971
0.7021022622359122
1972
0.7256953091705947
1973
0.7546452066043982
1974
0.7457928483705798
1975
0.6518624180290866
1976
0.7113192260738181
1977
0.7202175301774412
1978
0.7000591106657774
1979
0.7053222043892818
1980
0.8057744023703516
1981
0.7653484431208817
1982
0.7316234974662237
1983
0.7423034480943249
1984
0.7485011507388687
1985
0.5694386288998359
1986
0.8379585314417138
1987
0.7527923437191689
1988
0.832035279350325
1989
0.8209756465962681
1990
0.7699085263901749
1991
0.7120345204733832
1992
0.8114535164613662
1993
0.849233475556196
1994
0.8762410970464458
1995
0.671177820521707
1996
0.6715484362237036
1997
0.7514857659662035
1998
0.7545217057058412
1999
0.7650531089010569
2000
0.8225806499505692
2001
0.7359371637776506
2002
0.7107597746718697
2003
0.7465029702280287
2004
0.6621848486559307
2005
0.7554598636794895
2006
0.7688868415204976
2007
0.7956873947793417
2008
0.6330442973558915
2009
0.5587909369683122
2010
0.7980531841320546
2011
0.8035388126123046
2012
0.738017786536927
2013
0.6646856908776734
2014
0.6322722678434924
2015
0.7697278037871605
2016
0.7812847461972414
2017
0.5550088963073192
2018
0.7913168004226975
2019
0.8100426879740733
2020
0.7708595438294658
2021
0.7411309095361658
2022
0.7685994794455835
2023
0.7555500777517361
2024
0.7735392869288198
2025
0.7957478489704258
In [20]:
for ano in df0["year"].unique():
    temp = df0[df0["year"] == ano]
    print(ano)
    clusterer = Birch()
    cluster_labels = clusterer.fit_predict(temp["points"].values.reshape(-1, 1).tolist())
    temp["Labels"] = cluster_labels
    print(silhouette_score(temp["points"].values.reshape(-1, 1).tolist(), cluster_labels))
    cluster(temp)
1950
0.7623876636280248
1951
0.6722252829315847
1952
0.7644382101126403
1953
0.7051863591354304
1954
0.7245351963523796
1955
0.7005619432558534
1956
0.7759388976060635
1957
0.5713550405955361
1958
0.6021865246847107
1959
0.6348435278839822
1960
0.7179677179893312
1961
0.6754950097781932
1962
0.6562025145791508
1963
0.6619696830330211
1964
0.6962928307397505
1965
0.6410690764934392
1966
0.6231758856737503
1967
0.6910517200955202
1968
0.6653469034152771
1969
0.6693530042306297
1970
0.719265777326842
1971
0.618320306808646
1972
0.5980017316644533
1973
0.6691524256006519
1974
0.6881490100224322
1975
0.6632074416433728
1976
0.6740039020383763
1977
0.6605812895544333
1978
0.6161226622589479
1979
0.6238800171880501
1980
0.7631626414829408
1981
0.7515476102947917
1982
0.686551442415382
1983
0.6162350710742236
1984
0.7485011507388687
1985
0.5973703615732937
1986
0.7709438615918525
1987
0.6807902374196813
1988
0.5810703502315716
1989
0.8195375812151494
1990
0.7311057368658382
1991
0.7499407460683831
1992
0.8114535164613662
1993
0.8037286464108541
1994
0.6686980270994577
1995
0.6870111670697203
1996
0.6698130571724606
1997
0.5777719172765445
1998
0.7545217057058412
1999
0.7650531089010569
2000
0.7580361860188398
2001
0.7359371637776506
2002
0.7366310508809067
2003
0.7465029702280287
2004
0.6282528309001825
2005
0.7298337140798888
2006
0.7688868415204976
2007
0.7080009170561645
2008
0.7057561047165647
2009
0.5725747316077942
2010
0.7980531841320546
2011
0.7710408961519429
2012
0.738017786536927
2013
0.6646856908776734
2014
0.6322722678434924
2015
0.5725790185009448
2016
0.7812847461972414
2017
0.7552926664919701
2018
0.7342062055033767
2019
0.7469915337799137
2020
0.6972460107091962
2021
0.6583672945183826
2022
0.7685994794455835
2023
0.7555500777517361
2024
0.7481735857075167
2025
0.7749195921961307

Construtores (incluindo 0)¶

In [22]:
for ano in dfc["year"].unique():
    temp = dfc[dfc["year"] == ano]
    silhouette_avg = []
    print(ano)
    for n_clusters in range(2, 11):
        if n_clusters > len(temp["points"].values.tolist()) - 1:
            continue
        clusterer = KMeans(n_clusters = n_clusters)
        cluster_labels = clusterer.fit_predict(temp["points"].values.reshape(-1, 1).tolist())
        silhouette_avg.append(silhouette_score(temp["points"].values.reshape(-1, 1).tolist(), cluster_labels))
    
    melhor = silhouette_avg.index(max(silhouette_avg)) + 2
    clusterer = KMeans(n_clusters = melhor)
    cluster_labels = clusterer.fit_predict(temp["points"].values.reshape(-1, 1).tolist())
    temp["Labels"] = cluster_labels
    print(silhouette_score(temp["points"].values.reshape(-1, 1).tolist(), cluster_labels))
    clusterc(temp)
1958
0.6205134657386334
1959
0.5468693205607777
1960
0.7295528104884147
1961
0.5387666984934388
1962
0.6073329658615393
1963
0.7061953212267857
1964
0.7325163183785673
1965
0.7089715779370951
1966
0.739916865593341
1967
0.5816287781713932
1968
0.5218590204215204
1969
0.7723656347904572
1970
0.7155684389140271
1971
0.6540311141165849
1972
0.7014329021532026
1973
0.8109369219393421
1974
0.7333596176765114
1975
0.7504966371968371
1976
0.8314739925177196
1977
0.7194642127618693
1978
0.6889890308375258
1979
0.676293362186755
1980
0.7214994553339834
1981
0.6106744959884122
1982
0.7839553839973534
1983
0.7748631510841398
1984
0.7394279254445267
1985
0.8191778085420871
1986
0.6906056067865778
1987
0.7293234188830732
1988
0.7796357306025132
1989
0.7541227190384332
1990
0.824261025523462
1991
0.8322267465697631
1992
0.8499246302736363
1993
0.7494250485650468
1994
0.7401118214906854
1995
0.7120981796457498
1996
0.672734501240078
1997
0.6033431152239606
1998
0.8441586115489756
1999
0.7814527567868658
2000
0.9003621689161402
2001
0.7643882023407752
2002
0.7536697288292676
2003
0.8059618197980839
2004
0.662284225608234
2005
0.7216435982436564
2006
0.787296484499919
2007
0.7151287700062361
2008
0.6527998551262011
2009
0.7677711829212724
2010
0.7646991004501721
2011
0.7934936638387944
2012
0.7572515700812978
2013
0.762894160329348
2014
0.7069035403032113
2015
0.6758693552107161
2016
0.7364560589718832
2017
0.7492166408210685
2018
0.8338841451574155
2019
0.7810338251825559
2020
0.6602304706165107
2021
0.7327854154921877
2022
0.809951522809007
2023
0.6066933566100532
2024
0.8696062626511665
2025
0.7583743351367509
In [23]:
for ano in dfc["year"].unique():
    temp = dfc[dfc["year"] == ano]
    silhouette_avg = []
    print(ano)
    for n_clusters in range(2, 11):
        if n_clusters > len(temp["points"].values.tolist()) - 1:
            continue
        clusterer = GaussianMixture(n_components = n_clusters)
        cluster_labels = clusterer.fit_predict(temp["points"].values.reshape(-1, 1).tolist())
        silhouette_avg.append(silhouette_score(temp["points"].values.reshape(-1, 1).tolist(), cluster_labels))
    
    melhor = silhouette_avg.index(max(silhouette_avg)) + 2
    clusterer = GaussianMixture(n_components = melhor)
    cluster_labels = clusterer.fit_predict(temp["points"].values.reshape(-1, 1).tolist())
    temp["Labels"] = cluster_labels
    print(silhouette_score(temp["points"].values.reshape(-1, 1).tolist(), cluster_labels))
    clusterc(temp)
1958
0.6205134657386334
1959
0.2662337662337662
1960
0.7295528104884147
1961
0.5387666984934388
1962
0.6832273038566634
1963
0.7061953212267857
1964
0.7325163183785673
1965
0.7089715779370951
1966
0.65763668189505
1967
0.6752451230053201
1968
0.6539720233470233
1969
0.7723656347904572
1970
0.7155684389140271
1971
0.6540311141165849
1972
0.6942522829183483
1973
0.8109369219393421
1974
0.7333596176765114
1975
0.7504966371968371
1976
0.8314739925177196
1977
0.7194642127618693
1978
0.6878448512770136
1979
0.6077514543704003
1980
0.7214994553339834
1981
0.5940654007731843
1982
0.7839553839973534
1983
0.7748631510841398
1984
0.7394279254445267
1985
0.8191778085420871
1986
0.6906056067865778
1987
0.7293234188830732
1988
0.7796357306025132
1989
0.7396850940920026
1990
0.824261025523462
1991
0.8322267465697631
1992
0.8499246302736363
1993
0.7494250485650468
1994
0.7401118214906854
1995
0.7388464524081874
1996
0.672734501240078
1997
0.6727813102035528
1998
0.8441586115489756
1999
0.7814527567868658
2000
0.9003621689161402
2001
0.7643882023407752
2002
0.7536697288292676
2003
0.8059618197980839
2004
0.662284225608234
2005
0.7216435982436564
2006
0.787296484499919
2007
0.7151287700062361
2008
0.6527998551262011
2009
0.7677711829212724
2010
0.7646991004501721
2011
0.7934936638387944
2012
0.7572515700812978
2013
0.762894160329348
2014
0.7069035403032113
2015
0.6758693552107161
2016
0.7364560589718832
2017
0.6337612954876486
2018
0.8338841451574155
2019
0.7810338251825559
2020
0.6602304706165107
2021
0.7127786988012879
2022
0.809951522809007
2023
0.7510757235763615
2024
0.8696062626511665
2025
0.7583743351367509
In [24]:
for ano in dfc["year"].unique():
    temp = dfc[dfc["year"] == ano]
    print(ano)
    clusterer = Birch()
    cluster_labels = clusterer.fit_predict(temp["points"].values.reshape(-1, 1).tolist())
    temp["Labels"] = cluster_labels
    print(silhouette_score(temp["points"].values.reshape(-1, 1).tolist(), cluster_labels))
    clusterc(temp)
1958
0.4797975711634977
1959
0.2662337662337662
1960
0.6440118599396103
1961
0.28277777777777774
1962
0.6073329658615393
1963
0.7061953212267857
1964
0.5744878280994015
1965
0.6806695622321944
1966
0.65763668189505
1967
0.6146557931180187
1968
0.6564627453443242
1969
0.6618384195783577
1970
0.547155479277905
1971
0.6540311141165849
1972
0.584287285164478
1973
0.7182909655108125
1974
0.7254325924247221
1975
0.7461565156032726
1976
0.7413168436964739
1977
0.6941601619327394
1978
0.6422255033532988
1979
0.6016070141674944
1980
0.7214994553339834
1981
0.5536208893265224
1982
0.7839553839973534
1983
0.7480854775355593
1984
0.6282248394590504
1985
0.621730151695781
1986
0.5368682060357981
1987
0.7293234188830732
1988
0.5171855836800645
1989
0.7396850940920026
1990
0.824261025523462
1991
0.8137187153175907
1992
0.8423206415201547
1993
0.7494250485650468
1994
0.6155685319544092
1995
0.6288322243323343
1996
0.672734501240078
1997
0.7076156032842146
1998
0.8441586115489756
1999
0.7441312866894263
2000
0.7101390860339293
2001
0.7643882023407752
2002
0.7536697288292676
2003
0.7658046159855417
2004
0.662284225608234
2005
0.7216435982436564
2006
0.787296484499919
2007
0.605210137220662
2008
0.5874118379185781
2009
0.7171997411828008
2010
0.7340728395069062
2011
0.6905665870631686
2012
0.7228851551560114
2013
0.7540376700382171
2014
0.6130612194772429
2015
0.5206582482739704
2016
0.7079227370800851
2017
0.6182123421035763
2018
0.7263903860888302
2019
0.7341253008291466
2020
0.5763392024081047
2021
0.6999114796427401
2022
0.7477264051366055
2023
0.7065305239184747
2024
0.7402759082121249
2025
0.7583743351367509

Construtores (excluindo 0)¶

In [26]:
for ano in dfc0["year"].unique():
    temp = dfc0[dfc0["year"] == ano]
    silhouette_avg = []
    print(ano)
    for n_clusters in range(2, 11):
        if n_clusters > len(temp["points"].values.tolist()) - 1:
            continue
        clusterer = KMeans(n_clusters = n_clusters)
        cluster_labels = clusterer.fit_predict(temp["points"].values.reshape(-1, 1).tolist())
        silhouette_avg.append(silhouette_score(temp["points"].values.reshape(-1, 1).tolist(), cluster_labels))
    
    melhor = silhouette_avg.index(max(silhouette_avg)) + 2
    clusterer = KMeans(n_clusters = melhor)
    cluster_labels = clusterer.fit_predict(temp["points"].values.reshape(-1, 1).tolist())
    temp["Labels"] = cluster_labels
    print(silhouette_score(temp["points"].values.reshape(-1, 1).tolist(), cluster_labels))
    clusterc(temp)
1958
0.6205134657386334
1959
0.5468693205607777
1960
0.7295528104884147
1961
0.5387666984934388
1962
0.6832273038566634
1963
0.7061953212267857
1964
0.7325163183785673
1965
0.7089715779370951
1966
0.739916865593341
1967
0.6752451230053201
1968
0.6564627453443242
1969
0.7723656347904572
1970
0.7155684389140271
1971
0.6540311141165849
1972
0.7014329021532026
1973
0.8109369219393421
1974
0.7333596176765114
1975
0.7504966371968371
1976
0.8314739925177196
1977
0.7194642127618693
1978
0.6878448512770136
1979
0.676293362186755
1980
0.7214994553339834
1981
0.6106744959884122
1982
0.7839553839973534
1983
0.7748631510841398
1984
0.7311651026671115
1985
0.8191778085420871
1986
0.6201518876682878
1987
0.7293234188830732
1988
0.7796357306025132
1989
0.7396850940920026
1990
0.824261025523462
1991
0.8322267465697631
1992
0.8499246302736363
1993
0.7494250485650468
1994
0.7401118214906854
1995
0.7388464524081874
1996
0.672734501240078
1997
0.6727813102035528
1998
0.8441586115489756
1999
0.7814527567868658
2000
0.9008802083844601
2001
0.7565425365852112
2002
0.7536697288292676
2003
0.7835252899663839
2004
0.662284225608234
2005
0.7216435982436564
2006
0.7815951125846695
2007
0.6972932589893626
2008
0.7362054161248404
2009
0.7677711829212724
2010
0.7448941492376941
2011
0.7541194723250224
2012
0.7579597367445928
2013
0.7353964848039121
2014
0.6484876856103428
2015
0.662253439082864
2016
0.7364560589718832
2017
0.7492166408210685
2018
0.8296869499617265
2019
0.7810338251825559
2020
0.6281573352687118
2021
0.729325175141065
2022
0.809951522809007
2023
0.7510757235763615
2024
0.8696062626511665
2025
0.7583743351367509
In [27]:
for ano in dfc0["year"].unique():
    temp = dfc0[dfc0["year"] == ano]
    silhouette_avg = []
    print(ano)
    for n_clusters in range(2, 11):
        if n_clusters > len(temp["points"].values.tolist()) - 1:
            continue
        clusterer = GaussianMixture(n_components = n_clusters)
        cluster_labels = clusterer.fit_predict(temp["points"].values.reshape(-1, 1).tolist())
        silhouette_avg.append(silhouette_score(temp["points"].values.reshape(-1, 1).tolist(), cluster_labels))
    
    melhor = silhouette_avg.index(max(silhouette_avg)) + 2
    clusterer = GaussianMixture(n_components = melhor)
    cluster_labels = clusterer.fit_predict(temp["points"].values.reshape(-1, 1).tolist())
    temp["Labels"] = cluster_labels
    print(silhouette_score(temp["points"].values.reshape(-1, 1).tolist(), cluster_labels))
    clusterc(temp)
1958
0.6205134657386334
1959
0.5468693205607777
1960
0.7295528104884147
1961
0.5387666984934388
1962
0.6832273038566634
1963
0.7061953212267857
1964
0.7325163183785673
1965
0.7089715779370951
1966
0.6155334561954411
1967
0.6752451230053201
1968
0.6564627453443242
1969
0.7723656347904572
1970
0.7155684389140271
1971
0.6540311141165849
1972
0.6942522829183483
1973
0.8109369219393421
1974
0.7333596176765114
1975
0.7504966371968371
1976
0.8314739925177196
1977
0.7194642127618693
1978
0.6878448512770136
1979
0.6077514543704003
1980
0.7214994553339834
1981
0.5940654007731843
1982
0.7839553839973534
1983
0.7748631510841398
1984
0.7311651026671115
1985
0.8191778085420871
1986
0.6906056067865778
1987
0.7293234188830732
1988
0.7796357306025132
1989
0.7396850940920026
1990
0.824261025523462
1991
0.8322267465697631
1992
0.8499246302736363
1993
0.7494250485650468
1994
0.7401118214906854
1995
0.7388464524081874
1996
0.672734501240078
1997
0.6033431152239606
1998
0.8441586115489756
1999
0.7814527567868658
2000
0.9008802083844601
2001
0.7565425365852112
2002
0.7536697288292676
2003
0.8064817910672281
2004
0.662284225608234
2005
0.7216435982436564
2006
0.7815951125846695
2007
0.6972932589893626
2008
0.7362054161248404
2009
0.7677711829212724
2010
0.7448941492376941
2011
0.7541194723250224
2012
0.7579597367445928
2013
0.7353964848039121
2014
0.6484876856103428
2015
0.662253439082864
2016
0.7364560589718832
2017
0.6337612954876486
2018
0.8296869499617265
2019
0.7810338251825559
2020
0.6281573352687118
2021
0.729325175141065
2022
0.809951522809007
2023
0.7510757235763615
2024
0.8696062626511665
2025
0.7583743351367509
In [28]:
for ano in dfc0["year"].unique():
    temp = dfc0[dfc0["year"] == ano]
    print(ano)
    clusterer = Birch()
    cluster_labels = clusterer.fit_predict(temp["points"].values.reshape(-1, 1).tolist())
    temp["Labels"] = cluster_labels
    print(silhouette_score(temp["points"].values.reshape(-1, 1).tolist(), cluster_labels))
    clusterc(temp)
1958
0.4797975711634977
1959
0.2662337662337662
1960
0.6440118599396103
1961
0.28277777777777774
1962
0.6073329658615393
1963
0.7061953212267857
1964
0.5744878280994015
1965
0.6806695622321944
1966
0.65763668189505
1967
0.6146557931180187
1968
0.6564627453443242
1969
0.6618384195783577
1970
0.547155479277905
1971
0.6540311141165849
1972
0.584287285164478
1973
0.7182909655108125
1974
0.7254325924247221
1975
0.7461565156032726
1976
0.7413168436964739
1977
0.6941601619327394
1978
0.6422255033532988
1979
0.6016070141674944
1980
0.7214994553339834
1981
0.5536208893265224
1982
0.7839553839973534
1983
0.7480854775355593
1984
0.6048529908881884
1985
0.621730151695781
1986
0.5368682060357981
1987
0.7293234188830732
1988
0.5171855836800645
1989
0.7396850940920026
1990
0.824261025523462
1991
0.8137187153175907
1992
0.8423206415201547
1993
0.7494250485650468
1994
0.6155685319544092
1995
0.6288322243323343
1996
0.672734501240078
1997
0.7076156032842146
1998
0.8441586115489756
1999
0.7441312866894263
2000
0.585676058605942
2001
0.7565425365852112
2002
0.7536697288292676
2003
0.7835252899663839
2004
0.662284225608234
2005
0.7216435982436564
2006
0.7815951125846695
2007
0.5829965272122505
2008
0.6097859158292004
2009
0.7171997411828008
2010
0.7254055801677428
2011
0.6316876959862899
2012
0.5629471395178869
2013
0.7353964848039121
2014
0.5249630371384821
2015
0.48399076284288484
2016
0.7079227370800851
2017
0.6182123421035763
2018
0.7179853690019219
2019
0.7341253008291466
2020
0.5288446983966547
2021
0.7033474831228658
2022
0.7477264051366055
2023
0.7065305239184747
2024
0.7402759082121249
2025
0.7583743351367509

Medidas de desigualdade¶

Campeonato de pilotos¶

Gini¶

K-Means¶
In [38]:
dados = []

for ano in df["year"].unique():
    dados.append([ano, gini(df[df["year"] == ano]["points"].values.tolist())])

dados = pd.DataFrame(dados, columns=["Ano", "Gini"])
    
silhouette_avg = []

for n_clusters in range(2, 11):
    
    clusterer = KMeans(n_clusters = n_clusters)
    cluster_labels = clusterer.fit_predict(dados["Gini"].values.reshape(-1, 1).tolist())
    silhouette_avg.append(silhouette_score(dados["Gini"].values.reshape(-1, 1).tolist(), cluster_labels))
    
melhor = silhouette_avg.index(max(silhouette_avg)) + 2
clusterer = KMeans(n_clusters = melhor)
cluster_labels = clusterer.fit_predict(dados["Gini"].values.reshape(-1, 1).tolist())
dados["Labels"] = cluster_labels
print(silhouette_score(dados["Gini"].values.reshape(-1, 1).tolist(), cluster_labels))
clusterd(dados, "Gini")  # mudar função
0.5560353956728895
In [40]:
dados0 = []

for ano in df["year"].unique():
    #print(df[df["year"] == ano]["points"].values.tolist())
    dados0.append([ano, gini(df0[df0["year"] == ano]["points"].values.tolist()), mld(df0[df0["year"] == ano]["points"].values.tolist()), theil(df0[df0["year"] == ano]["points"].values.tolist())])

dados0 = pd.DataFrame(dados0, columns=["Ano", "Gini", "MLD", "Theil"])
    
silhouette_avg = []

for n_clusters in range(2, 11):
    
    clusterer = KMeans(n_clusters = n_clusters)
    cluster_labels = clusterer.fit_predict(dados0["Gini"].values.reshape(-1, 1).tolist())
    silhouette_avg.append(silhouette_score(dados0["Gini"].values.reshape(-1, 1).tolist(), cluster_labels))
    
melhor = silhouette_avg.index(max(silhouette_avg)) + 2
clusterer = KMeans(n_clusters = melhor)
cluster_labels = clusterer.fit_predict(dados0["Gini"].values.reshape(-1, 1).tolist())
dados0["Labels"] = cluster_labels
print(silhouette_score(dados0["Gini"].values.reshape(-1, 1).tolist(), cluster_labels))
clusterd(dados0, "Gini")  # mudar função
0.5387554076587593
Mistura Gaussiana¶
In [43]:
silhouette_avg = []

for n_clusters in range(2, 11):
    
    clusterer = GaussianMixture(n_components = n_clusters)
    cluster_labels = clusterer.fit_predict(dados["Gini"].values.reshape(-1, 1).tolist())
    silhouette_avg.append(silhouette_score(dados["Gini"].values.reshape(-1, 1).tolist(), cluster_labels))
    
melhor = silhouette_avg.index(max(silhouette_avg)) + 2
clusterer = GaussianMixture(n_components = melhor)
cluster_labels = clusterer.fit_predict(dados["Gini"].values.reshape(-1, 1).tolist())
dados["Labels"] = cluster_labels
print(silhouette_score(dados["Gini"].values.reshape(-1, 1).tolist(), cluster_labels))
clusterd(dados, "Gini")  # mudar função
0.5474988113575222
In [45]:
silhouette_avg = []

for n_clusters in range(2, 11):
    
    clusterer = GaussianMixture(n_components = n_clusters)
    cluster_labels = clusterer.fit_predict(dados0["Gini"].values.reshape(-1, 1).tolist())
    silhouette_avg.append(silhouette_score(dados0["Gini"].values.reshape(-1, 1).tolist(), cluster_labels))
    
melhor = silhouette_avg.index(max(silhouette_avg)) + 2
clusterer = GaussianMixture(n_components = melhor)
cluster_labels = clusterer.fit_predict(dados0["Gini"].values.reshape(-1, 1).tolist())
dados0["Labels"] = cluster_labels
print(silhouette_score(dados0["Gini"].values.reshape(-1, 1).tolist(), cluster_labels))
clusterd(dados0, "Gini")  # mudar função
0.5851349158327567
Birch¶
In [48]:
clusterer = Birch()
cluster_labels = clusterer.fit_predict(dados0["Gini"].values.reshape(-1, 1).tolist())
dados0["Labels"] = cluster_labels
print(silhouette_score(dados0["Gini"].values.reshape(-1, 1).tolist(), cluster_labels))
clusterd(dados0, "Gini")
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[48], line 4
      2 cluster_labels = clusterer.fit_predict(dados0["Gini"].values.reshape(-1, 1).tolist())
      3 dados0["Labels"] = cluster_labels
----> 4 print(silhouette_score(dados0["Gini"].values.reshape(-1, 1).tolist(), cluster_labels))
      5 clusterd(dados0, "Gini")

File ~\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py:213, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    207 try:
    208     with config_context(
    209         skip_parameter_validation=(
    210             prefer_skip_nested_validation or global_skip_validation
    211         )
    212     ):
--> 213         return func(*args, **kwargs)
    214 except InvalidParameterError as e:
    215     # When the function is just a wrapper around an estimator, we allow
    216     # the function to delegate validation to the estimator, but we replace
    217     # the name of the estimator by the name of the function in the error
    218     # message to avoid confusion.
    219     msg = re.sub(
    220         r"parameter of \w+ must be",
    221         f"parameter of {func.__qualname__} must be",
    222         str(e),
    223     )

File ~\anaconda3\Lib\site-packages\sklearn\metrics\cluster\_unsupervised.py:141, in silhouette_score(X, labels, metric, sample_size, random_state, **kwds)
    139     else:
    140         X, labels = X[indices], labels[indices]
--> 141 return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))

File ~\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py:186, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    184 global_skip_validation = get_config()["skip_parameter_validation"]
    185 if global_skip_validation:
--> 186     return func(*args, **kwargs)
    188 func_sig = signature(func)
    190 # Map *args/**kwargs to the function signature

File ~\anaconda3\Lib\site-packages\sklearn\metrics\cluster\_unsupervised.py:299, in silhouette_samples(X, labels, metric, **kwds)
    297 n_samples = len(labels)
    298 label_freqs = np.bincount(labels)
--> 299 check_number_of_labels(len(le.classes_), n_samples)
    301 kwds["metric"] = metric
    302 reduce_func = functools.partial(
    303     _silhouette_reduce, labels=labels, label_freqs=label_freqs
    304 )

File ~\anaconda3\Lib\site-packages\sklearn\metrics\cluster\_unsupervised.py:38, in check_number_of_labels(n_labels, n_samples)
     27 """Check that number of labels are valid.
     28 
     29 Parameters
   (...)
     35     Number of samples.
     36 """
     37 if not 1 < n_labels < n_samples:
---> 38     raise ValueError(
     39         "Number of labels is %d. Valid values are 2 to n_samples - 1 (inclusive)"
     40         % n_labels
     41     )

ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)
In [50]:
clusterer = Birch()
cluster_labels = clusterer.fit_predict(dados["Gini"].values.reshape(-1, 1).tolist())
dados["Labels"] = cluster_labels
print(silhouette_score(dados["Gini"].values.reshape(-1, 1).tolist(), cluster_labels))
clusterd(dados, "Gini")
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[50], line 4
      2 cluster_labels = clusterer.fit_predict(dados["Gini"].values.reshape(-1, 1).tolist())
      3 dados["Labels"] = cluster_labels
----> 4 print(silhouette_score(dados["Gini"].values.reshape(-1, 1).tolist(), cluster_labels))
      5 clusterd(dados, "Gini")

File ~\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py:213, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    207 try:
    208     with config_context(
    209         skip_parameter_validation=(
    210             prefer_skip_nested_validation or global_skip_validation
    211         )
    212     ):
--> 213         return func(*args, **kwargs)
    214 except InvalidParameterError as e:
    215     # When the function is just a wrapper around an estimator, we allow
    216     # the function to delegate validation to the estimator, but we replace
    217     # the name of the estimator by the name of the function in the error
    218     # message to avoid confusion.
    219     msg = re.sub(
    220         r"parameter of \w+ must be",
    221         f"parameter of {func.__qualname__} must be",
    222         str(e),
    223     )

File ~\anaconda3\Lib\site-packages\sklearn\metrics\cluster\_unsupervised.py:141, in silhouette_score(X, labels, metric, sample_size, random_state, **kwds)
    139     else:
    140         X, labels = X[indices], labels[indices]
--> 141 return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))

File ~\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py:186, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    184 global_skip_validation = get_config()["skip_parameter_validation"]
    185 if global_skip_validation:
--> 186     return func(*args, **kwargs)
    188 func_sig = signature(func)
    190 # Map *args/**kwargs to the function signature

File ~\anaconda3\Lib\site-packages\sklearn\metrics\cluster\_unsupervised.py:299, in silhouette_samples(X, labels, metric, **kwds)
    297 n_samples = len(labels)
    298 label_freqs = np.bincount(labels)
--> 299 check_number_of_labels(len(le.classes_), n_samples)
    301 kwds["metric"] = metric
    302 reduce_func = functools.partial(
    303     _silhouette_reduce, labels=labels, label_freqs=label_freqs
    304 )

File ~\anaconda3\Lib\site-packages\sklearn\metrics\cluster\_unsupervised.py:38, in check_number_of_labels(n_labels, n_samples)
     27 """Check that number of labels are valid.
     28 
     29 Parameters
   (...)
     35     Number of samples.
     36 """
     37 if not 1 < n_labels < n_samples:
---> 38     raise ValueError(
     39         "Number of labels is %d. Valid values are 2 to n_samples - 1 (inclusive)"
     40         % n_labels
     41     )

ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

Theil¶

K-Means¶
In [54]:
silhouette_avg = []

for n_clusters in range(2, 11):
    
    clusterer = KMeans(n_clusters = n_clusters)
    cluster_labels = clusterer.fit_predict(dados0["Theil"].values.reshape(-1, 1).tolist())
    silhouette_avg.append(silhouette_score(dados0["Theil"].values.reshape(-1, 1).tolist(), cluster_labels))
    
melhor = silhouette_avg.index(max(silhouette_avg)) + 2
clusterer = KMeans(n_clusters = melhor)
cluster_labels = clusterer.fit_predict(dados0["Theil"].values.reshape(-1, 1).tolist())
dados0["Labels"] = cluster_labels
print(silhouette_score(dados0["Theil"].values.reshape(-1, 1).tolist(), cluster_labels))
clusterd(dados0, "Theil")  # mudar função
0.6062141561768923
Mistura Gaussiana¶
In [57]:
silhouette_avg = []

for n_clusters in range(2, 11):
    
    clusterer = GaussianMixture(n_components = n_clusters)
    cluster_labels = clusterer.fit_predict(dados0["Theil"].values.reshape(-1, 1).tolist())
    silhouette_avg.append(silhouette_score(dados0["Theil"].values.reshape(-1, 1).tolist(), cluster_labels))
    
melhor = silhouette_avg.index(max(silhouette_avg)) + 2
clusterer = GaussianMixture(n_components = melhor)
cluster_labels = clusterer.fit_predict(dados0["Theil"].values.reshape(-1, 1).tolist())
dados0["Labels"] = cluster_labels
print(silhouette_score(dados0["Theil"].values.reshape(-1, 1).tolist(), cluster_labels))
clusterd(dados0, "Theil")  # mudar função
0.5484690385725198
Birch¶
In [60]:
clusterer = Birch()
cluster_labels = clusterer.fit_predict(dados0["Theil"].values.reshape(-1, 1).tolist())
dados0["Labels"] = cluster_labels
print(silhouette_score(dados0["Theil"].values.reshape(-1, 1).tolist(), cluster_labels))
clusterd(dados0, "Theil")
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[60], line 4
      2 cluster_labels = clusterer.fit_predict(dados0["Theil"].values.reshape(-1, 1).tolist())
      3 dados0["Labels"] = cluster_labels
----> 4 print(silhouette_score(dados0["Theil"].values.reshape(-1, 1).tolist(), cluster_labels))
      5 clusterd(dados0, "Theil")

File ~\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py:213, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    207 try:
    208     with config_context(
    209         skip_parameter_validation=(
    210             prefer_skip_nested_validation or global_skip_validation
    211         )
    212     ):
--> 213         return func(*args, **kwargs)
    214 except InvalidParameterError as e:
    215     # When the function is just a wrapper around an estimator, we allow
    216     # the function to delegate validation to the estimator, but we replace
    217     # the name of the estimator by the name of the function in the error
    218     # message to avoid confusion.
    219     msg = re.sub(
    220         r"parameter of \w+ must be",
    221         f"parameter of {func.__qualname__} must be",
    222         str(e),
    223     )

File ~\anaconda3\Lib\site-packages\sklearn\metrics\cluster\_unsupervised.py:141, in silhouette_score(X, labels, metric, sample_size, random_state, **kwds)
    139     else:
    140         X, labels = X[indices], labels[indices]
--> 141 return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))

File ~\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py:186, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    184 global_skip_validation = get_config()["skip_parameter_validation"]
    185 if global_skip_validation:
--> 186     return func(*args, **kwargs)
    188 func_sig = signature(func)
    190 # Map *args/**kwargs to the function signature

File ~\anaconda3\Lib\site-packages\sklearn\metrics\cluster\_unsupervised.py:299, in silhouette_samples(X, labels, metric, **kwds)
    297 n_samples = len(labels)
    298 label_freqs = np.bincount(labels)
--> 299 check_number_of_labels(len(le.classes_), n_samples)
    301 kwds["metric"] = metric
    302 reduce_func = functools.partial(
    303     _silhouette_reduce, labels=labels, label_freqs=label_freqs
    304 )

File ~\anaconda3\Lib\site-packages\sklearn\metrics\cluster\_unsupervised.py:38, in check_number_of_labels(n_labels, n_samples)
     27 """Check that number of labels are valid.
     28 
     29 Parameters
   (...)
     35     Number of samples.
     36 """
     37 if not 1 < n_labels < n_samples:
---> 38     raise ValueError(
     39         "Number of labels is %d. Valid values are 2 to n_samples - 1 (inclusive)"
     40         % n_labels
     41     )

ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

MLD¶

K-Means¶
In [64]:
silhouette_avg = []

for n_clusters in range(2, 11):
    
    clusterer = KMeans(n_clusters = n_clusters)
    cluster_labels = clusterer.fit_predict(dados0["MLD"].values.reshape(-1, 1).tolist())
    silhouette_avg.append(silhouette_score(dados0["MLD"].values.reshape(-1, 1).tolist(), cluster_labels))
    
melhor = silhouette_avg.index(max(silhouette_avg)) + 2
clusterer = KMeans(n_clusters = melhor)
cluster_labels = clusterer.fit_predict(dados0["MLD"].values.reshape(-1, 1).tolist())
dados0["Labels"] = cluster_labels
print(silhouette_score(dados0["MLD"].values.reshape(-1, 1).tolist(), cluster_labels))
clusterd(dados0, "MLD")  # mudar função
0.590304205410129
Mistura Gaussiana¶
In [67]:
silhouette_avg = []

for n_clusters in range(2, 11):
    
    clusterer = GaussianMixture(n_components = n_clusters)
    cluster_labels = clusterer.fit_predict(dados0["MLD"].values.reshape(-1, 1).tolist())
    silhouette_avg.append(silhouette_score(dados0["MLD"].values.reshape(-1, 1).tolist(), cluster_labels))
    
melhor = silhouette_avg.index(max(silhouette_avg)) + 2
clusterer = GaussianMixture(n_components = melhor)
cluster_labels = clusterer.fit_predict(dados0["MLD"].values.reshape(-1, 1).tolist())
dados0["Labels"] = cluster_labels
print(silhouette_score(dados0["MLD"].values.reshape(-1, 1).tolist(), cluster_labels))
clusterd(dados0, "MLD")  # mudar função
0.5973601424126799
Birch¶
In [70]:
clusterer = Birch()
cluster_labels = clusterer.fit_predict(dados0["MLD"].values.reshape(-1, 1).tolist())
dados0["Labels"] = cluster_labels
print(silhouette_score(dados0["MLD"].values.reshape(-1, 1).tolist(), cluster_labels))
clusterd(dados0, "MLD")
0.5429833160513684

Campeonato de construtores¶

Gini¶

K-Means¶
In [132]:
dadosc = []

for ano in dfc["year"].unique():
    dadosc.append([ano, gini(dfc[dfc["year"] == ano]["points"].values.tolist())])

dadosc = pd.DataFrame(dadosc, columns=["Ano", "Gini"])
    
silhouette_avg = []

for n_clusters in range(2, 11):
    clusterer = KMeans(n_clusters = n_clusters)
    cluster_labels = clusterer.fit_predict(dadosc["Gini"].values.reshape(-1, 1).tolist())
    silhouette_avg.append(silhouette_score(dadosc["Gini"].values.reshape(-1, 1).tolist(), cluster_labels))
    
melhor = silhouette_avg.index(max(silhouette_avg)) + 2
clusterer = KMeans(n_clusters = melhor)
cluster_labels = clusterer.fit_predict(dadosc["Gini"].values.reshape(-1, 1).tolist())
dadosc["Labels"] = cluster_labels
print(silhouette_score(dadosc["Gini"].values.reshape(-1, 1).tolist(), cluster_labels))
clusterd(dadosc, "Gini")  # mudar função
0.6383484414359761
In [134]:
dadosc0 = []

for ano in dfc0["year"].unique():
    #print(df[df["year"] == ano]["points"].values.tolist())
    #print(gini(dfc0[dfc0["year"] == ano]["points"].values.tolist()))
    dadosc0.append([ano, gini(dfc0[dfc0["year"] == ano]["points"].values.tolist()), mld(dfc0[dfc0["year"] == ano]["points"].values.tolist()), theil(dfc0[dfc0["year"] == ano]["points"].values.tolist())])

dadosc0 = pd.DataFrame(dadosc0, columns=["Ano", "Gini", "MLD", "Theil"])
    
silhouette_avg = []

for n_clusters in range(2, 11):
    #print(dadosc0["Gini"])
    clusterer = KMeans(n_clusters = n_clusters)
    cluster_labels = clusterer.fit_predict(dadosc0["Gini"].values.reshape(-1, 1).tolist())
    silhouette_avg.append(silhouette_score(dadosc0["Gini"].values.reshape(-1, 1).tolist(), cluster_labels))
    
melhor = silhouette_avg.index(max(silhouette_avg)) + 2
clusterer = KMeans(n_clusters = melhor)
cluster_labels = clusterer.fit_predict(dadosc0["Gini"].values.reshape(-1, 1).tolist())
dadosc0["Labels"] = cluster_labels
print(silhouette_score(dadosc0["Gini"].values.reshape(-1, 1).tolist(), cluster_labels))
clusterd(dadosc0, "Gini")  # mudar função
0.6075436166495269
Mistura Gaussiana¶
In [138]:
silhouette_avg = []

for n_clusters in range(2, 11):
    
    clusterer = GaussianMixture(n_components = n_clusters)
    cluster_labels = clusterer.fit_predict(dadosc["Gini"].values.reshape(-1, 1).tolist())
    silhouette_avg.append(silhouette_score(dadosc["Gini"].values.reshape(-1, 1).tolist(), cluster_labels))
    
melhor = silhouette_avg.index(max(silhouette_avg)) + 2
clusterer = GaussianMixture(n_components = melhor)
cluster_labels = clusterer.fit_predict(dadosc["Gini"].values.reshape(-1, 1).tolist())
dadosc["Labels"] = cluster_labels
print(silhouette_score(dadosc["Gini"].values.reshape(-1, 1).tolist(), cluster_labels))
clusterd(dadosc, "Gini")  # mudar função
0.6372312141241268
In [136]:
silhouette_avg = []

for n_clusters in range(2, 11):
    
    clusterer = GaussianMixture(n_components = n_clusters)
    cluster_labels = clusterer.fit_predict(dadosc0["Gini"].values.reshape(-1, 1).tolist())
    silhouette_avg.append(silhouette_score(dadosc0["Gini"].values.reshape(-1, 1).tolist(), cluster_labels))
    
melhor = silhouette_avg.index(max(silhouette_avg)) + 2
clusterer = GaussianMixture(n_components = melhor)
cluster_labels = clusterer.fit_predict(dadosc0["Gini"].values.reshape(-1, 1).tolist())
dadosc0["Labels"] = cluster_labels
print(silhouette_score(dadosc0["Gini"].values.reshape(-1, 1).tolist(), cluster_labels))
clusterd(dadosc0, "Gini")  # mudar função
0.6051809852295769
Birch¶
In [141]:
clusterer = Birch()
cluster_labels = clusterer.fit_predict(dadosc["Gini"].values.reshape(-1, 1).tolist())
dadosc["Labels"] = cluster_labels
print(silhouette_score(dadosc["Gini"].values.reshape(-1, 1).tolist(), cluster_labels))
clusterd(dadosc, "Gini")
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[141], line 4
      2 cluster_labels = clusterer.fit_predict(dadosc["Gini"].values.reshape(-1, 1).tolist())
      3 dadosc["Labels"] = cluster_labels
----> 4 print(silhouette_score(dadosc["Gini"].values.reshape(-1, 1).tolist(), cluster_labels))
      5 clusterd(dadosc, "Gini")

File ~\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py:213, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    207 try:
    208     with config_context(
    209         skip_parameter_validation=(
    210             prefer_skip_nested_validation or global_skip_validation
    211         )
    212     ):
--> 213         return func(*args, **kwargs)
    214 except InvalidParameterError as e:
    215     # When the function is just a wrapper around an estimator, we allow
    216     # the function to delegate validation to the estimator, but we replace
    217     # the name of the estimator by the name of the function in the error
    218     # message to avoid confusion.
    219     msg = re.sub(
    220         r"parameter of \w+ must be",
    221         f"parameter of {func.__qualname__} must be",
    222         str(e),
    223     )

File ~\anaconda3\Lib\site-packages\sklearn\metrics\cluster\_unsupervised.py:141, in silhouette_score(X, labels, metric, sample_size, random_state, **kwds)
    139     else:
    140         X, labels = X[indices], labels[indices]
--> 141 return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))

File ~\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py:186, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    184 global_skip_validation = get_config()["skip_parameter_validation"]
    185 if global_skip_validation:
--> 186     return func(*args, **kwargs)
    188 func_sig = signature(func)
    190 # Map *args/**kwargs to the function signature

File ~\anaconda3\Lib\site-packages\sklearn\metrics\cluster\_unsupervised.py:299, in silhouette_samples(X, labels, metric, **kwds)
    297 n_samples = len(labels)
    298 label_freqs = np.bincount(labels)
--> 299 check_number_of_labels(len(le.classes_), n_samples)
    301 kwds["metric"] = metric
    302 reduce_func = functools.partial(
    303     _silhouette_reduce, labels=labels, label_freqs=label_freqs
    304 )

File ~\anaconda3\Lib\site-packages\sklearn\metrics\cluster\_unsupervised.py:38, in check_number_of_labels(n_labels, n_samples)
     27 """Check that number of labels are valid.
     28 
     29 Parameters
   (...)
     35     Number of samples.
     36 """
     37 if not 1 < n_labels < n_samples:
---> 38     raise ValueError(
     39         "Number of labels is %d. Valid values are 2 to n_samples - 1 (inclusive)"
     40         % n_labels
     41     )

ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)
In [143]:
clusterer = Birch()
cluster_labels = clusterer.fit_predict(dadosc0["Gini"].values.reshape(-1, 1).tolist())
dadosc0["Labels"] = cluster_labels
print(silhouette_score(dadosc0["Gini"].values.reshape(-1, 1).tolist(), cluster_labels))
clusterd(dadosc0, "Gini")
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[143], line 4
      2 cluster_labels = clusterer.fit_predict(dadosc0["Gini"].values.reshape(-1, 1).tolist())
      3 dadosc0["Labels"] = cluster_labels
----> 4 print(silhouette_score(dadosc0["Gini"].values.reshape(-1, 1).tolist(), cluster_labels))
      5 clusterd(dadosc0, "Gini")

File ~\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py:213, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    207 try:
    208     with config_context(
    209         skip_parameter_validation=(
    210             prefer_skip_nested_validation or global_skip_validation
    211         )
    212     ):
--> 213         return func(*args, **kwargs)
    214 except InvalidParameterError as e:
    215     # When the function is just a wrapper around an estimator, we allow
    216     # the function to delegate validation to the estimator, but we replace
    217     # the name of the estimator by the name of the function in the error
    218     # message to avoid confusion.
    219     msg = re.sub(
    220         r"parameter of \w+ must be",
    221         f"parameter of {func.__qualname__} must be",
    222         str(e),
    223     )

File ~\anaconda3\Lib\site-packages\sklearn\metrics\cluster\_unsupervised.py:141, in silhouette_score(X, labels, metric, sample_size, random_state, **kwds)
    139     else:
    140         X, labels = X[indices], labels[indices]
--> 141 return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))

File ~\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py:186, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    184 global_skip_validation = get_config()["skip_parameter_validation"]
    185 if global_skip_validation:
--> 186     return func(*args, **kwargs)
    188 func_sig = signature(func)
    190 # Map *args/**kwargs to the function signature

File ~\anaconda3\Lib\site-packages\sklearn\metrics\cluster\_unsupervised.py:299, in silhouette_samples(X, labels, metric, **kwds)
    297 n_samples = len(labels)
    298 label_freqs = np.bincount(labels)
--> 299 check_number_of_labels(len(le.classes_), n_samples)
    301 kwds["metric"] = metric
    302 reduce_func = functools.partial(
    303     _silhouette_reduce, labels=labels, label_freqs=label_freqs
    304 )

File ~\anaconda3\Lib\site-packages\sklearn\metrics\cluster\_unsupervised.py:38, in check_number_of_labels(n_labels, n_samples)
     27 """Check that number of labels are valid.
     28 
     29 Parameters
   (...)
     35     Number of samples.
     36 """
     37 if not 1 < n_labels < n_samples:
---> 38     raise ValueError(
     39         "Number of labels is %d. Valid values are 2 to n_samples - 1 (inclusive)"
     40         % n_labels
     41     )

ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

Theil¶

K-Means¶
In [145]:
silhouette_avg = []

for n_clusters in range(2, 11):
    
    clusterer = KMeans(n_clusters = n_clusters)
    cluster_labels = clusterer.fit_predict(dadosc0["Theil"].values.reshape(-1, 1).tolist())
    silhouette_avg.append(silhouette_score(dadosc0["Theil"].values.reshape(-1, 1).tolist(), cluster_labels))
    
melhor = silhouette_avg.index(max(silhouette_avg)) + 2
clusterer = KMeans(n_clusters = melhor)
cluster_labels = clusterer.fit_predict(dadosc0["Theil"].values.reshape(-1, 1).tolist())
dadosc0["Labels"] = cluster_labels
print(silhouette_score(dadosc0["Theil"].values.reshape(-1, 1).tolist(), cluster_labels))
clusterd(dadosc0, "Theil")  # mudar função
0.6048100180202782
Mistura Gaussiana¶
In [147]:
silhouette_avg = []

for n_clusters in range(2, 11):
    
    clusterer = GaussianMixture(n_components = n_clusters)
    cluster_labels = clusterer.fit_predict(dadosc0["Theil"].values.reshape(-1, 1).tolist())
    silhouette_avg.append(silhouette_score(dadosc0["Theil"].values.reshape(-1, 1).tolist(), cluster_labels))
    
melhor = silhouette_avg.index(max(silhouette_avg)) + 2
clusterer = GaussianMixture(n_components = melhor)
cluster_labels = clusterer.fit_predict(dadosc0["Theil"].values.reshape(-1, 1).tolist())
dadosc0["Labels"] = cluster_labels
print(silhouette_score(dadosc0["Theil"].values.reshape(-1, 1).tolist(), cluster_labels))
clusterd(dadosc0, "Theil")  # mudar função
0.586211982453521
Birch¶
In [149]:
clusterer = Birch()
cluster_labels = clusterer.fit_predict(dadosc0["Theil"].values.reshape(-1, 1).tolist())
dadosc0["Labels"] = cluster_labels
print(silhouette_score(dadosc0["Theil"].values.reshape(-1, 1).tolist(), cluster_labels))
clusterd(dadosc0, "Theil")
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[149], line 4
      2 cluster_labels = clusterer.fit_predict(dadosc0["Theil"].values.reshape(-1, 1).tolist())
      3 dadosc0["Labels"] = cluster_labels
----> 4 print(silhouette_score(dadosc0["Theil"].values.reshape(-1, 1).tolist(), cluster_labels))
      5 clusterd(dadosc0, "Theil")

File ~\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py:213, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    207 try:
    208     with config_context(
    209         skip_parameter_validation=(
    210             prefer_skip_nested_validation or global_skip_validation
    211         )
    212     ):
--> 213         return func(*args, **kwargs)
    214 except InvalidParameterError as e:
    215     # When the function is just a wrapper around an estimator, we allow
    216     # the function to delegate validation to the estimator, but we replace
    217     # the name of the estimator by the name of the function in the error
    218     # message to avoid confusion.
    219     msg = re.sub(
    220         r"parameter of \w+ must be",
    221         f"parameter of {func.__qualname__} must be",
    222         str(e),
    223     )

File ~\anaconda3\Lib\site-packages\sklearn\metrics\cluster\_unsupervised.py:141, in silhouette_score(X, labels, metric, sample_size, random_state, **kwds)
    139     else:
    140         X, labels = X[indices], labels[indices]
--> 141 return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))

File ~\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py:186, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    184 global_skip_validation = get_config()["skip_parameter_validation"]
    185 if global_skip_validation:
--> 186     return func(*args, **kwargs)
    188 func_sig = signature(func)
    190 # Map *args/**kwargs to the function signature

File ~\anaconda3\Lib\site-packages\sklearn\metrics\cluster\_unsupervised.py:299, in silhouette_samples(X, labels, metric, **kwds)
    297 n_samples = len(labels)
    298 label_freqs = np.bincount(labels)
--> 299 check_number_of_labels(len(le.classes_), n_samples)
    301 kwds["metric"] = metric
    302 reduce_func = functools.partial(
    303     _silhouette_reduce, labels=labels, label_freqs=label_freqs
    304 )

File ~\anaconda3\Lib\site-packages\sklearn\metrics\cluster\_unsupervised.py:38, in check_number_of_labels(n_labels, n_samples)
     27 """Check that number of labels are valid.
     28 
     29 Parameters
   (...)
     35     Number of samples.
     36 """
     37 if not 1 < n_labels < n_samples:
---> 38     raise ValueError(
     39         "Number of labels is %d. Valid values are 2 to n_samples - 1 (inclusive)"
     40         % n_labels
     41     )

ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

MLD¶

K-Means¶
In [151]:
silhouette_avg = []

for n_clusters in range(2, 11):
    
    clusterer = KMeans(n_clusters = n_clusters)
    cluster_labels = clusterer.fit_predict(dadosc0["MLD"].values.reshape(-1, 1).tolist())
    silhouette_avg.append(silhouette_score(dadosc0["MLD"].values.reshape(-1, 1).tolist(), cluster_labels))
    
melhor = silhouette_avg.index(max(silhouette_avg)) + 2
clusterer = KMeans(n_clusters = melhor)
cluster_labels = clusterer.fit_predict(dadosc0["MLD"].values.reshape(-1, 1).tolist())
dadosc0["Labels"] = cluster_labels
print(silhouette_score(dadosc0["MLD"].values.reshape(-1, 1).tolist(), cluster_labels))
clusterd(dadosc0, "MLD")  # mudar função
0.6553926483957443
Mistura Gaussiana¶
In [153]:
silhouette_avg = []

for n_clusters in range(2, 11):
    
    clusterer = GaussianMixture(n_components = n_clusters)
    cluster_labels = clusterer.fit_predict(dadosc0["MLD"].values.reshape(-1, 1).tolist())
    silhouette_avg.append(silhouette_score(dadosc0["MLD"].values.reshape(-1, 1).tolist(), cluster_labels))
    
melhor = silhouette_avg.index(max(silhouette_avg)) + 2
clusterer = GaussianMixture(n_components = melhor)
cluster_labels = clusterer.fit_predict(dadosc0["MLD"].values.reshape(-1, 1).tolist())
dadosc0["Labels"] = cluster_labels
print(silhouette_score(dadosc0["MLD"].values.reshape(-1, 1).tolist(), cluster_labels))
clusterd(dadosc0, "MLD")  # mudar função
0.6514478127463159
Birch¶
In [155]:
clusterer = Birch()
cluster_labels = clusterer.fit_predict(dadosc0["MLD"].values.reshape(-1, 1).tolist())
dadosc0["Labels"] = cluster_labels
print(silhouette_score(dadosc0["MLD"].values.reshape(-1, 1).tolist(), cluster_labels))
clusterd(dadosc0, "MLD")
0.5539655299922572
In [ ]: