import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import KMeans

sns.set()


df =pd.read_csv('assign_wk7/student-mat.csv', low_memory=False, sep =';')


df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher      395 non-null    object
 21  internet    395 non-null    object
 22  romantic    395 non-null    object
 23  famrel      395 non-null    int64 
 24  freetime    395 non-null    int64 
 25  goout       395 non-null    int64 
 26  Dalc        395 non-null    int64 
 27  Walc        395 non-null    int64 
 28  health      395 non-null    int64 
 29  absences    395 non-null    int64 
 30  G1          395 non-null    int64 
 31  G2          395 non-null    int64 
 32  G3          395 non-null    int64 
dtypes: int64(16), object(17)
memory usage: 102.0+ KB


df.isnull().sum()

school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
G1            0
G2            0
G3            0
dtype: int64


df.nunique()

school         2
sex            2
age            8
address        2
famsize        2
Pstatus        2
Medu           5
Fedu           5
Mjob           5
Fjob           5
reason         4
guardian       3
traveltime     4
studytime      4
failures       4
schoolsup      2
famsup         2
paid           2
activities     2
nursery        2
higher         2
internet       2
romantic       2
famrel         5
freetime       5
goout          5
Dalc           5
Walc           5
health         5
absences      34
G1            17
G2            17
G3            18
dtype: int64


df.age.unique()

array([18, 17, 15, 16, 19, 22, 20, 21], dtype=int64)

df


##df[].apply(lambda x: x.astype('category'))

#cat_cols = ['school','sex','age','address','famsize','Pstatus','Medu','Fedu','Mjob','Fjob',
 #   'reason','guardian','traveltime','studytime','failures','schoolsup','famsup','paid','activities','nursery','higher',
  # 'internet','romantic','famrel','freetime','goout','Dalc','Walc','health']
#df[cat_cols] = df[cat_cols].astype('category')


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher      395 non-null    object
 21  internet    395 non-null    object
 22  romantic    395 non-null    object
 23  famrel      395 non-null    int64 
 24  freetime    395 non-null    int64 
 25  goout       395 non-null    int64 
 26  Dalc        395 non-null    int64 
 27  Walc        395 non-null    int64 
 28  health      395 non-null    int64 
 29  absences    395 non-null    int64 
 30  G1          395 non-null    int64 
 31  G2          395 non-null    int64 
 32  G3          395 non-null    int64 
dtypes: int64(16), object(17)
memory usage: 102.0+ KB


df.corr()


_ = sns.heatmap(df.corr())


sns.displot(data=df, x="G3",multiple="stack")

<seaborn.axisgrid.FacetGrid at 0x12effe3bc10>


sns.displot(data=df, x="G3",hue='Walc',kind='kde')
sns.displot(data=df, x='G3',hue='Dalc',kind='kde')

<seaborn.axisgrid.FacetGrid at 0x12e80121130>


sns.displot(data=df,x = 'G3', col = 'Dalc',kind = 'kde')
sns.displot(data=df,x = 'G3', col = 'Walc',kind = 'kde')
#Comparing impacts of Dalc (Workday alcohol consumption) & Walc (Weekend alcohol consumption)

<seaborn.axisgrid.FacetGrid at 0x12e804ee910>


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher      395 non-null    object
 21  internet    395 non-null    object
 22  romantic    395 non-null    object
 23  famrel      395 non-null    int64 
 24  freetime    395 non-null    int64 
 25  goout       395 non-null    int64 
 26  Dalc        395 non-null    int64 
 27  Walc        395 non-null    int64 
 28  health      395 non-null    int64 
 29  absences    395 non-null    int64 
 30  G1          395 non-null    int64 
 31  G2          395 non-null    int64 
 32  G3          395 non-null    int64 
dtypes: int64(16), object(17)
memory usage: 102.0+ KB


needs_enc=['school','sex','address','famsize','Pstatus','Mjob','Fjob','reason','guardian','schoolsup','famsup',
          'paid','activities','nursery','higher','internet','romantic']


df_encoded = pd.get_dummies(df, columns=needs_enc, 
                            prefix=needs_enc)


df_encoded


df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 59 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   age                395 non-null    int64
 1   Medu               395 non-null    int64
 2   Fedu               395 non-null    int64
 3   traveltime         395 non-null    int64
 4   studytime          395 non-null    int64
 5   failures           395 non-null    int64
 6   famrel             395 non-null    int64
 7   freetime           395 non-null    int64
 8   goout              395 non-null    int64
 9   Dalc               395 non-null    int64
 10  Walc               395 non-null    int64
 11  health             395 non-null    int64
 12  absences           395 non-null    int64
 13  G1                 395 non-null    int64
 14  G2                 395 non-null    int64
 15  G3                 395 non-null    int64
 16  school_GP          395 non-null    uint8
 17  school_MS          395 non-null    uint8
 18  sex_F              395 non-null    uint8
 19  sex_M              395 non-null    uint8
 20  address_R          395 non-null    uint8
 21  address_U          395 non-null    uint8
 22  famsize_GT3        395 non-null    uint8
 23  famsize_LE3        395 non-null    uint8
 24  Pstatus_A          395 non-null    uint8
 25  Pstatus_T          395 non-null    uint8
 26  Mjob_at_home       395 non-null    uint8
 27  Mjob_health        395 non-null    uint8
 28  Mjob_other         395 non-null    uint8
 29  Mjob_services      395 non-null    uint8
 30  Mjob_teacher       395 non-null    uint8
 31  Fjob_at_home       395 non-null    uint8
 32  Fjob_health        395 non-null    uint8
 33  Fjob_other         395 non-null    uint8
 34  Fjob_services      395 non-null    uint8
 35  Fjob_teacher       395 non-null    uint8
 36  reason_course      395 non-null    uint8
 37  reason_home        395 non-null    uint8
 38  reason_other       395 non-null    uint8
 39  reason_reputation  395 non-null    uint8
 40  guardian_father    395 non-null    uint8
 41  guardian_mother    395 non-null    uint8
 42  guardian_other     395 non-null    uint8
 43  schoolsup_no       395 non-null    uint8
 44  schoolsup_yes      395 non-null    uint8
 45  famsup_no          395 non-null    uint8
 46  famsup_yes         395 non-null    uint8
 47  paid_no            395 non-null    uint8
 48  paid_yes           395 non-null    uint8
 49  activities_no      395 non-null    uint8
 50  activities_yes     395 non-null    uint8
 51  nursery_no         395 non-null    uint8
 52  nursery_yes        395 non-null    uint8
 53  higher_no          395 non-null    uint8
 54  higher_yes         395 non-null    uint8
 55  internet_no        395 non-null    uint8
 56  internet_yes       395 non-null    uint8
 57  romantic_no        395 non-null    uint8
 58  romantic_yes       395 non-null    uint8
dtypes: int64(16), uint8(43)
memory usage: 66.1 KB


X = df_encoded.iloc[:,df_encoded.columns != 'G3']
y = df_encoded.iloc[:,df_encoded.columns == 'G3']


from sklearn.cluster import KMeans


sum_sq= []


for n in range (2,30):
    print('Calculating for ',n,' clusters')
    
    # random_start makes the results reproducible 
    # n_jobs=-1 means run with all machine processors
    model = KMeans(n_clusters=n, random_state=42, n_jobs=-1)
    model.fit(X)
    sum_sq.append(-model.score(X))

Calculating for  2  clusters
Calculating for  3  clusters
Calculating for  4  clusters
Calculating for  5  clusters
Calculating for

C:\Users\musta\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:938: FutureWarning: 'n_jobs' was deprecated in version 0.23 and will be removed in 0.25.
  warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
C:\Users\musta\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:938: FutureWarning: 'n_jobs' was deprecated in version 0.23 and will be removed in 0.25.
  warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
C:\Users\musta\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:938: FutureWarning: 'n_jobs' was deprecated in version 0.23 and will be removed in 0.25.
  warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
C:\Users\musta\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:938: FutureWarning: 'n_jobs' was deprecated in version 0.23 and will be removed in 0.25.
  warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"

 6  clusters
Calculating for  7  clusters
Calculating for  8  clusters
Calculating for  9  clusters

C:\Users\musta\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:938: FutureWarning: 'n_jobs' was deprecated in version 0.23 and will be removed in 0.25.
  warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
C:\Users\musta\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:938: FutureWarning: 'n_jobs' was deprecated in version 0.23 and will be removed in 0.25.
  warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
C:\Users\musta\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:938: FutureWarning: 'n_jobs' was deprecated in version 0.23 and will be removed in 0.25.
  warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
C:\Users\musta\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:938: FutureWarning: 'n_jobs' was deprecated in version 0.23 and will be removed in 0.25.
  warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"

Calculating for  10  clusters
Calculating for  11  clusters

C:\Users\musta\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:938: FutureWarning: 'n_jobs' was deprecated in version 0.23 and will be removed in 0.25.
  warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
C:\Users\musta\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:938: FutureWarning: 'n_jobs' was deprecated in version 0.23 and will be removed in 0.25.
  warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"

Calculating for  12  clusters
Calculating for  13  clusters

C:\Users\musta\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:938: FutureWarning: 'n_jobs' was deprecated in version 0.23 and will be removed in 0.25.
  warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
C:\Users\musta\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:938: FutureWarning: 'n_jobs' was deprecated in version 0.23 and will be removed in 0.25.
  warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"

Calculating for  14  clusters
Calculating for  15  clusters

C:\Users\musta\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:938: FutureWarning: 'n_jobs' was deprecated in version 0.23 and will be removed in 0.25.
  warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
C:\Users\musta\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:938: FutureWarning: 'n_jobs' was deprecated in version 0.23 and will be removed in 0.25.
  warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"


plt.plot(range(2, 30), sum_sq, 'bx-')

[<matplotlib.lines.Line2D at 0x12e89559130>]


plt.plot(range(3, 30), np.diff(sum_sq), 'bx-')

[<matplotlib.lines.Line2D at 0x12e8a577340>]


model = KMeans(n_clusters=13, random_state=42, n_jobs=-1)
model.fit(X)
preds= model.predict(X)

C:\Users\musta\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:938: FutureWarning: 'n_jobs' was deprecated in version 0.23 and will be removed in 0.25.
  warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"


from sklearn import metrics
score = metrics.silhouette_score(X, preds)
score

0.12767015799537987


from sklearn.decomposition import PCA

pca = PCA(n_components=2)
data_reduced = pca.fit_transform(X)
data_reduced = pd.DataFrame(data_reduced)

ax = data_reduced.plot(kind='scatter', x=0, y=1, c=preds, cmap='rainbow')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_title('Projection of the clustering on the axis of the PCA')

Text(0.5, 1.0, 'Projection of the clustering on the axis of the PCA')


for n in range (2,30):
    # random_state makes the results reproducible 
    # n_jobs=-1 means run with all machine processors 
    #Looks like n_jobs is defaulted to use all cores now, so we will remove to get rid of this ugly warning
    model = KMeans(n_clusters=n, random_state=42 )
    model.fit(X)
    # gather the predictions
    preds= model.predict(X)
    score = metrics.silhouette_score(X, preds)
    print('Silhouette score for ',n,' clusters: ',score)

Silhouette score for  2  clusters:  0.42209431823156734
Silhouette score for  3  clusters:  0.3394734695134711
Silhouette score for  4  clusters:  0.25276599147970397
Silhouette score for  5  clusters:  0.2231951804669021
Silhouette score for  6  clusters:  0.1652943701994924
Silhouette score for  7  clusters:  0.16519786417666815
Silhouette score for  8  clusters:  0.15760085549729622
Silhouette score for  9  clusters:  0.13278603147624848
Silhouette score for  10  clusters:  0.13365734276663413
Silhouette score for  11  clusters:  0.13292138415526694
Silhouette score for  12  clusters:  0.1393770622094747
Silhouette score for  13  clusters:  0.12767015799537987
Silhouette score for  14  clusters:  0.11997525158947846
Silhouette score for  15  clusters:  0.11451487899109687
Silhouette score for  16  clusters:  0.11318621396954952
Silhouette score for  17  clusters:  0.10882478770850627
Silhouette score for  18  clusters:  0.11006680671285829
Silhouette score for  19  clusters:  0.10826234248515464
Silhouette score for  20  clusters:  0.10737692025331733
Silhouette score for  21  clusters:  0.10632462034892456
Silhouette score for  22  clusters:  0.09838863913409718
Silhouette score for  23  clusters:  0.10162674589002149
Silhouette score for  24  clusters:  0.10194708542527331
Silhouette score for  25  clusters:  0.09143433046821624
Silhouette score for  26  clusters:  0.09292515473945777
Silhouette score for  27  clusters:  0.09034010363082175
Silhouette score for  28  clusters:  0.08938173617462963
Silhouette score for  29  clusters:  0.09093317573338854


model = KMeans(n_clusters=2, random_state=42)
model.fit(X)
preds= model.predict(X)

score = metrics.silhouette_score(X, preds)
score

0.42209431823156734


pca = PCA(n_components=2)
data_reduced = pca.fit_transform(X)
data_reduced = pd.DataFrame(data_reduced)

ax = data_reduced.plot(kind='scatter', x=0, y=1, c=preds, cmap='rainbow')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_title('Projection of the clustering on a the axis of the PCA')

Text(0.5, 1.0, 'Projection of the clustering on a the axis of the PCA')


df_enc2 = df_encoded.copy()


df_enc2


df_enc2.drop(columns=['G1','G2'], inplace=True)


df_enc2


X = df_enc2.iloc[:,df_enc2.columns != 'G3']
y = df_enc2.iloc[:,df_enc2.columns == 'G3']


sum_sq= []


for n in range (2,30):
    print('Calculating for ',n,' clusters')
    
    # random_start makes the results reproducible 
    # n_jobs=-1 means run with all machine processors
    model = KMeans(n_clusters=n, random_state=42)
    model.fit(X)
    sum_sq.append(-model.score(X))

Calculating for  2  clusters
Calculating for  3  clusters
Calculating for  4  clusters
Calculating for  5  clusters
Calculating for  6  clusters
Calculating for  7  clusters
Calculating for  8  clusters
Calculating for  9  clusters
Calculating for  10  clusters
Calculating for  11  clusters
Calculating for  12  clusters
Calculating for  13  clusters
Calculating for  14  clusters
Calculating for  15  clusters
Calculating for  16  clusters
Calculating for  17  clusters
Calculating for  18  clusters
Calculating for  19  clusters
Calculating for  20  clusters
Calculating for  21  clusters
Calculating for  22  clusters
Calculating for  23  clusters
Calculating for  24  clusters
Calculating for  25  clusters
Calculating for  26  clusters
Calculating for  27  clusters
Calculating for  28  clusters
Calculating for  29  clusters


plt.plot(range(2, 30), sum_sq, 'bx-')

[<matplotlib.lines.Line2D at 0x12e8c212280>]


plt.plot(range(3, 30), np.diff(sum_sq), 'bx-')

[<matplotlib.lines.Line2D at 0x12e8c25af40>]


for n in range (2,30):
    # random_state makes the results reproducible 
    # n_jobs=-1 means run with all machine processors 
    #Looks like n_jobs is defaulted to use all cores now, so we will remove to get rid of this ugly warning
    model = KMeans(n_clusters=n, random_state=42 )
    model.fit(X)
    # gather the predictions
    preds= model.predict(X)
    score = metrics.silhouette_score(X, preds)
    print('Silhouette score for ',n,' clusters: ',score)

Silhouette score for  2  clusters:  0.5119665918136315
Silhouette score for  3  clusters:  0.4401237106613952
Silhouette score for  4  clusters:  0.21383674649294312
Silhouette score for  5  clusters:  0.18306692899141733
Silhouette score for  6  clusters:  0.17894202482162702
Silhouette score for  7  clusters:  0.12392229650993603
Silhouette score for  8  clusters:  0.10598354151012931
Silhouette score for  9  clusters:  0.09927466335132894
Silhouette score for  10  clusters:  0.0980296977130901
Silhouette score for  11  clusters:  0.08681294106902995
Silhouette score for  12  clusters:  0.08571075360045469
Silhouette score for  13  clusters:  0.08577453960906262
Silhouette score for  14  clusters:  0.08566459222277091
Silhouette score for  15  clusters:  0.08538764262419113
Silhouette score for  16  clusters:  0.08232215717598297
Silhouette score for  17  clusters:  0.0777859368149924
Silhouette score for  18  clusters:  0.07593622748472036
Silhouette score for  19  clusters:  0.0780492919095828
Silhouette score for  20  clusters:  0.07377596491207171
Silhouette score for  21  clusters:  0.06905837674003754
Silhouette score for  22  clusters:  0.07814289952834202
Silhouette score for  23  clusters:  0.07683003284484262
Silhouette score for  24  clusters:  0.06184731800009618
Silhouette score for  25  clusters:  0.05938879795550189
Silhouette score for  26  clusters:  0.05988406051580081
Silhouette score for  27  clusters:  0.05810482116243873
Silhouette score for  28  clusters:  0.06581525672444744
Silhouette score for  29  clusters:  0.0649524150667437


model = KMeans(n_clusters=2, random_state=42)
model.fit(X)
preds= model.predict(X)

score = metrics.silhouette_score(X, preds)
score

0.5119665918136315


pca = PCA(n_components=2)
data_reduced = pca.fit_transform(X)
data_reduced = pd.DataFrame(data_reduced)

ax = data_reduced.plot(kind='scatter', x=0, y=1, c=preds, cmap='rainbow')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_title('Projection of the clustering on a the axis of the PCA')

Text(0.5, 1.0, 'Projection of the clustering on a the axis of the PCA')


from tpot import TPOTClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

# Any results you write to the current directory are saved as output.
import timeit

C:\Users\musta\anaconda3\lib\site-packages\tpot\builtins\__init__.py:36: UserWarning: Warning: optional dependency `torch` is not available. - skipping import of NN models.
  warnings.warn("Warning: optional dependency `torch` is not available. - skipping import of NN models.")


X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=42)


%%time
tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, n_jobs=-1, random_state=42)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

C:\Users\musta\anaconda3\lib\site-packages\sklearn\utils\validation.py:72: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  return f(**kwargs)
C:\Users\musta\anaconda3\lib\site-packages\tpot\tpot.py:67: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.pretest_y[0:unique_target_idx.shape[0]] = \
C:\Users\musta\anaconda3\lib\site-packages\pandas\core\frame.py:3047: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.iloc._setitem_with_indexer(key, value)
C:\Users\musta\anaconda3\lib\site-packages\pandas\core\frame.py:3032: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._setitem_slice(indexer, value)

Generation 1 - Current best internal CV score: 0.2470734126984127

Generation 2 - Current best internal CV score: 0.2470734126984127

Generation 3 - Current best internal CV score: 0.2564484126984127

Generation 4 - Current best internal CV score: 0.2564484126984127

Generation 5 - Current best internal CV score: 0.25957341269841266

Best pipeline: RandomForestClassifier(input_matrix, bootstrap=True, criterion=gini, max_features=0.6000000000000001, min_samples_leaf=3, min_samples_split=17, n_estimators=100)
0.1518987341772152
Wall time: 2min 29s

C:\Users\musta\anaconda3\lib\site-packages\sklearn\utils\validation.py:72: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  return f(**kwargs)


from tpot import TPOTRegressor


%%time
tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2, n_jobs=-1, scoring='r2', random_state=42)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

C:\Users\musta\anaconda3\lib\site-packages\sklearn\utils\validation.py:72: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  return f(**kwargs)

Generation 1 - Current best internal CV score: 0.2673513906772262

Generation 2 - Current best internal CV score: 0.28111326765763855

Generation 3 - Current best internal CV score: 0.28111326765763855

Generation 4 - Current best internal CV score: 0.28509114116828316

Generation 5 - Current best internal CV score: 0.28509114116828316

Best pipeline: AdaBoostRegressor(MinMaxScaler(Normalizer(input_matrix, norm=max)), learning_rate=0.1, loss=square, n_estimators=100)
0.1773028586865094
Wall time: 1min 36s

C:\Users\musta\anaconda3\lib\site-packages\sklearn\utils\validation.py:72: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  return f(**kwargs)

	age	Medu	Fedu	traveltime	studytime	failures	famrel	freetime	goout	Dalc	Walc	health	absences	G1	G2	G3
age	1.000000	-0.163658	-0.163438	0.070641	-0.004140	0.243665	0.053940	0.016434	0.126964	0.131125	0.117276	-0.062187	0.175230	-0.064081	-0.143474	-0.161579
Medu	-0.163658	1.000000	0.623455	-0.171639	0.064944	-0.236680	-0.003914	0.030891	0.064094	0.019834	-0.047123	-0.046878	0.100285	0.205341	0.215527	0.217147
Fedu	-0.163438	0.623455	1.000000	-0.158194	-0.009175	-0.250408	-0.001370	-0.012846	0.043105	0.002386	-0.012631	0.014742	0.024473	0.190270	0.164893	0.152457
traveltime	0.070641	-0.171639	-0.158194	1.000000	-0.100909	0.092239	-0.016808	-0.017025	0.028540	0.138325	0.134116	0.007501	-0.012944	-0.093040	-0.153198	-0.117142
studytime	-0.004140	0.064944	-0.009175	-0.100909	1.000000	-0.173563	0.039731	-0.143198	-0.063904	-0.196019	-0.253785	-0.075616	-0.062700	0.160612	0.135880	0.097820
failures	0.243665	-0.236680	-0.250408	0.092239	-0.173563	1.000000	-0.044337	0.091987	0.124561	0.136047	0.141962	0.065827	0.063726	-0.354718	-0.355896	-0.360415
famrel	0.053940	-0.003914	-0.001370	-0.016808	0.039731	-0.044337	1.000000	0.150701	0.064568	-0.077594	-0.113397	0.094056	-0.044354	0.022168	-0.018281	0.051363
freetime	0.016434	0.030891	-0.012846	-0.017025	-0.143198	0.091987	0.150701	1.000000	0.285019	0.209001	0.147822	0.075733	-0.058078	0.012613	-0.013777	0.011307
goout	0.126964	0.064094	0.043105	0.028540	-0.063904	0.124561	0.064568	0.285019	1.000000	0.266994	0.420386	-0.009577	0.044302	-0.149104	-0.162250	-0.132791
Dalc	0.131125	0.019834	0.002386	0.138325	-0.196019	0.136047	-0.077594	0.209001	0.266994	1.000000	0.647544	0.077180	0.111908	-0.094159	-0.064120	-0.054660
Walc	0.117276	-0.047123	-0.012631	0.134116	-0.253785	0.141962	-0.113397	0.147822	0.420386	0.647544	1.000000	0.092476	0.136291	-0.126179	-0.084927	-0.051939
health	-0.062187	-0.046878	0.014742	0.007501	-0.075616	0.065827	0.094056	0.075733	-0.009577	0.077180	0.092476	1.000000	-0.029937	-0.073172	-0.097720	-0.061335
absences	0.175230	0.100285	0.024473	-0.012944	-0.062700	0.063726	-0.044354	-0.058078	0.044302	0.111908	0.136291	-0.029937	1.000000	-0.031003	-0.031777	0.034247
G1	-0.064081	0.205341	0.190270	-0.093040	0.160612	-0.354718	0.022168	0.012613	-0.149104	-0.094159	-0.126179	-0.073172	-0.031003	1.000000	0.852118	0.801468
G2	-0.143474	0.215527	0.164893	-0.153198	0.135880	-0.355896	-0.018281	-0.013777	-0.162250	-0.064120	-0.084927	-0.097720	-0.031777	0.852118	1.000000	0.904868
G3	-0.161579	0.217147	0.152457	-0.117142	0.097820	-0.360415	0.051363	0.011307	-0.132791	-0.054660	-0.051939	-0.061335	0.034247	0.801468	0.904868	1.000000

Week 7 Lab: Unsupervised Learning¶

Our Dataset:¶

Unsupervised Lab¶

Deliverables:¶

Reasoning for so many categorical¶

Update - This is no longer true, leaving to show progress¶

Let's gather our thoughts for a second¶

A theory¶

Revisit¶

Conclusion¶

	school	sex	age	address	famsize	Pstatus	Medu	Fedu	Mjob	Fjob	...	famrel	freetime	goout	Dalc	Walc	health	absences	G1	G2	G3
0	GP	F	18	U	GT3	A	4	4	at_home	teacher	...	4	3	4	1	1	3	6	5	6	6
1	GP	F	17	U	GT3	T	1	1	at_home	other	...	5	3	3	1	1	3	4	5	5	6
2	GP	F	15	U	LE3	T	1	1	at_home	other	...	4	3	2	2	3	3	10	7	8	10
3	GP	F	15	U	GT3	T	4	2	health	services	...	3	2	2	1	1	5	2	15	14	15
4	GP	F	16	U	GT3	T	3	3	other	other	...	4	3	2	1	2	5	4	6	10	10

	age	Medu	Fedu	traveltime	studytime	failures	famrel	freetime	goout	Dalc	...	activities_no	activities_yes	nursery_no	nursery_yes	higher_no	higher_yes	internet_no	internet_yes	romantic_no	romantic_yes
0	18	4	4	2	2	0	4	3	4	1	...	1	0	0	1	0	1	1	0	1	0
1	17	1	1	1	2	0	5	3	3	1	...	1	0	1	0	0	1	0	1	1	0
2	15	1	1	1	2	3	4	3	2	2	...	1	0	0	1	0	1	0	1	1	0
3	15	4	2	1	3	0	3	2	2	1	...	0	1	0	1	0	1	0	1	0	1
4	16	3	3	1	2	0	4	3	2	1	...	1	0	0	1	0	1	1	0	1	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
390	20	2	2	1	2	2	5	5	4	4	...	1	0	0	1	0	1	1	0	1	0
391	17	3	1	2	1	0	2	4	5	3	...	1	0	1	0	0	1	0	1	1	0
392	21	1	1	1	1	3	5	5	3	3	...	1	0	1	0	0	1	1	0	1	0
393	18	3	2	3	1	0	4	4	1	3	...	1	0	1	0	0	1	0	1	1	0
394	19	1	1	1	1	0	3	2	3	3	...	1	0	0	1	0	1	0	1	1	0

	age	Medu	Fedu	traveltime	studytime	failures	famrel	freetime	goout	Dalc	...	activities_no	activities_yes	nursery_no	nursery_yes	higher_no	higher_yes	internet_no	internet_yes	romantic_no	romantic_yes
0	18	4	4	2	2	0	4	3	4	1	...	1	0	0	1	0	1	1	0	1	0
1	17	1	1	1	2	0	5	3	3	1	...	1	0	1	0	0	1	0	1	1	0
2	15	1	1	1	2	3	4	3	2	2	...	1	0	0	1	0	1	0	1	1	0
3	15	4	2	1	3	0	3	2	2	1	...	0	1	0	1	0	1	0	1	0	1
4	16	3	3	1	2	0	4	3	2	1	...	1	0	0	1	0	1	1	0	1	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
390	20	2	2	1	2	2	5	5	4	4	...	1	0	0	1	0	1	1	0	1	0
391	17	3	1	2	1	0	2	4	5	3	...	1	0	1	0	0	1	0	1	1	0
392	21	1	1	1	1	3	5	5	3	3	...	1	0	1	0	0	1	1	0	1	0
393	18	3	2	3	1	0	4	4	1	3	...	1	0	1	0	0	1	0	1	1	0
394	19	1	1	1	1	0	3	2	3	3	...	1	0	0	1	0	1	0	1	1	0

	age	Medu	Fedu	traveltime	studytime	failures	famrel	freetime	goout	Dalc	...	activities_no	activities_yes	nursery_no	nursery_yes	higher_no	higher_yes	internet_no	internet_yes	romantic_no	romantic_yes
0	18	4	4	2	2	0	4	3	4	1	...	1	0	0	1	0	1	1	0	1	0
1	17	1	1	1	2	0	5	3	3	1	...	1	0	1	0	0	1	0	1	1	0
2	15	1	1	1	2	3	4	3	2	2	...	1	0	0	1	0	1	0	1	1	0
3	15	4	2	1	3	0	3	2	2	1	...	0	1	0	1	0	1	0	1	0	1
4	16	3	3	1	2	0	4	3	2	1	...	1	0	0	1	0	1	1	0	1	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
390	20	2	2	1	2	2	5	5	4	4	...	1	0	0	1	0	1	1	0	1	0
391	17	3	1	2	1	0	2	4	5	3	...	1	0	1	0	0	1	0	1	1	0
392	21	1	1	1	1	3	5	5	3	3	...	1	0	1	0	0	1	1	0	1	0
393	18	3	2	3	1	0	4	4	1	3	...	1	0	1	0	0	1	0	1	1	0
394	19	1	1	1	1	0	3	2	3	3	...	1	0	0	1	0	1	0	1	1	0

	age	Medu	Fedu	traveltime	studytime	failures	famrel	freetime	goout	Dalc	...	activities_no	activities_yes	nursery_no	nursery_yes	higher_no	higher_yes	internet_no	internet_yes	romantic_no	romantic_yes
0	18	4	4	2	2	0	4	3	4	1	...	1	0	0	1	0	1	1	0	1	0
1	17	1	1	1	2	0	5	3	3	1	...	1	0	1	0	0	1	0	1	1	0
2	15	1	1	1	2	3	4	3	2	2	...	1	0	0	1	0	1	0	1	1	0
3	15	4	2	1	3	0	3	2	2	1	...	0	1	0	1	0	1	0	1	0	1
4	16	3	3	1	2	0	4	3	2	1	...	1	0	0	1	0	1	1	0	1	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
390	20	2	2	1	2	2	5	5	4	4	...	1	0	0	1	0	1	1	0	1	0
391	17	3	1	2	1	0	2	4	5	3	...	1	0	1	0	0	1	0	1	1	0
392	21	1	1	1	1	3	5	5	3	3	...	1	0	1	0	0	1	1	0	1	0
393	18	3	2	3	1	0	4	4	1	3	...	1	0	1	0	0	1	0	1	1	0
394	19	1	1	1	1	0	3	2	3	3	...	1	0	0	1	0	1	0	1	1	0

	age	Medu	Fedu	traveltime	studytime	failures	famrel	freetime	goout	Dalc	...	activities_no	activities_yes	nursery_no	nursery_yes	higher_no	higher_yes	internet_no	internet_yes	romantic_no	romantic_yes
0	18	4	4	2	2	0	4	3	4	1	...	1	0	0	1	0	1	1	0	1	0
1	17	1	1	1	2	0	5	3	3	1	...	1	0	1	0	0	1	0	1	1	0
2	15	1	1	1	2	3	4	3	2	2	...	1	0	0	1	0	1	0	1	1	0
3	15	4	2	1	3	0	3	2	2	1	...	0	1	0	1	0	1	0	1	0	1
4	16	3	3	1	2	0	4	3	2	1	...	1	0	0	1	0	1	1	0	1	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
390	20	2	2	1	2	2	5	5	4	4	...	1	0	0	1	0	1	1	0	1	0
391	17	3	1	2	1	0	2	4	5	3	...	1	0	1	0	0	1	0	1	1	0
392	21	1	1	1	1	3	5	5	3	3	...	1	0	1	0	0	1	1	0	1	0
393	18	3	2	3	1	0	4	4	1	3	...	1	0	1	0	0	1	0	1	1	0
394	19	1	1	1	1	0	3	2	3	3	...	1	0	0	1	0	1	0	1	1	0

	age	Medu	Fedu	traveltime	studytime	failures	famrel	freetime	goout	Dalc	...	activities_no	activities_yes	nursery_no	nursery_yes	higher_no	higher_yes	internet_no	internet_yes	romantic_no	romantic_yes
0	18	4	4	2	2	0	4	3	4	1	...	1	0	0	1	0	1	1	0	1	0
1	17	1	1	1	2	0	5	3	3	1	...	1	0	1	0	0	1	0	1	1	0
2	15	1	1	1	2	3	4	3	2	2	...	1	0	0	1	0	1	0	1	1	0
3	15	4	2	1	3	0	3	2	2	1	...	0	1	0	1	0	1	0	1	0	1
4	16	3	3	1	2	0	4	3	2	1	...	1	0	0	1	0	1	1	0	1	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
390	20	2	2	1	2	2	5	5	4	4	...	1	0	0	1	0	1	1	0	1	0
391	17	3	1	2	1	0	2	4	5	3	...	1	0	1	0	0	1	0	1	1	0
392	21	1	1	1	1	3	5	5	3	3	...	1	0	1	0	0	1	1	0	1	0
393	18	3	2	3	1	0	4	4	1	3	...	1	0	1	0	0	1	0	1	1	0
394	19	1	1	1	1	0	3	2	3	3	...	1	0	0	1	0	1	0	1	1	0

	age	Medu	Fedu	traveltime	studytime	failures	famrel	freetime	goout	Dalc	...	activities_no	activities_yes	nursery_no	nursery_yes	higher_no	higher_yes	internet_no	internet_yes	romantic_no	romantic_yes
0	18	4	4	2	2	0	4	3	4	1	...	1	0	0	1	0	1	1	0	1	0
1	17	1	1	1	2	0	5	3	3	1	...	1	0	1	0	0	1	0	1	1	0
2	15	1	1	1	2	3	4	3	2	2	...	1	0	0	1	0	1	0	1	1	0
3	15	4	2	1	3	0	3	2	2	1	...	0	1	0	1	0	1	0	1	0	1
4	16	3	3	1	2	0	4	3	2	1	...	1	0	0	1	0	1	1	0	1	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
390	20	2	2	1	2	2	5	5	4	4	...	1	0	0	1	0	1	1	0	1	0
391	17	3	1	2	1	0	2	4	5	3	...	1	0	1	0	0	1	0	1	1	0
392	21	1	1	1	1	3	5	5	3	3	...	1	0	1	0	0	1	1	0	1	0
393	18	3	2	3	1	0	4	4	1	3	...	1	0	1	0	0	1	0	1	1	0
394	19	1	1	1	1	0	3	2	3	3	...	1	0	0	1	0	1	0	1	1	0

	age	Medu	Fedu	traveltime	studytime	failures	famrel	freetime	goout	Dalc	...	activities_no	activities_yes	nursery_no	nursery_yes	higher_no	higher_yes	internet_no	internet_yes	romantic_no	romantic_yes
0	18	4	4	2	2	0	4	3	4	1	...	1	0	0	1	0	1	1	0	1	0
1	17	1	1	1	2	0	5	3	3	1	...	1	0	1	0	0	1	0	1	1	0
2	15	1	1	1	2	3	4	3	2	2	...	1	0	0	1	0	1	0	1	1	0
3	15	4	2	1	3	0	3	2	2	1	...	0	1	0	1	0	1	0	1	0	1
4	16	3	3	1	2	0	4	3	2	1	...	1	0	0	1	0	1	1	0	1	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
390	20	2	2	1	2	2	5	5	4	4	...	1	0	0	1	0	1	1	0	1	0
391	17	3	1	2	1	0	2	4	5	3	...	1	0	1	0	0	1	0	1	1	0
392	21	1	1	1	1	3	5	5	3	3	...	1	0	1	0	0	1	1	0	1	0
393	18	3	2	3	1	0	4	4	1	3	...	1	0	1	0	0	1	0	1	1	0
394	19	1	1	1	1	0	3	2	3	3	...	1	0	0	1	0	1	0	1	1	0

	age	Medu	Fedu	traveltime	studytime	failures	famrel	freetime	goout	Dalc	...	activities_no	activities_yes	nursery_no	nursery_yes	higher_no	higher_yes	internet_no	internet_yes	romantic_no	romantic_yes
0	18	4	4	2	2	0	4	3	4	1	...	1	0	0	1	0	1	1	0	1	0
1	17	1	1	1	2	0	5	3	3	1	...	1	0	1	0	0	1	0	1	1	0
2	15	1	1	1	2	3	4	3	2	2	...	1	0	0	1	0	1	0	1	1	0
3	15	4	2	1	3	0	3	2	2	1	...	0	1	0	1	0	1	0	1	0	1
4	16	3	3	1	2	0	4	3	2	1	...	1	0	0	1	0	1	1	0	1	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
390	20	2	2	1	2	2	5	5	4	4	...	1	0	0	1	0	1	1	0	1	0
391	17	3	1	2	1	0	2	4	5	3	...	1	0	1	0	0	1	0	1	1	0
392	21	1	1	1	1	3	5	5	3	3	...	1	0	1	0	0	1	1	0	1	0
393	18	3	2	3	1	0	4	4	1	3	...	1	0	1	0	0	1	0	1	1	0
394	19	1	1	1	1	0	3	2	3	3	...	1	0	0	1	0	1	0	1	1	0