import subprocess
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np
import pandas as pd
import scipy.cluster.hierarchy as shc
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

call_Rscript = subprocess.call('Rscript extract_estimates.R', shell=True)
call_Rscript

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
✔ ggplot2 3.3.6     ✔ purrr   0.3.4
✔ tibble  3.1.7     ✔ dplyr   1.0.9
✔ tidyr   1.2.0     ✔ stringr 1.4.0
✔ readr   2.1.2     ✔ forcats 0.5.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

Attache Paket: ‘nlme’

Das folgende Objekt ist maskiert ‘package:dplyr’:

    collapse


Attache Paket: ‘data.table’

Die folgenden Objekte sind maskiert von ‘package:dplyr’:

    between, first, last

Das folgende Objekt ist maskiert ‘package:purrr’:

    transpose

Lade nötiges Paket: Matrix

Attache Paket: ‘Matrix’

Die folgenden Objekte sind maskiert von ‘package:tidyr’:

    expand, pack, unpack


Attache Paket: ‘lme4’

Das folgende Objekt ist maskiert ‘package:nlme’:

    lmList

0

speech_data = pd.read_csv('speech_data.csv', sep=',', header=0)
speech_data.head()

speech_data = speech_data.drop(columns=['Unnamed: 0'])
speech_data.head()

tests = pd.read_csv('phon_wm_scores.csv',sep=',', header=0)
tests = tests.rename({'subject_nr': 'Subject'}, axis='columns')
tests.head()

all_measures = pd.merge(tests, speech_data, on='Subject')
all_measures = all_measures.set_index('Subject')
all_measures.head()

scaler = StandardScaler()
scaled_features = scaler.fit_transform(all_measures)

all_measures_scaled = pd.DataFrame(scaled_features, index=all_measures.index, columns=all_measures.columns)

all_measures_scaled.head()

#define PCA model to use
pca = PCA(n_components=9)

#fit PCA model to data
pca_fit = pca.fit(all_measures_scaled)

PC_values = np.arange(pca.n_components_) + 1
plt.style.use("fivethirtyeight")
plt.figure(figsize=(5, 4))
plt.plot(PC_values, pca.explained_variance_ratio_, 'o-', linewidth=2, color='blue')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.show()

# Create a k-means clustering model
kmeans = KMeans(init='random', n_clusters=2, n_init=10)

# Fit the data to the model
kmeans.fit(all_measures_scaled)

# Determine which clusters each data point belongs to:
clusters = kmeans.predict(all_measures_scaled)

# Add cluster number to the original data
all_measures_scaled_clustered = pd.DataFrame(all_measures_scaled, columns=all_measures_scaled.columns, index=all_measures_scaled.index)
all_measures_scaled_clustered['cluster'] = clusters

all_measures_scaled_clustered.head()

# Create a PCA model to reduce our data to 2 dimensions for visualisation
pca = PCA(n_components=2)
pca.fit(all_measures_scaled)

# Transfor the scaled data to the new PCA space
measures_reduced = pca.transform(all_measures_scaled)

# Convert to a data frame
measures_reduceddf = pd.DataFrame(measures_reduced, index=all_measures_scaled.index, columns=['PC1','PC2'])
measures_reduceddf['cluster'] = clusters
measures_reduceddf.head()

plt.style.use("fivethirtyeight")
plt.figure(figsize=(5, 4))

scat = sns.scatterplot(
    data=measures_reduceddf,
    x="PC1",
    y="PC2",
    hue="cluster",
    style="cluster",
    palette="Set2",
)

	Unnamed: 0	Subject	slope_ba_da	consistency_ba_da	slope_s_sh	consistency_s_sh	dprime
0	1	101	0.792550	-1.124428	0.917763	-0.702372	0.611972
1	2	102	0.464940	-1.240745	0.450287	-1.211086	1.067259
2	3	103	1.215431	-1.179974	0.692098	-0.799731	0.691741
3	4	104	0.506354	-0.978700	0.867693	-1.885947	0.057793
4	5	106	0.374636	-0.451488	0.646777	-1.194057	1.189263

	Subject	slope_ba_da	consistency_ba_da	slope_s_sh	consistency_s_sh	dprime
0	101	0.792550	-1.124428	0.917763	-0.702372	0.611972
1	102	0.464940	-1.240745	0.450287	-1.211086	1.067259
2	103	1.215431	-1.179974	0.692098	-0.799731	0.691741
3	104	0.506354	-0.978700	0.867693	-1.885947	0.057793
4	106	0.374636	-0.451488	0.646777	-1.194057	1.189263

	Subject	nonword_repetition	sound_blending	num_reversed	aud_wm
0	101	8	29	12	29
1	102	7	25	15	35
2	103	7	32	11	25
3	104	7	29	18	35
4	106	7	25	13	29

	nonword_repetition	sound_blending	num_reversed	aud_wm	slope_ba_da	consistency_ba_da	slope_s_sh	consistency_s_sh	dprime
Subject
101	8	29	12	29	0.792550	-1.124428	0.917763	-0.702372	0.611972
102	7	25	15	35	0.464940	-1.240745	0.450287	-1.211086	1.067259
103	7	32	11	25	1.215431	-1.179974	0.692098	-0.799731	0.691741
104	7	29	18	35	0.506354	-0.978700	0.867693	-1.885947	0.057793
106	7	25	13	29	0.374636	-0.451488	0.646777	-1.194057	1.189263

	nonword_repetition	sound_blending	num_reversed	aud_wm	slope_ba_da	consistency_ba_da	slope_s_sh	consistency_s_sh	dprime
Subject
101	-0.501848	0.448155	-1.269763	-0.959053	0.642975	-0.053508	1.638610	1.021420	-0.198564
102	-0.898832	-0.682902	-0.449063	0.402460	-0.405124	-0.369450	-1.824150	-0.297725	0.457181
103	-0.898832	1.296447	-1.543330	-1.866728	1.995866	-0.204383	-0.032967	0.768960	-0.083674
104	-0.898832	0.448155	0.371638	0.402460	-0.272631	0.342319	1.267726	-2.047707	-0.996743
106	-0.898832	-0.682902	-0.996196	-0.959053	-0.694028	1.774340	-0.368678	-0.253567	0.632902

	PC1	PC2	cluster
Subject
101	-1.116364	-0.440510	1
102	-0.405248	-0.648170	1
103	-1.681884	-1.272908	1
104	-0.932461	1.146899	1
106	-1.133984	-2.090004	1

k-means cluster analysis of speech and language data