# Necessary imports for data analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import sklearn
from sklearn.linear_model import LinearRegression

# Set style of matplotlib
plt.style.use("seaborn-darkgrid")

# Input the dataset
data = pd.read_csv('heart.csv')
data.head()


variable_names = {
    'age': "Age (years)",
    'sex': "Sex (0=female, 1=male)",
    'cp': "Chest Pain Type",
    'trestbps': "Resting blood pressure (mm Hg)",
    'chol': "Serum cholestoral (mg/dl)",
    'fbs': "Fasting blood sugar > 120 mg/dl",
    'restecg': "Resting ECG results",
    'thalach': "Maximum heart rate",
    'exang': "Exercise induced angina",
    'oldpeak': "ST depression induced by exercise related to rest",
    'slope': "Peak ST segment slope",
    'ca': 'Number of major vessels colored by flourosopy',
    'thal': 'Thalassemia',
    'target': "Has heart disease",
}


# Split the data into 'categorical' and 'continous'
# Categorical variables
categ_data = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'target']
# Continuous variables
contin_data = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# Make the histograms for categorical data
plt.figure(figsize=(15,15))
for i, column in enumerate(categ_data, 1):
    plt.subplot(3, 3, i)
    data[data["target"] == 0][column].hist(bins=5, color='blue', label='Does not have Heart Disease', alpha=0.6)
    data[data["target"] == 1][column].hist(bins=5, color='red', label='Has Heart Disease', alpha=0.6)
    plt.legend()
    plt.xlabel(variable_names[column])


# Make the continuous historgrams, with larger bin size
plt.figure(figsize=(15, 15))
for i, column in enumerate(contin_data, 1):
    plt.subplot(3, 2, i)
    data[data["target"] == 0][column].hist(bins=50, color='blue', label='Does not have Heart Disease', alpha=0.6)
    data[data["target"] == 1][column].hist(bins=50, color='red', label='Has Heart Disease', alpha=0.6)
    plt.legend()
    plt.xlabel(variable_names[column])


corr = data.corr()
plt.subplots(figsize=(13,10))
hm = sn.heatmap(corr, annot=True)

hm.set_title("Correlation Matrix between Columns")
hm.set_xticklabels(hm.get_xticklabels(), rotation=45, horizontalalignment='right')
hm.set_yticklabels(hm.get_yticklabels(), rotation=45, horizontalalignment='right')

plt.show()


import statsmodels.api as sm

# Independent
X = data['cp']

# Dependent
y = data['exang']

model = sm.OLS(y,X).fit()
model.summary()


import statsmodels.api as sm

# Independent
X = data['cp']

# Dependent
y = data['oldpeak']

model = sm.OLS(y,X).fit()
model.summary()


import statsmodels.api as sm

# Independent
X = data['exang']

# Dependent
y = data['oldpeak']

model = sm.OLS(y,X).fit()
model.summary()


import statsmodels.api as sm

# Independent
X = data['oldpeak']

# Dependent
y = data['thalach']

model = sm.OLS(y,X).fit()
model.summary()


import statsmodels.api as sm

# Independent
X = data['thalach']

# Dependent
y = data['cp']

model = sm.OLS(y,X).fit()
model.summary()


import statsmodels.api as sm

# Independent
X = data['thalach']

# Dependent
y = data['exang']

model = sm.OLS(y,X).fit()
model.summary()


import statsmodels.api as sm

# Independent
X = data[['cp', 'exang', 'oldpeak', 'thalach']]

# Dependent
y = data['target']

model = sm.OLS(y,X).fit()
model.summary()


import statsmodels.api as sm

# Independent
X = data['ca']

# Dependent
y = data['target']

model = sm.OLS(y,X).fit()
model.summary()


import statsmodels.api as sm

# Independent
X = data['thal']

# Dependent
y = data['target']

model = sm.OLS(y,X).fit()
model.summary()


from sklearn.model_selection import train_test_split

data2 = data.copy()

# Make dummy variable columns
cp_dummies = pd.get_dummies(data2['cp'], prefix='cp')
thal_dummies = pd.get_dummies(data2['thal'], prefix='thal')
slope_dummies = pd.get_dummies(data2['slope'], prefix='slope')
dummy_columns = [data2, cp_dummies, thal_dummies, slope_dummies]
data_cp = pd.concat(dummy_columns, axis=1)

data2 = data_cp.drop(columns=['cp','thal','slope'])

# Determine input and output variables
y = data_cp
x = data2.copy().drop(['target'], axis=1)

# Normalize the data
x = (x - np.min(x)) / (np.max(x) - np.min(x)).values

# Split data into test set and train set
x_train, x_test, y_train_t, y_test_t = train_test_split(x, y, test_size = 0.2, random_state=0)
y_train = y_train_t['target'].values
y_test = y_test_t['target'].values

# Show new dummy-ized dataset
data2.head()


from sklearn.linear_model import LogisticRegression
import seaborn as sns

model = LogisticRegression().fit(x_train, y_train)

y_predict = model.predict(x_test)
residuals = y_predict - y_test
    
plt.figure(figsize=(13,7))
reg_violin_plot = sns.violinplot(x=y_test_t['cp'], y=residuals, palette='rocket')
reg_violin_plot.set(xlabel="Chest pain type",
                    ylabel="Heart Disease logistic regression model residual",
                    title='Heart Disease Logistic Regression Model Residuals by Chest Pain Type')
plt.show()


# Imports
from sklearn.metrics import accuracy_score
test_score = accuracy_score(y_test, y_predict) * 100
train_score = accuracy_score(y_train, model.predict(x_train)) * 100

# Analyze the accuracy of our model
results = pd.DataFrame(data=[["Logistic Regression", train_score, test_score]], columns=['Model', 'Train Accuracy %', 'Test Accuracy %'])
results


from sklearn.neighbors import KNeighborsClassifier

# Initialize and train the model
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)

# Calculate residuals
y_predict = knn.predict(x_test)
residuals = y_predict - y_test
    
# Make the violin plot
plt.figure(figsize=(13,7))
reg_violin_plot = sns.violinplot(x=y_test_t['cp'], y=residuals, palette='rocket')
reg_violin_plot.set(xlabel="Chest pain type",
                    ylabel="Heart Disease k-nearest neighbors model residual",
                    title='Heart Disease K-Nearest Neighbors Model Residuals by Chest Pain Type')
plt.show()


# Calculate accuracy score
test_score = accuracy_score(y_test, y_predict) * 100
train_score = accuracy_score(y_train, knn.predict(x_train)) * 100

results = results.append(pd.DataFrame(data=[["K-nearest neighbors", train_score, test_score]], columns=['Model', 'Train Accuracy %', 'Test Accuracy %']), ignore_index=True)
results


# Imports
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

# Build the decision tree model and train it
dtree = DecisionTreeClassifier(random_state=0,max_depth=10)
dtree.fit(x_train, y_train)

# Display the tree
fig, ax = plt.subplots(figsize=(30,12))
tree.plot_tree(dtree, fontsize=12, feature_names=list(x_train.columns), max_depth=3)
plt.show()


# Calculate the residuals
y_predict = dtree.predict(x_test)
residuals = y_predict - y_test

# Make a violin plot of the residuals
plt.figure(figsize=(13,7))
reg_violin_plot = sns.violinplot(x=y_test_t['cp'], y=residuals, palette='rocket')
reg_violin_plot.set(xlabel="Chest pain type",
                    ylabel="Heart Disease decision tree model residual",
                    title='Heart Disease Decision Tree Model Residuals by Chest Pain Type')
plt.show()


# Calculate the accuracy score
test_score = accuracy_score(y_test, y_predict) * 100
train_score = accuracy_score(y_train, dtree.predict(x_train)) * 100

results = results.append(pd.DataFrame(data=[["Decision Tree Classifier", train_score, test_score]], columns=['Model', 'Train Accuracy %', 'Test Accuracy %']), ignore_index=True)
results


from sklearn.svm import SVC

# Build and train the SVM model
svm = SVC(kernel='rbf', gamma=0.1, C=1.0)
svm.fit(x_train, y_train)

# Calculate the residuals
y_predict = svm.predict(x_test)
residuals = y_predict - y_test
    
# Make the violin plot of residuals
plt.figure(figsize=(13,7))
reg_violin_plot = sns.violinplot(x=y_test_t['cp'], y=residuals, palette='rocket')
reg_violin_plot.set(xlabel="Chest pain type",
                    ylabel="Heart Disease support vector machine model residual",
                    title='Heart Disease Support Vector Machine Model Residuals by Chest Pain Type')
plt.show()


# Calculate the accuracy score
test_score = accuracy_score(y_test, y_predict) * 100
train_score = accuracy_score(y_train, svm.predict(x_train)) * 100

results = results.append(pd.DataFrame(data=[["Support Vector Classifier", train_score, test_score]], columns=['Model', 'Train Accuracy %', 'Test Accuracy %']), ignore_index=True)
results


from sklearn.ensemble import RandomForestClassifier

# Build the random forest model and train
rf = RandomForestClassifier(n_estimators=1000, random_state=18)
rf.fit(x_train, y_train)

# Calculate the residuals
y_predict = rf.predict(x_test)
residuals = y_predict - y_test
    
# Make violin plot of the residuals
plt.figure(figsize=(13,7))
reg_violin_plot = sns.violinplot(x=y_test_t['cp'], y=residuals, palette='rocket')
reg_violin_plot.set(xlabel="Chest pain type",
                    ylabel="Heart Disease random forest model residual",
                    title='Heart Disease Random Forest Model Residuals by Chest Pain Type')
plt.show()


# Calculate the accuracy score
test_score = accuracy_score(y_test, y_predict) * 100
train_score = accuracy_score(y_train, rf.predict(x_train)) * 100

results = results.append(pd.DataFrame(data=[["Random Forest Classifier", train_score, test_score]], columns=['Model', 'Train Accuracy %', 'Test Accuracy %']), ignore_index=True)
results

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	2	1

Dep. Variable:	exang	R-squared (uncentered):	0.024
Model:	OLS	Adj. R-squared (uncentered):	0.021
Method:	Least Squares	F-statistic:	7.461
Date:	Mon, 17 May 2021	Prob (F-statistic):	0.00668
Time:	20:18:32	Log-Likelihood:	-256.77
No. Observations:	303	AIC:	515.5
Df Residuals:	302	BIC:	519.3
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
cp	0.0628	0.023	2.731	0.007	0.018	0.108

Omnibus:	4406.815	Durbin-Watson:	1.311
Prob(Omnibus):	0.000	Jarque-Bera (JB):	51.572
Skew:	0.716	Prob(JB):	6.33e-12
Kurtosis:	1.573	Cond. No.	1.00

Dep. Variable:	oldpeak	R-squared (uncentered):	0.141
Model:	OLS	Adj. R-squared (uncentered):	0.138
Method:	Least Squares	F-statistic:	49.70
Date:	Mon, 17 May 2021	Prob (F-statistic):	1.22e-11
Time:	20:18:32	Log-Likelihood:	-541.03
No. Observations:	303	AIC:	1084.
Df Residuals:	302	BIC:	1088.
Df Model:	1
Covariance Type:	nonrobust

Predicting Heart Disease¶

Introduction¶

Gathering the Data¶

Describing the dataset¶

Exploratory Analysis¶

Data Histograms¶

Categorical Variables¶

Interpreting the Histograms¶

Correlation Matrix¶

R² Regression Tests¶

Chest-pain Type (cp) vs. Exercise-induced angina (exang)¶

Analysis Summary: cp vs exang¶

Analysis Summary: cp vs oldpeak¶

Analysis Summary: exang vs oldpeak¶

Analysis Summary: oldpeak vs thalach¶

Maximum heartrate achieved (thalach) vs. Chest pain type (cp)¶

Analysis Summary: thalach vs cp¶

Analysis Summary: thalach vs exang¶

Multivariate Multiple Regression with cp, exang, oldpeak, thalach vs. Heart Disease (target)¶

Analysis Summary: cp, exang, oldpeak, thalach vs target¶

Number of major vessels colored by flourosopy (ca) vs. Heart Disease (target)¶

Analysis Summary: ca vs target¶

Thalassemia (thal) vs. Heart Disease (target)¶

Analysis Summary: thal vs target¶

Machine Learning¶

Logistic Regression Model¶

K-Nearest Neighbors¶

Decision Tree Classifier¶

Support Vector Machine Classification¶

Random Forest Classifier¶

Conclusion¶

Further Information¶

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	2	1

Omnibus:	55.440	Durbin-Watson:	1.189
Prob(Omnibus):	0.000	Jarque-Bera (JB):	83.289
Skew:	1.123	Prob(JB):	8.21e-19
Kurtosis:	4.247	Cond. No.	1.00

Omnibus:	50.583	Durbin-Watson:	1.561
Prob(Omnibus):	0.000	Jarque-Bera (JB):	95.334
Skew:	0.897	Prob(JB):	1.99e-21
Kurtosis:	5.081	Cond. No.	1.00

Omnibus:	39.033	Durbin-Watson:	0.722
Prob(Omnibus):	0.000	Jarque-Bera (JB):	50.086
Skew:	-0.951	Prob(JB):	1.33e-11
Kurtosis:	3.589	Cond. No.	1.00

Dep. Variable:	cp	R-squared (uncentered):	0.503
Model:	OLS	Adj. R-squared (uncentered):	0.501
Method:	Least Squares	F-statistic:	305.4
Date:	Mon, 17 May 2021	Prob (F-statistic):	9.78e-48
Time:	20:18:32	Log-Likelihood:	-428.85
No. Observations:	303	AIC:	859.7
Df Residuals:	302	BIC:	863.4
Df Model:	1
Covariance Type:	nonrobust

Omnibus:	89.179	Durbin-Watson:	1.786
Prob(Omnibus):	0.000	Jarque-Bera (JB):	28.345
Skew:	0.529	Prob(JB):	7.00e-07
Kurtosis:	1.938	Cond. No.	1.00

Omnibus:	15133.636	Durbin-Watson:	1.691
Prob(Omnibus):	0.000	Jarque-Bera (JB):	52.972
Skew:	0.731	Prob(JB):	3.14e-12
Kurtosis:	1.565	Cond. No.	1.00

Omnibus:	12.000	Durbin-Watson:	0.831
Prob(Omnibus):	0.002	Jarque-Bera (JB):	12.388
Skew:	-0.469	Prob(JB):	0.00204
Kurtosis:	2.679	Cond. No.	359.

Omnibus:	1664.957	Durbin-Watson:	0.051
Prob(Omnibus):	0.000	Jarque-Bera (JB):	43.452
Skew:	-0.207	Prob(JB):	3.67e-10
Kurtosis:	1.192	Cond. No.	1.00

Omnibus:	1647.199	Durbin-Watson:	0.087
Prob(Omnibus):	0.000	Jarque-Bera (JB):	43.904
Skew:	-0.207	Prob(JB):	2.93e-10
Kurtosis:	1.182	Cond. No.	1.00

	coef	std err	t	P>\|t\|	[0.025	0.975]
cp	0.1292	0.024	5.360	0.000	0.082	0.177
exang	-0.1928	0.052	-3.698	0.000	-0.295	-0.090
oldpeak	-0.1177	0.020	-5.975	0.000	-0.156	-0.079
thalach	0.0040	0.000	14.678	0.000	0.003	0.005

	Model	Train Accuracy %	Test Accuracy %
0	Logistic Regression	85.950413	86.885246
1	K-nearest neighbors	83.057851	85.245902

	Model	Train Accuracy %	Test Accuracy %
0	Logistic Regression	85.950413	86.885246
1	K-nearest neighbors	83.057851	85.245902
2	Decision Tree Classifier	100.000000	78.688525

	Model	Train Accuracy %	Test Accuracy %
0	Logistic Regression	85.950413	86.885246
1	K-nearest neighbors	83.057851	85.245902
2	Decision Tree Classifier	100.000000	78.688525
3	Support Vector Classifier	85.537190	86.885246

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	2	1

	age	sex	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	...	cp_1	cp_2	cp_3	thal_1	thal_2	slope_0	slope_2
0	63	1	145	233	1	0	150	0	2.3	...	0	0	1	1	0	1	0
1	37	1	130	250	0	1	187	0	3.5	...	0	1	0	0	1	1	0
2	41	0	130	204	0	0	172	0	1.4	...	1	0	0	0	1	0	1
3	56	1	120	236	0	1	178	0	0.8	...	1	0	0	0	1	0	1
4	57	0	120	354	0	1	163	1	0.6	...	0	0	0	0	1	0	1

Predicting Heart Disease¶

Introduction¶

Gathering the Data¶

Describing the dataset¶

Exploratory Analysis¶

Data Histograms¶

Categorical Variables¶

Interpreting the Histograms¶

Correlation Matrix¶

R2 Regression Tests¶

Chest-pain Type (cp) vs. Exercise-induced angina (exang)¶

Analysis Summary: cp vs exang¶

Chest Pain Type (cp) vs. ST depression induced by exercise related to rest (oldpeak)¶

Analysis Summary: cp vs oldpeak¶

Exercise-induced angina (exang) vs. ST depression induced by exercise related to rest (oldpeak)¶

Analysis Summary: exang vs oldpeak¶

ST depression induced by exercise related to rest (oldpeak) vs. Maximum heartrate achieved (thalach)¶

Analysis Summary: oldpeak vs thalach¶

Maximum heartrate achieved (thalach) vs. Chest pain type (cp)¶

Analysis Summary: thalach vs cp¶

Analysis Summary: thalach vs exang¶

Multivariate Multiple Regression with cp, exang, oldpeak, thalach vs. Heart Disease (target)¶

Analysis Summary: cp, exang, oldpeak, thalach vs target¶

Number of major vessels colored by flourosopy (ca) vs. Heart Disease (target)¶

Analysis Summary: ca vs target¶

Thalassemia (thal) vs. Heart Disease (target)¶

Analysis Summary: thal vs target¶

Machine Learning¶

Logistic Regression Model¶

K-Nearest Neighbors¶

Decision Tree Classifier¶

Support Vector Machine Classification¶

Random Forest Classifier¶

Conclusion¶

Further Information¶

R² Regression Tests¶

	age	sex	cp	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	slope	thal	target
0	63	1	3	145	233	1	0	150	0	2.3	0	1	1
1	37	1	2	130	250	0	1	187	0	3.5	0	2	1
2	41	0	1	130	204	0	0	172	0	1.4	2	2	1
3	56	1	1	120	236	0	1	178	0	0.8	2	2	1
4	57	0	0	120	354	0	1	163	1	0.6	2	2	1

	age	sex	trestbps	chol	fbs	restecg	thalach	exang	oldpeak	...	cp_1	cp_2	cp_3	thal_1	thal_2	slope_0	slope_2
0	63	1	145	233	1	0	150	0	2.3	...	0	0	1	1	0	1	0
1	37	1	130	250	0	1	187	0	3.5	...	0	1	0	0	1	1	0
2	41	0	130	204	0	0	172	0	1.4	...	1	0	0	0	1	0	1
3	56	1	120	236	0	1	178	0	0.8	...	1	0	0	0	1	0	1
4	57	0	120	354	0	1	163	1	0.6	...	0	0	0	0	1	0	1