### DataFrame
import pandas as pd

### For working with arrays.
import numpy as np

### Functions creating iterators for efficient looping.
import itertools

### Creating visualizations.
from matplotlib import pyplot as plt
import matplotlib
import seaborn as sns
sns.set()

import warnings
warnings.filterwarnings('ignore')

### Train Test Split
from sklearn.model_selection import train_test_split

### Model Building
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

### Visualize a Decision Tree
from sklearn import tree


df = pd.read_csv('2019.csv')


print(df.shape)

(156, 9)


df.head()


df.info()
# there is no null value
# there is no categorical value except 'Country or region'

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Overall rank                  156 non-null    int64  
 1   Country or region             156 non-null    object 
 2   Score                         156 non-null    float64
 3   GDP per capita                156 non-null    float64
 4   Social support                156 non-null    float64
 5   Healthy life expectancy       156 non-null    float64
 6   Freedom to make life choices  156 non-null    float64
 7   Generosity                    156 non-null    float64
 8   Perceptions of corruption     156 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory usage: 11.1+ KB


# statistical info
df.describe()


# check unique values in dataset
df.apply(lambda x: len(x.unique()))

Overall rank                    156
Country or region               156
Score                           155
GDP per capita                  146
Social support                  145
Healthy life expectancy         119
Freedom to make life choices    130
Generosity                      118
Perceptions of corruption       113
dtype: int64


# numerical attributes visualization
fig,ax = plt.subplots(2, 3, figsize=(20,10))

# GDP_per_capita
ax1=plt.subplot(2,3,1)
sns.histplot(df["GDP per capita"], kde=True)

# Social_support
ax2=plt.subplot(2,3,2)
sns.histplot(df["Social support"], kde=True)

# Healthy_life_expectancy
ax3=plt.subplot(2,3,3)
sns.histplot(df["Healthy life expectancy"], kde=True)

# Life_choices
ax4=plt.subplot(2,3,4)
sns.histplot(df["Freedom to make life choices"], kde=True)

# Generosity
ax5=plt.subplot(2,3,5)
sns.histplot(df["Generosity"], kde=True)

# Corruption
ax6=plt.subplot(2,3,6)
sns.histplot(df["Perceptions of corruption"], kde=True)

plt.show()


# Square root transform on 'Perceptions of corruption'.
df['Perceptions of corruption'] = np.sqrt(df['Perceptions of corruption'])


# numerical attributes visualization
fig,ax = plt.subplots(2, 3, figsize=(20,10))

# GDP_per_capita
ax1=plt.subplot(2,3,1)
sns.histplot(df["GDP per capita"], kde=True)

# Social_support
ax2=plt.subplot(2,3,2)
sns.histplot(df["Social support"], kde=True)

# Healthy_life_expectancy
ax3=plt.subplot(2,3,3)
sns.histplot(df["Healthy life expectancy"], kde=True)

# Life_choices
ax4=plt.subplot(2,3,4)
sns.histplot(df["Freedom to make life choices"], kde=True)

# Generosity
ax5=plt.subplot(2,3,5)
sns.histplot(df["Generosity"], kde=True)

# Corruption
ax6=plt.subplot(2,3,6)
sns.histplot(df["Perceptions of corruption"], kde=True)

plt.show()


# Split the df into train(80%) and test(20%) datasets
train, test = train_test_split(df, test_size=0.2, random_state=2)


print('Shape of train',train.shape)
print('Shape of test',test.shape)

Shape of train (124, 9)
Shape of test (32, 9)


#Drop unnecessary columns:
test.drop(['Overall rank'],axis=1,inplace=True)
train.drop(['Overall rank'],axis=1,inplace=True)

# Preparation for plotting Correlation Matrix and Scatter Charts
data = pd.concat([train,test], ignore_index=True)


print(train.shape)
train.head()

(124, 8)


print(test.shape)
test.head()

(32, 8)


print(data.shape)
data.head()

(156, 8)


corr = data.corr()
plt.figure(figsize=(15,10))
sns.heatmap(corr, annot = True, cmap="coolwarm")

<AxesSubplot:>


fig,ax = plt.subplots(2,3,figsize=(20,10))

# GDP per capita vs Score 
ax1=plt.subplot(2,3,1)
sns.regplot(x='GDP per capita',y='Score',color='r',data=data,fit_reg=True) # fit_reg=True shows a regression line
plt.xlabel("GDP per capita")
plt.ylabel('Score')
ax1.set_title("GDP per capita and Score")

# Social support vs Score
ax2=plt.subplot(2,3,2)
sns.regplot(x='Social support', y='Score',color='b', data=data, fit_reg=True)
plt.xlabel("Social support")
plt.ylabel('Score')
ax2.set_title("Social support and Score")

# Healthy life expectancy vs Score
ax3=plt.subplot(2,3,3)
sns.regplot(x='Healthy life expectancy', y='Score',color='g', data=data, fit_reg=True)
plt.xlabel("Healthy life expectancy")
plt.ylabel('Score')
ax3.set_title("Healthy life expectancy and Score")

# Freedom to make life choices vs Score
ax4=plt.subplot(2,3,4)
sns.regplot(x='Freedom to make life choices', y='Score',color='purple', data=data, fit_reg=True)
plt.xlabel("Freedom to make life choice")
plt.ylabel('Score')
ax4.set_title("Freedom to make life choices and Score")

# Generosity vs Score
ax5=plt.subplot(2,3,5)
sns.regplot(x='Generosity', y='Score',color='y', data=data, fit_reg=True)
plt.xlabel("Generosity")
plt.ylabel('Score')
ax5.set_title("Generosity and Score")

# Perceptions of corruption vs Score
ax6=plt.subplot(2,3,6)
sns.regplot(x='Perceptions of corruption', y='Score',color='orange', data=data, fit_reg=True)
plt.xlabel("Perceptions of corruption")
plt.ylabel('Score')
ax6.set_title("Perceptions of corruption and Score")

plt.show()


#Define target and ID columns:
target = 'Score'
IDcol = ['Country or region']

def modelfit(alg, dtrain, dtest, predictors, target, IDcol, filename):
        
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain[target])
    
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    #Predict testing set:  
    dtest_predictions = alg.predict(dtest[predictors])
    
    #Perform cross-validation:
    cv_score = cross_val_score(alg, dtrain[predictors], dtrain[target], cv=19, scoring='neg_mean_squared_error')
    cv_score = np.sqrt(np.abs(cv_score))
    
    # Mean squared error
    mse_train = mean_squared_error(dtrain[target].values, dtrain_predictions)
    rmse_train = np.sqrt(mse_train)
    mse_test = mean_squared_error(dtest[target].values, dtest_predictions)
    rmse_test = np.sqrt(mse_test)
    
    # R^2 (coefficient of determination)
    r_squared_train = r2_score(dtrain[target].values, dtrain_predictions) 
    r_squared_test = r2_score(dtest[target].values, dtest_predictions)    
    
    result = (rmse_test,
              rmse_train,
              np.mean(cv_score),
              np.std(cv_score),
              np.min(cv_score),
              np.max(cv_score),
              r_squared_test,
              r_squared_train
             )

    return result


predictors = [x for x in train.columns if x not in [target]+IDcol]
alg1 = LinearRegression(normalize=True) .

LR_result = modelfit(alg1, train, test, predictors, target, IDcol, '01_Linear_Regression.csv')
coef1 = pd.Series(alg1.coef_, predictors).sort_values(ascending=False)

plt.figure(figsize=(5,4))
coef1.plot(kind='bar', title='Model Coefficients')

<AxesSubplot:title={'center':'Model Coefficients'}>


predictors = [x for x in train.columns if x not in [target]+IDcol]
alg2 = Ridge(alpha=0.05,normalize=True)  .

RR_result = modelfit(alg2, train, test, predictors, target, IDcol, '02_Ridge_Regression.csv')
coef2 = pd.Series(alg2.coef_, predictors).sort_values(ascending=False)

plt.figure(figsize=(5,4))
coef2.plot(kind='bar', title='Model Coefficients')

<AxesSubplot:title={'center':'Model Coefficients'}>


# Hyperparameter
lis = [0.03, 0.05, 0.07, 0.10, 0.15]  # 3% of total train data, 5%,7%,10%, 15%
min_samples_leaf = [round(i*len(train)) for i in lis] # The minimum number of samples required to be at a leaf node.

predictors = [x for x in train.columns if x not in [target]+IDcol]
DT_params = [min_samples_leaf]
DT_params = list(itertools.product(*DT_params))

column_params = ["min_samples_leaf"]
Accuracy_params = ["Root Mean Square Error (RMSE) (testing data)",
                   "Root Mean Square Error (RMSE) (training data)",
                   "CV Error-Mean",
                   "CV Error-Std",
                   "CV Error-Min",
                   "CV Error-Max",
                   "R^2 (testing data)",
                   "R^2 (training data)",
                   "Abs(R^2 (training data)-R^2 (testing data))"
                  ]
result_params = Accuracy_params + column_params

DT_Accuracy = pd.DataFrame(columns = result_params)


for i in range(len(DT_params)):
    alg3 = DecisionTreeRegressor(min_samples_leaf=DT_params[i][0], 
                                 random_state=2)
    DT_result = modelfit(alg3, train, test, predictors, target, IDcol, '03_Decision_Tree.csv')
    
    # Feature importance refers to techniques that assign a score to input features based on 
    # how useful they are at predicting a target variable.
    coef3 = pd.Series(alg3.feature_importances_, predictors).sort_values(ascending=False)
    
    lis_result = [DT_result[0],
                  DT_result[1],
                  DT_result[2],
                  DT_result[3],
                  DT_result[4],
                  DT_result[5],
                  DT_result[6],
                  DT_result[7],
                  abs(DT_result[7]-DT_result[6]),
                  DT_params[i][0]
                 ]
    
    DT_Accuracy = DT_Accuracy.append(pd.DataFrame([lis_result], columns = result_params), ignore_index=True)


DT_Accuracy


# Hyperparameter
n_estimators = [10, 25, 50]  # The number of trees in the forest.
max_depth = list(range(3,10,2))  # The maximum depth of the tree.
lis = [0.03, 0.05, 0.07, 0.10, 0.15]  # 3% of total train data, 5%,7%,10%, 15%.
min_samples_leaf = [round(i*len(train)) for i in lis]  # The minimum number of samples required to be at a leaf node. 
max_features = ['sqrt', 0.5, 0.8]  # The number of features to consider when looking for the best split.

predictors = [x for x in train.columns if x not in [target]+IDcol]
RF_params = [n_estimators, max_depth, min_samples_leaf, max_features]
RF_params = list(itertools.product(*RF_params))

column_params = ["n_estimators", "max_depth", "min_samples_leaf", "max_features"]
Accuracy_params = ["Root Mean Square Error (RMSE) (testing data)",
                   "Root Mean Square Error (RMSE) (training data)",
                   "CV Error-Mean",
                   "CV Error-Std",
                   "CV Error-Min",
                   "CV Error-Max",
                   "R^2 (testing data)",
                   "R^2 (training data)",
                   "Abs(R^2 (training data)-R^2 (testing data))"
                  ]
result_params = Accuracy_params + column_params

RF_Accuracy = pd.DataFrame(columns = Accuracy_params+column_params)


for i in range(len(RF_params)):
    alg4 = RandomForestRegressor(n_estimators=RF_params[i][0],
                                 max_depth=RF_params[i][1],
                                 min_samples_leaf=RF_params[i][2],
                                 max_features=RF_params[i][3],
                                 random_state=2)
    RF_result = modelfit(alg4, train, test, predictors, target, IDcol, '04_Random_Forest.csv')
    lis_result = [RF_result[0],
                  RF_result[1],
                  RF_result[2],
                  RF_result[3],
                  RF_result[4],
                  RF_result[5],
                  RF_result[6],
                  RF_result[7],
                  abs(RF_result[7]-RF_result[6]),
                  RF_params[i][0],
                  RF_params[i][1],
                  RF_params[i][2],
                  RF_params[i][3]
                 ]
    
    RF_Accuracy = RF_Accuracy.append(pd.DataFrame([lis_result], columns = result_params), ignore_index=True)


RF_Accuracy


with pd.ExcelWriter('Wold_Happiness_results.xlsx') as writer:
    DT_Accuracy.to_excel(writer, index = False, sheet_name='DT_Accuracy')
    RF_Accuracy.to_excel(writer, index = False, sheet_name='RF_Accuracy')


# Decision Tree
DT_best_model = DT_Accuracy.loc[DT_Accuracy['Abs(R^2 (training data)-R^2 (testing data))'].idxmin()]

alg5 = DecisionTreeRegressor(min_samples_leaf=DT_best_model[9], 
                             random_state=2)
modelfit(alg5, train, test, predictors, target, IDcol, '05_Best_Decision_Tree.csv')
coef5 = pd.Series(alg5.feature_importances_, predictors).sort_values(ascending=False)

plt.figure(figsize=(5,4))
coef5.plot(kind='bar', title='Feature Importances')
print(alg5.__class__.__name__)
DT_best_model

DecisionTreeRegressor

Root Mean Square Error (RMSE) (testing data)     0.611302
Root Mean Square Error (RMSE) (training data)    0.522433
CV Error-Mean                                    0.602206
CV Error-Std                                     0.151618
CV Error-Min                                     0.428825
CV Error-Max                                     0.933219
R^2 (testing data)                               0.663728
R^2 (training data)                              0.779577
Abs(R^2 (training data)-R^2 (testing data))      0.115849
min_samples_leaf                                       19
Name: 4, dtype: object


# Decision Tree
fig5 = plt.figure(figsize=(60,40), facecolor='k')
tree.plot_tree(alg5,
               feature_names=predictors,
               class_names=target, 
               fontsize=60)
plt.show()


# To save the figure to the .png file
fig5.savefig("fig5_Best_Decistion_Tree.png")


# Random Forest
RF_best_model = RF_Accuracy.loc[RF_Accuracy['Abs(R^2 (training data)-R^2 (testing data))'].idxmin()]

alg6 = RandomForestRegressor(n_estimators=RF_best_model[9], 
                             max_depth=RF_best_model[10],
                             min_samples_leaf=RF_best_model[11],
                             max_features=RF_best_model[12],
                             random_state=2)
modelfit(alg6, train, test, predictors, target, IDcol, '06_Best_Random_Forest.csv')
coef6 = pd.Series(alg6.feature_importances_, predictors).sort_values(ascending=False)

plt.figure(figsize=(5,4))
coef6.plot(kind='bar', title='Feature Importances')
print(alg6.__class__.__name__)
RF_best_model

RandomForestRegressor

Root Mean Square Error (RMSE) (testing data)      0.523044
Root Mean Square Error (RMSE) (training data)     0.540461
CV Error-Mean                                     0.561344
CV Error-Std                                      0.137066
CV Error-Min                                      0.362534
CV Error-Max                                      0.879787
R^2 (testing data)                                0.753818
R^2 (training data)                               0.764102
Abs(R^2 (training data)-R^2 (testing data))      0.0102838
n_estimators                                            10
max_depth                                                3
min_samples_leaf                                        19
max_features                                           0.8
Name: 14, dtype: object

Variable	Description
Overall rank	Rank of the country based on the Happiness Score
Country or region	Name of the country
Score	A metric measured in 2019 by asking the sampled people the question: 'How would you rate your happiness on a scale of 0 to 10. 0 being the worst; 10 being the best0
GDP per capita	The extent to which GDP contributes to the calculation of the Happiness Score
Social support	The extent to which Social Support contributes to the calculation of the Happiness Score
Healthy life expectancy	The extent to which Healthy Life expectancy contributed to the calculation of the Happiness Score
Freedom to make life choices	The extent to which Freedom contributed to the calculation of the Happiness Score
Generosity	Generosity is the residual of regressing the national average of GWP responses to the question “Have you donated money to a charity in the past month?” on GDP per capita
Perceptions of corruption	The extent to which Perception of Corruption contributes to Happiness Score

	Overall rank	Country or region	Score	GDP per capita	Social support	Healthy life expectancy	Freedom to make life choices	Generosity	Perceptions of corruption
0	1	Finland	7.769	1.340	1.587	0.986	0.596	0.153	0.393
1	2	Denmark	7.600	1.383	1.573	0.996	0.592	0.252	0.410
2	3	Norway	7.554	1.488	1.582	1.028	0.603	0.271	0.341
3	4	Iceland	7.494	1.380	1.624	1.026	0.591	0.354	0.118
4	5	Netherlands	7.488	1.396	1.522	0.999	0.557	0.322	0.298

	Overall rank	Score	GDP per capita	Social support	Healthy life expectancy	Freedom to make life choices	Generosity	Perceptions of corruption
count	156.000000	156.000000	156.000000	156.000000	156.000000	156.000000	156.000000	156.000000
mean	78.500000	5.407096	0.905147	1.208814	0.725244	0.392571	0.184846	0.110603
std	45.177428	1.113120	0.398389	0.299191	0.242124	0.143289	0.095254	0.094538
min	1.000000	2.853000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	39.750000	4.544500	0.602750	1.055750	0.547750	0.308000	0.108750	0.047000
50%	78.500000	5.379500	0.960000	1.271500	0.789000	0.417000	0.177500	0.085500
75%	117.250000	6.184500	1.232500	1.452500	0.881750	0.507250	0.248250	0.141250
max	156.000000	7.769000	1.684000	1.624000	1.141000	0.631000	0.566000	0.453000

	Country or region	Score	GDP per capita	Social support	Healthy life expectancy	Freedom to make life choices	Generosity	Perceptions of corruption
130	Myanmar	4.360	0.710	1.181	0.555	0.525	0.566	0.414729
71	Libya	5.525	1.044	1.303	0.673	0.416	0.133	0.389872
65	Portugal	5.693	1.221	1.431	0.999	0.508	0.047	0.158114
123	Tunisia	4.461	0.921	1.000	0.815	0.167	0.059	0.234521
99	Nepal	4.913	0.446	1.226	0.677	0.439	0.285	0.298329

	Country or region	Score	GDP per capita	Social support	Healthy life expectancy	Freedom to make life choices	Generosity	Perceptions of corruption
12	Israel	7.139	1.276	1.455	1.029	0.371	0.261	0.286356
3	Iceland	7.494	1.380	1.624	1.026	0.591	0.354	0.343511
98	Ivory Coast	4.944	0.569	0.808	0.232	0.352	0.154	0.300000
6	Sweden	7.343	1.387	1.487	1.009	0.574	0.267	0.610737
142	Madagascar	3.933	0.274	0.916	0.555	0.148	0.169	0.202485

Ruth's Blog

Application Of Regression Techniques To Predict A Country's Happiness Index

Dataset Information¶

Exploratory Data Analysis¶

Feature Engineering¶

Train-Test Split¶

Correlation Matrix¶

Scatter Charts¶

Model Building¶

Linear Regression¶

Ridge Regression¶

Decision Tree¶

Random Forest¶

Output as an Excel File¶

Hyperparameter Tuning (Grid Search)¶

Visualize a Decision Tree with Scikit-Learn and Python¶

	Root Mean Square Error (RMSE) (testing data)	Root Mean Square Error (RMSE) (training data)	CV Error-Mean	CV Error-Std	CV Error-Min	CV Error-Max	R^2 (testing data)	R^2 (training data)	Abs(R^2 (training data)-R^2 (testing data))	min_samples_leaf
0	0.592723	0.312973	0.613328	0.189848	0.295954	1.010352	0.683857	0.920894	0.237037	4
1	0.610474	0.362691	0.545310	0.156676	0.283796	0.930885	0.664638	0.893765	0.229127	6
2	0.609496	0.426611	0.538368	0.152717	0.292959	0.978080	0.665711	0.853020	0.187308	9
3	0.634843	0.501519	0.604481	0.136011	0.388272	0.911380	0.637330	0.796871	0.159542	12
4	0.611302	0.522433	0.602206	0.151618	0.428825	0.933219	0.663728	0.779577	0.115849	19

	Root Mean Square Error (RMSE) (testing data)	Root Mean Square Error (RMSE) (training data)	CV Error-Mean	CV Error-Std	CV Error-Min	CV Error-Max	R^2 (testing data)	R^2 (training data)	Abs(R^2 (training data)-R^2 (testing data))	n_estimators	max_depth	min_samples_leaf	max_features
0	0.539372	0.429534	0.498211	0.125418	0.258480	0.757326	0.738208	0.850998	0.112790	10	3	4	sqrt
1	0.511561	0.432329	0.525941	0.130198	0.258774	0.795190	0.764509	0.849053	0.084544	10	3	4	0.5
2	0.489271	0.434311	0.498537	0.139864	0.236076	0.730333	0.784584	0.847666	0.063082	10	3	4	0.8
3	0.556103	0.440985	0.500810	0.127364	0.260181	0.735909	0.721715	0.842948	0.121233	10	3	6	sqrt
4	0.538546	0.438232	0.526868	0.138972	0.264192	0.810381	0.739010	0.844903	0.105893	10	3	6	0.5
...	...	...	...	...	...	...	...	...	...	...	...	...	...
175	0.525119	0.478292	0.528496	0.138389	0.338100	0.779464	0.751861	0.815251	0.063390	50	9	12	0.5
176	0.524645	0.472357	0.534704	0.141456	0.351052	0.848859	0.752309	0.819807	0.067498	50	9	12	0.8
177	0.602943	0.555686	0.590896	0.158264	0.285481	0.908642	0.672861	0.750624	0.077763	50	9	19	sqrt
178	0.559856	0.539287	0.565983	0.163570	0.273607	0.922628	0.717946	0.765125	0.047179	50	9	19	0.5
179	0.557086	0.536865	0.565937	0.160036	0.274829	0.974540	0.720730	0.767231	0.046501	50	9	19	0.8