#importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#to ignore warnings
import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

# Load data
df = pd.read_csv('predicting_wages.csv')

# See top 5 row in the dataset
df.head()

# Checking info of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3835 entries, 0 to 3834
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   female  3835 non-null   int64  
 1   cg      3835 non-null   int64  
 2   sc      3835 non-null   int64  
 3   hsg     3835 non-null   int64  
 4   mw      3835 non-null   int64  
 5   so      3835 non-null   int64  
 6   we      3835 non-null   int64  
 7   ne      3835 non-null   int64  
 8   exp1    3835 non-null   float64
 9   exp2    3835 non-null   float64
 10  exp3    3835 non-null   float64
 11  wage    3835 non-null   float64
dtypes: float64(4), int64(8)
memory usage: 359.7 KB

# Printing the summary statistics for the dataset
df.describe().T

df[['exp1','exp2','exp3','wage']].boxplot(figsize=(20,10))
plt.show()

sns.scatterplot(x='exp1',y='wage',data=df)

<Axes: xlabel='exp1', ylabel='wage'>

cols=df.select_dtypes('int').columns.to_list()
cols

['female', 'cg', 'sc', 'hsg', 'mw', 'so', 'we', 'ne']

sns.kdeplot(x=df.wage,hue=df.female)

<Axes: xlabel='wage', ylabel='Density'>

for i in cols:
    sns.kdeplot(data=df, x=df.wage,hue=df[i])
    plt.show()

####################  Linear and Quadratic specifications ##############################

# Wage linear regression
from Scikit-learn.linear_model import LinearRegression
from Scikit-learn import metrics

Y = df['wage'] # target variable
X = df[['female' , 'sc', 'cg', 'mw' , 'so' , 'we' , 'exp1' , 'exp2' , 'exp3']] #regressors

#defining the model
model = LinearRegression()
# fit the Linear regression to the regressors and target varibale.
results = model.fit(X,Y) # train the model

print("Intercept",results.intercept_) # beta_0

Intercept 4.9154015212414155

#coefficients of other regressors
pd.DataFrame(results.coef_.reshape(1,-1),columns=X.columns)

# compute MSE and R^2
y_pred = results.predict(X) # predictions on same data
print("R-squared",metrics.r2_score(Y,y_pred)) # r-squared
MSE_adj2 =  (3835/(3835-10)) * np.mean(np.square(Y-y_pred))
print("MSE adj:",MSE_adj2) #MSE

R-squared 0.09548800312432548
MSE adj: 165.6801552431276

adj_rsquared = 1 - ((1-0.095488)*(3835-1)/(3835-9-1))
print("Adj Rsquared:",adj_rsquared)# adjusted r-squared

Adj Rsquared: 0.09335973647058826

# Linear regression: Quadratic specification

from Scikit-learn.preprocessing import PolynomialFeatures

X.drop('female',axis = 1,inplace = True)
poly = PolynomialFeatures(interaction_only=True,include_bias=False)
X_poly = poly.fit_transform(X) # creating polynomial features with degree = 2(Quadratic)

X_poly = pd.DataFrame(X_poly, columns= poly.get_feature_names_out(X.columns))

X_poly['female'] = df['female']
X['female'] = df['female']

# train the model
results = model.fit(X_poly,Y)

# beta_0
print("Intercept",results.intercept_)

Intercept 16.552407194298763

pd.DataFrame(results.coef_.reshape(1,-1),columns=X_poly.columns).T# printing the co-efficients in a tabular format

# compute MSE and R^2
y_pred = results.predict(X_poly)
print("R-squared",metrics.r2_score(Y,y_pred))
MSE_adj2 =  (3835/(3835-33)) * np.mean(np.square(Y-y_pred))
print("MSE adj:",MSE_adj2) #MSE

R-squared 0.10397282164398369
MSE adj: 165.11885608922336

adj_rsquared = 1 - ((1-0.10397)*(3835-1)/(3835-33-1))
adj_rsquared

0.09619073401736389

####################  Linear and Quadratic specifications with Sample Splitting ##############################

from Scikit-learn.model_selection import train_test_split #using this we split the data into train and test in python.


#Split without the function train_test_split
X_train = X.iloc[:1918,:]
X_test = X.iloc[1918:,:]
Y_train = Y.iloc[:1918]
Y_test = Y.iloc[1918:]


#Split using the function train_test_split
# X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.5,random_state=101) # splitting data into train and test


results = model.fit(X_train,Y_train)
print("Intercept",results.intercept_)

Intercept 4.184615914173742

pd.DataFrame(results.coef_.reshape(1,-1),columns=X.columns)

y_pred = results.predict(X_test) # predit y values on test data
print("R-squared",metrics.r2_score(Y_test,y_pred)) # r-squared
print("MSE",metrics.mean_squared_error(Y_test,y_pred))# MSE

R-squared 0.10272836637029858
MSE 154.58369290508847

# Linear regression: Quadratic specification
X_train = X_poly.iloc[:1918,:]
X_test = X_poly.iloc[1918:,:]
Y_train = Y.iloc[:1918]
Y_test = Y.iloc[1918:]
# X_train,X_test,Y_train,Y_test = train_test_split(X_poly,Y,test_size=0.5,random_state=101)
results = model.fit(X_train,Y_train)
print("Intercept",results.intercept_)

Intercept 12.364122069227694

pd.DataFrame(results.coef_.reshape(1,-1),columns=X_poly.columns)

y_pred = results.predict(X_test)
print("R-squared",metrics.r2_score(Y_test,y_pred))
print("MSE",metrics.mean_squared_error(Y_test,y_pred))

R-squared 0.10460461287645784
MSE 154.2604495272174

# Convert notebook to html
!jupyter nbconvert --to html "/content/drive/My Drive/Colab Notebooks/Copy of FDS_Project_LearnerNotebook_FullCode.ipynb"

	female	cg	sc	hsg	ne	exp1	exp2	exp3	wage
0	0	0	0	1	1	33.0	10.89	35.937	11.659091
1	0	1	0	0	1	27.0	7.29	19.683	12.825000
2	0	0	1	0	1	13.0	1.69	2.197	5.777027
3	0	1	0	0	1	2.0	0.04	0.008	12.468750
4	1	1	0	0	1	15.0	2.25	3.375	18.525000

	count	mean	std	min	25%	50%	75%	max
female	3835.0	0.417992	0.493293	0.000000	0.00000	0.000000	1.000000	1.000000
cg	3835.0	0.376271	0.484513	0.000000	0.00000	0.000000	1.000000	1.000000
sc	3835.0	0.323859	0.468008	0.000000	0.00000	0.000000	1.000000	1.000000
hsg	3835.0	0.299870	0.458260	0.000000	0.00000	0.000000	1.000000	1.000000
mw	3835.0	0.287614	0.452709	0.000000	0.00000	0.000000	1.000000	1.000000
so	3835.0	0.243546	0.429278	0.000000	0.00000	0.000000	0.000000	1.000000
we	3835.0	0.211734	0.408591	0.000000	0.00000	0.000000	0.000000	1.000000
ne	3835.0	0.257106	0.437095	0.000000	0.00000	0.000000	1.000000	1.000000
exp1	3835.0	13.353194	8.639348	2.000000	6.00000	11.000000	19.500000	35.000000
exp2	3835.0	2.529267	2.910554	0.040000	0.36000	1.210000	3.802500	12.250000
exp3	3835.0	5.812103	9.033207	0.008000	0.21600	1.331000	7.414875	42.875000
wage	3835.0	15.533356	13.518165	0.004275	9.61875	13.028571	17.812500	348.333017

	0
sc	-2.386526e+00
cg	2.240488e+00
mw	-5.519395e+00
so	-2.914391e+00
we	-8.053925e-01
exp1	-1.321456e+00
exp2	1.252185e+01
exp3	-4.838676e-02
sc cg	-2.542411e-14
sc mw	-7.225602e-01
sc so	-6.512599e-01
sc we	-1.046514e-01
sc exp1	8.390751e-01
sc exp2	-4.060847e+00
sc exp3	6.330261e-01
cg mw	-7.609079e-01
cg so	1.704052e+00
cg we	-1.494761e+00
cg exp1	7.858795e-01
cg exp2	-4.897043e-02
cg exp3	-5.950344e-01
mw so	2.176037e-14
mw we	1.759703e-14
mw exp1	1.107607e+00
mw exp2	-6.052683e+00
mw exp3	9.062626e-01
so we	-1.776357e-15
so exp1	3.947042e-01
so exp2	-8.913739e-01
so exp3	-3.515656e-02
we exp1	4.719171e-01
we exp2	-3.918829e+00
we exp3	8.050007e-01
exp1 exp2	-4.838676e-01
exp1 exp3	8.720600e-02
exp2 exp3	-5.798236e-02
female	-1.880013e+00

	p	R-squared_test	MSE_test
basic reg	9	0.1027	154.584
flexi reg	33	0.1046	154.260

PREDICTING WAGES¶

Dataset¶

Importing the necessary libraries and overview of the dataset¶

Loading the data¶

Checking the info of the data¶

Univariate Analysis¶

Checking the summary statistics of the dataset¶

Bivariate Analysis¶

Let's first look the relationship between the experience and Wages¶

Now make a list of dummy columns and check there relationship with wage¶

As now we have done our analysis, lets move to Modeling.¶

Basic Model¶

Flexible model¶

Basic Model on splitted data¶

Flexible model on splitted data¶

Conclusion and recommendations:¶

	p	R-squared_sample	R-squared_adj	MSE_adj
basic reg	9	0.0954	0.093	165.680
flexi reg	33	0.1039	0.096	165.118