#importing necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#to ignore warnings
import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

# Load data
df = pd.read_csv('gender_wage_gap.csv')

# See variables in the dataset
df.head()

# Checking info of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3835 entries, 0 to 3834
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   female  3835 non-null   int64  
 1   cg      3835 non-null   int64  
 2   sc      3835 non-null   int64  
 3   hsg     3835 non-null   int64  
 4   mw      3835 non-null   int64  
 5   so      3835 non-null   int64  
 6   we      3835 non-null   int64  
 7   ne      3835 non-null   int64  
 8   exp1    3835 non-null   float64
 9   exp2    3835 non-null   float64
 10  exp3    3835 non-null   float64
 11  wage    3835 non-null   float64
dtypes: float64(4), int64(8)
memory usage: 359.7 KB

# Printing the summary statistics for the dataset
df.describe().T

df[['exp1','exp2','exp3','wage']].boxplot(figsize=(20,10))
plt.show()

# Mean value of all females
df[df['female'] == 1].mean()

female     1.000000
cg         0.406114
sc         0.354336
hsg        0.239551
mw         0.291329
so         0.255147
we         0.198378
ne         0.255147
exp1      13.037118
exp2       2.449453
exp3       5.599297
wage      14.720058
dtype: float64

# Mean value of all males
df[df['female'] == 0].mean()

female     0.000000
cg         0.354839
sc         0.301971
hsg        0.343190
mw         0.284946
so         0.235215
we         0.221326
ne         0.258513
exp1      13.580197
exp2       2.586588
exp3       5.964938
wage      16.117458
dtype: float64

####################  Linear and Quadratic specifications ##############################

# Wage linear regression
import statsmodels.api as sm
Y = df['wage'] # target variable
X = df[['female' , 'sc', 'cg', 'mw' , 'so' , 'we' , 'exp1' , 'exp2' , 'exp3']] #regressors
X = sm.add_constant(X) # adding constant for intercept
model = sm.OLS(Y, X)
results = model.fit() # train the model
print(results.summary()) # summary of the model

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   wage   R-squared:                       0.095
Model:                            OLS   Adj. R-squared:                  0.093
Method:                 Least Squares   F-statistic:                     44.87
Date:                Thu, 08 Jun 2023   Prob (F-statistic):           3.17e-77
Time:                        17:07:11   Log-Likelihood:                -15235.
No. Observations:                3835   AIC:                         3.049e+04
Df Residuals:                    3825   BIC:                         3.055e+04
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.9154      1.299      3.784      0.000       2.368       7.462
female        -1.8264      0.425     -4.302      0.000      -2.659      -0.994
sc             2.4865      0.534      4.654      0.000       1.439       3.534
cg             9.8708      0.562     17.567      0.000       8.769      10.972
mw            -1.2142      0.566     -2.146      0.032      -2.323      -0.105
so             0.4046      0.588      0.688      0.491      -0.748       1.558
we            -0.2508      0.611     -0.410      0.682      -1.449       0.947
exp1           1.0965      0.269      4.077      0.000       0.569       1.624
exp2          -4.0134      1.785     -2.248      0.025      -7.514      -0.513
exp3           0.4603      0.344      1.340      0.180      -0.213       1.134
==============================================================================
Omnibus:                     6626.018   Durbin-Watson:                   1.958
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          8721375.159
Skew:                          11.808   Prob(JB):                         0.00
Kurtosis:                     235.426   Cond. No.                         198.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

# import PolynomialFeatures library to create polynomial features
from Scikit-learn.preprocessing import PolynomialFeatures

# create an object of PolynomialFeatures with only the interaction terms
poly = PolynomialFeatures(interaction_only=True,include_bias = False)

#Dropping constant and female as we dont want to create interaction features for them
X.drop(['const','female'],axis = 1,inplace = True)
print(X.shape)

# transform the data to add the polynomial features too
X_poly = poly.fit_transform(X)

# convert to dataframe
X_poly = pd.DataFrame(X_poly,columns= poly.get_feature_names_out(X.columns))

print(X_poly.shape)
X_poly['female'] = df['female']
X['female'] = df['female']
X_poly = sm.add_constant(X_poly)

#defining the model
model = sm.OLS(Y, X_poly) # Linear regression/OLS object

#fitting the model
results = model.fit() # train the model

# summary of the model
print(results.summary())

(3835, 8)
(3835, 36)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   wage   R-squared:                       0.104
Model:                            OLS   Adj. R-squared:                  0.096
Method:                 Least Squares   F-statistic:                     13.79
Date:                Thu, 08 Jun 2023   Prob (F-statistic):           5.53e-69
Time:                        17:07:11   Log-Likelihood:                -15217.
No. Observations:                3835   AIC:                         3.050e+04
Df Residuals:                    3802   BIC:                         3.071e+04
Df Model:                          32                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         16.5524      7.175      2.307      0.021       2.486      30.619
sc            -2.3865      5.415     -0.441      0.659     -13.003       8.230
cg             2.2405      5.908      0.379      0.705      -9.342      13.823
mw            -5.5194      3.375     -1.635      0.102     -12.137       1.098
so            -2.9144      3.482     -0.837      0.403      -9.742       3.913
we            -0.8054      3.646     -0.221      0.825      -7.953       6.342
exp1          -1.3215      2.073     -0.638      0.524      -5.386       2.743
exp2          12.5218     24.780      0.505      0.613     -36.062      61.106
exp3          -0.0484      0.151     -0.321      0.748      -0.344       0.247
sc cg       1.555e-13   4.48e-13      0.347      0.729   -7.23e-13    1.03e-12
sc mw         -0.7226      1.445     -0.500      0.617      -3.555       2.110
sc so         -0.6513      1.534     -0.424      0.671      -3.659       2.357
sc we         -0.1047      1.592     -0.066      0.948      -3.227       3.018
sc exp1        0.8391      1.097      0.765      0.444      -1.311       2.989
sc exp2       -4.0608      6.512     -0.624      0.533     -16.827       8.706
sc exp3        0.6330      1.158      0.547      0.585      -1.637       2.904
cg mw         -0.7609      1.536     -0.496      0.620      -3.772       2.250
cg so          1.7041      1.569      1.086      0.278      -1.373       4.781
cg we         -1.4948      1.637     -0.913      0.361      -4.704       1.715
cg exp1        0.7859      1.245      0.631      0.528      -1.654       3.226
cg exp2       -0.0490      7.761     -0.006      0.995     -15.265      15.167
cg exp3       -0.5950      1.462     -0.407      0.684      -3.462       2.272
mw so      -6.971e-15   1.95e-14     -0.358      0.720   -4.51e-14    3.12e-14
mw we      -1.916e-14   2.95e-14     -0.650      0.516    -7.7e-14    3.87e-14
mw exp1        1.1076      0.724      1.530      0.126      -0.312       2.527
mw exp2       -6.0527      4.790     -1.264      0.206     -15.444       3.339
mw exp3        0.9063      0.919      0.987      0.324      -0.895       2.707
so we      -7.762e-15   1.48e-14     -0.523      0.601   -3.68e-14    2.13e-14
so exp1        0.3947      0.748      0.528      0.598      -1.072       1.861
so exp2       -0.8914      4.973     -0.179      0.858     -10.641       8.858
so exp3       -0.0352      0.960     -0.037      0.971      -1.917       1.847
we exp1        0.4719      0.790      0.597      0.550      -1.077       2.021
we exp2       -3.9188      5.300     -0.739      0.460     -14.309       6.472
we exp3        0.8050      1.034      0.778      0.436      -1.223       2.833
exp1 exp2     -0.4839      1.505     -0.321      0.748      -3.435       2.467
exp1 exp3      0.0872      0.451      0.193      0.847      -0.797       0.972
exp2 exp3     -0.0580      0.503     -0.115      0.908      -1.044       0.928
female        -1.8800      0.425     -4.426      0.000      -2.713      -1.047
==============================================================================
Omnibus:                     6588.420   Durbin-Watson:                   1.959
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          8473264.499
Skew:                          11.669   Prob(JB):                         0.00
Kurtosis:                     232.090   Cond. No.                     1.14e+16
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 3.61e-24. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

# target variable
Y = df['wage']

#regressors W
W = df[['sc', 'cg', 'mw' , 'so' , 'we' , 'exp1' , 'exp2' , 'exp3']]
W = sm.add_constant(W)

# Linear regression/OLS object
model = sm.OLS(Y, W)

# train the model
results = model.fit()

# get the residuals
t_Y = results.resid
t_Y

0       -1.686401
1      -10.550023
2       -9.298414
3       -3.466715
4       -4.451256
          ...    
3830     1.148402
3831    -0.049341
3832     2.728471
3833    -5.471606
3834    30.261356
Length: 3835, dtype: float64

# target regressor D, i.e. female in our predicitng wage example
D = df['female']

#regressors W
W = df[['sc', 'cg', 'mw' , 'so' , 'we' , 'exp1' , 'exp2' , 'exp3']] #regressors
W = sm.add_constant(W)

# Linear regression/OLS object
model = sm.OLS(D, W)

# train the model
results = model.fit()

# get the residuals
t_D = results.resid
t_D

0      -0.323196
1      -0.448263
2      -0.443029
3      -0.485547
4       0.572763
          ...   
3830    0.555462
3831   -0.294333
3832    0.577249
3833    0.575515
3834   -0.419835
Length: 3835, dtype: float64

# Run OLS coefficient get coefficients and 95% confidence intervals

# target variable
Y = t_Y

X = t_D
X = sm.add_constant(X)

# Linear regression/OLS object
model = sm.OLS(Y, X)

# train the model
results = model.fit()

# print the summary of the trained model
print(results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     18.55
Date:                Thu, 08 Jun 2023   Prob (F-statistic):           1.70e-05
Time:                        17:07:11   Log-Likelihood:                -15235.
No. Observations:                3835   AIC:                         3.047e+04
Df Residuals:                    3833   BIC:                         3.049e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -5.281e-15      0.208  -2.54e-14      1.000      -0.407       0.407
0             -1.8264      0.424     -4.307      0.000      -2.658      -0.995
==============================================================================
Omnibus:                     6626.018   Durbin-Watson:                   1.958
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          8721375.159
Skew:                          11.808   Prob(JB):                         0.00
Kurtosis:                     235.426   Cond. No.                         2.04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

# target variable
Y = df['wage']

#regressors W
W = df[['sc', 'cg', 'mw' , 'so' , 'we' , 'exp1' , 'exp2' , 'exp3']]

# create an object of PolynomialFeatures with only the interaction terms
poly = PolynomialFeatures(interaction_only=True,include_bias = False)

# transform the data to add the polynomial features too
X_poly = poly.fit_transform(W)
X_poly = sm.add_constant(X_poly)

#defining the model
model = sm.OLS(Y, X_poly) # Linear regression/OLS object

 # train the model
results = model.fit()

# get the residuals
t_Y = results.resid
t_Y

0       -2.269388
1      -11.721164
2       -8.577609
3       -4.626069
4       -4.974672
          ...    
3830     1.726182
3831     0.775800
3832     3.639956
3833    -4.183105
3834    31.101298
Length: 3835, dtype: float64

# target regressor
D = df['female']

#Regressors w
W = df[['sc', 'cg', 'mw' , 'so' , 'we' , 'exp1' , 'exp2' , 'exp3']]

# create an object of PolynomialFeatures with only the interaction terms
poly = PolynomialFeatures(interaction_only=True,include_bias = False)
X_poly = poly.fit_transform(W)
X_poly = sm.add_constant(X_poly)


#defining the model
model = sm.OLS(D, X_poly)

# train the model
results = model.fit()

# get the residuals
t_D = results.resid
t_D

0      -0.255420
1      -0.443778
2      -0.408862
3      -0.576250
4       0.576618
          ...   
3830    0.539392
3831   -0.267692
3832    0.551484
3833    0.588884
3834   -0.405553
Length: 3835, dtype: float64

# Run OLS coefficient get coefficients and 95% confidence intervals
Y = t_Y # target variable
X = t_D
X = sm.add_constant(X)
model = sm.OLS(Y, X)
results = model.fit() # train the model
print(results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     19.75
Date:                Thu, 08 Jun 2023   Prob (F-statistic):           9.07e-06
Time:                        17:07:11   Log-Likelihood:                -15217.
No. Observations:                3835   AIC:                         3.044e+04
Df Residuals:                    3833   BIC:                         3.045e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -1.721e-13      0.207  -8.33e-13      1.000      -0.405       0.405
0             -1.8800      0.423     -4.444      0.000      -2.709      -1.051
==============================================================================
Omnibus:                     6588.420   Durbin-Watson:                   1.959
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          8473264.499
Skew:                          11.669   Prob(JB):                         0.00
Kurtosis:                     232.090   Cond. No.                         2.05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

# Convert notebook to html
!jupyter nbconvert --to html "/content/drive/MyDrive/MIT - Data Sciences/Colab Notebooks/Week Four - Regression and Prediction/Optional Content/Gender Wage GAP in the US/Case_Study_2.ipynb"

	count	mean	std	min	25%	50%	75%	max
female	3835.0	0.417992	0.493293	0.000000	0.00000	0.000000	1.000000	1.000000
cg	3835.0	0.376271	0.484513	0.000000	0.00000	0.000000	1.000000	1.000000
sc	3835.0	0.323859	0.468008	0.000000	0.00000	0.000000	1.000000	1.000000
hsg	3835.0	0.299870	0.458260	0.000000	0.00000	0.000000	1.000000	1.000000
mw	3835.0	0.287614	0.452709	0.000000	0.00000	0.000000	1.000000	1.000000
so	3835.0	0.243546	0.429278	0.000000	0.00000	0.000000	0.000000	1.000000
we	3835.0	0.211734	0.408591	0.000000	0.00000	0.000000	0.000000	1.000000
ne	3835.0	0.257106	0.437095	0.000000	0.00000	0.000000	1.000000	1.000000
exp1	3835.0	13.353194	8.639348	2.000000	6.00000	11.000000	19.500000	35.000000
exp2	3835.0	2.529267	2.910554	0.040000	0.36000	1.210000	3.802500	12.250000
exp3	3835.0	5.812103	9.033207	0.008000	0.21600	1.331000	7.414875	42.875000
wage	3835.0	15.533356	13.518165	0.004275	9.61875	13.028571	17.812500	348.333017

	Estimate	Standard Error	Confidence Interval
basic reg	-1.8264	0.425	[-2.659 -0.994]
flexi reg	-1.8800	0.425	[-2.713 -1.047]

GENDER WAGE GAP¶

What is the difference in predicted wages between men and women with the same job-relevant characteristics?¶

Dataset¶

Importing the necessary libraries and overview of the dataset¶

Loading the data¶

Checking the info of the data¶

Univariate Analysis¶

Checking the summary statistics of the dataset¶

As the goal of this case study is to analyse the gender wage gap, let's look at the statistics of all columns for both female and male.¶

Basic Model¶

Flexible model¶

Illustration of Partialling Out: (Linear Specification - Basic Model)¶

Step 1: We predict Y using W only and find its residuals. i.e. removing the dependence of W on Y.¶

Step 2: We predict D using W and find its residuals, i.e. removing the dependence of W on D.¶

Step 3: Then we model residuals from step 1 and step 2, and this will give us how Y is dependent on D only.¶

Illustration of Partialling Out: (Quadratic Specification - Flexible model)¶

Conclusion:¶

	female	cg	sc	hsg	ne	exp1	exp2	exp3	wage
0	0	0	0	1	1	33.0	10.89	35.937	11.659091
1	0	1	0	0	1	27.0	7.29	19.683	12.825000
2	0	0	1	0	1	13.0	1.69	2.197	5.777027
3	0	1	0	0	1	2.0	0.04	0.008	12.468750
4	1	1	0	0	1	15.0	2.25	3.375	18.525000