Multiple Linear Regression in Python – Using Quantitative and Categorical Independent Variables

## Results are below.................
# -*- coding: utf-8 -*-
"""
Created on Sat Jan 25 18:29:48 2025

@author: profa
"""

#############################################
## Gates, 2025

## The following tutorial illustrates how to use
## Python to perform and interpret 
## multiple linear regression on data that
## contains both quantitative independent variables
## and a categorical independent variable. 

## The entire and original dataset is HERE
## https://drive.google.com/file/d/1f9FRqo1MaanGURS470xxTK29wQwqXTVt/view?usp=sharing
## The Dependent Variable will be INCOME
###################################################

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
import statsmodels.api as sm

## Tell Python to print the entire dataframes - no ...
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Read in the data
## !! Your file path will be different from mine - be sure to UPDATE THIS
filepath="C:/Users/profa/Desktop/PT/IncomeDataset_Pretend_Gates.csv"
MyData=pd.read_csv(filepath)
print(MyData)

## Save the dependent variable as Y
## Save the indepdent variables as X
y=MyData["INCOME"]
print(y)
X=MyData.drop(["INCOME"], axis=1)
print(X)

## One-Hot Encode the Categorical Variable in X
categorical_variables_list = ["EDU_"] ##We have only one
#Instantiate
MyColTransformer = ColumnTransformer(
    transformers=[('onehot', OneHotEncoder(drop=[["BS"]]), categorical_variables_list)],
    remainder='passthrough'
)
# Perform the transformation
MyTransformedData=MyColTransformer.fit_transform(X)
print(MyTransformedData)

## Convert to dataframe
# Get the feature names
column_names = MyColTransformer.get_feature_names_out()

# Create a DataFrame from the transformed data
FinalDF = pd.DataFrame(MyTransformedData, columns=column_names)
print(FinalDF)
## Write the Final
file= "C:/Users/profa/Desktop/PT/IncomeDataset_Pretend_Gates_OneHot.csv"
FinalDF.to_csv(file, sep=',', encoding='utf-8', index=False, header=True)

## Split the data into training data (to train the model) and testing data
## to test the accuracy of the model
X_train, X_test, y_train, y_test = train_test_split(FinalDF, y, test_size=0.3, random_state=42)
print(X_test)
print(y_test)

# Create a Linear Regression model
MY_MLR_Model= LinearRegression()

# Fit the model on the training data
MY_MLR_Model.fit(X_train, y_train)

# Add a constant to the independent variable for statsmodels
X_sm = sm.add_constant(X_train) 

# Fit the model with statsmodels
model_sm = sm.OLS(y_train, X_sm).fit()

# Print the summary
print(model_sm.summary())

# Make predictions on the test data
y_pred = MY_MLR_Model.predict(X_test)

# Evaluate the model
print("R-squared:", MY_MLR_Model.score(X_test, y_test))

## Print the regression coefficients
print(MY_MLR_Model.coef_)  
## [-43253.95200045  54137.9195231   62314.95555614   -513.60062504    5322.85279307]

## Print the bias (the intercept)
print(MY_MLR_Model.intercept_) ## 80705.30  (rounded)

## The final linear model based on the coefs and intercept (rounded)
## y = -43254*x1 + 54138*x2  + 62315*x3   -514*x4  + 5323*x5   +  80705
## How does it work? Let's plug in some numbers to see if the result makes sense
## x1 is AA, x2 is MS, x3 is PhD, x4 is AGE, x5 is YRS_EXP_FLD
## Suppose a person has an MS, is 32, has 4yrs experience in the field
## y = -43254*(0) + 54138*(1)  + 62315*(0)   -514*(32)  + 5323*(4)   +  80705
## = 139687
Results