## Results are below
######################################
## Gates, 2025
##
## Multiple Linear Regression
## with both quant and cat indep
## variables.
##
## Drop Choice (not first drop)
## ## The entire and original dataset is HERE
## https://drive.google.com/file/d/1f9FRqo1MaanGURS470xxTK29wQwqXTVt/view?usp=sharing
## The Dependent Variable will be INCOME
#########################################
## Libraries
library(dplyr)
## Read in the dataset
# Load data
filepath="C:/Users/profa/Desktop/PT/IncomeDataset_Pretend_Gates.csv"
(MyIncomeData <- read.csv(filepath))
# Check data types
str(MyIncomeData)
## Change EDU_ to type factor
(MyIncomeData$EDU_ <- as.factor(MyIncomeData$EDU_) )
MyIncomeData
str(MyIncomeData)
##----
## Why does the EDU variable have a _ making it EDU_?
## You can give a variable any legal name. Using the "_"
## makes the dummy variables easier to read :)
##----
## model.matrix generates dummy variables
## Note, the -1 removes the intercept term
(My_dummy_cols <- model.matrix(~EDU_ - 1, data = MyIncomeData))
## Choose to remove whichever dummy column we want
## We also need to remove the original EDU_ column from
## the dataset
(My_dummy_cols <- subset(My_dummy_cols, select = -c(EDU_BS)))
(MyIncomeData <- subset(MyIncomeData, select=-c(EDU_)))
## Create the final dataset
(FinalDF<- cbind(MyIncomeData, My_dummy_cols))
## Run Multiple Linear Regression
My_MLR_Model <- lm(INCOME ~ ., data = FinalDF)
summary(My_MLR_Model)
## The model that was created:
## x1 is AA, x2 is MS, x3 is PhD, x4 is AGE, x5 is YRS_EXP_FLD
## Why? We can associate any variable x with any variable name as
## long as we are consistent with the use of the coefficients
## y = -44186.8*x1 + 47252.1*x2 + 57371.9*x3 -810.8*x4 + 5981.2*x5 + 87536.7
## Let's plug in some values....
## ## Suppose a person has an MS, is 32, has 4yrs experience in the field
## y = -44186.8*(0) + 47252.1*(1) + 57371.9*(0) -810.8*(32) + 5981.2*(4) + 87536.7
## y = 132768 which is the income estimate from this model
Results
