# Need to have R3.4.3 and above installed # IMF # For any question contact vguerreiro@imf.org or RPPI@imf.org ############################## Model specification ############################ rm(list = ls()) # Install packages -------------------------------------------------------- packages1 <- c("magrittr","dplyr","lmtest","e1071","robustbase","gridExtra","rio","gvlma","sandwich" ,"caret","MASS","EnvStats","micEconIndex","readr","dummies","broom","openxlsx","XLConnect","xlsx") pkgs2inst <- !(packages1 %in% (.packages(all.available=T))) if (any(pkgs2inst)) install.packages(packages1[pkgs2inst]) lapply(packages1, require, character.only=T) rm(packages1,pkgs2inst) #### !!! COMPILER NEEDS TO UPDATE THE FOLLOWING: ------------------------------ # !!! Define the period Period <- "1Q2008" # !!! Insert address of the folder where the data is Data_folder <- "C:/Users/vguerreiro/My Local Documents/CSO_Synthetic_data/data set/" # !!! Insert address of the folder where the outputs should be stored Output_folder <- "C:/Users/vguerreiro/My Local Documents/CSO_Synthetic_data/data set/" BaseCalcul <- as.data.frame(read_csv(paste(Data_folder, Period, ".csv", sep = ""))) names(BaseCalcul) ### !!! Selecte variables to be excluded from the regression (the "" are needed) exl_var <- c("id", "Year", "Month", "Status", "Period", "Region", "Year_Built", "County") ### !!! Selecte categorical variables to be used on the regression (the "" are needed). # This variables will be transformed in dummies (binary)------------------------- catg_var <- c("Dwelling_Type", "BER", "Year_Built_Agg", "County_Agg", "Neighborhood_Type", "Building_Levels", "Central_Heating") BaseCalcul <- dummy.data.frame(BaseCalcul, names = catg_var, omit.constants=TRUE, dummy.classes = getOption("dummy.classes"), all = TRUE) exl_name <- which(names(BaseCalcul) %in% exl_var) BaseCalcul <- BaseCalcul[,-exl_name] # OLS regression olsregS <- lm(log(Price)~.,BaseCalcul) Df.olsreg <- tidy(olsregS) # model selection process step <- stepAIC(olsregS, direction="both") a<-step$anova # AIC = Akaike criterion = goodness of fit. Used by ANOVA for comparison of different models. # Lower AIC the better. ############################################################################### # Evaluation of the selected model. To make this must run the script again and ## include in the exl_var list the variables that were rulled out by the ANOVA. BaseCalcul <- as.data.frame( read_csv( paste( Data_folder, Period,".csv",sep = ""), col_types = cols() )) ### !!! Selecte variables to be excluded from the regression (the "" are needed) exl_var <- c("id", "Year", "Month", "Status", "Period","Region", "Year_Built", "County") ### !!! Selecte categorical variables to be used on the regression (the "" are needed). # This variables will be transformed in dummies (binary)------------------------- catg_var <- c("Dwelling_Type", "BER", "Year_Built_Agg", "County_Agg", "Neighborhood_Type", "Building_Levels", "Central_Heating") BaseCalcul<-dummy.data.frame(BaseCalcul, names = catg_var, omit.constants = TRUE, dummy.classes = getOption("dummy.classes"), all = TRUE) exl_name <- which(names(BaseCalcul) %in% exl_var) BaseCalcul<-BaseCalcul[,-exl_name] # OLS regression olsregS <- lm(log(Price)~.,BaseCalcul) Df.olsreg <- tidy(olsregS) R2_1 <- summary(olsregS)$r.squared Stats_1 <- summary(olsregS) Stats_1 # p value as closest to zero de better. The p-value is the chance that # The result you're seeing happened due to random variation. # Commonly a p-value of .05 or less (interpreted roughly as "there's a 5% chance or less of # his happening just due to random variation") ## is taken to mean that the result is significant. # Critical value must be higher than p-value # Critical value = table of F-statistics / DF. ## Used for assessing the fitness of the model - the higher F the better