9 min read

Modeling Pittsburgh House Sales - Linear

In this post I will be modeling house (land parcel) sales in Pittsburgh. The data is from the WPRDC’s Parcels n’at dashboard.

The goal is to use linear modeling to predict the sale price of a house using features of the house and the property.

This code sets up the environment and loads the libraries I will use.

#load libraries
library(tidyverse)
library(scales)
library(caret)
library(broom)
library(modelr)
library(rsample)
library(janitor)

#set up environment
options(scipen = 999, digits = 5)

theme_set(theme_bw())

This reads the data and engineers some features.

#read in data
df <- read_csv("parcel_data.csv", progress = FALSE) %>% 
  clean_names() %>% 
  select(-geom) %>% 
  mutate(munidesc_asmt = str_replace(munidesc_asmt, " - PITTSBURGH", "")) %>% 
  mutate(finishedlivingarea_asmt_log10 = log10(finishedlivingarea_asmt),
         lotarea_asmt_log10 = log10(lotarea_asmt),
         price_sales_log10 = log10(price_sales),
         saleprice_asmt_log10 = log10(saleprice_asmt)) %>% 
  select(pin, classdesc_asmt, munidesc_asmt, schooldesc_asmt, neighdesc_asmt, taxdesc_asmt,
         usedesc_asmt, homesteadflag_asmt, farmsteadflag_asmt, styledesc_asmt,
         yearblt_asmt, extfinish_desc_asmt, roofdesc_asmt,  basementdesc_asmt,
         gradedesc_asmt, conditiondesc_asmt, stories_asmt, totalrooms_asmt, bedrooms_asmt,
         fullbaths_asmt, halfbaths_asmt, heatingcoolingdesc_asmt, fireplaces_asmt, 
         bsmtgarage_asmt, finishedlivingarea_asmt, finishedlivingarea_asmt_log10,
         lotarea_asmt, lotarea_asmt_log10, saledate_sales, price_sales, price_sales_log10,
         saleprice_asmt, saleprice_asmt_log10)

#create grade vectors
grades_standard <- c("average -", "average", "average +",
                     "good -", "good", "good +",
                     "very good -", "very good", "very good +")

grades_below_average_or_worse <- c("poor -", "poor", "poor +",
                                   "below average -", "below average", "below average +")

grades_excellent_or_better <- c("excellent -", "excellent", "excellent +",
                                "highest cost -", "highest cost", "highest cost +")

#subset data and engineer features
df <- df %>% 
  filter(classdesc_asmt == "RESIDENTIAL",
         saleprice_asmt > 100,
         str_detect(munidesc_asmt, "Ward"),
         finishedlivingarea_asmt > 0,
         lotarea_asmt > 0) %>% 
  select(pin, munidesc_asmt, schooldesc_asmt, neighdesc_asmt, taxdesc_asmt,
         usedesc_asmt, homesteadflag_asmt, farmsteadflag_asmt, styledesc_asmt,
         yearblt_asmt, extfinish_desc_asmt, roofdesc_asmt,  basementdesc_asmt, 
         heatingcoolingdesc_asmt, gradedesc_asmt, conditiondesc_asmt, stories_asmt, 
         totalrooms_asmt, bedrooms_asmt, fullbaths_asmt, halfbaths_asmt, fireplaces_asmt, 
         bsmtgarage_asmt, finishedlivingarea_asmt_log10, lotarea_asmt_log10, price_sales_log10, 
         saleprice_asmt_log10, saledate_sales) %>% 
  mutate(usedesc_asmt = fct_lump(usedesc_asmt, n = 5),
         styledesc_asmt = fct_lump(styledesc_asmt, n = 10),
         #clean up and condense gradedesc_asmt
         gradedesc_asmt = str_to_lower(gradedesc_asmt),
         gradedesc_asmt = case_when(gradedesc_asmt %in% grades_below_average_or_worse ~ "below average + or worse",
                                    gradedesc_asmt %in% grades_excellent_or_better ~ "excellent - or better",
                                    gradedesc_asmt %in% grades_standard ~ gradedesc_asmt),
         gradedesc_asmt = fct_relevel(gradedesc_asmt, c("below average + or worse", "average -", "average", "average +",
                                                        "good -", "good", "good +",
                                                        "very good -", "very good", "very good +", "excellent - or better")))

#replace missing character rows with "missing", change character columns to factor
df <- df %>% 
  mutate_if(is.character, replace_na, "missing") %>% 
  mutate_if(is.character, as.factor)

#select response and features
df <- df %>% 
  select(munidesc_asmt, usedesc_asmt, styledesc_asmt, conditiondesc_asmt, gradedesc_asmt,
         finishedlivingarea_asmt_log10, lotarea_asmt_log10, yearblt_asmt, bedrooms_asmt, 
         fullbaths_asmt, halfbaths_asmt, saleprice_asmt_log10) %>% 
  na.omit()

#view data
glimpse(df)
## Observations: 64,521
## Variables: 12
## $ munidesc_asmt                 <fct> 29th Ward, 26th Ward, 14th Ward,...
## $ usedesc_asmt                  <fct> SINGLE FAMILY, SINGLE FAMILY, SI...
## $ styledesc_asmt                <fct> OLD STYLE, OLD STYLE, OLD STYLE,...
## $ conditiondesc_asmt            <fct> FAIR, FAIR, GOOD, AVERAGE, AVERA...
## $ gradedesc_asmt                <fct> average, average, average +, ave...
## $ finishedlivingarea_asmt_log10 <dbl> 3.3659, 3.1917, 3.3617, 3.0350, ...
## $ lotarea_asmt_log10            <dbl> 3.7796, 3.7267, 3.4698, 3.8008, ...
## $ yearblt_asmt                  <dbl> 1905, 1930, 1900, 1930, 1930, 19...
## $ bedrooms_asmt                 <dbl> 3, 3, 3, 3, 3, 2, 3, 8, 2, 4, 3,...
## $ fullbaths_asmt                <dbl> 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1,...
## $ halfbaths_asmt                <dbl> 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,...
## $ saleprice_asmt_log10          <dbl> 4.6128, 4.9149, 5.6314, 4.1761, ...

As shown in the data above, the model uses the following features to predict sale price:

  • municipality name
  • primary use of the parcel
  • style of building
  • condition of the structure
  • grade of construction
  • living area in square feet
  • lot area in square feet
  • year the house was built
  • number of bedrooms
  • number of full baths
  • number of half-baths

This code sets up the data for cross validation.

#create initial split object
df_split <- initial_split(df, prop = .75)

#extract training dataframe
training_data_full <- training(df_split)

#extract testing dataframe
testing_data <- testing(df_split)

#find dimensions of training_data_full and testing_data
dim(training_data_full)
## [1] 48391    12
dim(testing_data)
## [1] 16130    12

This code divides the data into training and testing sets.

set.seed(42)

#prep the df with the cross validation partitions
cv_split <- vfold_cv(training_data_full, v = 5)

cv_data <- cv_split %>% 
  mutate(
    #extract train dataframe for each split
    train = map(splits, ~training(.x)), 
    #extract validate dataframe for each split
    validate = map(splits, ~testing(.x))
  )

#view df
cv_data
## #  5-fold cross-validation 
## # A tibble: 5 x 4
##   splits               id    train                  validate             
## * <list>               <chr> <list>                 <list>               
## 1 <split [38.7K/9.7K]> Fold1 <tibble [38,712 x 12]> <tibble [9,679 x 12]>
## 2 <split [38.7K/9.7K]> Fold2 <tibble [38,713 x 12]> <tibble [9,678 x 12]>
## 3 <split [38.7K/9.7K]> Fold3 <tibble [38,713 x 12]> <tibble [9,678 x 12]>
## 4 <split [38.7K/9.7K]> Fold4 <tibble [38,713 x 12]> <tibble [9,678 x 12]>
## 5 <split [38.7K/9.7K]> Fold5 <tibble [38,713 x 12]> <tibble [9,678 x 12]>

This builds the model to predict house sale price.

#build model using the train data for each fold of the cross validation
cv_models_lm <- cv_data %>% 
  mutate(model = map(train, ~lm(formula = saleprice_asmt_log10 ~ ., data = .x)))
cv_models_lm
## #  5-fold cross-validation 
## # A tibble: 5 x 5
##   splits             id    train                validate            model  
## * <list>             <chr> <list>               <list>              <list> 
## 1 <split [38.7K/9.7~ Fold1 <tibble [38,712 x 1~ <tibble [9,679 x 1~ <S3: l~
## 2 <split [38.7K/9.7~ Fold2 <tibble [38,713 x 1~ <tibble [9,678 x 1~ <S3: l~
## 3 <split [38.7K/9.7~ Fold3 <tibble [38,713 x 1~ <tibble [9,678 x 1~ <S3: l~
## 4 <split [38.7K/9.7~ Fold4 <tibble [38,713 x 1~ <tibble [9,678 x 1~ <S3: l~
## 5 <split [38.7K/9.7~ Fold5 <tibble [38,713 x 1~ <tibble [9,678 x 1~ <S3: l~
#problem with factors split across training/validation
#https://stats.stackexchange.com/questions/235764/new-factors-levels-not-present-in-training-data

This is where I begin to calculate metrics to judge how well my model is doing.

cv_prep_lm <- cv_models_lm %>% 
  mutate(
    #extract actual sale price for the records in the validate dataframes
    validate_actual = map(validate, ~.x$saleprice_asmt_log10),
    #predict response variable for each validate set using its corresponding model
    validate_predicted = map2(.x = model, .y = validate, ~predict(.x, .y))
  )

#View data
cv_prep_lm
## #  5-fold cross-validation 
## # A tibble: 5 x 7
##   splits   id    train    validate   model validate_actual validate_predic~
## * <list>   <chr> <list>   <list>     <lis> <list>          <list>          
## 1 <split ~ Fold1 <tibble~ <tibble [~ <S3:~ <dbl [9,679]>   <dbl [9,679]>   
## 2 <split ~ Fold2 <tibble~ <tibble [~ <S3:~ <dbl [9,678]>   <dbl [9,678]>   
## 3 <split ~ Fold3 <tibble~ <tibble [~ <S3:~ <dbl [9,678]>   <dbl [9,678]>   
## 4 <split ~ Fold4 <tibble~ <tibble [~ <S3:~ <dbl [9,678]>   <dbl [9,678]>   
## 5 <split ~ Fold5 <tibble~ <tibble [~ <S3:~ <dbl [9,678]>   <dbl [9,678]>
#calculate fit metrics for each validate fold       
cv_eval_lm <- cv_prep_lm %>% 
  mutate(validate_rmse = map2_dbl(model, validate, modelr::rmse),
         validate_mae = map2_dbl(model, validate, modelr::mae))

cv_eval_lm <- cv_eval_lm %>% 
  mutate(fit = map(model, ~glance(.x))) %>% 
  unnest(fit)
#view data
cv_eval_lm %>% 
  select(id, validate_mae, validate_rmse, adj.r.squared)
## # A tibble: 5 x 4
##   id    validate_mae validate_rmse adj.r.squared
##   <chr>        <dbl>         <dbl>         <dbl>
## 1 Fold1        0.296         0.407         0.449
## 2 Fold2        0.294         0.400         0.445
## 3 Fold3        0.296         0.404         0.445
## 4 Fold4        0.297         0.407         0.449
## 5 Fold5        0.297         0.409         0.446

Finally, this calculates how well the model did on the validation set.

#summarize fit metrics on cross-validated dfs
cv_eval_lm %>% 
  select(validate_mae, validate_rmse, adj.r.squared) %>% 
  summarize_all(mean)
## # A tibble: 1 x 3
##   validate_mae validate_rmse adj.r.squared
##          <dbl>         <dbl>         <dbl>
## 1        0.296         0.406         0.447
#fit model on full training set
train_df <- cv_data %>% 
  unnest(train) %>% 
  select(-id)

model_train <- lm(formula = saleprice_asmt_log10 ~ ., data = train_df)

model_train %>% 
  glance()
## # A tibble: 1 x 11
##   r.squared adj.r.squared sigma statistic p.value    df  logLik    AIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <int>   <dbl>  <dbl>
## 1     0.448         0.447 0.405     2239.       0    71 -99661. 1.99e5
## # ... with 3 more variables: BIC <dbl>, deviance <dbl>, df.residual <int>

This is the RMSE on the training set

#calculate rmse on training set
rmse(model_train, train_df)
## [1] 0.40492
#create model object
#model <- lm(saleprice_asmt_log10 ~ ., data = train_data_small)

#tidy model
model_train %>% 
  tidy() %>% 
  filter(term != "(Intercept)") %>% 
  arrange(-estimate)
## # A tibble: 70 x 5
##    term                              estimate std.error statistic   p.value
##    <chr>                                <dbl>     <dbl>     <dbl>     <dbl>
##  1 gradedesc_asmtexcellent - or bet~    0.455   0.0170       26.7 8.60e-157
##  2 gradedesc_asmtvery good +            0.425   0.0155       27.4 7.21e-165
##  3 gradedesc_asmtvery good              0.375   0.0133       28.2 1.85e-174
##  4 gradedesc_asmtgood +                 0.375   0.00840      44.6 0.       
##  5 gradedesc_asmtgood                   0.369   0.00758      48.7 0.       
##  6 gradedesc_asmtvery good -            0.336   0.0154       21.8 2.78e-105
##  7 gradedesc_asmtgood -                 0.290   0.00669      43.4 0.       
##  8 munidesc_asmt7th Ward                0.268   0.00782      34.3 5.18e-257
##  9 conditiondesc_asmtEXCELLENT          0.261   0.0212       12.3 1.14e- 34
## 10 gradedesc_asmtaverage +              0.253   0.00424      59.7 0.       
## # ... with 60 more rows
#10th ward is the base factor for muni_desc term
model_train %>% 
  tidy() %>% 
  filter(str_detect(term, "10th"))
## # A tibble: 0 x 5
## # ... with 5 variables: term <chr>, estimate <dbl>, std.error <dbl>,
## #   statistic <dbl>, p.value <dbl>

This shows the impact each term of the model has on the response variable. This is for the training data.

#visualize estimates for terms
model_train %>% 
  tidy() %>% 
  filter(term != "(Intercept)") %>% 
  mutate(term = fct_reorder(term, estimate)) %>% 
  ggplot(aes(term, estimate)) +
  geom_hline(yintercept = 0, linetype = 2, color = "red") +
  geom_point() +
  coord_flip()

Next, I apply the model to the testing data to see how the model does out-of-sample.

#create dfs for train_data_small and validate_data
#train_data_small <- cv_prep_lm %>% 
#  unnest(train) %>% 
#  select(-id)

validate_df <- cv_prep_lm %>% 
  unnest(validate)

This creates the augmented dataframe and plots the actual price vs. the fitted price.

#visualize model on validate data
augment_validate <- augment(model_train, newdata = validate_df) %>% 
  mutate(.resid = saleprice_asmt_log10 - .fitted)

#actual vs. fitted
cv_prep_lm %>% 
  unnest(validate_actual, validate_predicted) %>% 
  ggplot(aes(validate_actual, validate_predicted)) +
  geom_abline() +
  stat_density_2d(aes(fill = stat(level)), geom = "polygon") +
  geom_smooth(method = "lm") +
  scale_x_continuous(limits = c(2, 7)) +
  scale_y_continuous(limits = c(2, 7)) +
  coord_equal() +
  scale_fill_viridis_c()

This distribution shows that the model overestimates the prices on many houses.

#distribution of residuals
augment_validate %>% 
  ggplot(aes(.resid)) +
  geom_density() +
  geom_vline(xintercept = 0, color = "red", linetype = 2)

This shows that the residuals are correlated with the actual price, which indicates that the model is failing to account for some dynamic in the sale process.

#sale price vs. residuals
augment_validate %>% 
  ggplot(aes(.resid, saleprice_asmt_log10)) +
  stat_density_2d(aes(fill = stat(level)), geom = "polygon") +
  geom_vline(xintercept = 0, color = "red", linetype = 2) +
  scale_fill_viridis_c()

This calculates how well the model predicted sale price on out-of-sample testing data.

#calculate fit of model on test data
rmse(model_train, validate_df)
## [1] 0.40492
mae(model_train, validate_df)
## [1] 0.29555
rsquare(model_train, validate_df)
## [1] 0.4475