10 min read

Exploring 311 data with PCA

Principal Component Analysis in R

Principal Component Analysis is an unsupervised method that reduces the number of dimensions in a dataset and highlights where the data varies. We will use PCA to analyze the 311 dataset from the WPRDC.

Setup

Install packages

install.packages(c("tidyverse", "lubridate", "broom", "ggfortify", "ggrepel", "janitor"))

Set up your environment

library(tidyverse)
library(lubridate)
library(broom)
library(ggfortify)
library(ggrepel)
library(janitor)

options(scipen = 999, digits = 4)
set.seed(1234)

theme_set(theme_bw())

Load the data

read_csv("https://raw.githubusercontent.com/conorotompkins/pittsburgh_311/master/data/pittsburgh_311.csv", progress = FALSE) %>% 
  clean_names() %>% 
  mutate(date = ymd(str_sub(created_on, 1, 10)),
         month = month(date, label = TRUE)) -> df

Prep the data

Create a dataframe of the top request types

(df %>% 
  count(request_type, sort = TRUE) %>% 
  filter(n > 400) -> df_top_requests)
## # A tibble: 84 x 2
##    request_type                             n
##    <chr>                                <int>
##  1 Potholes                             25202
##  2 Weeds/Debris                         16503
##  3 Building Maintenance                 10469
##  4 Snow/Ice removal                      7006
##  5 Refuse Violations                     6515
##  6 Abandoned Vehicle (parked on street)  5877
##  7 Missed Pick Up                        4689
##  8 Replace/Repair a Sign                 4445
##  9 Building Without a Permit             4404
## 10 Litter                                4198
## # ... with 74 more rows

Count the number of requests per month by request type, filter for the top request types, and fill in gaps in the data

(df %>%
  semi_join(df_top_requests) %>% 
  group_by(request_type, month) %>% 
  summarize(n = n()) %>% 
  complete(request_type, month) %>% 
  replace_na(replace = list(n = 0)) -> df_months)
## # A tibble: 1,008 x 3
## # Groups:   request_type [84]
##    request_type                         month     n
##    <chr>                                <ord> <dbl>
##  1 Abandoned Vehicle (parked on street) Jan     523
##  2 Abandoned Vehicle (parked on street) Feb     427
##  3 Abandoned Vehicle (parked on street) Mar     452
##  4 Abandoned Vehicle (parked on street) Apr     417
##  5 Abandoned Vehicle (parked on street) May     488
##  6 Abandoned Vehicle (parked on street) Jun     466
##  7 Abandoned Vehicle (parked on street) Jul     457
##  8 Abandoned Vehicle (parked on street) Aug     596
##  9 Abandoned Vehicle (parked on street) Sep     525
## 10 Abandoned Vehicle (parked on street) Oct     571
## # ... with 998 more rows

Calculate the percentage of a request type for each month

(df_months %>% 
  group_by(request_type) %>% 
  mutate(request_type_total = sum(n),
         month_percentage = n / request_type_total) -> df_months)
## # A tibble: 1,008 x 5
## # Groups:   request_type [84]
##    request_type             month     n request_type_tot~ month_percentage
##    <chr>                    <ord> <dbl>             <dbl>            <dbl>
##  1 Abandoned Vehicle (park~ Jan     523              5877           0.0890
##  2 Abandoned Vehicle (park~ Feb     427              5877           0.0727
##  3 Abandoned Vehicle (park~ Mar     452              5877           0.0769
##  4 Abandoned Vehicle (park~ Apr     417              5877           0.0710
##  5 Abandoned Vehicle (park~ May     488              5877           0.0830
##  6 Abandoned Vehicle (park~ Jun     466              5877           0.0793
##  7 Abandoned Vehicle (park~ Jul     457              5877           0.0778
##  8 Abandoned Vehicle (park~ Aug     596              5877           0.101 
##  9 Abandoned Vehicle (park~ Sep     525              5877           0.0893
## 10 Abandoned Vehicle (park~ Oct     571              5877           0.0972
## # ... with 998 more rows

Check for bad data

df_months %>% 
  filter(is.na(month_percentage))
## # A tibble: 0 x 5
## # Groups:   request_type [0]
## # ... with 5 variables: request_type <chr>, month <ord>, n <dbl>,
## #   request_type_total <dbl>, month_percentage <dbl>
df_months %>% 
  filter(is.nan(month_percentage))
## # A tibble: 0 x 5
## # Groups:   request_type [0]
## # ... with 5 variables: request_type <chr>, month <ord>, n <dbl>,
## #   request_type_total <dbl>, month_percentage <dbl>

Spread the data to turn the months into the columns

(df_months %>% 
  select(request_type, month, month_percentage) %>% 
  spread(month, month_percentage) %>% 
  ungroup() -> df_months)
## # A tibble: 84 x 13
##    request_type     Jan     Feb    Mar    Apr    May    Jun    Jul    Aug
##    <chr>          <dbl>   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1 Abandoned V~ 0.0890  0.0727  0.0769 0.0710 0.0830 0.0793 0.0778 0.101 
##  2 Barking Dog  0.0563  0.0608  0.0608 0.0631 0.104  0.101  0.0788 0.113 
##  3 Board Up (P~ 0.0395  0.0482  0.0658 0.0943 0.114  0.0899 0.110  0.123 
##  4 Broken Side~ 0.0337  0.155   0.148  0.0872 0.105  0.0964 0.0696 0.0735
##  5 Building Ma~ 0.0708  0.0919  0.103  0.0739 0.0842 0.0829 0.0725 0.0919
##  6 Building Wi~ 0.0842  0.0697  0.0636 0.0577 0.105  0.0883 0.0924 0.0815
##  7 Catch Basin~ 0.0636  0.0377  0.0778 0.0748 0.0984 0.132  0.0825 0.127 
##  8 City Source~ 0.00527 0.00246 0.0105 0.0428 0.196  0.213  0.195  0.164 
##  9 City Steps,~ 0.0443  0.0180  0.0148 0.0197 0.116  0.216  0.203  0.146 
## 10 City Steps,~ 0.0265  0.0305  0.0713 0.0509 0.128  0.120  0.136  0.128 
## # ... with 74 more rows, and 4 more variables: Sep <dbl>, Oct <dbl>,
## #   Nov <dbl>, Dec <dbl>

Check that they all add up to 1 across the rows

(df_months %>% 
  select(Jan:Dec) %>% 
  mutate(row_sum = rowSums(.)) %>% 
  select(row_sum, everything()) -> test)
## # A tibble: 84 x 13
##    row_sum     Jan     Feb    Mar    Apr    May    Jun    Jul    Aug    Sep
##      <dbl>   <dbl>   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1       1 0.0890  0.0727  0.0769 0.0710 0.0830 0.0793 0.0778 0.101  0.0893
##  2       1 0.0563  0.0608  0.0608 0.0631 0.104  0.101  0.0788 0.113  0.124 
##  3       1 0.0395  0.0482  0.0658 0.0943 0.114  0.0899 0.110  0.123  0.0855
##  4       1 0.0337  0.155   0.148  0.0872 0.105  0.0964 0.0696 0.0735 0.0528
##  5       1 0.0708  0.0919  0.103  0.0739 0.0842 0.0829 0.0725 0.0919 0.0776
##  6       1 0.0842  0.0697  0.0636 0.0577 0.105  0.0883 0.0924 0.0815 0.0829
##  7       1 0.0636  0.0377  0.0778 0.0748 0.0984 0.132  0.0825 0.127  0.105 
##  8       1 0.00527 0.00246 0.0105 0.0428 0.196  0.213  0.195  0.164  0.0808
##  9       1 0.0443  0.0180  0.0148 0.0197 0.116  0.216  0.203  0.146  0.118 
## 10       1 0.0265  0.0305  0.0713 0.0509 0.128  0.120  0.136  0.128  0.108 
## # ... with 74 more rows, and 3 more variables: Oct <dbl>, Nov <dbl>,
## #   Dec <dbl>

Perform basic comparisons

df_months %>% 
  ggplot(aes(Jan, Jul)) +
  geom_point()

Remember that each dot represents a request type, and the month shows what % of that request type occurred that month

df_months %>% 
  ggplot(aes(Apr, Oct)) +
  geom_point()

It is not feasible to plot all the months against each other. PCA can help by condensing the columns and increasing the variance. PCA creates eigenvectors that represents the data in a concentrated way. Eigenvectors and eigenvalues do not represent observed data. They are calculated representations of the data. We will refer to eigenvectors as “principal components”.

In this case, where our data is measured by months in a year, each principal component could loosely be compared to a season.

Prep the data for PCA

The PCA function requires an all-numeric dataframe, so drop the request types into the dataframe metadata

(df_months %>% 
  ungroup() %>% 
  remove_rownames() %>% 
  column_to_rownames(var = "request_type") -> df_months_pca1)
## # A tibble: 84 x 12
##        Jan     Feb    Mar    Apr    May    Jun    Jul    Aug    Sep    Oct
##  *   <dbl>   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1 0.0890  0.0727  0.0769 0.0710 0.0830 0.0793 0.0778 0.101  0.0893 0.0972
##  2 0.0563  0.0608  0.0608 0.0631 0.104  0.101  0.0788 0.113  0.124  0.113 
##  3 0.0395  0.0482  0.0658 0.0943 0.114  0.0899 0.110  0.123  0.0855 0.0877
##  4 0.0337  0.155   0.148  0.0872 0.105  0.0964 0.0696 0.0735 0.0528 0.0849
##  5 0.0708  0.0919  0.103  0.0739 0.0842 0.0829 0.0725 0.0919 0.0776 0.0787
##  6 0.0842  0.0697  0.0636 0.0577 0.105  0.0883 0.0924 0.0815 0.0829 0.113 
##  7 0.0636  0.0377  0.0778 0.0748 0.0984 0.132  0.0825 0.127  0.105  0.0931
##  8 0.00527 0.00246 0.0105 0.0428 0.196  0.213  0.195  0.164  0.0808 0.0583
##  9 0.0443  0.0180  0.0148 0.0197 0.116  0.216  0.203  0.146  0.118  0.0557
## 10 0.0265  0.0305  0.0713 0.0509 0.128  0.120  0.136  0.128  0.108  0.0855
## # ... with 74 more rows, and 2 more variables: Nov <dbl>, Dec <dbl>

Create the PCA object

(df_months_pca1 %>% 
  prcomp(scale = TRUE) -> pc)
## Standard deviations (1, .., p=12):
##  [1] 2.0313544303132324842 1.5112299607905623766 1.3677583442481697773
##  [4] 1.0647449915481701499 0.9373153843502736171 0.6612690017981472934
##  [7] 0.6319678449167117629 0.5732234023111663079 0.4666060722915733039
## [10] 0.4192405535100036107 0.3847717270238655840 0.0000000000000003789
## 
## Rotation (n x k) = (12 x 12):
##         PC1     PC2       PC3      PC4      PC5        PC6      PC7
## Jan -0.3509  0.2377  0.036588 -0.40554  0.40259 -0.0878343  0.04648
## Feb -0.2189  0.2230 -0.226391  0.69886 -0.14610  0.1005237  0.15306
## Mar -0.0235 -0.4858 -0.323760  0.11519  0.19761 -0.1724337 -0.62037
## Apr  0.1329 -0.4686 -0.301500 -0.04306  0.07313 -0.5014973  0.54199
## May  0.2339 -0.1448 -0.445597 -0.33914 -0.15507  0.5723879 -0.10942
## Jun  0.4049  0.1866  0.002112 -0.22765 -0.21210 -0.0386388  0.13438
## Jul  0.4322  0.1697  0.095923 -0.10888 -0.01800 -0.1443989  0.02032
## Aug  0.3866  0.1805  0.189907  0.17598 -0.04242 -0.2554247 -0.44835
## Sep  0.2944 -0.1580  0.365255  0.16856  0.42595 -0.0005222  0.04916
## Oct  0.1150 -0.4130  0.389922  0.16708  0.17452  0.5136274  0.18989
## Nov -0.1323 -0.3291  0.333424 -0.05855 -0.69855 -0.1384698 -0.02866
## Dec -0.3720 -0.1311  0.333522 -0.25017 -0.03408 -0.0524049 -0.15824
##          PC8       PC9     PC10      PC11   PC12
## Jan -0.10077  0.119474  0.19207  0.310620 0.5699
## Feb  0.04311 -0.085692 -0.10149 -0.130098 0.5209
## Mar -0.31077 -0.259371 -0.01955  0.035606 0.1620
## Apr  0.09929  0.264116 -0.01396 -0.143422 0.1379
## May  0.40042  0.106727 -0.02849 -0.028027 0.2717
## Jun -0.42038 -0.373786  0.36939 -0.413396 0.2471
## Jul -0.13081 -0.103405 -0.77442  0.256091 0.2205
## Aug  0.13536  0.607417  0.20278 -0.123033 0.1983
## Sep  0.53195 -0.454920  0.16862  0.035475 0.1519
## Oct -0.43757  0.304459 -0.02271 -0.003209 0.1456
## Nov  0.09659 -0.104258  0.14115  0.387838 0.2472
## Dec  0.15902 -0.008971 -0.35391 -0.678722 0.1743

Inspect the PCA object with tidier functions from the broom library. These functions turn the PCA object into a tidy dataframe

pc %>% 
  tidy() %>% 
  head()
## # A tibble: 6 x 3
##   row                                     PC  value
##   <fct>                                <dbl>  <dbl>
## 1 Abandoned Vehicle (parked on street)     1 -0.844
## 2 Barking Dog                              1  0.402
## 3 Board Up (PLI referral to DPW)           1  0.352
## 4 Broken Sidewalk                          1 -0.789
## 5 Building Maintenance                     1 -1.20 
## 6 Building Without a Permit                1 -0.747
pc %>% 
  tidy("pcs")
## # A tibble: 12 x 4
##       PC  std.dev percent cumulative
##    <dbl>    <dbl>   <dbl>      <dbl>
##  1     1 2.03e+ 0  0.344       0.344
##  2     2 1.51e+ 0  0.190       0.534
##  3     3 1.37e+ 0  0.156       0.690
##  4     4 1.06e+ 0  0.0945      0.785
##  5     5 9.37e- 1  0.0732      0.858
##  6     6 6.61e- 1  0.0364      0.894
##  7     7 6.32e- 1  0.0333      0.927
##  8     8 5.73e- 1  0.0274      0.955
##  9     9 4.67e- 1  0.0181      0.973
## 10    10 4.19e- 1  0.0146      0.988
## 11    11 3.85e- 1  0.0123      1    
## 12    12 3.79e-16  0           1
pc %>% 
  augment(data = df_months) -> au

au %>% 
  head()
## # A tibble: 6 x 26
##   .rownames request_type    Jan    Feb    Mar    Apr    May    Jun    Jul
##   <fct>     <chr>         <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 1         Abandoned V~ 0.0890 0.0727 0.0769 0.0710 0.0830 0.0793 0.0778
## 2 2         Barking Dog  0.0563 0.0608 0.0608 0.0631 0.104  0.101  0.0788
## 3 3         Board Up (P~ 0.0395 0.0482 0.0658 0.0943 0.114  0.0899 0.110 
## 4 4         Broken Side~ 0.0337 0.155  0.148  0.0872 0.105  0.0964 0.0696
## 5 5         Building Ma~ 0.0708 0.0919 0.103  0.0739 0.0842 0.0829 0.0725
## 6 6         Building Wi~ 0.0842 0.0697 0.0636 0.0577 0.105  0.0883 0.0924
## # ... with 17 more variables: Aug <dbl>, Sep <dbl>, Oct <dbl>, Nov <dbl>,
## #   Dec <dbl>, .fittedPC1 <dbl>, .fittedPC2 <dbl>, .fittedPC3 <dbl>,
## #   .fittedPC4 <dbl>, .fittedPC5 <dbl>, .fittedPC6 <dbl>,
## #   .fittedPC7 <dbl>, .fittedPC8 <dbl>, .fittedPC9 <dbl>,
## #   .fittedPC10 <dbl>, .fittedPC11 <dbl>, .fittedPC12 <dbl>

Plot how the PCA object explains the variance in the data

pc %>% 
  tidy("pcs") %>%
  select(-std.dev) %>% 
  gather(measure, value, -PC) %>% 
    ggplot(aes(PC, value)) +
    geom_line() +
    geom_point() +
    facet_wrap(~measure) +
    labs(title = "Variance explained by each principal component",
         x = "Principal Component",
         y = NULL) +
    scale_x_continuous(breaks = 1:12)

The first two principal components explain most of the variance

For an in-depth plot we need to create the PCA object a different way

df_months %>% 
  nest() %>% 
  mutate(pca = map(data, ~ prcomp(.x %>% select(-request_type), 
                                  center = TRUE, scale = TRUE)),
         pca_aug = map2(pca, data, ~augment(.x, data = .y))) -> df_months_pca2

Plot the PCA data

df_months_pca2 %>%
  mutate(
    pca_graph = map2(
      .x = pca,
      .y = data,
      ~ autoplot(.x, loadings = TRUE, loadings.label = TRUE,
                 loadings.label.repel = TRUE,
                 data = .y) +
        theme_bw() +
        labs(x = "Principal Component 1",
             y = "Principal Component 2",
             title = "First two principal components of PCA on 311 dataset")
    )
  ) %>%
  pull(pca_graph)
## [[1]]

This shows that summer and winter explain a significant part of the variance

Plot the data to show the outliers

au %>% 
  mutate(outlier = case_when(abs(.fittedPC1) > 2 & abs(.fittedPC2) > 1.5 ~ TRUE),
         pothole = case_when(request_type == "Potholes" ~ "Potholes",
                             request_type != "Potholes" ~ "Other")) -> au

au %>% 
ggplot(aes(.fittedPC1, .fittedPC2)) +
  geom_point() +
  geom_label_repel(data = au %>% filter(outlier),
             aes(label = request_type)) +
  theme_bw()

au %>% 
ggplot(aes(.fittedPC1, .fittedPC2)) +
  geom_point(aes(color = pothole)) +
  geom_label_repel(data = au %>% filter(request_type == "Potholes"),
             aes(label = request_type)) +
  theme_bw() +
  scale_color_manual(NULL, values = c("black", "red"))