7 min read

Mayor @billpeduto tweets

Pittsburgh Mayor Bill Peduto uses Twitter to communicate with his constituents, express his political opinions, and comment about Pittsburgh sports. I will use various R packages (mainly the Tidyverse, Tidytext, and rtweet) to analyze how the Mayor uses Twitter.

Before getting started, we need to load the packages required for this analysis:

library(tidyverse)
library(tidytext)
library(ggraph)
library(igraph)
library(widyr)
library(SnowballC)
library(lubridate)
library(viridis)
library(scales)
library(knitr)
library(kableExtra)

First, we will download @billpeduto’s recent tweets using rtweet

rtweet::tweets_bill <- get_timelines("BillPeduto", n = 3200)

I have already downloaded the data, so I will load it from my GitHub repo. I will also do some data munging to make the data easier to work with.

url <- "https://raw.githubusercontent.com/conorotompkins/pittsburgh_twitter/master/data/df_billpeduto.csv"

read_csv(url) %>% 
  mutate(created_at = with_tz(created_at, "US/Eastern"),
         date = ymd(str_sub(created_at, 1, 10)),
         year = year(date),
         month = month(date, label = TRUE),
         week = week(date),
         wday = wday(date, label = TRUE),
         hour = hour(created_at),
         month_year = str_c(month, year, sep = "-"),
         month_year = factor(month_year, levels = unique(month_year)),
         wday = factor(wday, levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"))) -> df_bill

df_bill[1:5, 1:5] %>% 
  kable("html") %>% 
  kable_styling()
status_id created_at source is_quote is_retweet
x892785891055468548 2017-08-02 12:35:18 Twitter for iPhone FALSE TRUE
x892804872390795264 2017-08-02 13:50:43 Twitter for iPhone FALSE TRUE
x892820785823404032 2017-08-02 14:53:57 Twitter for iPhone FALSE TRUE
x892926762933506048 2017-08-02 21:55:04 Twitter for iPhone FALSE TRUE
x892953788679692288 2017-08-02 23:42:28 Twitter for iPhone FALSE FALSE

Exploratory Analysis

This plot shows that the Mayor tweets about 15 times a day, though there is considerable daily variation.

df_bill %>% 
  count(date) %>%
  ggplot(aes(date, n)) +
  geom_jitter(alpha = .5) +
  geom_smooth(size = 2) +
  scale_x_date(date_breaks = "month",
               date_labels = "%b-%Y") +
  labs(title = title, 
       x = "",
       y = "Number of tweets",
       caption = caption)

The Mayor tweets the most on weekdays; Tuesday has the highest median number of tweets. The weekends see fewer tweets, and Saturday and Sunday are very similar.

df_bill %>% 
  count(wday, date) %>% 
  group_by(wday) %>% 
  mutate(median = median(n)) %>% 
  ggplot(aes(wday, n, color = wday)) +
  geom_jitter(alpha = .3, width = .2) +
  geom_point(aes(wday, median), size = 5) +
  scale_color_viridis(name = "Day of the week", discrete = TRUE) +
  labs(title = title,
       subtitle = "One small dot = one day, large dot = median for weekday",
       x = "",
       y = "Number of tweets in a given day",
       caption = caption) +
  theme_bw(base_family = 18)

The Mayor does most of his tweeting around lunchtime and in the late evening.

df_bill %>% 
  count(hour) %>% 
  ggplot(aes(hour, y = 1, fill = n)) +
  geom_tile() +
  coord_equal() +
  scale_y_continuous(expand = c(0,0)) +
  scale_x_continuous(expand = c(0,0),
                     breaks = seq(0, 23, by = 3)) +
  scale_fill_viridis(name = "Number of tweets",
                     option = 3) +
  guides(fill = guide_colorbar(title.position = "top")) +
  labs(title = title,
       x = "Hour",
       y = "",
       caption = caption) +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        legend.position = "bottom",
        legend.text = element_text(size = 7))

This view combines the day of the week and hour variables, and confirms what we observed when looking at the variables independently.

df_bill %>% 
  mutate(wday = factor(wday, levels = rev(levels(df_bill$wday)))) %>% 
  count(wday, hour) %>% 
  complete(wday, hour = 0:23) %>% 
  replace_na(list(n = 0)) %>% 
  ggplot(aes(hour, wday, fill = n)) +
  geom_tile() +
  coord_equal() +
  scale_x_continuous(expand = c(0,0),
                  breaks = seq(0, 24, by = 3)) +
  scale_y_discrete(expand = c(0,0)) +
  scale_fill_viridis(option = 3) +
  labs(title = title,
       x = "Hour",
       y = "",
       caption = caption) +
  guides(fill = guide_colorbar("Number of tweets"))

This plot shows the percentage of the Mayor’s tweets that are regular tweets, retweets, quotes, and replies.

df_bill %>% 
  select(date, month_year, month, week, is_retweet, is_quote, reply_to_screen_name) %>% 
  mutate(tweet_type = case_when(is.na(reply_to_screen_name) == FALSE ~ "Reply",
                                is_retweet == FALSE & is_quote == FALSE ~ "Regular tweet",
                                is_retweet == TRUE ~ "Retweet",
                                is_quote == TRUE ~ "Quote")) -> df_bill_tweet_types

df_bill_tweet_types %>% 
  count(month_year, tweet_type) %>% 
  mutate(tweet_type = factor(tweet_type, levels = rev(c("Retweet", "Regular tweet", "Quote", "Reply")))) -> df_bill_tweet_types_month_year

df_bill_tweet_types_month_year %>%  
  ggplot(aes(month_year, n, fill = tweet_type, group = tweet_type)) +
  geom_area(position = "fill") +
  scale_fill_viridis(name = "Tweet type", discrete = TRUE) +
  scale_x_discrete(expand = c(0,0)) +
  scale_y_continuous(expand = c(0,0), labels = scales::percent) +
  labs(title = "Types of @billpeduto tweets",
       x = "",
       y = "Percentage of tweets",
       caption = caption)

This heatmap shows that the Mayor replies to tweets most often in the late evening.

df_bill %>% 
  filter(!is.na(reply_to_screen_name), is_quote == FALSE, is_retweet == FALSE) %>% 
  count(wday, hour) %>% 
  complete(wday, hour = 0:23) %>% 
  replace_na(list(n = 0)) %>% 
  ggplot(aes(hour, wday, fill = n)) +
  geom_tile() +
  coord_equal() +
  scale_x_continuous(expand = c(0,0),
                  breaks = seq(0, 24, by = 3)) +
  scale_y_discrete(expand = c(0,0)) +
  scale_fill_viridis(option = 3) +
  labs(x = "Hour",
       y = "Day of the week") +
  guides(fill = guide_colorbar("Number of tweets"))

Text Analysis

The R package Tidytext makes it very easy to parse text data and pull insights from it. It interfaces very easily with ggraph, which creates network graphs.

set.seed(1234)
df_bill <- read_csv("https://raw.githubusercontent.com/conorotompkins/pittsburgh_twitter/master/data/bill_peduto_tweets.tweets.csv")

By separating the Mayor’s tweets into bigrams (two-word chunks), we can identify which words are used together. This code cuts the tweets into bigrams and counts their occurences. I exclude retweets and quotes from this analysis.

count_twitter_bigrams <- function(dataset, custom_stopwords) {
  replace_reg <- "https://t.co/[A-Za-z\\d]+|http://[A-Za-z\\d]+|&amp;|&lt;|&gt;|RT|https|'s"
  
  dataset %>%
    filter(is_quote == FALSE, is_retweet == FALSE) %>% 
    mutate(text = str_replace_all(text, replace_reg, "")) %>%
    unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
    separate(bigram, c("word1", "word2"), sep = " ") %>%
    filter(!word1 %in% stop_words$word,
           !word2 %in% stop_words$word,
           !word1 %in% custom_stopwords,
           !word2 %in% custom_stopwords) %>%
    count(word1, word2, sort = TRUE)
}

bill_stop_words <- c("t.co", "https", "amp")

tweets_bill <- count_twitter_bigrams(df_bill, bill_stop_words)
tweets_bill %>% 
  rename(count = n) %>% 
  head() %>% 
  kable("html") %>% 
  kable_styling()
word1 word2 count
public safety 12
affordable housing 10
amazon hq2 8
climate change 6
common sense 6
u.s cities 6

This network graph shows which words the Mayor uses together frequently. “public” + “safety”, and “affordable” + “housing” are the main standouts. I think this plot pretty accurately captures the topics the Mayor tweets about.

visualize_bigrams <- function(bigrams, minimum, text_size = 3, title = NULL, subtitle = NULL, caption = NULL) {
  set.seed(2016)
  a <- grid::arrow(type = "closed", 
                   length = unit(.1, "inches"))
  
  bigrams %>%
    filter(n >= minimum) %>% 
    graph_from_data_frame() %>%
    ggraph(layout = "fr") +
    geom_node_point(color = "lightblue", size = 3) +
    geom_node_text(aes(label = name), size = text_size, vjust = 1, hjust = 1) +
    geom_edge_link(aes(edge_alpha = n, edge_width = n), show.legend = TRUE, arrow = a, end_cap = circle(.25, 'inches')) +
    scale_edge_width_continuous("Count", range = c(.5, 1.5)) +
    scale_edge_alpha_continuous("Count", range = c(.3, .7)) +
    labs(title = title,
         subtitle = subtitle,
         caption = caption) +
    theme_void(base_size = 18)
}

visualize_bigrams(tweets_bill, 
                  minimum = 4,
                  text_size = 5,
                  title = "@billpeduto tweets",
                  subtitle = "Bigram network",
                  caption = caption)

This code determines which words are correlated with each other:

word_correlations <- function(dataframe, minimum, custom_stopwords){
  replace_reg <- "https://t.co/[A-Za-z\\d]+|http://[A-Za-z\\d]+|&amp;|&lt;|&gt;|RT|https"
  unnest_reg <- "([^A-Za-z_\\d#@']|'(?![A-Za-z_\\d#@]))"
  dataframe %>% 
  filter(is_quote == FALSE, is_retweet == FALSE) %>% 
  select(status_id, text) %>% 
  mutate(section = row_number() %/% 10) %>%
  filter(section > 0) %>%
  mutate(text = str_replace_all(text, replace_reg, "")) %>%
  unnest_tokens(word, text, token = "regex", pattern = unnest_reg) %>%
  filter(!word %in% stop_words$word,
         !word %in% custom_stopwords,
         str_detect(word, "[a-z]")) %>% 
  mutate(word = str_replace(word, "'", ""),
         word = str_replace(word, "'", ""),
         word = SnowballC::wordStem(word)) %>% 
  group_by(word) %>% 
  filter(n() >= minimum) %>%
  pairwise_cor(word, section, sort = TRUE)
}

bill_stopwords <- c("t.co", "https", "amp")

bill_words <- word_correlations(df_bill, minimum = 10, custom_stopwords = bill_stopwords)

This plot shows which words are correlated highly with each other. Some words are “stemmed” (shortened and/or modified), to capture very similar words. For example, “business” and “businesses” could be stemmed to “busine”.

“amazon” and “hq2” are used together overwhelmingly. The “pwsa” + “water” + “decad” combination indicate the ongoing water quality problems, and the plans to reinvest in the infrastructure. This plot shows how some of the policy issues the Mayor focuses on are connected.

visualize_word_correlations <- function(dataframe, title, subtitle, caption){
  dataframe %>% 
  filter(correlation > .3) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_label(aes(label = name), size = 5, repel = TRUE) +
  scale_edge_alpha_continuous(range = c(.1, .5)) +
  theme_void(base_size = 18) +
  labs(title = title,
       subtitle = subtitle,
       caption = caption)
}

visualize_word_correlations(bill_words, 
                            title = "@billpeduto tweets",
                            subtitle = "Word correlation network",
                            caption = "@conor_tompkins")