This is a short notebook scraping tweets related to the Vox-Pol Conference 2018 in Amsterdam. As this was again a very inspiring Vox-Pol event I thought it was time to further explore the twitter community.


Load the necessary packages

# install pacman once if not avaible on your machine
# install.packages("pacman")
pacman::p_load(tidyverse, purrr, tidyr, rtweet, stringr, ggraph, igraph, tidygraph, forcats)

Get Data

Call Twitter API. If you want to scrape data yourself you have to register a free account where you get your personal access point to Twitter. Check out rtweet on github and follow their instructions to the twitter authentication.

twitter_token <- readRDS("twitter_token.rds")

rt <- search_tweets(
  "#VOXPolConf18 OR #VOXPolConf2018", n = 2000, include_rts = T, retryonratelimit = T
save(rt, file = "rt.Rdata")

Lets first look at the data structure and column names. Twitter returns a huge amount of data.

rt %>% glimpse # the same as str, returns a df overview
## function (n, df, ncp)

The top ten retweeted tweets.

# load("rt.Rdata")
rt %>% 
  select(screen_name, text, retweet_count) %>% 
  filter(!str_detect(text, "^RT")) %>% 
  mutate(text = str_replace_all(text, "\\\n", " ")) %>% 
  arrange(desc(retweet_count)) %>% 
  top_n(n = 10) %>% 
  knitr::kable(., format = "markdown")
screen_name text retweet_count
MubarazAhmed ISIS jihadis remain keen on returning to mainstream social media platforms for recruitment purposes, don’t just want to be talking to each other on Telegram, says @AmarAmarasingam. #voxpolconf18 https://t.co/JwQLqakmF6 28
intelwire It’s happening!!! #voxpolconf18 https://t.co/XV7nWCG6vN 21
MubarazAhmed Fascinating findings presented by ⁦@AmarAmarasingam⁩ on languages used in Telegram communications by jihadi groups. Arabic remains integral to ISIS on Telegram, but there is also a surprisingly high level of activity in Persian. Cc: ⁦@KasraAarabi⁩ #voxpolconf18 https://t.co/7YVFUr2WBD 19
FabioFavusMaxim Very excited to have presented our research on the Alt-Right with @systatz at #voxpolconf2018. Received some great suggestions by @miriam_fs. to improve our analysis, which is definitely something we’ll implement. You can check out our slides here: https://t.co/uGV7es8VhF https://t.co/pz3113fPZN 19
VOX_Pol We look forward to seeing many of you in Amsterdam next week for #voxpolconf18. Most up-to-date version of the Conference Programme is at https://t.co/SLzRsN2y6E and also below. https://t.co/aw5IdEWcRD 17
ErinSaltman Present & future trends within violent extremism & terrorism; new tech, new tactics, old problems, old groups. Pleasure & privilege to share panel discussion with @intelwire @techvsterrorism @p_vanostaeyen moderated by @VOX_Pol @galwaygrrl. Big Qs at #voxpolconf18 !! https://t.co/fBnlrEe4c2 17
lizzypearson Really looking forward to @VOX_Pol Amsterdam conference where I’m talking UK Islamist offline reflections on online. Plus! seeing presentations by @Swansea_Law colleagues @CTProject_JW on online Jihadism in the US and @CTP_ALW on Britain First imagery in the UK #voxpolconf18 https://t.co/vGDsJgZM4I 15
AmarAmarasingam Day 2: @pieternanninga talks about the dramatic drop in ISIS video releases from 2015 to 2018. #VOXpolconf18 https://t.co/ipJYXzXIUI 14
MoignKhawaja .@AmarAmarasingam giving a very interesting presentation on how jihadists are using @telegram as a platform for various purposes including propaganda dissemination here at .⁦@VOX_Pol⁩ #VOXPolConf2018 day 1 session 2 chaired by ⁦@galwaygrrlhttps://t.co/NOLTPDUn1G 13
Drjohnhorgan Follow #VoxPolConf18 this week to learn about new research on terrorism, extremism and everything in between 13
MiloComerford Important corrective on online extremism from @MubarazAhmed’s research at #VOXPolConf18 - large proportion of traffic to extremist websites comes from searches, not social media. @VOX_Pol https://t.co/JVGIXouaa4 13


What was the best time to tweet?

rt %>%
  ## parse date format
    cdate = created_at %>% 
      str_extract("\\d{4}-\\d{2}-\\d{2}") %>% 
    hour = lubridate::hour(created_at)
  ) %>%
  ## select relevant time period
  filter(cdate >= as.Date("2018-08-19")) %>% 
  ## count tweet per and and hour
  group_by(cdate, hour) %>%
  tally %>%
  ungroup %>%
  ggplot(aes(hour, n)) +
  geom_line() +
  ## split the visualization 
  facet_wrap(~cdate, ncol = 2) +
  theme_minimal() +
  ggtitle("Number of Tweets by Day and Hour")

Retweet Network

rt_graph <- rt %>% 
  ## select relevant variables
  dplyr::select(screen_name, mentions_screen_name) %>% 
  ## unnest list of mentions_screen_name
  unnest %>% 
  ## count the number of coocurences
  filter(!(screen_name == "VOX_Pol" | mentions_screen_name == "VOX_Pol")) %>% 
  group_by(screen_name, mentions_screen_name) %>% 
  tally(sort = T) %>%
  ungroup %>% 
  ## drop missing values
  drop_na %>% 
  ## iflter those coocurences that appear at least 2 times
  filter(n > 1) %>% 
  ## transforming the dataframe to a graph object
  as_tbl_graph() %>% 
  ## calculating node centrality
  mutate(popularity = centrality_degree(mode = 'in'))

rt_graph %>% 
  ## create graph layout
  ggraph(layout = "kk") + 
  ## define edge aestetics
  geom_edge_fan(aes(alpha = n, edge_width = n, color = n)) + 
  ## scale down link saturation
  scale_edge_alpha(range = c(.5, .9)) +
  ## define note size param
  scale_edge_color_gradient(low = "gray50", high = "#1874CD") +
  geom_node_point(aes(size = popularity), color = "gray30") +
  ## define node labels
  geom_node_text(aes(label = name), repel = T, fontface = "bold") +
  ## equal width and height
  coord_fixed() +
  ## plain theme
  theme_void() +
  ## title
  ggtitle("#VOXPolConf18 Tweets and Retweets")

Most Frequent Hashtags

rt_hashtags <- rt %>% 
  select(hashtags) %>% 
  ## unnest list of hastags
  unnest %>% 
    na.omit %>% 
  ## clean hashtags
  mutate(hashtags = stringr::str_to_lower(hashtags) %>% 
           str_replace_all("2018", "18") %>% 
           ## add #symbol to vector
           paste0("#", .)) %>% 
  ## count each hashtag and sort
  count(hashtags, sort = T) %>% 
  filter(n > 2)

rt_hashtags %>% 
  filter(hashtags != "#voxpolconf18") %>%
  mutate(hashtags = forcats::fct_reorder(hashtags, n)) %>% 
  ggplot(aes(hashtags, n)) +
  geom_bar(stat = "identity", alpha = .7) +
  coord_flip() +
  theme_minimal() +
  ggtitle("Most Frequent Hastags related to #voxpolconf18")

Most Frequent Bigram Network

gg_bigram <- rt %>%
  select(text) %>% 
  ## remove text noise
  mutate(text = stringr::str_remove_all(text, "w |amp ")) %>% 
  ## remove retweets
  filter(!stringr::str_detect(text, "^RT")) %>% 
  ## remove urls
  mutate(text = stringr::str_remove_all(text, "https?[:]//[[:graph:]]+")) %>% 
  mutate(id = 1:n()) %>% 
  ## split text into words
  tidytext::unnest_tokens(word, text, token = "words") %>% 
  ## remove stop words
  anti_join(tidytext::stop_words) %>% 
  ## paste words to text by id
  group_by(id) %>% 
  summarise(text = paste(word, collapse = " ")) %>% 
  ungroup %>% 
  ## again split text into bigrams (word occurences or collocations)
  tidytext::unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% 
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  ## count bigrams
  count(word1, word2, sort = T) %>% 
  ## select first 90
  slice(1:100) %>% 
  drop_na() %>%
  ## create tidy graph object
  as_tbl_graph() %>% 
  ## calculate node centrality
  mutate(Popularity = centrality_degree(mode = 'in'))
gg_bigram %>% 
  ggraph() +
  geom_edge_link(aes(edge_alpha = n, edge_width = n)) +
  geom_node_point(aes(size = Popularity)) + 
  geom_node_text(aes(label = name),  repel = TRUE) +
  theme_void() +
  scale_edge_alpha("", range = c(0.3, .6)) +
  ggtitle("Top Bigram Network from Tweets using hashtag #VOXPolConf18")

