3_Sample_Data_Collection.Rmd

---
title: "Third Task - Sample Data Collection"
author: "Steven Liu; Kangjie Zhang"
date: "May 31, 2020"
output:
  html_document:
    df_print: paged
    toc: yes
  md_document:
    df_print: paged
    toc: yes
    variant: markdown_github
  pdf_document:
    toc: yes
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)
```

## 0 Preparation
```{r}
source("R/utils_kz.R")
library(igraph)
library(readr)
library(httr)
library(tidyverse)  
library(rtweet)
# tw <- search_tweets("(crisis AND management AND learning) OR
# (COVID AND management AND learning) OR
# (pandemic AND management AND learning) OR
# (course AND migration) OR
# (COVID AND migration) OR
# (pandemic AND migration) OR
# (COVID AND education) OR
# (pandemic AND education) OR
# (remote AND instruction) OR
# (emergency AND instruction) OR
# (COVID AND migrate AND online) OR
# (COVID AND pivot AND online) OR
# (COVID AND higher AND education AND online) OR
# (COVID AND higher AND education)", lang = "en", include_rts = T, n = 10000, retryonratelimit = TRUE)
# saveRDS(object = tw, file = "data/tw_31MAY2020.rds")
```

```{Rcpp}
#include <Rcpp.h>
#include <vector>
#include <string>
#include <unordered_map>

using namespace Rcpp;

inline std::unordered_map<String, int> getHashtagsTable(StringVector &hashtags_unique) {
  std::unordered_map<String, int> table(hashtags_unique.size());
  for (int i = 0; i < hashtags_unique.size(); i++) {
    table[hashtags_unique[i]] = i;
  }
  return table;
}

// [[Rcpp::export]]
NumericMatrix getCoOccurrenceMatrix(List &hashtags_list, StringVector &hashtags_unique) {
  std::unordered_map<String, int> hashtags_table = getHashtagsTable(hashtags_unique);
  NumericMatrix adj(hashtags_table.size());
  for (int i = 0; i < hashtags_list.size(); i++) {
    StringVector hashtags = hashtags_list[i];
    for (int j = 0; j < hashtags.size(); j++) {
      for (int k = 0; k < hashtags.size(); k++) {
        adj(hashtags_table[hashtags[j]], hashtags_table[hashtags[k]]) += 1;
      }
    }
  }
  return adj;
}
```


## 1 Variable Description
```{r}
tw <- readRDS("data/tw_31MAY2020.rds")
users <- users_data(tw)
tw_variables <- names(tw)
users_variables <- names(users)
data_frame(tw_variables)
data_frame(users_variables)
```

source: \
https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object (Variable Description for tweet data)
https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/user-object (Variable Description for users data) 

## 2 Basic Statistics
* Total number of tweets
```{r}
nrow(tw)
```
* Number of unique Tweets 
```{r}
unique_tw <- tw$text %>% 
  plain_tweets() %>% 
  unique() %>% 
  length()
unique_tw
```
* Number of unique accounts 
```{r}
unique_accounts <- users$user_id %>% 
  unique() %>% 
  length()
unique_accounts
```
* Numer of retweets
```{r}
table(tw$is_retweet)
```
* Number of quoted tweet
```{r}
table(tw$is_quote)
```

## 3 User Demographics
1. Gender information is not available in twitter. Steven used [COSMOS](http://socialdatalab.net/COSMOS) software to determine the gender by users' first name.
2. The attribute of users' language setting is no longer supported (deprecated) by twitter.

* Tweets/Accounts Rate (number of tweets per account)
```{r}
unique_tw/unique_accounts
```
* Devices/App used distribution
```{r}
source <- tw$source %>% table()
source <- data_frame(var = names(source), freq = source) %>% arrange(desc(freq)) 
others <- data_frame(var = "Others", freq = sum(source[-(1:8),]$freq))
source <- rbind(source[1:8,], others)
source %>%
  ggplot(aes(x = "", y = freq, fill = var)) +
  geom_col(width = 1) +
  coord_polar(theta = "y", direction = 1) + 
  labs(x = NULL, y = NULL, fill = "Utility") +
  labs(title="Devices/App used distribution") + 
  theme_minimal()
```

Geovisualization:
```{r}
## create lat/lng variables using all available tweet and profile geo-location data
tw <- lat_lng(tw)
countries <- maps::map("world", namesonly = T, plot = F)
maps::map("world", region = countries[-grep("Antarctica", countries)], lwd = .25)
with(tw, points(lng, lat, pch = 20, cex = .75, col = rgb(0, .3, .7, .75)))
```


Note that the geolocation information is not available on all tweets. (see the available number below)
```{r}
tw$lat %>% is.na() %>% `!` %>% sum()
```
## 4 Time span
```{r}
## plot time series of tweets
tw %>%
  ts_plot("3 hours") +
  ggplot2::theme_minimal() +
  ggplot2::theme(plot.title = ggplot2::element_text(face = "bold")) +
  ggplot2::labs(
    x = NULL, y = NULL,
    title = "Frequency of Twitter statuses from past 1-9 days",
    subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals",
    caption = "\nSource: Data collected from Twitter's REST API via rtweet"
  ) 
```

## 5 Word Frequency Summary
### Hashtags
#### Top Hashtags
```{r}
ht.top <- tw %>%
  related_hashtags_freq(upper = T) %>%
  sort() %>%
  as.data.frame() 
## list all hashtags ordered by frequency
top <- ht.top %>% arrange(desc(Freq))
top 
## plot top 30 hashtags
ht.top %>% top_n(30, wt = Freq) %>%
  ggplot(aes(tags.all, Freq)) +
  geom_col() +
  labs(x = "", y = "Number of appearances") +
  coord_flip()
```

#### Would cloud for Hashtags
```{r}
names(ht.top) <- c("word", "n")
words.count <- ht.top %>% arrange(desc(n))
wordcloud(words = words.count$word, freq = words.count$n, 
          random.order = F, max.words = 200, rot.per = 0.15,
          colors = brewer.pal(8, "Dark2"))
```

#### Hashtag cooccurrences 

* Cooccurrences matrix
```{r}
tw$hashtags <- lapply(tw$hashtags, toupper)
adj <- getCoOccurrenceMatrix(tw$hashtags, tw$hashtags %>% unlist() %>% unique())
names <- tw$hashtags %>% unlist() %>% unique()
colnames(adj) <- names
rownames(adj) <- names
a <- as.vector(top$tags.all)[1:20]
ht.net <- 
  adj %>%
  graph_from_adjacency_matrix(mode = "undirected", weighted = T, diag = F) 
ht.net <- induced_subgraph(ht.net, a)
as_data_frame(ht.net, what="edges") %>% arrange(desc(weight))
```

* Cooccurrences network
```{r fig1, out.width = '140%'}
graph_attr(ht.net, "layout") <- layout_in_circle
plot(
  ht.net,
  vertex.shape = "none",
  edge.color = "orange",
  edge.width = E(ht.net)$weight / 10,
  vertex.label.dist = 0,
  vertex.label.color = "steel blue",
  vertex.label.font = 1.2,
  vertex.label.cex = .4, 
  vertex.color = "gray50"
)
```

### Keywords
#### Top Keywords 

```{r}
## list all hashtags ordered by frequency
tw.words <- tw %>% clean_tweets() 
topwords <- tw.words %>% count(word, sort = T) 
topwords
## plot top 20 keywords
tw.words %>% 
  plot_word_freq(20)
```

#### Would cloud for Keywords
```{r}
words.count <- tw.words %>% count(word, sort = T)
wordcloud(words = words.count$word, freq = words.count$n, 
          random.order = F, max.words = 200, rot.per = 0.15,
          colors = brewer.pal(8, "Dark2"))
```

#### Keywords cooccurrences
* Cooccurrences matrix
```{r}
list <- rep(list(NA),nrow(tw))
for (i in 1:nrow(tw)) {
  list[i] <- clean_tweets(tw[i,])
}

list <- lapply(list, toupper)
adj <- getCoOccurrenceMatrix(list, list %>% unlist() %>% unique())
names <- list %>% unlist() %>% unique()
colnames(adj) <- names
rownames(adj) <- names
a <- as.vector(toupper(topwords$word)[1:20])

ht.net <- 
  adj %>%
  graph_from_adjacency_matrix(mode = "undirected", weighted = T, diag = F) 
ht.net <- induced_subgraph(ht.net, a)
as_data_frame(ht.net, what="edges") %>% arrange(desc(weight))
```

* Cooccurrences network
```{r fig2, out.width = '140%'}
graph_attr(ht.net, "layout") <- layout_in_circle
plot(
  ht.net,
  vertex.shape = "none",
  edge.color = "orange",
  edge.width = E(ht.net)$weight / 6000,
  vertex.label.dist = 0,
  vertex.label.color = "steel blue",
  vertex.label.font = 1.2,
  vertex.label.cex = .4, 
  vertex.color = "gray50"
)
```


## 6 Popular Tweets
### Most-liked tweets (>100 likes)
favorite_count: Indicates approximately how many times this Tweet has been liked by Twitter users. \
The below chuck list the tweets which have more than 100 likes.
```{r}
likes <- tw %>% select(text, favorite_count) %>% arrange(desc(favorite_count))
likes %>%
  filter(favorite_count>100)
# Histogram of number of likes
likes %>% filter(favorite_count>100) %>%
ggplot(aes(x=favorite_count)) + 
  geom_histogram(colour="black", fill="white")+
  labs(title="Histogram of number of likes (>100)", x = "Number of likes") 

# Histogram of number of likes
likes %>% filter(favorite_count<100) %>%
ggplot(aes(x=favorite_count)) + 
  geom_histogram(colour="black", fill="white")+
  labs(title="Histogram of number of likes (<100)", x = "Number of likes") 
```


### Top retweets (retweeted > 5000 times)
retweet_count: Number of times this Tweet has been retweeted.\
The below chuck list the tweets where the number of retweets is greater than 5000 times.
```{r}
retweet <- tw %>% select(text, retweet_count) %>% arrange(desc(retweet_count))
retweet %>%
  filter(retweet_count>5000) %>%
  unique()
# Histogram of number of retweets
retweet %>%
  filter(retweet_count>5000) %>%
  ggplot(aes(x=retweet_count)) + 
  geom_histogram(colour="black", fill="white")+
  labs(title="Histogram of number of retweets (>5000)", x = "Number of retweets") 

retweet %>%
  filter(retweet_count<5000) %>%
ggplot(aes(x=retweet_count)) + 
  geom_histogram(colour="black", fill="white")+
  labs(title="Histogram of number of retweets (<5000)", x = "Number of retweets") 
```