Skip to content


Browse files Browse the repository at this point in the history
  • Loading branch information
Kun-Wu Lyu authored and Kun-Wu Lyu committed Aug 14, 2024
0 parents commit b6c0286
Show file tree
Hide file tree
Showing 66 changed files with 2,222 additions and 0 deletions.
Binary file added .DS_Store
Binary file not shown.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
343 changes: 343 additions & 0 deletions Event Data Analysis.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,343 @@
title: "Event Data Analysis"
author: "Kunwu Lyu"
date: "2024-06-26"
output: pdf_document

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE,
size = "small",
collapse = TRUE,
comment = NA,
warning = FALSE,
message = FALSE,
error = TRUE) # change it to TRUE

## Loading Packages

## Data Wrangling

# Define the directory containing the files
data_dir <- "data/"
# Define the file codes
file_codes <- c("F15", "W16", "S16", "F16", "W17", "S17", "F17", "W18", "S18", "F18",
"W19", "S19", "F19", "W20", "S20", "F20", "W21", "S21", "F21", "W22",
"S22", "F22", "W23", "S23", "F23", "W24", "S24")
# Create a function to read a file given its code
read_and_clean_event_file <- function(code) {
file_path <- paste0(data_dir, "2015-2024 Events Data - ", code, " - Event Data.csv")
data <- read_csv(file_path)
clean_names(data) %>%
mutate(across(everything(), as.character)) %>%
mutate(term = code)
term_to_year <- function(term) {
year <- as.numeric(str_sub(term, 2, 3))
season <- str_sub(term, 1, 1)
start_year <- if_else(season == "F", 2000 + year, 2000 + year - 1)
end_year <- start_year + 1
return(paste0(start_year, "-", end_year))
# Use purrr to read all files and store them in a named list
event_data_list <- set_names(map(file_codes, read_and_clean_event_file), file_codes)
combined_data <- reduce(event_data_list, full_join)
combined_data_filtered <- combined_data %>%
filter(!, what != "") %>%
filter(what != "Choir & Jazz Rehearsal") %>%
filter(what != "Jazz Rehearsal") %>%
mutate(date = as.Date(ymd(date))) %>%
mutate(support_level = if_else(support_level == "N" | support_level == "Y", "L", support_level)) %>%
department = str_replace_all(department, "WCC", "ODOA"),
department = str_replace_all(department, "MSUC", "MUSC"),
department = str_replace_all(department, "French Dept|French", "FREN"),
department = str_replace_all(department, "English", "ENGL"),
department = str_replace_all(department, "Pres. Office", "PRES"),
department = str_replace_all(department, "History", "HIST"),
department = str_replace_all(department, "THD", "THDA"),
department = str_replace_all(department, "Inclusion & Equity", "IEC"),
venue = str_replace_all(venue, "Skinner Chapel", "Chapel"),
department = str_replace_all(department, "/", " & "),
department = str_replace_all(department, ",", " &"),
venue = str_replace_all(venue, ",", " &"),
department = str_replace_all(department, "\\s+", " ") # Remove extra spaces
) %>%
select(-wk) %>%
mutate(department_type = case_when(
department == "MUSC" ~ "MUSC",
department == "ODOA" ~ "ODOA",
department == "CSA" ~ "CSA",
str_detect(department, "&") ~ "Collab",
TRUE ~ "Others"
)) %>%
mutate(what = str_replace_all(what, "Jazz Ensemble Concert|Jazz Area Concert", "Jazz Concert"),
what = str_replace_all(what, "Symphony Band Concert", "Symphony Concert"),
what = str_replace_all(what, "Composition Recital", "Composition Showcase Recital"),
what = str_replace_all(what, "Harpichord", "Harpsichord"),
what = str_replace_all(what, "Emsemble", "Ensemble"),
what = str_replace_all(what, "Juest Cellin'", "Just Cellin'"),
what = str_replace_all(what, "Facutly|FACULTY|Mazariello", "Faculty")) %>%
mutate(event_type = case_when(
str_detect(what, "GUEST|ODOA|Concert Series|SPCO") ~ "Guest",
str_detect(what, "Faculty") ~ "Faculty Recital",
str_detect(what, "Student|Senior|Junior|Piano Recital: |Johnson|Verma Jameson") ~ "Student Recital",
str_detect(what, "Studio Recital|Organ & Harpsichord|Composition Showcase Recital|Chamber Recital|Chamber Music Recital|Chamber Music|Organ Recital|Strings Recital|Violin & Viola|Violin/Viola|Drum Ensemble|Drum Recital|Voice Showcase Recital|Chinese Music Recital|Piano Studios Recital|Jazz Chamber|Piano Recital|Comps Fest|Recorder Recital|Music Ensemble") ~ "Studio Recital",
str_detect(what, "Orchestra Concert|Jazz Concert|Symphony Concert|Symphony Band|Choir Concert|Orchestra and Choir|Chinese & Global|Chinese Global Concert|Chinese and Global|Chinese Music Concert|Chinese Music Ensemble|Chinese Ensemble|Music Comps|Jazz Vocal Concert") ~ "Ensemble Concert",
str_detect(what, "CSA|Just Cellin|Lunar New Year|ACA|A Cappella|Accidentals|Exit 69|Date Knight|Knights|Knightingales|International Festival") ~ "Student Activity",
str_detect(what, "Masterclass|Lecture|Symposium") ~ "Masterclass",
str_detect(what, "Trustees|Trustee's|Presidents|Conference|President's|Presentation") ~ "Presentation",
str_detect(what, "Clinic|Music Fest|Music Department Showcase|Melinda Russell|Launch|Event|Opening") ~ "Special Events",
TRUE ~ "Guest"
)) %>%
mutate(year = term_to_year(term)) %>%
mutate(term = factor(term, levels = c("F15", "W16", "S16", "F16", "W17", "S17",
"F17", "W18", "S18", "F18", "W19", "S19",
"F19", "W20", "S20", "F20", "W21", "S21",
"F21", "W22", "S22", "F22", "W23", "S23",
"F23", "W24", "S24"), ordered = TRUE)) %>%
arrange(year, term) %>%
term_category = case_when(
str_detect(term, "^F") ~ "Fall",
str_detect(term, "^W") ~ "Winter",
str_detect(term, "^S") ~ "Spring"
) %>%
mutate(term_category = factor(term_category, levels = c("Spring", "Winter", "Fall"), ordered = TRUE))
event_summary <- combined_data_filtered %>%
group_by(year, term) %>%
summarize(term_total = n(), .groups = 'drop') %>%
group_by(year) %>%
mutate(year_total = sum(term_total)) %>%

## EDA Plots

# Overall Event Summary
# Sort the data frame by year and term in the order of F, W, S
event_summary <- event_summary %>%
mutate(term = factor(term, levels = c("F15", "W16", "S16", "F16", "W17", "S17",
"F17", "W18", "S18", "F18", "W19", "S19",
"F19", "W20", "S20", "F20", "W21", "S21",
"F21", "W22", "S22", "F22", "W23", "S23",
"F23", "W24", "S24"), ordered = TRUE)) %>%
arrange(year, term) %>%
term_category = case_when(
str_detect(term, "^F") ~ "Fall",
str_detect(term, "^W") ~ "Winter",
str_detect(term, "^S") ~ "Spring"
) %>%
mutate(term_category = factor(term_category, levels = c("Fall", "Winter", "Spring"), ordered = TRUE))
# Create the stacked bar chart
ggplot(event_summary, aes(x = year, y = term_total, fill = term_category)) +
geom_bar(stat = "identity") +
geom_text(aes(label = term_total),
position = position_stack(vjust = 0.5),
size = 3,
color = "black") +
scale_fill_manual(values = c("Fall" = "#FF9999", "Winter" = "#99CCFF", "Spring" = "#99FF99")) +
labs(x = "Year", y = "Total Events", fill = "Term") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Breakdown of Events by Support Level
# Function to summarize and pivot data for each year
summarize_and_pivot <- function(current_year) {
combined_data_filtered %>%
filter(year == current_year) %>%
group_by(term_category, support_level) %>%
summarize(Support = n(), .groups = 'drop') %>%
pivot_wider(names_from = support_level, values_from = Support, values_fill = list(Support = 0)) %>%
mutate(year = current_year) # Add a year column to identify the table
# List of years to iterate over
years <- unique(combined_data_filtered$year)
# Use purrr::map to apply summarize_and_pivot function for each year
support_tables <- map_dfr(years, summarize_and_pivot) # Combine into a single data frame
# Melt the data for ggplot
support_tables_melted <- support_tables %>%
pivot_longer(cols = -c(term_category, year), names_to = "support_level", values_to = "Support")
# Ensure the term_category and support_level are factors with the correct order
support_tables_melted <- support_tables_melted %>%
filter(support_level != "NA") %>%
mutate(term_category = factor(term_category, levels = c("Fall", "Winter", "Spring"), ordered = TRUE),
support_level = factor(support_level, levels = c("NA", "H", "M", "L"), ordered = TRUE))
# Calculate percentages for each segment
support_tables_melted <- support_tables_melted %>%
group_by(year, term_category) %>%
mutate(total_support = sum(Support),
percentage = (Support / total_support) * 100)
# Create the facet plot
ggplot(support_tables_melted, aes(x = term_category, y = Support, fill = support_level)) +
geom_bar(stat = "identity", position = "stack") +
geom_text(aes(label = paste0(Support, " (", round(percentage, 1), "%)")),
position = position_stack(vjust = 0.5), size = 2, color = "black") +
facet_wrap(~ year) +
labs(x = "Term", y = "Support Count", fill = "Support Level") +
scale_fill_manual(values = c("H" = "#FF9999", "M" = "#99CCFF", "L" = "#99FF99", "NA" = "black")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Breakdown of Events by Department/Source

# Function to summarize and pivot data for each year
summarize_and_pivot_department <- function(current_year) {
combined_data_filtered %>%
filter(year == current_year) %>%
group_by(term_category, department_type) %>%
summarize(DepartmentCount = n(), .groups = 'drop') %>%
pivot_wider(names_from = department_type, values_from = DepartmentCount, values_fill = list(DepartmentCount = 0)) %>%
mutate(year = current_year) # Add a year column to identify the table
# List of years to iterate over
years <- unique(combined_data_filtered$year)
# Use purrr::map to apply summarize_and_pivot function for each year
department_tables <- map_dfr(years, summarize_and_pivot_department) # Combine into a single data frame
# Melt the data for ggplot
department_tables_melted <- department_tables %>%
pivot_longer(cols = -c(term_category, year), names_to = "department_type", values_to = "DepartmentCount")
# Ensure the term_category and department_type are factors with the correct order
department_tables_melted <- department_tables_melted %>%
mutate(term_category = factor(term_category, levels = c("Fall", "Winter", "Spring"), ordered = TRUE),
department_type = factor(department_type, levels = c("MUSC", "ODOA", "CSA", "Collab", "Others"), ordered = TRUE))
# Filter out NA department types (if any)
department_tables_melted <- department_tables_melted %>%
filter(department_type != "NA")
# Calculate percentages for each segment
department_tables_melted <- department_tables_melted %>%
group_by(year, term_category) %>%
mutate(total_count = sum(DepartmentCount, na.rm = T),
percentage = (DepartmentCount / total_count) * 100)
# Create the facet plot
ggplot(department_tables_melted, aes(x = term_category, y = DepartmentCount, fill = department_type)) +
geom_bar(stat = "identity", position = "stack") +
geom_text(aes(label = paste0(DepartmentCount, " (", round(percentage, 1), "%)")),
position = position_stack(vjust = 0.5), size = 2, color = "black",
check_overlap = TRUE) +
facet_wrap(~ year) +
labs(x = "Term", y = "Department Count", fill = "Department Type") +
scale_fill_manual(values = c("MUSC" = "#FF9999", "ODOA" = "#99CCFF", "CSA" = "#99FF99", "Collab" = "#FFD700", "Others" = "#FFA500")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Breakdown of Music, Collab & ODOA Events by Type
# Function to summarize and pivot data for each year
summarize_and_pivot_event <- function(current_year) {
combined_data_filtered %>%
filter(year == current_year) %>%
group_by(term_category, event_type) %>%
summarize(EventCount = n(), .groups = 'drop') %>%
pivot_wider(names_from = event_type, values_from = EventCount, values_fill = list(EventCount = 0)) %>%
mutate(year = current_year) # Add a year column to identify the table
# List of years to iterate over
years <- unique(combined_data_filtered$year)
# Use purrr::map to apply summarize_and_pivot function for each year
event_tables <- map_dfr(years, summarize_and_pivot_event) # Combine into a single data frame
# Melt the data for ggplot
event_tables_melted <- event_tables %>%
pivot_longer(cols = -c(term_category, year), names_to = "event_type", values_to = "EventCount")
# Ensure the term_category and event_type are factors with the correct order
event_tables_melted <- event_tables_melted %>%
mutate(term_category = factor(term_category, levels = c("Fall", "Winter", "Spring"), ordered = TRUE),
event_type = factor(event_type, levels = c("Ensemble Concert", "Student Activity", "Studio Recital",
"Guest", "Faculty Recital", "Student Recital",
"Special Events", "Presentation", "Masterclass"),
ordered = TRUE))
# Calculate percentages for each segment
event_tables_melted <- event_tables_melted %>%
group_by(year, term_category) %>%
mutate(total_count = sum(EventCount, na.rm = T),
percentage = (EventCount / total_count) * 100)
# Create the facet plot
ggplot(event_tables_melted, aes(x = term_category, y = EventCount, fill = event_type)) +
geom_bar(stat = "identity", position = "stack") +
geom_text(aes(label = paste0(EventCount, " (", round(percentage, 1), "%)")),
position = position_stack(vjust = 0.5), size = 2, color = "black",
check_overlap = TRUE) +
facet_wrap(~ year) +
labs(x = "Term", y = "Event Count", fill = "Event Type") +
scale_fill_brewer(palette = "Set2") + # Using a Brewer palette for default colors
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

13 changes: 13 additions & 0 deletions Event Data Analysis.Rproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX

0 comments on commit b6c0286

Please sign in to comment.