Skip to content

Commit

Permalink
Merge pull request #7 from nrosed/main
Browse files Browse the repository at this point in the history
Updated location prediction to openstreetmap
  • Loading branch information
nrosed authored Mar 8, 2021
2 parents 912b366 + 3fec287 commit 526069b
Show file tree
Hide file tree
Showing 65 changed files with 34,287 additions and 1,155 deletions.
122 changes: 32 additions & 90 deletions analysis_scripts/analysis_utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,16 @@ library(jsonlite)
library(data.table)
library(dplyr)


proj_dir = here()
source(file.path(proj_dir, "/utils/scraper_processing_utils.R"))


#' Read in the quote information from processed coreNLP TSV output
#'
#' @param corenlp_file The processed coreNLP output TSV file.
#' Expected column names are full_name, gender, and quote
#' @return a dataframe of the quote information
read_corenlp_quote_files <- function(corenlp_file){

corenlp_df = data.frame(fread(corenlp_file, header=T, quote=""))
Expand All @@ -17,6 +27,11 @@ read_corenlp_quote_files <- function(corenlp_file){
return(corenlp_df)
}

#' Read in the benchmark quote information from processed coreNLP TSV output
#'
#' @param gold_file The hand-processed coreNLP output TSV file.
#' Expected column names are full_name, gender, quote
#' @return a dataframe of the benchmark quote information
read_benchmark_quote_file <- function(gold_file){

gold_df = data.frame(fread(gold_file, header=T, quote=FALSE))
Expand All @@ -28,14 +43,24 @@ read_benchmark_quote_file <- function(gold_file){

}


#' Read in the location information from processed coreNLP TSV output
#'
#' @param corenlp_file The processed coreNLP output TSV file.
#' Expected column names are address.country_code, country, un_region, and un_subregion
#' @return a dataframe of the location information
read_corenlp_location_files <- function(corenlp_file){

corenlp_df = data.frame(fread(corenlp_file, header=T))
country_df = get_country_info()
corenlp_df = merge(corenlp_df, country_df, all.x=T)
corenlp_df = unique(corenlp_df)

colnames(corenlp_df)[which(colnames(corenlp_df)=="address.country_code")] = "est_country_code"
colnames(corenlp_df)[which(colnames(corenlp_df)=="country")] = "est_country"
colnames(corenlp_df)[which(colnames(corenlp_df)=="un_region")] = "est_un_region"
colnames(corenlp_df)[which(colnames(corenlp_df)=="un_subregion")] = "est_un_subregion"

corenlp_df$est_country[which(is.na(corenlp_df$est_country_code))] = "NO_EST"
corenlp_df$est_country[which(is.na(corenlp_df$est_country))] = "NO_EST"
corenlp_df$est_un_region[which(is.na(corenlp_df$est_un_region))] = "NO_EST"
corenlp_df$est_un_subregion[which(is.na(corenlp_df$est_un_subregion))] = "NO_EST"
Expand All @@ -44,102 +69,19 @@ read_corenlp_location_files <- function(corenlp_file){
return(corenlp_df)
}

#' Read in the benchmark location information from processed coreNLP TSV output
#'
#' @param bm_loc_file The hand-processed coreNLP output TSV file.
#' Expected column names are address.country_code, country, un_region, and un_subregion
#' @return a dataframe of the benchmark location information
read_benchmark_location_file <- function(bm_loc_file){

gold_df = data.frame(fread(bm_loc_file, header=T))
colnames(gold_df)[which(colnames(gold_df)=="address.country_code")] = "true_country_code"
colnames(gold_df)[which(colnames(gold_df)=="country")] = "true_country"
colnames(gold_df)[which(colnames(gold_df)=="un_region")] = "true_un_region"
colnames(gold_df)[which(colnames(gold_df)=="un_subregion")] = "true_un_subregion"
return(gold_df)

}


make_comparison <- function(gold_df, res_df){

# join the df to make comparison
compare_df = merge(gold_df, res_df, by=c("file_id", "quote"), all.x=T)

#check if the predicted speaker is contained within the true speaker
speaker_idx = which(colnames(compare_df) == "est_speaker")
true_speaker_idx = which(colnames(compare_df) == "true_speaker")
speaker_match = apply(compare_df, 1,
function(x) grepl(x[speaker_idx],
x[true_speaker_idx],
fixed=T))

compare_df$is_speaker_correct = speaker_match



gender_idx = which(colnames(compare_df) == "est_gender")
true_gender_idx = which(colnames(compare_df) == "true_gender")
gender_match = apply(compare_df, 1,
function(x) x[gender_idx] == x[true_gender_idx])

compare_df$is_gender_correct = gender_match

return(compare_df)

}

evaluate_gender_speaker <- function(compare_df){

eval_df = subset(compare_df, !is.na(true_gender))
eval_df = subset(eval_df, true_gender!="NOT_CLEAR")

# overall accuracy
overall_gender_acc = sum(eval_df$is_gender_correct) / nrow(eval_df)

## get per DOC stats
# perdocument accuracy
per_doc_acc = eval_df %>%
group_by(file_id) %>%
summarize(acc = sum(is_gender_correct) / length(is_gender_correct))
per_doc_acc = data.frame(per_doc_acc)

}

evaluate_speaker_attrib <- function(compare_df, outfile){

eval_df = subset(compare_df, !is.na(true_speaker))
eval_df = subset(eval_df, true_speaker!="NOT_CLEAR")

# overall accuracy
overall_speaker_acc = sum(eval_df$is_speaker_correct) / nrow(eval_df)

## get per DOC stats
# perdocument accuracy
per_doc_acc = eval_df %>%
group_by(file_id) %>%
summarize(acc = sum(is_speaker_correct) / length(is_speaker_correct))
per_doc_acc = data.frame(per_doc_acc)

# document stats
num_quotes = data.frame(table(eval_df$file_id))
colnames(num_quotes) = c("file_id", "num_quotes")
per_doc_acc = merge(per_doc_acc, num_quotes)


}

basic_doc_stats <- function(gold_df, year_idx_file){


eval_df = subset(gold_df, !is.na(true_gender))
eval_df = subset(eval_df, true_gender!="NOT_CLEAR")


year_df = data.frame(fread(year_idx_file))
eval_df = merge(year_df, eval_df)

## get per year stats
# perdocument accuracy
per_year_gender = eval_df %>%
group_by(year) %>%
summarize(percent_male = sum(true_gender=="MALE") / length(true_gender))
per_year_gender = data.frame(per_year_gender)



}
Loading

0 comments on commit 526069b

Please sign in to comment.