index.qmd

---
title: "MBON Data Report(s)"
description: Data Reporting on MBON-mediated Datasets 
code-fold: true
---


```{R}
#| code-summary: setup
#| message: false
#| warning: false
if (!requireNamespace("librarian", quietly = TRUE)) {
  # If not installed, install the package
  install.packages("librarian")
}

librarian::shelf(
  dplyr,
  ggplot2,
  here,
  networkD3,
  tm,
  wordcloud
)

```

```{R}
#| code-summary: read in data from csv
source(here("R/getCleanedData.R"))
data <- getCleanedData()
```


```{R}
#| code-summary: create word cloud from titles & descriptions
#| message: false
#| warning: false
combined_text <- paste(data$`Dataset.title`, data$`Dataset.summary`, collapse = " ")
corpus <- Corpus(VectorSource(combined_text))

# Clean the text data
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "/")
corpus <- tm_map(corpus, toSpace, "@")
corpus <- tm_map(corpus, toSpace, "\\|")

corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stripWhitespace)

# Create a Term-Document Matrix
dtm <- TermDocumentMatrix(corpus)
matrix <- as.matrix(dtm)
word_freqs <- sort(rowSums(matrix), decreasing = TRUE)
df <- data.frame(word = names(word_freqs), freq = word_freqs)

# Generate the word cloud
wordcloud(words = df$word, freq = df$freq, min.freq = 1, 
          max.words = 100, random.order = FALSE, 
          colors = brewer.pal(8, "Dark2"))
```

```{R}
#| code-summary: create a sankey diagram
source("R/yes_no_values.R")  # load consts used below
data <- data %>%
  mutate(
    RA =     ifelse(tolower(RA)     %in% no_values,   "",          RA),
    erddap = ifelse(tolower(erddap) %in% yes_values,       "in ERDDAP", erddap),
    erddap = ifelse(tolower(erddap) %in% no_values, "",          erddap),
    obis = ifelse(tolower(obis) %in% no_values, "", obis),
    obis = ifelse(tolower(obis) %in% yes_values, "in OBIS", obis),
    ncei = ifelse(tolower(ncei) %in% no_values, "", ncei),
    ncei = ifelse(tolower(ncei) %in% yes_values, "in NCEI", ncei)
  )

# Create links data frames for each column pair and merge them together
links <- rbind(
  data.frame(
    source = data$RA,
    target = data$erddap,
    value = 1
  ), 
  data.frame(
    source = data$RA,
    target = data$obis,
    value = 1
  ),
  data.frame(
    source = data$RA,
    target = data$ncei,
    value = 1
  )
) %>%
  filter_all(all_vars(. != ""))  %>% # drop anything that is empty
  rbind(
    data.frame(
      source = data$mbon.project,
      target = data$RA,
      value = 1
    )
  )
# Create nodes data frame
nodes <- data.frame(
  name = c(as.character(links$source), as.character(links$target)) %>% unique()
)

# Reformat the links with IDs
links$IDsource <- match(links$source, nodes$name) - 1
links$IDtarget <- match(links$target, nodes$name) - 1

# Make the Network
p <- networkD3::sankeyNetwork(Links = links, Nodes = nodes,
                   Source = "IDsource", Target = "IDtarget",
                   Value = "value", NodeID = "name",
                   sinksRight = FALSE)
p
```

```{R}
#| code-summary: show counts for standard publication metrics
library(ggplot2)
library(dplyr)
library(tidyr)

source(here("R/getCleanedData.R"))
data <- getCleanedData()

# Select columns to analyze
columns_to_check <- c(
  "erddap_link", "ncei_link", "code_link", "obis_link", "doi", "web_link"
)

# Count non-empty values for each column
summary_df <- data %>%
  summarise(across(all_of(columns_to_check), ~ sum(!is.na(.) & . != ""))) %>%
  pivot_longer(cols = everything(), names_to = "Column", values_to = "NonEmptyCount")

# Plot the bar chart
ggplot(summary_df, aes(x = Column, y = NonEmptyCount)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(title = "N Datasets Published by Repository",
       x = "Publication Type",
       y = "Number of Datasets") +
  theme_minimal()


```