diff --git a/R/data.R b/R/data.R index 6a3c1d2..9ee54a4 100644 --- a/R/data.R +++ b/R/data.R @@ -1,14 +1,14 @@ #' Search patterns used for SIMPOL.1 functional groups -#' +#' #' This dataframe documents how functional groups for the SIMPOL.1 and Meredith #' et al. method are defined using SMARTS strings or `ChemmineR` functions. -#' +#' #' @format #' \describe{ #' \item{method}{Either "simpol1" for functional groups only used with the SIMPOL.1 method, or "meredith" for additional groups used in the Meredith et al. method.} #' \item{functional_groups}{These correspond to matching column names in the results of [get_fx_groups()].} #' \item{smarts}{SMARTS strings used to capture groups, when applicable} -#' \item{fun}{The function used to capture the functional group. When `smarts` is not `NA`, this is always "ChemmineR::smartsSearchOB". Other groups are captured with other `ChemmineR` functions or as calculations using other functional groups.} +#' \item{fun}{The function used to capture the functional group. When `smarts` is not `NA`, this is always "[ChemmineR::smartsSearchOB]". Other groups are captured with other `ChemmineR` functions or as calculations using other functional groups.} #' \item{notes}{Notes including how any functional group counts are corrected when there is overlap. E.g. when one SMARTS pattern is a subset of another pattern, but the two groups are counted separately without overlap in the SIMPOL.1 method.} #' } "smarts_simpol1" \ No newline at end of file diff --git a/R/get_fx_groups.R b/R/get_fx_groups.R index 71191b4..822b4e1 100644 --- a/R/get_fx_groups.R +++ b/R/get_fx_groups.R @@ -100,48 +100,7 @@ Set `validate = FALSE` to silence this warning.") carbon_dbl_count <- tibble::add_row(carbon_dbl_count, n = 0) } - # *_pattern are SMARTS strings: https://www.daylight.com/dayhtml_tutorials/languages/smarts/smarts_examples.html - carbon_dbl_bonds_pattern <- "C=C" #non-aromatic carbon double bonds - CCCO_pattern <- "C(C=C[AR1])(=O)[AR1]" #C=C-C=O in a non-aromatic ring - # ether_alkyl_pattern <- "[OD2]([C!R1])[C!R1]" #currently unused--ether_alkly calculated as total - other ethers - ether_alicyclic_pattern <- "[OD2]([C!R0])[C!R0]" - ether_aromatic_pattern <- "O(c)[C,c]" #only one of the carbons has to be aromatic - nitro_pattern <- "[$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]" - hydroxyl_aromatic_pattern <- "[OX2H]c" - nitrate_pattern <- "[$([NX3](=[OX1])(=[OX1])O),$([NX3+]([OX1-])(=[OX1])O)]" - - #TODO need patterns for amines that don't pick up amides - amine_primary_pattern <- "[NX3;H2;!$(NC=[!#6]);!$(NC#[!#6])][#6X4]" - amine_secondary_pattern <- "[NX3H1!$(NC=[!#6])!$(NC#[!#6])]([#6X4])[#6X4]" - amine_tertiary_pattern <- "[NX3H0!$(NC=[!#6])!$(NC#[!#6])]([#6X4])([#6X4])[#6X4]" - amine_aromatic_pattern <- "[NX3;!$(NO)]c" - - amide_primary_pattern <- "[CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H2]" - amide_secondary_pattern <- "[CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H1][#6;!$(C=[O,N,S])]" - amide_tertiary_pattern <- - "[CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H0]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])]" - - # amide_total_pattern <- "[CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]" - - carbonylperoxynitrate_pattern <- "*C(=O)OO[N+1](=O)[O-1]" - peroxide_pattern <- "[OX2D2][OX2D2]" #this captures carbonylperoxynitrates too - hydroperoxide_pattern <- "[OX2][OX2H,OX1-]" #this captures peroxyacids too - carbonylperoxyacid_pattern <- "[CX3;$([R0][#6]),$([H1R0])](=[OX1])[OX2][$([OX2H]),$([OX1-])]" - nitroester_pattern <- "C(=O)(OC)C~[NX3](-,=[OX1])-,=[OX1]" - # This captures OH groups on a ring that also has a nitro group (para, ortho, or meta). Need to correct aromatic hydroxyl count later. - nitrophenol_pattern <- - "[OX2H][$(c1ccccc1[$([NX3](=O)=O),$([NX3+](=O)[O-])]),$(c1cccc(c1)[$([NX3](=O)=O),$([NX3+](=O)[O-])]),$(c1ccc(cc1)[$([NX3](=O)=O),$([NX3+](=O)[O-])])]" - phosphoric_acid_pattern <- - "[$(P(=[OX1])([$([OX2H]),$([OX1-]),$([OX2]P)])([$([OX2H]),$([OX1-]),$([OX2]P)])[$([OX2H]),$([OX1-]),$([OX2]P)]),$([P+]([OX1-])([$([OX2H]),$([OX1-]),$([OX2]P)])([$([OX2H]),$([OX1-]),$([OX2]P)])[$([OX2H]),$([OX1-]),$([OX2]P)])]" - phosphoric_ester_pattern <- - "[$(P(=[OX1])([OX2][#6])([$([OX2H]),$([OX1-]),$([OX2][#6])])[$([OX2H]),$([OX1-]),$([OX2][#6]),$([OX2]P)]),$([P+]([OX1-])([OX2][#6])([$([OX2H]),$([OX1-]),$([OX2][#6])])[$([OX2H]),$([OX1-]),$([OX2][#6]),$([OX2]P)])]" - sulfate_pattern <- - "[$([#16X4](=[OX1])(=[OX1])([OX2H,OX1H0-])[OX2][#6]),$([#16X4+2]([OX1-])([OX1-])([OX2H,OX1H0-])[OX2][#6])]" - #sulfonate groups; sulfonate ions, and conjugate acid, sulfonic acids - sulfonate_pattern <- - "[#16X4](=[OX1])(=[OX1])([#6])[*$([O-1]),*$([OH1]),*$([OX2H0])]" - thiol_pattern <- "[#16X2H]" - carbothioester_pattern <- "S([#6])[CX3](=O)[#6]" + smarts <- smarts_patterns_simpol1 fx_groups_df <- dplyr::tibble( @@ -157,43 +116,42 @@ Set `validate = FALSE` to silence this warning.") rings_aromatic = as.integer(rings$AROMATIC), rings_total = as.integer(rings$RINGS), rings_aliphatic = NA_integer_, #calculated below - carbon_dbl_bonds_aliphatic = ChemmineR::smartsSearchOB(compound_sdf, carbon_dbl_bonds_pattern), - CCCO_aliphatic_ring = ChemmineR::smartsSearchOB(compound_sdf, CCCO_pattern), # C=C-C=O in a non-aromatic ring + carbon_dbl_bonds_aliphatic = ChemmineR::smartsSearchOB(compound_sdf, smarts$carbon_dbl_bonds_aliphatic), + CCCO_aliphatic_ring = ChemmineR::smartsSearchOB(compound_sdf, smarts$CCCO_aliphatic_ring), hydroxyl_total = groups$ROH, - hydroxyl_aromatic = ChemmineR::smartsSearchOB(compound_sdf, hydroxyl_aromatic_pattern, uniqueMatches = FALSE), + hydroxyl_aromatic = ChemmineR::smartsSearchOB(compound_sdf, smarts$hydroxyl_aromatic, uniqueMatches = FALSE), hydroxyl_aliphatic = NA_integer_, #calculated below aldehydes = groups$RCHO, ketones = groups$RCOR, carbox_acids = groups$RCOOH, ester = groups$RCOOR, ether_total = groups$ROR, - # ether_alkyl = ChemmineR::smartsSearchOB(compound_sdf, ether_alkyl_pattern), - ether_alkyl = NA_integer_, - ether_alicyclic = ChemmineR::smartsSearchOB(compound_sdf, ether_alicyclic_pattern), - ether_aromatic = ChemmineR::smartsSearchOB(compound_sdf, ether_aromatic_pattern), - nitrate = ChemmineR::smartsSearchOB(compound_sdf, nitrate_pattern), - nitro = ChemmineR::smartsSearchOB(compound_sdf, nitro_pattern), - amine_primary = ChemmineR::smartsSearchOB(compound_sdf, amine_primary_pattern), - amine_secondary = ChemmineR::smartsSearchOB(compound_sdf, amine_secondary_pattern), - amine_tertiary = ChemmineR::smartsSearchOB(compound_sdf, amine_tertiary_pattern), - amine_aromatic = ChemmineR::smartsSearchOB(compound_sdf, amine_aromatic_pattern), - amide_primary = ChemmineR::smartsSearchOB(compound_sdf, amide_primary_pattern), - amide_secondary = ChemmineR::smartsSearchOB(compound_sdf, amide_secondary_pattern), - amide_tertiary = ChemmineR::smartsSearchOB(compound_sdf, amide_tertiary_pattern), - carbonylperoxynitrate = ChemmineR::smartsSearchOB(compound_sdf, carbonylperoxynitrate_pattern), - peroxide = ChemmineR::smartsSearchOB(compound_sdf, peroxide_pattern), - hydroperoxide = ChemmineR::smartsSearchOB(compound_sdf, hydroperoxide_pattern), - carbonylperoxyacid = ChemmineR::smartsSearchOB(compound_sdf, carbonylperoxyacid_pattern), - nitrophenol = ChemmineR::smartsSearchOB(compound_sdf, nitrophenol_pattern), - nitroester = ChemmineR::smartsSearchOB(compound_sdf, nitroester_pattern), + ether_alkyl = NA_integer_, #calculated below + ether_alicyclic = ChemmineR::smartsSearchOB(compound_sdf, smarts$ether_alicyclic), + ether_aromatic = ChemmineR::smartsSearchOB(compound_sdf, smarts$ether_aromatic), + nitrate = ChemmineR::smartsSearchOB(compound_sdf, smarts$nitrate), + nitro = ChemmineR::smartsSearchOB(compound_sdf, smarts$nitro), + amine_primary = ChemmineR::smartsSearchOB(compound_sdf, smarts$amine_primary), + amine_secondary = ChemmineR::smartsSearchOB(compound_sdf, smarts$amine_secondary), + amine_tertiary = ChemmineR::smartsSearchOB(compound_sdf, smarts$amine_tertiary), + amine_aromatic = ChemmineR::smartsSearchOB(compound_sdf, smarts$amine_aromatic), + amide_primary = ChemmineR::smartsSearchOB(compound_sdf, smarts$amide_primary), + amide_secondary = ChemmineR::smartsSearchOB(compound_sdf, smarts$amide_secondary), + amide_tertiary = ChemmineR::smartsSearchOB(compound_sdf, smarts$amide_tertiary), + carbonylperoxynitrate = ChemmineR::smartsSearchOB(compound_sdf, smarts$carbonylperoxynitrate), + peroxide = ChemmineR::smartsSearchOB(compound_sdf, smarts$peroxide), + hydroperoxide = ChemmineR::smartsSearchOB(compound_sdf, smarts$hydroperoxide), + carbonylperoxyacid = ChemmineR::smartsSearchOB(compound_sdf, smarts$carbonylperoxyacid), + nitrophenol = ChemmineR::smartsSearchOB(compound_sdf, smarts$nitrophenol), + nitroester = ChemmineR::smartsSearchOB(compound_sdf, smarts$nitroester), # Additional groups from Meredith et al. 2023 - phosphoric_acids = ChemmineR::smartsSearchOB(compound_sdf, phosphoric_acid_pattern), - phosphoric_esters = ChemmineR::smartsSearchOB(compound_sdf, phosphoric_ester_pattern), - sulfates = ChemmineR::smartsSearchOB(compound_sdf, sulfate_pattern), - sulfonates = ChemmineR::smartsSearchOB(compound_sdf, sulfonate_pattern), - thiols = ChemmineR::smartsSearchOB(compound_sdf, thiol_pattern), - carbothioesters = ChemmineR::smartsSearchOB(compound_sdf, carbothioester_pattern), + phosphoric_acids = ChemmineR::smartsSearchOB(compound_sdf, smarts$phosphoric_acids), + phosphoric_esters = ChemmineR::smartsSearchOB(compound_sdf, smarts$phosphoric_esters), + sulfates = ChemmineR::smartsSearchOB(compound_sdf, smarts$sulfates), + sulfonates = ChemmineR::smartsSearchOB(compound_sdf, smarts$sulfonates), + thiols = ChemmineR::smartsSearchOB(compound_sdf, smarts$thiols), + carbothioesters = ChemmineR::smartsSearchOB(compound_sdf, smarts$carbothioesters), oxygens = atoms[["O"]] %||% 0L, chlorines = atoms[["Cl"]] %||% 0L, nitrogens = atoms[["N"]] %||% 0L, diff --git a/R/sysdata.rda b/R/sysdata.rda new file mode 100644 index 0000000..bb23864 Binary files /dev/null and b/R/sysdata.rda differ diff --git a/data-raw/README.md b/data-raw/README.md new file mode 100644 index 0000000..236cb11 --- /dev/null +++ b/data-raw/README.md @@ -0,0 +1,4 @@ +In the future, if new methods are added, create a separate .csv file named smarts_.csv. +To turn this into a user-facing dataset, edit `make_data.R` to add another `usethis::use_data()` and document it by adding a new entry to `R/data.R`. To also use this in internal data, it needs to be added as an argument to `usethis::use_data(..., internal = TRUE)` since only one sysdata.rda can exist for holding internal data. E.g. `usethis::use_data(smarts_patterns_simpol1, smarts_patterns_newmethod, internal = TRUE, overwrite = TRUE)` + +Be sure to run the code in `make_data.R` and to run `devtools::document()` to update data and documentation. \ No newline at end of file diff --git a/data-raw/make_data.R b/data-raw/make_data.R new file mode 100644 index 0000000..65dcab0 --- /dev/null +++ b/data-raw/make_data.R @@ -0,0 +1,14 @@ +## code to prepare `smarts` dataset goes here +smarts_simpol1 <- readr::read_csv("data-raw/smarts_simpol1.csv") + +#create user-facing data.frame +usethis::use_data(smarts_simpol1, overwrite = TRUE) + +#create internal named list with just SMARTS strings +just_smarts_simpol1 <- + smarts_simpol1 %>% + dplyr::filter(!is.na(smarts)) +smarts_patterns_simpol1 <- as.list(just_smarts_simpol1$smarts) +names(smarts_patterns_simpol1) <- just_smarts_simpol1$functional_group + +usethis::use_data(smarts_patterns_simpol1, internal = TRUE, overwrite = TRUE) diff --git a/data-raw/smarts_simpol1.R b/data-raw/smarts_simpol1.R deleted file mode 100644 index 3b68961..0000000 --- a/data-raw/smarts_simpol1.R +++ /dev/null @@ -1,3 +0,0 @@ -## code to prepare `smarts` dataset goes here -smarts_simpol1 <- readr::read_csv("data-raw/smarts_simpol1.csv") -usethis::use_data(smarts_simpol1, overwrite = TRUE) diff --git a/data-raw/smarts_simpol1.csv b/data-raw/smarts_simpol1.csv index 09f6abb..def1ccc 100644 --- a/data-raw/smarts_simpol1.csv +++ b/data-raw/smarts_simpol1.csv @@ -4,7 +4,7 @@ simpol1,rings_aromatic,NA,ChemmineR::rings, simpol1,rings_total,NA,ChemmineR::rings, simpol1,rings_aliphatic,NA,rings_total - rings_aromatic, simpol1,carbon_dbl_bonds_aliphatic,C=C,ChemmineR::smartsSearchOB, -simpol1,CCCO_aliphatic_ring,C(C=C[AR1])(=O)[AR1],ChemmineR::smartsSearchOB, +simpol1,CCCO_aliphatic_ring,C(C=C[AR1])(=O)[AR1],ChemmineR::smartsSearchOB,Matches C=C-C=O in a non-aromatic ring simpol1,hydroxyl_total,NA,ChemmineR::groups, simpol1,hydroxyl_aromatic,[OX2H]c,ChemmineR::smartsSearchOB,"This pattern also captures nitrophenols, so the number of nitrophenols is subtracted" simpol1,hydroxyl_aliphatic,NA,hydroxyl_total - hydroxyl_aromatic, diff --git a/data/smarts_simpol1.rda b/data/smarts_simpol1.rda index 45f46d1..c0ccf25 100644 Binary files a/data/smarts_simpol1.rda and b/data/smarts_simpol1.rda differ diff --git a/man/smarts_simpol1.Rd b/man/smarts_simpol1.Rd index 2ebfc7e..9b79d90 100644 --- a/man/smarts_simpol1.Rd +++ b/man/smarts_simpol1.Rd @@ -9,7 +9,7 @@ \item{method}{Either "simpol1" for functional groups only used with the SIMPOL.1 method, or "meredith" for additional groups used in the Meredith et al. method.} \item{functional_groups}{These correspond to matching column names in the results of \code{\link[=get_fx_groups]{get_fx_groups()}}.} \item{smarts}{SMARTS strings used to capture groups, when applicable} -\item{fun}{The function used to capture the functional group. When \code{smarts} is not \code{NA}, this is always "ChemmineR::smartsSearchOB". Other groups are captured with other \code{ChemmineR} functions or as calculations using other functional groups.} +\item{fun}{The function used to capture the functional group. When \code{smarts} is not \code{NA}, this is always "\link[ChemmineR:smartsSearchOB]{ChemmineR::smartsSearchOB}". Other groups are captured with other \code{ChemmineR} functions or as calculations using other functional groups.} \item{notes}{Notes including how any functional group counts are corrected when there is overlap. E.g. when one SMARTS pattern is a subset of another pattern, but the two groups are counted separately without overlap in the SIMPOL.1 method.} } }