Skip to content

Commit

Permalink
define SMARTS patterns in internal dataset rather than in get_fx_grou…
Browse files Browse the repository at this point in the history
…ps() code
  • Loading branch information
Aariq committed Aug 6, 2024
1 parent 63e8a6d commit 2d4e2d4
Show file tree
Hide file tree
Showing 9 changed files with 51 additions and 78 deletions.
6 changes: 3 additions & 3 deletions R/data.R
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
#' Search patterns used for SIMPOL.1 functional groups
#'
#'
#' This dataframe documents how functional groups for the SIMPOL.1 and Meredith
#' et al. method are defined using SMARTS strings or `ChemmineR` functions.
#'
#'
#' @format
#' \describe{
#' \item{method}{Either "simpol1" for functional groups only used with the SIMPOL.1 method, or "meredith" for additional groups used in the Meredith et al. method.}
#' \item{functional_groups}{These correspond to matching column names in the results of [get_fx_groups()].}
#' \item{smarts}{SMARTS strings used to capture groups, when applicable}
#' \item{fun}{The function used to capture the functional group. When `smarts` is not `NA`, this is always "ChemmineR::smartsSearchOB". Other groups are captured with other `ChemmineR` functions or as calculations using other functional groups.}
#' \item{fun}{The function used to capture the functional group. When `smarts` is not `NA`, this is always "[ChemmineR::smartsSearchOB]". Other groups are captured with other `ChemmineR` functions or as calculations using other functional groups.}
#' \item{notes}{Notes including how any functional group counts are corrected when there is overlap. E.g. when one SMARTS pattern is a subset of another pattern, but the two groups are counted separately without overlap in the SIMPOL.1 method.}
#' }
"smarts_simpol1"
98 changes: 28 additions & 70 deletions R/get_fx_groups.R
Original file line number Diff line number Diff line change
Expand Up @@ -100,48 +100,7 @@ Set `validate = FALSE` to silence this warning.")
carbon_dbl_count <- tibble::add_row(carbon_dbl_count, n = 0)
}

# *_pattern are SMARTS strings: https://www.daylight.com/dayhtml_tutorials/languages/smarts/smarts_examples.html
carbon_dbl_bonds_pattern <- "C=C" #non-aromatic carbon double bonds
CCCO_pattern <- "C(C=C[AR1])(=O)[AR1]" #C=C-C=O in a non-aromatic ring
# ether_alkyl_pattern <- "[OD2]([C!R1])[C!R1]" #currently unused--ether_alkly calculated as total - other ethers
ether_alicyclic_pattern <- "[OD2]([C!R0])[C!R0]"
ether_aromatic_pattern <- "O(c)[C,c]" #only one of the carbons has to be aromatic
nitro_pattern <- "[$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]"
hydroxyl_aromatic_pattern <- "[OX2H]c"
nitrate_pattern <- "[$([NX3](=[OX1])(=[OX1])O),$([NX3+]([OX1-])(=[OX1])O)]"

#TODO need patterns for amines that don't pick up amides
amine_primary_pattern <- "[NX3;H2;!$(NC=[!#6]);!$(NC#[!#6])][#6X4]"
amine_secondary_pattern <- "[NX3H1!$(NC=[!#6])!$(NC#[!#6])]([#6X4])[#6X4]"
amine_tertiary_pattern <- "[NX3H0!$(NC=[!#6])!$(NC#[!#6])]([#6X4])([#6X4])[#6X4]"
amine_aromatic_pattern <- "[NX3;!$(NO)]c"

amide_primary_pattern <- "[CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H2]"
amide_secondary_pattern <- "[CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H1][#6;!$(C=[O,N,S])]"
amide_tertiary_pattern <-
"[CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H0]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])]"

# amide_total_pattern <- "[CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]"

carbonylperoxynitrate_pattern <- "*C(=O)OO[N+1](=O)[O-1]"
peroxide_pattern <- "[OX2D2][OX2D2]" #this captures carbonylperoxynitrates too
hydroperoxide_pattern <- "[OX2][OX2H,OX1-]" #this captures peroxyacids too
carbonylperoxyacid_pattern <- "[CX3;$([R0][#6]),$([H1R0])](=[OX1])[OX2][$([OX2H]),$([OX1-])]"
nitroester_pattern <- "C(=O)(OC)C~[NX3](-,=[OX1])-,=[OX1]"
# This captures OH groups on a ring that also has a nitro group (para, ortho, or meta). Need to correct aromatic hydroxyl count later.
nitrophenol_pattern <-
"[OX2H][$(c1ccccc1[$([NX3](=O)=O),$([NX3+](=O)[O-])]),$(c1cccc(c1)[$([NX3](=O)=O),$([NX3+](=O)[O-])]),$(c1ccc(cc1)[$([NX3](=O)=O),$([NX3+](=O)[O-])])]"
phosphoric_acid_pattern <-
"[$(P(=[OX1])([$([OX2H]),$([OX1-]),$([OX2]P)])([$([OX2H]),$([OX1-]),$([OX2]P)])[$([OX2H]),$([OX1-]),$([OX2]P)]),$([P+]([OX1-])([$([OX2H]),$([OX1-]),$([OX2]P)])([$([OX2H]),$([OX1-]),$([OX2]P)])[$([OX2H]),$([OX1-]),$([OX2]P)])]"
phosphoric_ester_pattern <-
"[$(P(=[OX1])([OX2][#6])([$([OX2H]),$([OX1-]),$([OX2][#6])])[$([OX2H]),$([OX1-]),$([OX2][#6]),$([OX2]P)]),$([P+]([OX1-])([OX2][#6])([$([OX2H]),$([OX1-]),$([OX2][#6])])[$([OX2H]),$([OX1-]),$([OX2][#6]),$([OX2]P)])]"
sulfate_pattern <-
"[$([#16X4](=[OX1])(=[OX1])([OX2H,OX1H0-])[OX2][#6]),$([#16X4+2]([OX1-])([OX1-])([OX2H,OX1H0-])[OX2][#6])]"
#sulfonate groups; sulfonate ions, and conjugate acid, sulfonic acids
sulfonate_pattern <-
"[#16X4](=[OX1])(=[OX1])([#6])[*$([O-1]),*$([OH1]),*$([OX2H0])]"
thiol_pattern <- "[#16X2H]"
carbothioester_pattern <- "S([#6])[CX3](=O)[#6]"
smarts <- smarts_patterns_simpol1

fx_groups_df <-
dplyr::tibble(
Expand All @@ -157,43 +116,42 @@ Set `validate = FALSE` to silence this warning.")
rings_aromatic = as.integer(rings$AROMATIC),
rings_total = as.integer(rings$RINGS),
rings_aliphatic = NA_integer_, #calculated below
carbon_dbl_bonds_aliphatic = ChemmineR::smartsSearchOB(compound_sdf, carbon_dbl_bonds_pattern),
CCCO_aliphatic_ring = ChemmineR::smartsSearchOB(compound_sdf, CCCO_pattern), # C=C-C=O in a non-aromatic ring
carbon_dbl_bonds_aliphatic = ChemmineR::smartsSearchOB(compound_sdf, smarts$carbon_dbl_bonds_aliphatic),
CCCO_aliphatic_ring = ChemmineR::smartsSearchOB(compound_sdf, smarts$CCCO_aliphatic_ring),
hydroxyl_total = groups$ROH,
hydroxyl_aromatic = ChemmineR::smartsSearchOB(compound_sdf, hydroxyl_aromatic_pattern, uniqueMatches = FALSE),
hydroxyl_aromatic = ChemmineR::smartsSearchOB(compound_sdf, smarts$hydroxyl_aromatic, uniqueMatches = FALSE),
hydroxyl_aliphatic = NA_integer_, #calculated below
aldehydes = groups$RCHO,
ketones = groups$RCOR,
carbox_acids = groups$RCOOH,
ester = groups$RCOOR,
ether_total = groups$ROR,
# ether_alkyl = ChemmineR::smartsSearchOB(compound_sdf, ether_alkyl_pattern),
ether_alkyl = NA_integer_,
ether_alicyclic = ChemmineR::smartsSearchOB(compound_sdf, ether_alicyclic_pattern),
ether_aromatic = ChemmineR::smartsSearchOB(compound_sdf, ether_aromatic_pattern),
nitrate = ChemmineR::smartsSearchOB(compound_sdf, nitrate_pattern),
nitro = ChemmineR::smartsSearchOB(compound_sdf, nitro_pattern),
amine_primary = ChemmineR::smartsSearchOB(compound_sdf, amine_primary_pattern),
amine_secondary = ChemmineR::smartsSearchOB(compound_sdf, amine_secondary_pattern),
amine_tertiary = ChemmineR::smartsSearchOB(compound_sdf, amine_tertiary_pattern),
amine_aromatic = ChemmineR::smartsSearchOB(compound_sdf, amine_aromatic_pattern),
amide_primary = ChemmineR::smartsSearchOB(compound_sdf, amide_primary_pattern),
amide_secondary = ChemmineR::smartsSearchOB(compound_sdf, amide_secondary_pattern),
amide_tertiary = ChemmineR::smartsSearchOB(compound_sdf, amide_tertiary_pattern),
carbonylperoxynitrate = ChemmineR::smartsSearchOB(compound_sdf, carbonylperoxynitrate_pattern),
peroxide = ChemmineR::smartsSearchOB(compound_sdf, peroxide_pattern),
hydroperoxide = ChemmineR::smartsSearchOB(compound_sdf, hydroperoxide_pattern),
carbonylperoxyacid = ChemmineR::smartsSearchOB(compound_sdf, carbonylperoxyacid_pattern),
nitrophenol = ChemmineR::smartsSearchOB(compound_sdf, nitrophenol_pattern),
nitroester = ChemmineR::smartsSearchOB(compound_sdf, nitroester_pattern),
ether_alkyl = NA_integer_, #calculated below
ether_alicyclic = ChemmineR::smartsSearchOB(compound_sdf, smarts$ether_alicyclic),
ether_aromatic = ChemmineR::smartsSearchOB(compound_sdf, smarts$ether_aromatic),
nitrate = ChemmineR::smartsSearchOB(compound_sdf, smarts$nitrate),
nitro = ChemmineR::smartsSearchOB(compound_sdf, smarts$nitro),
amine_primary = ChemmineR::smartsSearchOB(compound_sdf, smarts$amine_primary),
amine_secondary = ChemmineR::smartsSearchOB(compound_sdf, smarts$amine_secondary),
amine_tertiary = ChemmineR::smartsSearchOB(compound_sdf, smarts$amine_tertiary),
amine_aromatic = ChemmineR::smartsSearchOB(compound_sdf, smarts$amine_aromatic),
amide_primary = ChemmineR::smartsSearchOB(compound_sdf, smarts$amide_primary),
amide_secondary = ChemmineR::smartsSearchOB(compound_sdf, smarts$amide_secondary),
amide_tertiary = ChemmineR::smartsSearchOB(compound_sdf, smarts$amide_tertiary),
carbonylperoxynitrate = ChemmineR::smartsSearchOB(compound_sdf, smarts$carbonylperoxynitrate),
peroxide = ChemmineR::smartsSearchOB(compound_sdf, smarts$peroxide),
hydroperoxide = ChemmineR::smartsSearchOB(compound_sdf, smarts$hydroperoxide),
carbonylperoxyacid = ChemmineR::smartsSearchOB(compound_sdf, smarts$carbonylperoxyacid),
nitrophenol = ChemmineR::smartsSearchOB(compound_sdf, smarts$nitrophenol),
nitroester = ChemmineR::smartsSearchOB(compound_sdf, smarts$nitroester),

# Additional groups from Meredith et al. 2023
phosphoric_acids = ChemmineR::smartsSearchOB(compound_sdf, phosphoric_acid_pattern),
phosphoric_esters = ChemmineR::smartsSearchOB(compound_sdf, phosphoric_ester_pattern),
sulfates = ChemmineR::smartsSearchOB(compound_sdf, sulfate_pattern),
sulfonates = ChemmineR::smartsSearchOB(compound_sdf, sulfonate_pattern),
thiols = ChemmineR::smartsSearchOB(compound_sdf, thiol_pattern),
carbothioesters = ChemmineR::smartsSearchOB(compound_sdf, carbothioester_pattern),
phosphoric_acids = ChemmineR::smartsSearchOB(compound_sdf, smarts$phosphoric_acids),
phosphoric_esters = ChemmineR::smartsSearchOB(compound_sdf, smarts$phosphoric_esters),
sulfates = ChemmineR::smartsSearchOB(compound_sdf, smarts$sulfates),
sulfonates = ChemmineR::smartsSearchOB(compound_sdf, smarts$sulfonates),
thiols = ChemmineR::smartsSearchOB(compound_sdf, smarts$thiols),
carbothioesters = ChemmineR::smartsSearchOB(compound_sdf, smarts$carbothioesters),
oxygens = atoms[["O"]] %||% 0L,
chlorines = atoms[["Cl"]] %||% 0L,
nitrogens = atoms[["N"]] %||% 0L,
Expand Down
Binary file added R/sysdata.rda
Binary file not shown.
4 changes: 4 additions & 0 deletions data-raw/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
In the future, if new methods are added, create a separate .csv file named smarts_<method>.csv.
To turn this into a user-facing dataset, edit `make_data.R` to add another `usethis::use_data()` and document it by adding a new entry to `R/data.R`. To also use this in internal data, it needs to be added as an argument to `usethis::use_data(..., internal = TRUE)` since only one sysdata.rda can exist for holding internal data. E.g. `usethis::use_data(smarts_patterns_simpol1, smarts_patterns_newmethod, internal = TRUE, overwrite = TRUE)`

Be sure to run the code in `make_data.R` and to run `devtools::document()` to update data and documentation.
14 changes: 14 additions & 0 deletions data-raw/make_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
## code to prepare `smarts` dataset goes here
smarts_simpol1 <- readr::read_csv("data-raw/smarts_simpol1.csv")

#create user-facing data.frame
usethis::use_data(smarts_simpol1, overwrite = TRUE)

#create internal named list with just SMARTS strings
just_smarts_simpol1 <-
smarts_simpol1 %>%
dplyr::filter(!is.na(smarts))
smarts_patterns_simpol1 <- as.list(just_smarts_simpol1$smarts)
names(smarts_patterns_simpol1) <- just_smarts_simpol1$functional_group

usethis::use_data(smarts_patterns_simpol1, internal = TRUE, overwrite = TRUE)
3 changes: 0 additions & 3 deletions data-raw/smarts_simpol1.R

This file was deleted.

2 changes: 1 addition & 1 deletion data-raw/smarts_simpol1.csv
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ simpol1,rings_aromatic,NA,ChemmineR::rings,
simpol1,rings_total,NA,ChemmineR::rings,
simpol1,rings_aliphatic,NA,rings_total - rings_aromatic,
simpol1,carbon_dbl_bonds_aliphatic,C=C,ChemmineR::smartsSearchOB,
simpol1,CCCO_aliphatic_ring,C(C=C[AR1])(=O)[AR1],ChemmineR::smartsSearchOB,
simpol1,CCCO_aliphatic_ring,C(C=C[AR1])(=O)[AR1],ChemmineR::smartsSearchOB,Matches C=C-C=O in a non-aromatic ring
simpol1,hydroxyl_total,NA,ChemmineR::groups,
simpol1,hydroxyl_aromatic,[OX2H]c,ChemmineR::smartsSearchOB,"This pattern also captures nitrophenols, so the number of nitrophenols is subtracted"
simpol1,hydroxyl_aliphatic,NA,hydroxyl_total - hydroxyl_aromatic,
Expand Down
Binary file modified data/smarts_simpol1.rda
Binary file not shown.
2 changes: 1 addition & 1 deletion man/smarts_simpol1.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 2d4e2d4

Please sign in to comment.