diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..985d669 --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,2 @@ +^modelc\.Rproj$ +^\.Rproj\.user$ diff --git a/.gitignore b/.gitignore index bd8ce74..c435ac7 100644 --- a/.gitignore +++ b/.gitignore @@ -84,4 +84,5 @@ flycheck_*.el .dir-locals.el # network security -/network-security.data \ No newline at end of file +/network-security.data +.Rproj.user diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..7eadc61 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,14 @@ +Package: modelc +Title: A generalized linear model object to SQL compiler +Version: 0.0.0.9000 +Authors@R: + person(given = "Spark", + family = "Fish", + email = "info@sparkfish.com", + role = c("aut", "cre")) +Description: modelc is an R model object to SQL compiler. It generates SQL select statements from linear and generalized linear models. Its interface currently consists of a single function, construct_select, which takes a single input, namely an lm or glm model object. It supports GLM family distributions using log or identity link functions. +License: Proprietary +Encoding: UTF-8 +LazyData: true +Suggests: + testthat (>= 2.1.0) diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..884a631 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,2 @@ +# Generated by roxygen2: fake comment so roxygen2 overwrites silently. +exportPattern("^[^\\.]") diff --git a/modelc.r b/R/modelc.R similarity index 77% rename from modelc.r rename to R/modelc.R index 2bcfb51..f2f959c 100644 --- a/modelc.r +++ b/R/modelc.R @@ -1,13 +1,13 @@ extract_parameters <- function(model) { - return(names(coef(model))) + return(names(coef(model))) } -extract_parameter_coefficient <- function(model, parameter) { +extract_parameter_coefficient <- function(model, parameter) { coefficient_value <- coef(model)[[parameter]] - if (!is.na(coefficient_value)) { - return(coefficient_value) + if (!is.na(coefficient_value)) { + return(coefficient_value) } else { - return(0) + return(0) } } @@ -27,11 +27,11 @@ is_intercept <- function(parameter) { return(parameter == '(Intercept)') } -build_intercept <- function(model, parameter, first=FALSE) { +build_intercept <- function(model, parameter, first=FALSE) { coefficient <- extract_parameter_coefficient(model, parameter) if (!first) { return(paste("+", coefficient)) - } else { + } else { return(coefficient) } } @@ -52,7 +52,7 @@ is_interaction <- function(parameter) { is_factor <- function(parameter, model) { factorlist <- names(model$xlevels) - for (factor in factorlist) { + for (factor in factorlist) { if (grepl(factor, parameter, fixed=T)) { return(TRUE) } @@ -62,7 +62,7 @@ is_factor <- function(parameter, model) { get_factor_name <- function(parameter, model) { factorlist <- names(model$xlevels) - for (factor in factorlist) { + for (factor in factorlist) { if (grepl(factor, parameter, fixed=T)) { return(factor) } @@ -76,79 +76,80 @@ extract_level <- function(parameter, factor) { return(substring(parameter, level_start, level_end)) } -has_parameter <- function(model, parameter) { +has_parameter <- function(model, parameter) { parameter %in% names(coef(model)); } -build_interaction_term <- function(model, interaction_term, first=FALSE) { - +build_interaction_term <- function(model, interaction_term, first=FALSE) { + split_interaction <- strsplit(interaction_term, ":")[[1]] coefficient <- extract_parameter_coefficient(model, interaction_term) - + sql <- paste(coefficient, "*", sep="") - if (first) { - sql <- paste("+", sql) + if (!first) { + sql <- paste("+", sql) } - + i = 0 + for (interaction_variable in split_interaction) { if (is_factor(interaction_variable, model)) { factor <- get_factor_name(interaction_variable, model) level <- extract_level(interaction_variable, factor) - sql = paste(sql, "(CASE WHEN", factor, "=", level, "THEN", level, "ELSE", 0, "END)") + sql = paste(sql, "(CASE WHEN", factor, "=", level, "THEN", 1, "ELSE", 0, "END)") } else { sql = paste(sql, interaction_variable, sep="") } - - if (i == 0) { - sql = paste(sql, "*", sep="") + + if (i == 0) { + sql = paste(sql, "*", sep="") } - + i = i + 1 - + } return (sql) } build_factor_case_statements <- function(model, first=F) { SQL_START_FIRST <- "(CASE" - SQL_START <- "+ (CASE" - factors <- model$xlevels + SQL_START <- "+ (CASE" + factors <- model$xlevels factor_variables <- names(factors) cases <- "" for (factor in factor_variables) { if (!first) { sql = SQL_START - } else { - sql <- SQL_START_FIRST + } else { + sql <- SQL_START_FIRST } for (level in factors[[factor]]) { formula_term <- paste(factor, level, sep="") if (has_parameter(model, formula_term)) { - coefficient <- extract_parameter_coefficient(model, formula_term) - sql = paste(sql, "WHEN", factor, "=", level, "THEN", coefficient, "*", factor) + coefficient <- extract_parameter_coefficient(model, formula_term) + sql = paste(sql, "WHEN", factor, "=", level, "THEN", coefficient) } } - + if (!(sql %in% c(SQL_START_FIRST, SQL_START))) { - cases <- paste(cases, sql, "END)") + cases <- paste(cases, sql, "END)") } } return(cases) } -apply_linkinverse <- function(model, sql) { +apply_linkinverse <- function(model, sql) { if (is.null(model$family)) { return(sql) - } - - if (model$family$link == "identity") { - return(sql) } - - else if(model$family$link == "log") { + + if (model$family$link == "identity") { + return(sql) + } + + else if(model$family$link == "log") { sql <- paste("EXP(", sql, ")", sep="") return(sql) } @@ -161,32 +162,39 @@ apply_linkinverse <- function(model, sql) { construct_select <- function(model) { parameters <- extract_parameters(model) select <- "" - count <- 0 + count <- 0 for (parameter in parameters) { + if (is_intercept(parameter)) { + cat(paste(parameter, "is an intercept\n")) build_term <- build_intercept } - else if (is_factor(parameter, model)) { - next; - } else if (is_interaction(parameter)) { + cat(paste(parameter, "is an interaction\n")) build_term <- build_interaction_term } + else if (is_factor(parameter, model)) { + cat(paste(parameter, "is a factor\n")) + next; + } else { build_term <- build_additive_term } select = paste(select, build_term(model, parameter, first=count==0)) count = count + 1 } - + select <- paste( select, - build_factor_case_statements(model, first=count==0) + " ", + build_factor_case_statements(model, first=count==0), + sep="" ) - + select_with_linkinverse <- apply_linkinverse(model, select) - + select <- paste("SELECT", select_with_linkinverse) - + select <- gsub(" ", " ", trimws(select)) + return(select) } diff --git a/README.md b/README.md index 53edf08..e4dbf06 100644 --- a/README.md +++ b/README.md @@ -97,3 +97,18 @@ SELECT ``` Note that your R session should be configured with `options(scipen=999)` to disable rendering numbers with scientific notation, otherwise `construct_select` may output invalid SQL. + +# Installing + +Ensure you have [R devtools](https://cran.r-project.org/web/packages/devtools/readme/README.html) installed globally. + +```R +setwd("..") # This line assumes your working directory is the `modelc/` root. Otherwise set the working directory to the folder *containing* modelc/ +devtools::install("modelc") +``` + +Alternately you can use `devtools::install_github` by passing a a GitHub auth token. + +```R +devtools::install_github("team-sparkfish/modelc", auth_token = my_secret_auth_token) +``` diff --git a/modelc.Rproj b/modelc.Rproj new file mode 100644 index 0000000..cba1b6b --- /dev/null +++ b/modelc.Rproj @@ -0,0 +1,21 @@ +Version: 1.0 + +RestoreWorkspace: No +SaveWorkspace: No +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +AutoAppendNewline: Yes +StripTrailingWhitespace: Yes + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,collate,namespace