diff --git a/R/append_values.R b/R/append_values.R index 0d03dda..a855622 100644 --- a/R/append_values.R +++ b/R/append_values.R @@ -1,27 +1,46 @@ -#' Appends all values with a specified type as a new column +#' Appends all JSON values with a specified type as a new column #' -#' The append_values_X functions let you take any remaining JSON and add it as -#' a column X (for X in "string", "number", "logical") insofar as it is of the -#' JSON type specified. +#' The \code{append_values} functions let you take any scalar JSON values +#' of a given type ("string", "number", "logical") and add them as a new +#' column named \code{column.name}. This is particularly useful after using +#' \code{\link{gather_keys}} to stack many objects. #' -#' Any values that do not conform to the type specified will be NA in the resulting -#' column. This includes other scalar types (e.g., numbers or logicals if you are -#' using append_values_string) and *also* any rows where the JSON is still an -#' object or an array. +#' Any values that can not be converted to the specified will be \code{NA} in +#' the resulting column. This includes other scalar types (e.g., numbers or +#' logicals if you are using \code{append_values_string}) and *also* any rows +#' where the JSON is NULL or an object or array. +#' +#' Note that the \code{append_values} functions do not alter the JSON +#' attribute of the \code{tbl_json} object in any way. #' #' @name append_values -#' @param .x a json string or tbl_json object -#' @param column.name the column.name to append the values into the data.frame -#' under -#' @param force parameter that determines if the variable type should be computed or not -#' if force is FALSE, then the function may take more memory -#' @param recursive logical indicating whether to extract a single value from a -#' nested object. Only used when force = TRUE. If force = FALSE, and -#' recursive=TRUE, throws an error. +#' @seealso \code{\link{gather_keys}} to gather all object keys first, +#' \code{\link{spread_all}} to spread values into new columns +#' @param .x a json string or \code{\link{tbl_json}} object +#' @param column.name the name of the column to append values as +#' @param force should values be coerced to the appropriate type +#' when possible, otherwise, types are checked first (requires more +#' memory) +#' @param recursive logical indicating whether to recurisvely extract a single +#' value from a nested object. Only used when \code{force = TRUE}. If +#' \code{force = FALSE}, and \code{recursive = TRUE}, throws an error. +#' @return a \code{\link{tbl_json}} object #' @examples +#' +#' # Stack names #' '{"first": "bob", "last": "jones"}' %>% -#' gather_keys() %>% -#' append_values_string() +#' gather_keys %>% +#' append_values_string +#' +#' # This is most useful when data is stored in keys and values +#' # For example, tags in recipes: +#' recipes <- c('{"name": "pie", "tags": {"apple": 10, "pie": 2, "flour": 5}}', +#' '{"name": "cookie", "tags": {"chocolate": 2, "cookie": 1}}') +#' recipes %>% +#' spread_values(name = jstring("name")) %>% +#' enter_object("tags") %>% +#' gather_keys("tag") %>% +#' append_values_number("count") NULL #' Creates the append_values_* functions diff --git a/R/enter_object.R b/R/enter_object.R index 971f8f7..7cf592e 100644 --- a/R/enter_object.R +++ b/R/enter_object.R @@ -1,26 +1,58 @@ -#' Dive into a specific object "key" +#' Enter into a specific object and discard all other JSON data #' -#' JSON can contain nested objects, such as {"key1": {"key2": [1, 2, 3]}}. The -#' function enter_object() can be used to access the array nested under "key1" -#' and "key2". After using enter_object(), all further tidyjson calls happen -#' inside the referenced object (all other JSON data outside the object -#' is discarded). If the object doesn't exist for a given row / index, then that -#' data.frame row will be discarded. +#' When manipulating a JSON object, \code{enter_object} lets you navigate to +#' a specific value of the object by referencing it's key. JSON can contain +#' nested objects, and you can pass in more than one character string into +#' \code{enter_object} to navigate through multiple objects simultaneously. #' -#' This is useful when you want to limit your data to just information found in -#' a specific key. Use the ... to specific a sequence of keys that you want to -#' enter into. Keep in mind that any rows with JSON that do not contain the key -#' will be discarded by this function. +#' After using \code{enter_object}, all further tidyjson calls happen inside the +#' referenced object (all other JSON data outside the object is discarded). +#' If the object doesn't exist for a given row / index, then that row will be +#' discarded. #' +#' In pipelines, \code{enter_object} is often preceded by \code{gather_keys} and +#' followed by \code{gather_array} if the key contains an array, or +#' \code{spread_all} if the key contains an object. +#' +#' @seealso \code{\link{gather_keys}} to access keys that could be entered +#' into, \code{\link{gather_array}} to gather an array in an object and +#' \code{\link{spread_all}} to spread values in an object. #' @param .x a json string or tbl_json object -#' @param ... path to filter +#' @param ... a sequence of character strings designating the object key or +#' sequences of keys you wish to enter +#' @return a \code{\link{tbl_json}} object #' @export #' @examples -#' c('{"name": "bob", "children": ["sally", "george"]}', '{"name": "anne"}') %>% -#' spread_values(parent.name = jstring("name")) %>% -#' enter_object("children") %>% +#' +#' # Let's start with a simple example of parents and children +#' json <- c('{"parent": "bob", "children": ["sally", "george"]}', +#' '{"parent": "fred", "children": ["billy"]}', +#' '{"parent": "anne"}') +#' +#' # We can see the keys and types in each +#' json %>% gather_keys %>% json_types +#' +#' # Let's capture the parent first and then enter in the children object +#' json %>% spread_all %>% enter_object("children") +#' +#' # Notice that "anne" was discarded, as she has no children +#' +#' # We can now use gather array to stack the array +#' json %>% spread_all %>% enter_object("children") %>% +#' gather_array("child.num") +#' +#' # And append_values_string to add the children names +#' json %>% spread_all %>% enter_object("children") %>% +#' gather_array("child.num") %>% +#' append_values_string("child") +#' +#' # A more realistc example with companies data +#' library(dplyr) +#' companies %>% +#' enter_object("acquisitions") %>% #' gather_array %>% -#' append_values_string("children") +#' spread_all %>% +#' glimpse enter_object <- function(.x, ...) { if (!is.tbl_json(.x)) .x <- as.tbl_json(.x) diff --git a/R/gather.R b/R/gather.R index f96593e..906b606 100644 --- a/R/gather.R +++ b/R/gather.R @@ -52,51 +52,123 @@ gather_factory <- function(default.column.name, default.column.empty, } -#' Stack a JSON {"key": value} object +#' Gather a JSON object into key-value pairs #' -#' Given a JSON key value structure, like {"key1": 1, "key2": 2}, the -#' gather_keys() function duplicates the rows of the tbl_json data.frame for -#' every key, adds a new column (default name "key") to capture the key names, -#' and then dives into the JSON values to enable further manipulation with -#' downstream tidyjson functions. +#' \code{gather_keys} collapses a JSON object into key-value pairs, creating +#' a new column \code{'key'} to store the object key names, and storing the +#' values in the \code{'JSON'} attribute for further tidyjson manipulation. +#' All other columns are duplicated as necessary. This allows you to access the +#' keys of the objects just like \code{\link{gather_array}} lets you access the +#' values of an array. #' -#' This allows you to *enter into* the keys of the objects just like \code{gather_array} -#' let you enter elements of the array. +#' \code{gather_keys} is often followed by \code{\link{enter_object}} to enter +#' into a value that is an object, by \code{\link{append_values}} to append all +#' scalar values as a new column or \code{\link{json_types}} to determine the +#' types of the keys. #' -#' @param .x a json string or tbl_json object whose JSON attribute should always be an object +#' @seealso \code{\link{gather_array}} to gather a JSON array, +#' \code{\link{enter_object}} to enter into an object, +#' \code{\link[tidyr]{gather}} to gather key-value pairs in a data +#' frame +#' @param .x a JSON string or \code{tbl_json} object whose JSON attribute should +#' always be an object #' @param column.name the name to give to the column of key names created -#' @return a tbl_json with a new column (column.name) that captures the keys -#' and JSON attribute of the associated value data +#' @return a \code{\link{tbl_json}} object #' @export #' @examples -#' '{"name": "bob", "age": 32}' %>% gather_keys %>% json_types +#' +#' # Let's start with a very simple example +#' json <- '{"name": "bob", "age": 32, "gender": "male"}' +#' +#' # Check that this is an object +#' json %>% json_types +#' +#' # Gather keys and check types +#' json %>% gather_keys %>% json_types +#' +#' # Sometimes data is stored in key names +#' json <- '{"2014": 32, "2015": 56, "2016": 14}' +#' +#' # Then we can use the column.name argument to change the name of the keys +#' json %>% gather_keys("year") +#' +#' # We can also use append_values_number to capture the values, since they are +#' # all of the same type +#' json %>% gather_keys("year") %>% append_values_number("count") +#' +#' # This can even work with a more complex, nested example +#' json <- '{"2015": {"1": 10, "3": 1, "11": 5}, "2016": {"2": 3, "5": 15}}' +#' json %>% gather_keys("year") %>% gather_keys("month") %>% +#' append_values_number("count") +#' +#' # Most JSON starts out as an object (or an array of objects), and gather_keys +#' # can be used to inspect the top level (or 2nd level) keys and their structure +#' library(dplyr) +#' worldbank %>% gather_keys %>% json_types %>% count(key, type) gather_keys <- gather_factory("key", character(0), names, "object") -#' Stack a JSON array -#' -#' Given a JSON array, such as [1, 2, 3], gather_array will "stack" the array in -#' the tbl_json data.frame, by replicating each row of the data.frame by the -#' length of the corresponding JSON array. A new column (by default called -#' "array.index") will be added to keep track of the referenced position in the -#' array for each row of the resuling data.frame. -#' -#' JSON can contain arrays of data, which can be simple vectors (fixed or varying -#' length integer, character or logical vectors). But they also often contain -#' lists of other objects (like a list of purchases for a user). The function -#' gather_array() takes JSON arrays and duplicates the rows in the data.frame to -#' correspond to the indices of the array, and puts the elements of -#' the array into the JSON attribute. This is equivalent to "stacking" the array -#' in the data.frame, and lets you continue to manipulate the remaining JSON -#' in the elements of the array. For simple arrays, use append_values_* to -#' capture all of the values of the array. For more complex arrays (where the -#' values are themselves objects or arrays), continue using other tidyjson -#' functions to structure the data as needed. -#' -#' @param .x a json string or tbl_json object whose JSON attribute should always be an array +#' Gather a JSON array into index-value pairs +#' +#' \code{gather_array} collapses a JSON array into index-value pairs, creating +#' a new column \code{'array.index'} to store the index of the array, and +#' storing values in the \code{'JSON'} attribute for further tidyjson +#' manipulation. All other columns are duplicated as necessary. This allows you +#' to access the values of the array just like \code{\link{gather_keys}} lets +#' you access the values of an object. +#' +#' JSON arrays can be simple vectors (fixed or varying length number, string +#' or logical vectors with or without null values). But they also often contain +#' lists of other objects (like a list of purchases for a user). Thus, the +#' best analogy in R for a JSON array is an unnamed list. +#' +#' \code{gather_array} is often preceded by \code{\link{enter_object}} when the +#' array is nested under a JSON object, and is often followed by +#' \code{\link{gather_keys}} or \code{\link{enter_object}} if the array values +#' are objects, or by \code{\link{append_values}} to append all scalar values +#' as a new column or \code{\link{json_types}} to determine the types of the +#' array elements (JSON does not guarantee they are the same type). +#' +#' @seealso \code{\link{gather_keys}} to gather a JSON object, +#' \code{\link{enter_object}} to enter into an object, +#' \code{\link[tidyr]{gather}} to gather key-value pairs in a data +#' frame +#' @param .x a json string or tbl_json object whose JSON attribute should always +#' be an array #' @param column.name the name to give to the array index column created -#' @return a tbl_json with a new column (column.name) that captures the array -#' index and JSON attribute extracted from the array +#' @return a \code{\link{tbl_json}} object #' @export #' @examples -#' '[1, "a", {"k": "v"}]' %>% gather_array %>% json_types +#' +#' # A simple character array example +#' json <- '["a", "b", "c"]' +#' +#' # Check that this is an array +#' json %>% json_types +#' +#' # Gather array and check types +#' json %>% gather_array %>% json_types +#' +#' # Extract string values +#' json %>% gather_array %>% append_values_string +#' +#' # A more complex mixed type example +#' json <- '["a", 1, true, null, {"key": "value"}]' +#' +#' # Then we can use the column.name argument to change the name of the keys +#' json %>% gather_array %>% json_types +#' +#' # A nested array +#' json <- '[["a", "b", "c"], ["a", "d"], ["b", "c"]]' +#' +#' # Extract both levels +#' json %>% gather_array("index.1") %>% gather_array("index.2") %>% +#' append_values_string +#' +#' # Some JSON begins as an array +#' commits %>% gather_array +#' +#' # We can use spread_all to capture all keys (where recursive = FALSE is used +#' # to limit the dept to just top level keys +#' library(dplyr) +#' commits %>% gather_array %>% spread_all(recursive = FALSE) %>% glimpse gather_array <- gather_factory("array.index", integer(0), seq_along, "array") diff --git a/R/json_complexity.R b/R/json_complexity.R index 61836ba..e449af9 100644 --- a/R/json_complexity.R +++ b/R/json_complexity.R @@ -1,19 +1,32 @@ -#' Add a column that contains the complexity (recursively unlisted length) of the JSON data +#' Compute the complexity (recursively unlisted length) of JSON data #' #' When investigating complex JSON data it can be helpful to identify the -#' complexity of deeply nested documents. The json_complexity() function adds a -#' column (default name "complexity") that contains the 'complexity' of the JSON -#' associated with each row. Essentially, every on-null scalar value is found in the -#' object by recursively stripping away all objects or arrays, and the complexity -#' is the count of these scalar values. Note that 'null' has complexity 0. +#' complexity of deeply nested documents. The \code{json_complexity} function +#' adds a column (default name \code{"complexity"}) that contains the +#' 'complexity' of the JSON associated with each row. Essentially, every on-null +#' scalar value is found in the object by recursively stripping away all objects +#' or arrays, and the complexity is the count of these scalar values. Note that +#' 'null' has complexity 0, as do empty objects and arrays. #' +#' @seealso \code{\link{json_lengths}} to compute the length of each value #' @param .x a json string or tbl_json object #' @param column.name the name to specify for the length column -#' @return a tbl_json object with column.name column that tells the length +#' @return a \code{\link{tbl_json}} object #' @export #' @examples -#' c('[1, 2, [3, 4]]', '{"k1": 1, "k2": [2, [3, 4]]}', '1', {}) %>% -#' json_lengths %>% json_complexity +#' +#' # A simple example +#' json <- c('[1, 2, [3, 4]]', '{"k1": 1, "k2": [2, [3, 4]]}', '1', 'null') +#' +#' # Complexity is larger than length for nested objects +#' json %>% json_lengths %>% json_complexity +#' +#' # Worldbank has complexity ranging from 8 to 17 +#' library(magrittr) +#' worldbank %>% json_complexity %$% table(complexity) +#' +#' # Commits are much more regular +#' commits %>% gather_array %>% json_complexity %$% table(complexity) json_complexity <- function(.x, column.name = "complexity") { if (!is.tbl_json(.x)) .x <- as.tbl_json(.x) diff --git a/R/json_lengths.R b/R/json_lengths.R index 3e038c9..0ff08ac 100644 --- a/R/json_lengths.R +++ b/R/json_lengths.R @@ -1,18 +1,33 @@ -#' Add a column that contains the length of the JSON data +#' Compute the length of JSON data #' #' When investigating JSON data it can be helpful to identify the lengths of the -#' JSON objects or arrays, especialy when they are 'ragged' across documents. The -#' json_lengths() function adds a column (default name "length") that contains -#' the 'length' of the JSON associated with each row. For objects, this will -#' be equal to the number of keys. For arrays, this will be equal to the length -#' of the array. All scalar values will be of length 1. +#' JSON objects or arrays, especialy when they are 'ragged' across documents. +#' The \code{json_lengths} function adds a column (default name \code{"length"}) +#' that contains the 'length' of the JSON associated with each row. For objects, +#' this will be equal to the number of keys. For arrays, this will be equal to +#' the length of the array. All scalar values will be of length 1, and null +#' will have length 0. #' +#' @seealso \code{\link{json_complexity}} to compute the recursive length of +#' each value #' @param .x a json string or tbl_json object #' @param column.name the name to specify for the length column -#' @return a tbl_json object with column.name column that tells the length +#' @return a \code{\link{tbl_json}} object #' @export #' @examples -#' c('[1, 2, 3]', '{"k1": 1, "k2": 2}', '1', {}) %>% json_lengths +#' +#' # A simple example +#' json <- c('[1, 2, 3]', '{"k1": 1, "k2": 2}', '1', 'null') +#' +#' # Complexity is larger than length for nested objects +#' json %>% json_lengths +#' +#' # Worldbank objcts are either length 7 or 8 +#' library(magrittr) +#' worldbank %>% json_lengths %$% table(length) +#' +#' # All commits are length 8 +#' commits %>% gather_array %>% json_lengths %$% table(length) json_lengths <- function(.x, column.name = "length") { if (!is.tbl_json(.x)) .x <- as.tbl_json(.x) diff --git a/R/json_schema.R b/R/json_schema.R index c82bfa2..e8f50b5 100644 --- a/R/json_schema.R +++ b/R/json_schema.R @@ -15,8 +15,8 @@ #' } #' #' For more complex JSON objects, ties are broken by taking the most -#' complex example (using \code{json_complexity}), and then by type -#' (using \code{json_types}). +#' complex example (using \code{\link{json_complexity}}), and then by type +#' (using \code{\link{json_types}}). #' #' This means that if a key has varying schema across documents, the #' most complex schema will be chosen as being representative. Similarly, @@ -24,33 +24,44 @@ #' chosen, and if arrays vary in schema across documents, the most #' complex is chosen. #' -#' @param .x a json string or tbl_json object +#' Note that \code{json_schema} can be slow for large JSON document collections, +#' you may want to sample your JSON collection first. +#' +#' @seealso \code{\link{json_structure}} to recursively structure all +#' documents into a single data frame, +#' \code{\link{plot_json_graph}} to plot JSON (including results +#' of \code{json_schema} as a graph +#' @param .x a json string or \code{\link{tbl_json}} object #' @param type whether to capture scalar nodes using the string that defines #' their type (e.g., "logical") or as a representative value #' (e.g., "true"), useful in conjunction with plot_json_graph #' @return a character string JSON document that represents the schema of #' the collection -#' #' @export #' @examples #' #' # A simple string -#' '"string"' %>% json_schema +#' '"string"' %>% json_schema %>% writeLines #' #' # A simple object -#' '{"key": "value"}' %>% json_schema +#' '{"key": "value"}' %>% json_schema %>% writeLines +#' +#' # A more complex JSON array +#' json <- '[{"a": 1}, [1, 2], "a", 1, true, null]' +#' +#' # Using type = 'string' (default) +#' json %>% json_schema %>% writeLines #' -#' # A complex array is represented by the most complex example -#' '[{"a": 1}, [1, 2], "a", 1, true, null]' %>% json_schema +#' # Using type = 'value' to show a representative value +#' json %>% json_schema(type = "value") %>% writeLines #' -#' # Companies example -#' companies[1] %>% json_schema %>% plot_json_graph +#' # Plotting the schema of a company example +#' companies[1] %>% json_schema(type = "value") %>% plot_json_graph #' -#' # Github issues -#' issues_array <- issues %>% gather_array # issues are one large array -#' # analyze first 5, and use type = "value" to ensure proper coloring of graph -#' issues_schema <- issues_array[1:5, ] %>% json_schema(type = "value") -#' issues_schema %>% plot_json_graph +#' # Schema of the first 5 github issues +#' library(dplyr) +#' issues %>% gather_array %>% slice(1:10) %>% +#' json_schema(type = "value") %>% plot_json_graph json_schema <- function(.x, type = c("string", "value")) { type <- match.arg(type) diff --git a/R/json_structure.R b/R/json_structure.R index 2a39b44..c7cce89 100644 --- a/R/json_structure.R +++ b/R/json_structure.R @@ -1,42 +1,50 @@ #' Recursively structures arbitrary JSON data into a single data.frame #' -#' Returns a \code{tbl_json} object where each row corresponds to a leaf in -#' the JSON structure. The first row corresponds to the json document as +#' Returns a \code{\link{tbl_json}} object where each row corresponds to a leaf +#' in the JSON structure. The first row corresponds to the JSON document as #' a whole. If the document is a scalar value (JSON string, number, logical #' or null), then there will only be 1 row. If instead it is an object or #' an array, then subsequent rows will recursively correspond to the elements #' (and their children) of the object or array. #' -#' @param .x a json string or tbl_json object -#' @return a tbl_json object with the following columns: +#' The columns in the \code{\link{tbl_json}} returend are defined as #' -#' \code{document.id} 1L if \code{x} is a single JSON string, otherwise the -#' index of \code{x}. +#' \itemize{ +#' \item \code{document.id} 1L if \code{.x} is a single JSON string, otherwise +#' the index of \code{.x}. #' -#' \code{parent.id} the string identifier of the parent node for this child. +#' \item \code{parent.id} the string identifier of the parent node for this +#' child. #' -#' \code{level} what level of the hierarchy this child resides at, starting -#' at \code{0L} for the root and incrementing for each level of nested -#' array or object. +#' \item \code{level} what level of the hierarchy this child resides at, +#' starting at \code{0L} for the root and incrementing for each level +#' of nested array or object. #' -#' \code{index} what index of the parent object / array this child resides -#' at (from \code{gather_array} for arrays). +#' \item \code{index} what index of the parent object / array this child +#' resides at (from \code{gather_array} for arrays). #' -#' \code{child.id} a unique ID for this leaf in this document, represented -#' as . where is the ID for the parent and -#' is this index. +#' \item \code{child.id} a unique ID for this leaf in this document, +#' represented as . where is the ID for the +#' parent and is this index. #' -#' \code{seq} the sequence of keys / indices that led to this child -#' (parents that are arrays are excluded) as a list, where character strings -#' denote objects and integers denote array positions +#' \item \code{seq} the sequence of keys / indices that led to this child +#' (parents that are arrays are excluded) as a list, where character +#' strings denote objects and integers denote array positions #' -#' \code{key} if this is the value of an object, what was the key that it -#' is listed under (from \code{gather_keys}). +#' \item \code{key} if this is the value of an object, what was the key that +#' it is listed under (from \code{\link{gather_keys}}). #' -#' \code{type} the type of this object (from \code{json_types}). +#' \item \code{type} the type of this object (from \code{\link{json_types}}). #' -#' \code{length} the length of this object (from \code{json_lengths}). +#' \item \code{length} the length of this object (from +#' \code{\link{json_lengths}}). +#' } #' +#' @seealso \code{\link{json_schema}} to create a schema for a JSON document or +#' collection, \code{\link{plot_json_graph}} to plot the structure +#' of a JSON object as a graph +#' @param .x a json string or tbl_json object +#' @return a \code{\link{tbl_json}} object #' @export #' @examples #' @@ -48,6 +56,10 @@ #' #' # A complex array #' '[{"a": 1}, [1, 2], "a", 1, true, null]' %>% json_structure +#' +#' # A sample of structure rows from a company +#' library(dplyr) +#' companies[1] %>% json_structure %>% sample_n(5) json_structure <- function(.x) { if (!is.tbl_json(.x)) .x <- as.tbl_json(.x) diff --git a/R/json_types.R b/R/json_types.R index 050950e..2ecf45b 100644 --- a/R/json_types.R +++ b/R/json_types.R @@ -1,20 +1,27 @@ -#' Add a column that tells the 'type' of the data in the root of the JSON +#' Add a column that tells the 'type' of the JSON data #' -#' The function json_types() inspects the JSON associated with -#' each row of the tbl_json data.frame, and adds a new column ("type" by -#' default) that identifies the type according to the -#' JSON standard at http://json.org/. +#' The function \code{json_types} inspects the JSON associated with +#' each row of the \code{\link{tbl_json}} object, and adds a new column +#' (\code{"type"} by default) that identifies the type according to the +#' JSON standard at \url{http://json.org/}. #' -#' This is particularly useful for inspecting your JSON data types, and can added -#' after gather_array() (or gather_keys()) to inspect the types of the elements -#' (or values) in arrays (or objects). +#' This is particularly useful for inspecting your JSON data types, and can +#' often follows after \code{\link{gather_array}}, \code{\link{gather_keys}} +#' or \code{\link{enter_object}} to inspect the types of the elements of +#' JSON objects or arrays. #' #' @param .x a json string or tbl_json object #' @param column.name the name to specify for the type column -#' @return a tbl_json object with column.name column that tells the type +#' @return a \code{\link{tbl_json}} object #' @export #' @examples +#' +#' # A simple example #' c('{"a": 1}', '[1, 2]', '"a"', '1', 'true', 'null') %>% json_types +#' +#' # Type distribution in the first 10 companies +#' library(dplyr) +#' companies[1:10] %>% gather_keys %>% json_types %>% count(type) json_types <- function(.x, column.name = "type") { if (!is.tbl_json(.x)) .x <- as.tbl_json(.x) diff --git a/R/plot_json_graph.R b/R/plot_json_graph.R index b5c4e2e..8cf1807 100644 --- a/R/plot_json_graph.R +++ b/R/plot_json_graph.R @@ -1,7 +1,7 @@ -#' Plots an igraph visualization of a single json document +#' Plots an \code{\link[igraph]{igraph}} visualization of a JSON document #' -#' This function first calls json_structure, and then uses that data to create -#' an igraph object, and then plots that object. +#' This function first calls \code{\link{json_structure}}, and then uses that +#' data to create an igraph object, and then plots that object. #' #' Each dot on the plot corresponds to a node in the JSON document, which #' could include an object or an array (which will have children nodes) or @@ -9,32 +9,34 @@ #' graph connects parent nodes to child nodes, and the vertices are colored #' based on json_types. #' -#' If show.labels is TRUE, then the names for object values are plotted on -#' the value node. +#' If \code{show.labels} is \code{TRUE}, then the names for object values are +#' plotted on the value node. #' -#' If you have a very large document (json_complexity larger than a few -#' hundred), you should consider setting show.labels to FALSE, and reducing -#' the vertex.size and edge.width. Documents that are even more complex may -#' need to be broken into smaller chunks to be visualized effectively. +#' If you have a very large document (\code{\link{json_complexity}} larger than +#' a few hundred), you should consider setting \code{show.labels} to +#' \code{FALSE}, and reducing the \code{vertex.size} and \code{edge.width} +#' parameters. Documents that are even more complex may need to be broken into +#' smaller chunks to be visualized effectively. #' #' Note that the legend is plotted automatically, but may not be scaled -#' correctly. Set legend to FALSE and manually create your own legend if -#' you wish to reposition it. +#' correctly. Set \code{legend} to \code{FALSE} and manually create your own +#' legend if you wish to reposition it. #' #' Also note that this function sets the plot margins to zero in order to -#' maximize the size of the graph on the page. the par() is reset afterwards. +#' maximize the size of the graph on the page. the \code{par} is reset +#' afterwards. #' -#' @param .x a json string or tbl_json object +#' @param .x a JSON string or \code{\link{tbl_json}} object #' @param legend add a type color legend automatically #' @param vertex.size the size of the vertices (helpful to reduce this if the -#' json is very complex +#' JSON is very complex #' @param edge.color the color for the edges #' @param edge.width the width of the edge lines, helpful to reduce this if -#' the json is very complex +#' the JSON is very complex #' @param show.labels should object names be shown #' @param plot should the plot be rendered? -#' @param ... further arguments to igraph::plot.igraph -#' @return the igraph object +#' @param ... further arguments to \code{\link[igraph]{plot.igraph}} +#' @return an \code{\link[igraph]{igraph}} object #' @export #' @examples #' @@ -52,7 +54,6 @@ #' #' # a very complex real example #' companies[1] %>% plot_json_graph(show.labels = FALSE, vertex.size = 4) -#' plot_json_graph <- function(.x, legend = TRUE, vertex.size = 6, edge.color = 'grey70', edge.width = .5, show.labels = TRUE, plot = TRUE, diff --git a/R/read_json.R b/R/read_json.R index 542846d..6db13b6 100644 --- a/R/read_json.R +++ b/R/read_json.R @@ -1,11 +1,13 @@ -#' Reads JSON from an input uri (file, url, ...) and returns a tbl_json +#' Reads JSON from an input uri (file, url, ...) and returns a +#' \code{\link{tbl_json}} object #' #' @param path to some json data #' @param format -#' If "json", process the data like one large JSON record. -#' If "jsonl", process the data one JSON record per line (json lines format) -#' If "infer", the format is the suffix of the given filepath. -#' @return tbl_json instance +#' If \code{"json"}, process the data like one large JSON record. +#' If \code{"jsonl"}, process the data one JSON record per line (json lines +#' format). +#' If \code{"infer"}, the format is the suffix of the given filepath. +#' @return a \code{\link{tbl_json}} object #' @export read_json <- function(path, format = c("json", "jsonl", "infer")) { diff --git a/R/spread_all.R b/R/spread_all.R index 36164e0..8feb181 100644 --- a/R/spread_all.R +++ b/R/spread_all.R @@ -1,14 +1,15 @@ -#' Spreads all object names into new columns +#' Spreads all scalar values of a JSON object into new columns #' -#' Like the spread function in tidyr but for JSON, this function spreads out -#' any JSON objects into new columns. If objects are nested, then the -#' recursive flag will expand those objects out with a compound colum name -#' based on the sequences of nested keys concatenated with the sep character. +#' Like the \code{\link[tidyr]{spread}} function in \code{tidyr} but for JSON, +#' this function spreads out any JSON objects that are scalars into new columns. +#' If objects are nested, then the recursive flag will expand scalar values of +#' nested objects out with a compound column name based on the sequences of +#' nested keys concatenated with the \code{sep} character. #' -#' Note that arrays are ignored by this function, use gather_array to stack -#' the array first, and then use spread_all if the array contains objects or -#' use one of the append_vaues_string, append_values_number or -#' append_values_logical to to capture the array values if they are scalars. +#' Note that arrays are ignored by this function, use \code{\link{gather_array}} +#' to gather the array first, and then use \code{spread_all} if the array +#' contains objects or use one of the \code{\link{append_values}} functions to +#' capture the array values if they are scalars. #' #' Note that scalar JSON values (e.g., a JSON string like '1') are also #' ignored, as they have no keys to create column names with. @@ -16,10 +17,17 @@ #' The order of columns is determined by the order they are encountered in the #' JSON document, with nested objects placed at the end. #' -#' @param .x a json string or tbl_json object +#' This function does not change the value of the JSON attribute of the +#' \code{\link{tbl_json}} object in any way. +#' +#' @seealso \code{\link{spread_values}} to specific which specific values +#' to spread along with their types, +#' \code{\link[tidyr]{spread}} for spreading data frames +#' @param .x a json string or \code{\link{tbl_json}} object #' @param recursive whether or not to recursively spread nested objects #' @param sep character used to separate nested object keys when resursive -#' is TRUE +#' is \code{TRUE} +#' @return a \code{\link{tbl_json}} object #' @export #' @examples #' @@ -30,7 +38,7 @@ #' json %>% spread_all #' #' # A more complex example -#' worldbank %>% spread_all %>% head +#' worldbank %>% spread_all spread_all <- function(.x, recursive = TRUE, sep = ".") { if (!is.tbl_json(.x)) .x <- as.tbl_json(.x) diff --git a/R/spread_values.R b/R/spread_values.R index 431aa23..44f1931 100644 --- a/R/spread_values.R +++ b/R/spread_values.R @@ -1,25 +1,61 @@ -#' Create new columns with JSON values +#' Spreads specific scalar values of a JSON object into new columns #' -#' The spread_values() function lets you dive into (potentially nested) JSON -#' objects and extract specific values. spread_values() takes jstring(), -#' jnumber() or jlogical() named function calls as arguments in order to specify -#' the type of the data that should be captured at each desired key location. -#' These values can be of varying types at varying depths. +#' The \code{spread_values} function lets you extract extract specific values +#' from (potentiall nested) JSON objects. \code{spread_values} takes +#' \code{\link{jstring}}, \code{\link{jnumber}} or \code{\link{jlogical}} named +#' function calls as arguments in order to specify the type of the data that +#' should be captured at each desired key location. These values can be of +#' varying types at varying depths. #' -#' Note that jstring, jnumber and jlogical will fail if they encounter the -#' incorrect type in any document +#' Note that \code{\link{jstring}}, \code{\link{jnumber}} and +#' \code{\link{jlogical}} will fail if they encounter the incorrect type in any +#' document. #' -#' @param .x a json string or tbl_json object -#' @param ... column=value list where 'column' will be the column name created -#' and 'value' must be a call to jstring(), jnumber() or jlogical() specifying -#' the path to get the value (and the type implicit in the function name) +#' The advantage of \code{spread_values} over \code{\link{spread_all}} is that +#' you are guaranteed to get a consistent data frame structure (columns and +#' types) out of any \code{spread_values} call. \code{\link{spread_all}} +#' requires less typing, but because it infers the columns and their types from +#' the JSON, it is less suitable when programming. +#' +#' @seealso \code{\link{spread_all}} for spreading all values, +#' \code{\link[tidyr]{spread}} for spreading data frames, +#' \code{\link{jstring}}, \code{\link{jnumber}}, +#' \code{\link{jlogical}} for accessing specific keys +#' @param .x a json string or \code{\link{tbl_json}} object +#' @param ... \code{column = value} pairs where \code{column} will be the +#' column name created and \code{value} must be a call to +#' \code{\link{jstring}}, \code{\link{jnumber}} or +#' \code{\link{jlogical}} specifying the path to get the value (and +#' the type implicit in the function name) +#' @return a \code{\link{tbl_json}} object #' @export #' @examples -#' '{"name": {"first": "bob", "last": "jones"}, "age": 32}' %>% +#' +#' # A simple example +#' json <- '{"name": {"first": "Bob", "last": "Jones"}, "age": 32}' +#' +#' # Using spread_values +#' json %>% #' spread_values( #' first.name = jstring("name", "first"), -#' age = jnumber("age") +#' last.name = jstring("name", "last"), +#' age = jnumber("age") #' ) +#' +#' # Another document, this time with a middle name (and no age) +#' json2 <- '{"name": {"first": "Ann", "middle": "A", "last": "Smith"}, "age": 23}' +#' +#' # spread_values still gives the same column structure +#' c(json, json2) %>% +#' spread_values( +#' first.name = jstring("name", "first"), +#' last.name = jstring("name", "last"), +#' age = jnumber("age") +#' ) +#' +#' # whereas spread_all adds a new column +#' json %>% spread_all +#' c(json, json2) %>% spread_all spread_values <- function(.x, ...) { if (!is.tbl_json(.x)) .x <- as.tbl_json(.x) @@ -65,10 +101,12 @@ jfactory <- function(map.function) { } #' Navigates nested objects to get at keys of a specific type, to be used as -#' arguments to spread_values +#' arguments to \code{\link{spread_values}} #' #' Note that these functions fail if they encounter the incorrect type. #' +#' @seealso \code{\link{spread_values}} for using these functions to spread +#' the values of a JSON object into new columns #' @name jfunctions #' @param ... the path to follow #' @param recursive logical indicating whether second level and beyond objects diff --git a/R/tbl_json.R b/R/tbl_json.R index 5ec1946..2850d8f 100644 --- a/R/tbl_json.R +++ b/R/tbl_json.R @@ -3,21 +3,67 @@ #' @name tbl_json NULL -#' tbl_json constructor +#' \code{tbl_json} constructor #' -#' Note that json.list must have the same length as nrow(df), and if json.list -#' has any NULL elements, the corresponding rows will be removed from df. Also -#' note that "..JSON" is a reserved column name used internally for filtering -#' tbl_json objects, and so is not allowed in the data.frame names. +#' Constructs a \code{tbl_json} object, for further downstream manipulation +#' by other tidyjson functions. Methods exist to convert JSON stored in +#' character strings without any other associated data, as a separate +#' character string and associated data frame, or as a single data frame +#' with a specified character string JSON column. #' +#' Most tidyjson functions accept a \code{tbl_json} object as the first +#' argument, and return a \code{tbl_json} object unless otherwise specified. +#' tidyjson functions will attempt to convert an object that isn't a +#' \code{tbl_json} object first, and so explicit construction of \code{tidyjson} +#' objects is rarely needed. +#' +#' \code{tbl_json} objects consist of a data frame along with it's associated +#' JSON, where each row of the data frame corresponds to a single JSON +#' document. The JSON is stored in a \code{"JSON"} attribute. +#' +#' Note that \code{json.list} must have the same length as \code{nrow(df)}, and +#' if \code{json.list} has any \code{NULL} elements, the corresponding rows will +#' be removed from \code{df}. Also note that \code{"..JSON"} is a reserved +#' column name used internally for filtering tbl_json objects, and so is not +#' allowed in the names of \code{df}. +#' +#' @seealso \code{read_json} for reading json from files #' @param df data.frame -#' @param json.list list of json lists parsed with fromJSON -#' @param drop.null.json drop NULL json entries from data.frame and json -#' @param .x an object to convert into a tbl_json object -#' @param json.column the name of the JSON column of data in x, if x is a data.frame +#' @param json.list list of json lists parsed with +#' \code{\link[jsonlite]{fromJSON}} +#' @param drop.null.json drop \code{NULL} json entries from \code{df} and +#' \code{json.list} +#' @param .x an object to convert into a \code{tbl_json} object +#' @param json.column the name of the json column of data in \code{.x}, if +#' \code{.x} is a data frame #' @param ... other arguments +#' @return a \code{\link{tbl_json}} object #' @rdname tbl_json #' @export +#' @examples +#' +#' # Construct a tbl_json object using a charater string of JSON +#' json <- '{"animal": "cat", "count": 2}' +#' json %>% as.tbl_json +#' +#' # access the "JSON" argument +#' json %>% as.tbl_json %>% attr("JSON") +#' +#' # Construct a tbl_json object using multiple documents +#' json <- c('{"animal": "cat", "count": 2}', '{"animal": "parrot", "count": 1}') +#' json %>% as.tbl_json +#' +#' # Construct a tbl_json object from a data.frame with a JSON colum +#' library(tibble) +#' farms <- tribble( +#' ~farm, ~animals, +#' 1L, '[{"animal": "pig", "count": 50}, {"animal": "cow", "count": 10}]', +#' 2L, '[{"animal": "chicken", "count": 20}]' +#' ) +#' farms %>% as.tbl_json(json.column = "animals") +#' # tidy the farms +#' farms %>% as.tbl_json(json.column = "animals") %>% +#' gather_array %>% spread_all tbl_json <- function(df, json.list, drop.null.json = FALSE) { assert_that(is.data.frame(df)) @@ -91,6 +137,7 @@ is.tbl_json <- function(.x) inherits(.x, "tbl_json") #' @param i row elements to extract #' @param j column elements to extract #' @param drop whether or not to simplify results +#' @return a \code{\link{tbl_json}} object #' @export `[.tbl_json` <- function(.x, i, j, drop = if (missing(i)) TRUE else length(cols) == 1) { diff --git a/man/append_values.Rd b/man/append_values.Rd index 01b7f94..a1b4111 100644 --- a/man/append_values.Rd +++ b/man/append_values.Rd @@ -5,7 +5,7 @@ \alias{append_values_logical} \alias{append_values_number} \alias{append_values_string} -\title{Appends all values with a specified type as a new column} +\title{Appends all JSON values with a specified type as a new column} \usage{ append_values_string(.x, column.name = type, force = TRUE, recursive = FALSE) @@ -17,32 +17,55 @@ append_values_logical(.x, column.name = type, force = TRUE, recursive = FALSE) } \arguments{ -\item{.x}{a json string or tbl_json object} +\item{.x}{a json string or \code{\link{tbl_json}} object} -\item{column.name}{the column.name to append the values into the data.frame -under} +\item{column.name}{the name of the column to append values as} -\item{force}{parameter that determines if the variable type should be computed or not -if force is FALSE, then the function may take more memory} +\item{force}{should values be coerced to the appropriate type +when possible, otherwise, types are checked first (requires more +memory)} -\item{recursive}{logical indicating whether to extract a single value from a -nested object. Only used when force = TRUE. If force = FALSE, and -recursive=TRUE, throws an error.} +\item{recursive}{logical indicating whether to recurisvely extract a single +value from a nested object. Only used when \code{force = TRUE}. If +\code{force = FALSE}, and \code{recursive = TRUE}, throws an error.} +} +\value{ +a \code{\link{tbl_json}} object } \description{ -The append_values_X functions let you take any remaining JSON and add it as -a column X (for X in "string", "number", "logical") insofar as it is of the -JSON type specified. +The \code{append_values} functions let you take any scalar JSON values +of a given type ("string", "number", "logical") and add them as a new +column named \code{column.name}. This is particularly useful after using +\code{\link{gather_keys}} to stack many objects. } \details{ -Any values that do not conform to the type specified will be NA in the resulting -column. This includes other scalar types (e.g., numbers or logicals if you are -using append_values_string) and *also* any rows where the JSON is still an -object or an array. +Any values that can not be converted to the specified will be \code{NA} in +the resulting column. This includes other scalar types (e.g., numbers or +logicals if you are using \code{append_values_string}) and *also* any rows +where the JSON is NULL or an object or array. + +Note that the \code{append_values} functions do not alter the JSON +attribute of the \code{tbl_json} object in any way. } \examples{ + +# Stack names '{"first": "bob", "last": "jones"}' \%>\% - gather_keys() \%>\% - append_values_string() + gather_keys \%>\% + append_values_string + +# This is most useful when data is stored in keys and values +# For example, tags in recipes: +recipes <- c('{"name": "pie", "tags": {"apple": 10, "pie": 2, "flour": 5}}', + '{"name": "cookie", "tags": {"chocolate": 2, "cookie": 1}}') +recipes \%>\% + spread_values(name = jstring("name")) \%>\% + enter_object("tags") \%>\% + gather_keys("tag") \%>\% + append_values_number("count") +} +\seealso{ +\code{\link{gather_keys}} to gather all object keys first, + \code{\link{spread_all}} to spread values into new columns } diff --git a/man/enter_object.Rd b/man/enter_object.Rd index bd7d10e..ed604e6 100644 --- a/man/enter_object.Rd +++ b/man/enter_object.Rd @@ -2,34 +2,70 @@ % Please edit documentation in R/enter_object.R \name{enter_object} \alias{enter_object} -\title{Dive into a specific object "key"} +\title{Enter into a specific object and discard all other JSON data} \usage{ enter_object(.x, ...) } \arguments{ \item{.x}{a json string or tbl_json object} -\item{...}{path to filter} +\item{...}{a sequence of character strings designating the object key or +sequences of keys you wish to enter} +} +\value{ +a \code{\link{tbl_json}} object } \description{ -JSON can contain nested objects, such as {"key1": {"key2": [1, 2, 3]}}. The -function enter_object() can be used to access the array nested under "key1" -and "key2". After using enter_object(), all further tidyjson calls happen -inside the referenced object (all other JSON data outside the object -is discarded). If the object doesn't exist for a given row / index, then that -data.frame row will be discarded. +When manipulating a JSON object, \code{enter_object} lets you navigate to +a specific value of the object by referencing it's key. JSON can contain +nested objects, and you can pass in more than one character string into +\code{enter_object} to navigate through multiple objects simultaneously. } \details{ -This is useful when you want to limit your data to just information found in -a specific key. Use the ... to specific a sequence of keys that you want to -enter into. Keep in mind that any rows with JSON that do not contain the key -will be discarded by this function. +After using \code{enter_object}, all further tidyjson calls happen inside the +referenced object (all other JSON data outside the object is discarded). +If the object doesn't exist for a given row / index, then that row will be +discarded. + +In pipelines, \code{enter_object} is often preceded by \code{gather_keys} and +followed by \code{gather_array} if the key contains an array, or +\code{spread_all} if the key contains an object. } \examples{ -c('{"name": "bob", "children": ["sally", "george"]}', '{"name": "anne"}') \%>\% - spread_values(parent.name = jstring("name")) \%>\% - enter_object("children") \%>\% + +# Let's start with a simple example of parents and children +json <- c('{"parent": "bob", "children": ["sally", "george"]}', + '{"parent": "fred", "children": ["billy"]}', + '{"parent": "anne"}') + +# We can see the keys and types in each +json \%>\% gather_keys \%>\% json_types + +# Let's capture the parent first and then enter in the children object +json \%>\% spread_all \%>\% enter_object("children") + +# Notice that "anne" was discarded, as she has no children + +# We can now use gather array to stack the array +json \%>\% spread_all \%>\% enter_object("children") \%>\% + gather_array("child.num") + +# And append_values_string to add the children names +json \%>\% spread_all \%>\% enter_object("children") \%>\% + gather_array("child.num") \%>\% + append_values_string("child") + +# A more realistc example with companies data +library(dplyr) +companies \%>\% + enter_object("acquisitions") \%>\% gather_array \%>\% - append_values_string("children") + spread_all \%>\% + glimpse +} +\seealso{ +\code{\link{gather_keys}} to access keys that could be entered + into, \code{\link{gather_array}} to gather an array in an object and + \code{\link{spread_all}} to spread values in an object. } diff --git a/man/gather_array.Rd b/man/gather_array.Rd index 21e03b7..5d1d23a 100644 --- a/man/gather_array.Rd +++ b/man/gather_array.Rd @@ -2,40 +2,79 @@ % Please edit documentation in R/gather.R \name{gather_array} \alias{gather_array} -\title{Stack a JSON array} +\title{Gather a JSON array into index-value pairs} \usage{ gather_array(.x, column.name = default.column.name) } \arguments{ -\item{.x}{a json string or tbl_json object whose JSON attribute should always be an array} +\item{.x}{a json string or tbl_json object whose JSON attribute should always +be an array} \item{column.name}{the name to give to the array index column created} } \value{ -a tbl_json with a new column (column.name) that captures the array - index and JSON attribute extracted from the array +a \code{\link{tbl_json}} object } \description{ -Given a JSON array, such as [1, 2, 3], gather_array will "stack" the array in -the tbl_json data.frame, by replicating each row of the data.frame by the -length of the corresponding JSON array. A new column (by default called -"array.index") will be added to keep track of the referenced position in the -array for each row of the resuling data.frame. +\code{gather_array} collapses a JSON array into index-value pairs, creating +a new column \code{'array.index'} to store the index of the array, and +storing values in the \code{'JSON'} attribute for further tidyjson +manipulation. All other columns are duplicated as necessary. This allows you +to access the values of the array just like \code{\link{gather_keys}} lets +you access the values of an object. } \details{ -JSON can contain arrays of data, which can be simple vectors (fixed or varying -length integer, character or logical vectors). But they also often contain -lists of other objects (like a list of purchases for a user). The function -gather_array() takes JSON arrays and duplicates the rows in the data.frame to -correspond to the indices of the array, and puts the elements of -the array into the JSON attribute. This is equivalent to "stacking" the array -in the data.frame, and lets you continue to manipulate the remaining JSON -in the elements of the array. For simple arrays, use append_values_* to -capture all of the values of the array. For more complex arrays (where the -values are themselves objects or arrays), continue using other tidyjson -functions to structure the data as needed. +JSON arrays can be simple vectors (fixed or varying length number, string +or logical vectors with or without null values). But they also often contain +lists of other objects (like a list of purchases for a user). Thus, the +best analogy in R for a JSON array is an unnamed list. + +\code{gather_array} is often preceded by \code{\link{enter_object}} when the +array is nested under a JSON object, and is often followed by +\code{\link{gather_keys}} or \code{\link{enter_object}} if the array values +are objects, or by \code{\link{append_values}} to append all scalar values +as a new column or \code{\link{json_types}} to determine the types of the +array elements (JSON does not guarantee they are the same type). } \examples{ -'[1, "a", {"k": "v"}]' \%>\% gather_array \%>\% json_types + +# A simple character array example +json <- '["a", "b", "c"]' + +# Check that this is an array +json \%>\% json_types + +# Gather array and check types +json \%>\% gather_array \%>\% json_types + +# Extract string values +json \%>\% gather_array \%>\% append_values_string + +# A more complex mixed type example +json <- '["a", 1, true, null, {"key": "value"}]' + +# Then we can use the column.name argument to change the name of the keys +json \%>\% gather_array \%>\% json_types + +# A nested array +json <- '[["a", "b", "c"], ["a", "d"], ["b", "c"]]' + +# Extract both levels +json \%>\% gather_array("index.1") \%>\% gather_array("index.2") \%>\% + append_values_string + +# Some JSON begins as an array +commits \%>\% gather_array + +# We can use spread_all to capture all keys (where recursive = FALSE is used +# to limit the dept to just top level keys +library(dplyr) +commits \%>\% gather_array \%>\% spread_all(recursive = FALSE) \%>\% glimpse +} +\seealso{ +\code{\link{gather_keys}} to gather a JSON object, + \code{\link{enter_object}} to enter into an object, + \code{\link[tidyr]{gather}} to gather key-value pairs in a data + frame } diff --git a/man/gather_keys.Rd b/man/gather_keys.Rd index b82ec60..beb3df5 100644 --- a/man/gather_keys.Rd +++ b/man/gather_keys.Rd @@ -2,31 +2,68 @@ % Please edit documentation in R/gather.R \name{gather_keys} \alias{gather_keys} -\title{Stack a JSON {"key": value} object} +\title{Gather a JSON object into key-value pairs} \usage{ gather_keys(.x, column.name = default.column.name) } \arguments{ -\item{.x}{a json string or tbl_json object whose JSON attribute should always be an object} +\item{.x}{a JSON string or \code{tbl_json} object whose JSON attribute should +always be an object} \item{column.name}{the name to give to the column of key names created} } \value{ -a tbl_json with a new column (column.name) that captures the keys - and JSON attribute of the associated value data +a \code{\link{tbl_json}} object } \description{ -Given a JSON key value structure, like {"key1": 1, "key2": 2}, the -gather_keys() function duplicates the rows of the tbl_json data.frame for -every key, adds a new column (default name "key") to capture the key names, -and then dives into the JSON values to enable further manipulation with -downstream tidyjson functions. +\code{gather_keys} collapses a JSON object into key-value pairs, creating +a new column \code{'key'} to store the object key names, and storing the +values in the \code{'JSON'} attribute for further tidyjson manipulation. +All other columns are duplicated as necessary. This allows you to access the +keys of the objects just like \code{\link{gather_array}} lets you access the +values of an array. } \details{ -This allows you to *enter into* the keys of the objects just like \code{gather_array} -let you enter elements of the array. +\code{gather_keys} is often followed by \code{\link{enter_object}} to enter +into a value that is an object, by \code{\link{append_values}} to append all +scalar values as a new column or \code{\link{json_types}} to determine the +types of the keys. } \examples{ -'{"name": "bob", "age": 32}' \%>\% gather_keys \%>\% json_types + +# Let's start with a very simple example +json <- '{"name": "bob", "age": 32, "gender": "male"}' + +# Check that this is an object +json \%>\% json_types + +# Gather keys and check types +json \%>\% gather_keys \%>\% json_types + +# Sometimes data is stored in key names +json <- '{"2014": 32, "2015": 56, "2016": 14}' + +# Then we can use the column.name argument to change the name of the keys +json \%>\% gather_keys("year") + +# We can also use append_values_number to capture the values, since they are +# all of the same type +json \%>\% gather_keys("year") \%>\% append_values_number("count") + +# This can even work with a more complex, nested example +json <- '{"2015": {"1": 10, "3": 1, "11": 5}, "2016": {"2": 3, "5": 15}}' +json \%>\% gather_keys("year") \%>\% gather_keys("month") \%>\% + append_values_number("count") + +# Most JSON starts out as an object (or an array of objects), and gather_keys +# can be used to inspect the top level (or 2nd level) keys and their structure +library(dplyr) +worldbank \%>\% gather_keys \%>\% json_types \%>\% count(key, type) +} +\seealso{ +\code{\link{gather_array}} to gather a JSON array, + \code{\link{enter_object}} to enter into an object, + \code{\link[tidyr]{gather}} to gather key-value pairs in a data + frame } diff --git a/man/jfunctions.Rd b/man/jfunctions.Rd index 626aa9f..77d3cc9 100644 --- a/man/jfunctions.Rd +++ b/man/jfunctions.Rd @@ -6,7 +6,7 @@ \alias{jnumber} \alias{jstring} \title{Navigates nested objects to get at keys of a specific type, to be used as -arguments to spread_values} +arguments to \code{\link{spread_values}}} \usage{ jstring(..., recursive = FALSE) @@ -27,4 +27,8 @@ a function that can operate on parsed JSON data \description{ Note that these functions fail if they encounter the incorrect type. } +\seealso{ +\code{\link{spread_values}} for using these functions to spread + the values of a JSON object into new columns +} diff --git a/man/json_complexity.Rd b/man/json_complexity.Rd index cb1cf1b..1adba4a 100644 --- a/man/json_complexity.Rd +++ b/man/json_complexity.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/json_complexity.R \name{json_complexity} \alias{json_complexity} -\title{Add a column that contains the complexity (recursively unlisted length) of the JSON data} +\title{Compute the complexity (recursively unlisted length) of JSON data} \usage{ json_complexity(.x, column.name = "complexity") } @@ -12,18 +12,33 @@ json_complexity(.x, column.name = "complexity") \item{column.name}{the name to specify for the length column} } \value{ -a tbl_json object with column.name column that tells the length +a \code{\link{tbl_json}} object } \description{ When investigating complex JSON data it can be helpful to identify the -complexity of deeply nested documents. The json_complexity() function adds a -column (default name "complexity") that contains the 'complexity' of the JSON -associated with each row. Essentially, every on-null scalar value is found in the -object by recursively stripping away all objects or arrays, and the complexity -is the count of these scalar values. Note that 'null' has complexity 0. +complexity of deeply nested documents. The \code{json_complexity} function +adds a column (default name \code{"complexity"}) that contains the +'complexity' of the JSON associated with each row. Essentially, every on-null +scalar value is found in the object by recursively stripping away all objects +or arrays, and the complexity is the count of these scalar values. Note that +'null' has complexity 0, as do empty objects and arrays. } \examples{ -c('[1, 2, [3, 4]]', '{"k1": 1, "k2": [2, [3, 4]]}', '1', {}) \%>\% - json_lengths \%>\% json_complexity + +# A simple example +json <- c('[1, 2, [3, 4]]', '{"k1": 1, "k2": [2, [3, 4]]}', '1', 'null') + +# Complexity is larger than length for nested objects +json \%>\% json_lengths \%>\% json_complexity + +# Worldbank has complexity ranging from 8 to 17 +library(magrittr) +worldbank \%>\% json_complexity \%$\% table(complexity) + +# Commits are much more regular +commits \%>\% gather_array \%>\% json_complexity \%$\% table(complexity) +} +\seealso{ +\code{\link{json_lengths}} to compute the length of each value } diff --git a/man/json_lengths.Rd b/man/json_lengths.Rd index 2ec7666..b38bc1e 100644 --- a/man/json_lengths.Rd +++ b/man/json_lengths.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/json_lengths.R \name{json_lengths} \alias{json_lengths} -\title{Add a column that contains the length of the JSON data} +\title{Compute the length of JSON data} \usage{ json_lengths(.x, column.name = "length") } @@ -12,17 +12,34 @@ json_lengths(.x, column.name = "length") \item{column.name}{the name to specify for the length column} } \value{ -a tbl_json object with column.name column that tells the length +a \code{\link{tbl_json}} object } \description{ When investigating JSON data it can be helpful to identify the lengths of the -JSON objects or arrays, especialy when they are 'ragged' across documents. The -json_lengths() function adds a column (default name "length") that contains -the 'length' of the JSON associated with each row. For objects, this will -be equal to the number of keys. For arrays, this will be equal to the length -of the array. All scalar values will be of length 1. +JSON objects or arrays, especialy when they are 'ragged' across documents. +The \code{json_lengths} function adds a column (default name \code{"length"}) +that contains the 'length' of the JSON associated with each row. For objects, +this will be equal to the number of keys. For arrays, this will be equal to +the length of the array. All scalar values will be of length 1, and null +will have length 0. } \examples{ -c('[1, 2, 3]', '{"k1": 1, "k2": 2}', '1', {}) \%>\% json_lengths + +# A simple example +json <- c('[1, 2, 3]', '{"k1": 1, "k2": 2}', '1', 'null') + +# Complexity is larger than length for nested objects +json \%>\% json_lengths + +# Worldbank objcts are either length 7 or 8 +library(magrittr) +worldbank \%>\% json_lengths \%$\% table(length) + +# All commits are length 8 +commits \%>\% gather_array \%>\% json_lengths \%$\% table(length) +} +\seealso{ +\code{\link{json_complexity}} to compute the recursive length of + each value } diff --git a/man/json_schema.Rd b/man/json_schema.Rd index 7e5188b..f409323 100644 --- a/man/json_schema.Rd +++ b/man/json_schema.Rd @@ -7,7 +7,7 @@ json_schema(.x, type = c("string", "value")) } \arguments{ -\item{.x}{a json string or tbl_json object} +\item{.x}{a json string or \code{\link{tbl_json}} object} \item{type}{whether to capture scalar nodes using the string that defines their type (e.g., "logical") or as a representative value @@ -34,33 +34,47 @@ JSON into a simple form using the following rules: } For more complex JSON objects, ties are broken by taking the most -complex example (using \code{json_complexity}), and then by type -(using \code{json_types}). +complex example (using \code{\link{json_complexity}}), and then by type +(using \code{\link{json_types}}). This means that if a key has varying schema across documents, the most complex schema will be chosen as being representative. Similarly, if the elements of an array vary in schema, the most complex element is chosen, and if arrays vary in schema across documents, the most complex is chosen. + +Note that \code{json_schema} can be slow for large JSON document collections, +you may want to sample your JSON collection first. } \examples{ # A simple string -'"string"' \%>\% json_schema +'"string"' \%>\% json_schema \%>\% writeLines # A simple object -'{"key": "value"}' \%>\% json_schema +'{"key": "value"}' \%>\% json_schema \%>\% writeLines + +# A more complex JSON array +json <- '[{"a": 1}, [1, 2], "a", 1, true, null]' -# A complex array is represented by the most complex example -'[{"a": 1}, [1, 2], "a", 1, true, null]' \%>\% json_schema +# Using type = 'string' (default) +json \%>\% json_schema \%>\% writeLines -# Companies example -companies[1] \%>\% json_schema \%>\% plot_json_graph +# Using type = 'value' to show a representative value +json \%>\% json_schema(type = "value") \%>\% writeLines -# Github issues -issues_array <- issues \%>\% gather_array # issues are one large array -# analyze first 5, and use type = "value" to ensure proper coloring of graph -issues_schema <- issues_array[1:5, ] \%>\% json_schema(type = "value") -issues_schema \%>\% plot_json_graph +# Plotting the schema of a company example +companies[1] \%>\% json_schema(type = "value") \%>\% plot_json_graph + +# Schema of the first 5 github issues +library(dplyr) +issues \%>\% gather_array \%>\% slice(1:10) \%>\% + json_schema(type = "value") \%>\% plot_json_graph +} +\seealso{ +\code{\link{json_structure}} to recursively structure all + documents into a single data frame, + \code{\link{plot_json_graph}} to plot JSON (including results + of \code{json_schema} as a graph } diff --git a/man/json_structure.Rd b/man/json_structure.Rd index 9629e04..690bc1b 100644 --- a/man/json_structure.Rd +++ b/man/json_structure.Rd @@ -10,42 +10,49 @@ json_structure(.x) \item{.x}{a json string or tbl_json object} } \value{ -a tbl_json object with the following columns: +a \code{\link{tbl_json}} object +} +\description{ +Returns a \code{\link{tbl_json}} object where each row corresponds to a leaf +in the JSON structure. The first row corresponds to the JSON document as +a whole. If the document is a scalar value (JSON string, number, logical +or null), then there will only be 1 row. If instead it is an object or +an array, then subsequent rows will recursively correspond to the elements +(and their children) of the object or array. +} +\details{ +The columns in the \code{\link{tbl_json}} returend are defined as - \code{document.id} 1L if \code{x} is a single JSON string, otherwise the - index of \code{x}. +\itemize{ + \item \code{document.id} 1L if \code{.x} is a single JSON string, otherwise + the index of \code{.x}. - \code{parent.id} the string identifier of the parent node for this child. +\item \code{parent.id} the string identifier of the parent node for this + child. - \code{level} what level of the hierarchy this child resides at, starting - at \code{0L} for the root and incrementing for each level of nested - array or object. +\item \code{level} what level of the hierarchy this child resides at, + starting at \code{0L} for the root and incrementing for each level + of nested array or object. - \code{index} what index of the parent object / array this child resides - at (from \code{gather_array} for arrays). +\item \code{index} what index of the parent object / array this child + resides at (from \code{gather_array} for arrays). - \code{child.id} a unique ID for this leaf in this document, represented - as . where is the ID for the parent and - is this index. +\item \code{child.id} a unique ID for this leaf in this document, + represented as . where is the ID for the + parent and is this index. - \code{seq} the sequence of keys / indices that led to this child - (parents that are arrays are excluded) as a list, where character strings - denote objects and integers denote array positions +\item \code{seq} the sequence of keys / indices that led to this child + (parents that are arrays are excluded) as a list, where character + strings denote objects and integers denote array positions - \code{key} if this is the value of an object, what was the key that it - is listed under (from \code{gather_keys}). +\item \code{key} if this is the value of an object, what was the key that + it is listed under (from \code{\link{gather_keys}}). - \code{type} the type of this object (from \code{json_types}). +\item \code{type} the type of this object (from \code{\link{json_types}}). - \code{length} the length of this object (from \code{json_lengths}). +\item \code{length} the length of this object (from + \code{\link{json_lengths}}). } -\description{ -Returns a \code{tbl_json} object where each row corresponds to a leaf in -the JSON structure. The first row corresponds to the json document as -a whole. If the document is a scalar value (JSON string, number, logical -or null), then there will only be 1 row. If instead it is an object or -an array, then subsequent rows will recursively correspond to the elements -(and their children) of the object or array. } \examples{ @@ -57,5 +64,14 @@ an array, then subsequent rows will recursively correspond to the elements # A complex array '[{"a": 1}, [1, 2], "a", 1, true, null]' \%>\% json_structure + +# A sample of structure rows from a company +library(dplyr) +companies[1] \%>\% json_structure \%>\% sample_n(5) +} +\seealso{ +\code{\link{json_schema}} to create a schema for a JSON document or + collection, \code{\link{plot_json_graph}} to plot the structure + of a JSON object as a graph } diff --git a/man/json_types.Rd b/man/json_types.Rd index 5524eb5..f055fb0 100644 --- a/man/json_types.Rd +++ b/man/json_types.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/json_types.R \name{json_types} \alias{json_types} -\title{Add a column that tells the 'type' of the data in the root of the JSON} +\title{Add a column that tells the 'type' of the JSON data} \usage{ json_types(.x, column.name = "type") } @@ -12,20 +12,27 @@ json_types(.x, column.name = "type") \item{column.name}{the name to specify for the type column} } \value{ -a tbl_json object with column.name column that tells the type +a \code{\link{tbl_json}} object } \description{ -The function json_types() inspects the JSON associated with -each row of the tbl_json data.frame, and adds a new column ("type" by -default) that identifies the type according to the -JSON standard at http://json.org/. +The function \code{json_types} inspects the JSON associated with +each row of the \code{\link{tbl_json}} object, and adds a new column +(\code{"type"} by default) that identifies the type according to the +JSON standard at \url{http://json.org/}. } \details{ -This is particularly useful for inspecting your JSON data types, and can added -after gather_array() (or gather_keys()) to inspect the types of the elements -(or values) in arrays (or objects). +This is particularly useful for inspecting your JSON data types, and can +often follows after \code{\link{gather_array}}, \code{\link{gather_keys}} +or \code{\link{enter_object}} to inspect the types of the elements of +JSON objects or arrays. } \examples{ + +# A simple example c('{"a": 1}', '[1, 2]', '"a"', '1', 'true', 'null') \%>\% json_types + +# Type distribution in the first 10 companies +library(dplyr) +companies[1:10] \%>\% gather_keys \%>\% json_types \%>\% count(type) } diff --git a/man/plot_json_graph.Rd b/man/plot_json_graph.Rd index eed584a..9369e01 100644 --- a/man/plot_json_graph.Rd +++ b/man/plot_json_graph.Rd @@ -2,36 +2,36 @@ % Please edit documentation in R/plot_json_graph.R \name{plot_json_graph} \alias{plot_json_graph} -\title{Plots an igraph visualization of a single json document} +\title{Plots an \code{\link[igraph]{igraph}} visualization of a JSON document} \usage{ plot_json_graph(.x, legend = TRUE, vertex.size = 6, edge.color = "grey70", edge.width = 0.5, show.labels = TRUE, plot = TRUE, ...) } \arguments{ -\item{.x}{a json string or tbl_json object} +\item{.x}{a JSON string or \code{\link{tbl_json}} object} \item{legend}{add a type color legend automatically} \item{vertex.size}{the size of the vertices (helpful to reduce this if the -json is very complex} +JSON is very complex} \item{edge.color}{the color for the edges} \item{edge.width}{the width of the edge lines, helpful to reduce this if -the json is very complex} +the JSON is very complex} \item{show.labels}{should object names be shown} \item{plot}{should the plot be rendered?} -\item{...}{further arguments to igraph::plot.igraph} +\item{...}{further arguments to \code{\link[igraph]{plot.igraph}}} } \value{ -the igraph object +an \code{\link[igraph]{igraph}} object } \description{ -This function first calls json_structure, and then uses that data to create -an igraph object, and then plots that object. +This function first calls \code{\link{json_structure}}, and then uses that +data to create an igraph object, and then plots that object. } \details{ Each dot on the plot corresponds to a node in the JSON document, which @@ -40,20 +40,22 @@ a string, number, logical or null value which will be terminal nodes. The graph connects parent nodes to child nodes, and the vertices are colored based on json_types. -If show.labels is TRUE, then the names for object values are plotted on -the value node. +If \code{show.labels} is \code{TRUE}, then the names for object values are +plotted on the value node. -If you have a very large document (json_complexity larger than a few -hundred), you should consider setting show.labels to FALSE, and reducing -the vertex.size and edge.width. Documents that are even more complex may -need to be broken into smaller chunks to be visualized effectively. +If you have a very large document (\code{\link{json_complexity}} larger than +a few hundred), you should consider setting \code{show.labels} to +\code{FALSE}, and reducing the \code{vertex.size} and \code{edge.width} +parameters. Documents that are even more complex may need to be broken into +smaller chunks to be visualized effectively. Note that the legend is plotted automatically, but may not be scaled -correctly. Set legend to FALSE and manually create your own legend if -you wish to reposition it. +correctly. Set \code{legend} to \code{FALSE} and manually create your own +legend if you wish to reposition it. Also note that this function sets the plot margins to zero in order to -maximize the size of the graph on the page. the par() is reset afterwards. +maximize the size of the graph on the page. the \code{par} is reset +afterwards. } \examples{ @@ -71,6 +73,5 @@ worldbank[1] \%>\% plot_json_graph # a very complex real example companies[1] \%>\% plot_json_graph(show.labels = FALSE, vertex.size = 4) - } diff --git a/man/read_json.Rd b/man/read_json.Rd index d74a8db..fa922d7 100644 --- a/man/read_json.Rd +++ b/man/read_json.Rd @@ -2,21 +2,24 @@ % Please edit documentation in R/read_json.R \name{read_json} \alias{read_json} -\title{Reads JSON from an input uri (file, url, ...) and returns a tbl_json} +\title{Reads JSON from an input uri (file, url, ...) and returns a +\code{\link{tbl_json}} object} \usage{ read_json(path, format = c("json", "jsonl", "infer")) } \arguments{ \item{path}{to some json data} -\item{format}{If "json", process the data like one large JSON record. -If "jsonl", process the data one JSON record per line (json lines format) -If "infer", the format is the suffix of the given filepath.} +\item{format}{If \code{"json"}, process the data like one large JSON record. +If \code{"jsonl"}, process the data one JSON record per line (json lines +format). +If \code{"infer"}, the format is the suffix of the given filepath.} } \value{ -tbl_json instance +a \code{\link{tbl_json}} object } \description{ -Reads JSON from an input uri (file, url, ...) and returns a tbl_json +Reads JSON from an input uri (file, url, ...) and returns a +\code{\link{tbl_json}} object } diff --git a/man/spread_all.Rd b/man/spread_all.Rd index f192470..be55bf3 100644 --- a/man/spread_all.Rd +++ b/man/spread_all.Rd @@ -2,35 +2,42 @@ % Please edit documentation in R/spread_all.R \name{spread_all} \alias{spread_all} -\title{Spreads all object names into new columns} +\title{Spreads all scalar values of a JSON object into new columns} \usage{ spread_all(.x, recursive = TRUE, sep = ".") } \arguments{ -\item{.x}{a json string or tbl_json object} +\item{.x}{a json string or \code{\link{tbl_json}} object} \item{recursive}{whether or not to recursively spread nested objects} \item{sep}{character used to separate nested object keys when resursive -is TRUE} +is \code{TRUE}} +} +\value{ +a \code{\link{tbl_json}} object } \description{ -Like the spread function in tidyr but for JSON, this function spreads out -any JSON objects into new columns. If objects are nested, then the -recursive flag will expand those objects out with a compound colum name -based on the sequences of nested keys concatenated with the sep character. +Like the \code{\link[tidyr]{spread}} function in \code{tidyr} but for JSON, +this function spreads out any JSON objects that are scalars into new columns. +If objects are nested, then the recursive flag will expand scalar values of +nested objects out with a compound column name based on the sequences of +nested keys concatenated with the \code{sep} character. } \details{ -Note that arrays are ignored by this function, use gather_array to stack -the array first, and then use spread_all if the array contains objects or -use one of the append_vaues_string, append_values_number or -append_values_logical to to capture the array values if they are scalars. +Note that arrays are ignored by this function, use \code{\link{gather_array}} +to gather the array first, and then use \code{spread_all} if the array +contains objects or use one of the \code{\link{append_values}} functions to +capture the array values if they are scalars. Note that scalar JSON values (e.g., a JSON string like '1') are also ignored, as they have no keys to create column names with. The order of columns is determined by the order they are encountered in the JSON document, with nested objects placed at the end. + +This function does not change the value of the JSON attribute of the +\code{\link{tbl_json}} object in any way. } \examples{ @@ -41,6 +48,11 @@ json <- c('{"a": "x", "b": 1, "c": true}', json \%>\% spread_all # A more complex example -worldbank \%>\% spread_all \%>\% head +worldbank \%>\% spread_all +} +\seealso{ +\code{\link{spread_values}} to specific which specific values + to spread along with their types, + \code{\link[tidyr]{spread}} for spreading data frames } diff --git a/man/spread_values.Rd b/man/spread_values.Rd index 4b5ef3b..90807eb 100644 --- a/man/spread_values.Rd +++ b/man/spread_values.Rd @@ -2,33 +2,73 @@ % Please edit documentation in R/spread_values.R \name{spread_values} \alias{spread_values} -\title{Create new columns with JSON values} +\title{Spreads specific scalar values of a JSON object into new columns} \usage{ spread_values(.x, ...) } \arguments{ -\item{.x}{a json string or tbl_json object} +\item{.x}{a json string or \code{\link{tbl_json}} object} -\item{...}{column=value list where 'column' will be the column name created -and 'value' must be a call to jstring(), jnumber() or jlogical() specifying -the path to get the value (and the type implicit in the function name)} +\item{...}{\code{column = value} pairs where \code{column} will be the +column name created and \code{value} must be a call to +\code{\link{jstring}}, \code{\link{jnumber}} or +\code{\link{jlogical}} specifying the path to get the value (and +the type implicit in the function name)} +} +\value{ +a \code{\link{tbl_json}} object } \description{ -The spread_values() function lets you dive into (potentially nested) JSON -objects and extract specific values. spread_values() takes jstring(), -jnumber() or jlogical() named function calls as arguments in order to specify -the type of the data that should be captured at each desired key location. -These values can be of varying types at varying depths. +The \code{spread_values} function lets you extract extract specific values +from (potentiall nested) JSON objects. \code{spread_values} takes +\code{\link{jstring}}, \code{\link{jnumber}} or \code{\link{jlogical}} named +function calls as arguments in order to specify the type of the data that +should be captured at each desired key location. These values can be of +varying types at varying depths. } \details{ -Note that jstring, jnumber and jlogical will fail if they encounter the -incorrect type in any document +Note that \code{\link{jstring}}, \code{\link{jnumber}} and +\code{\link{jlogical}} will fail if they encounter the incorrect type in any +document. + +The advantage of \code{spread_values} over \code{\link{spread_all}} is that +you are guaranteed to get a consistent data frame structure (columns and +types) out of any \code{spread_values} call. \code{\link{spread_all}} +requires less typing, but because it infers the columns and their types from +the JSON, it is less suitable when programming. } \examples{ -'{"name": {"first": "bob", "last": "jones"}, "age": 32}' \%>\% + +# A simple example +json <- '{"name": {"first": "Bob", "last": "Jones"}, "age": 32}' + +# Using spread_values +json \%>\% spread_values( first.name = jstring("name", "first"), - age = jnumber("age") + last.name = jstring("name", "last"), + age = jnumber("age") ) + +# Another document, this time with a middle name (and no age) +json2 <- '{"name": {"first": "Ann", "middle": "A", "last": "Smith"}, "age": 23}' + +# spread_values still gives the same column structure +c(json, json2) \%>\% + spread_values( + first.name = jstring("name", "first"), + last.name = jstring("name", "last"), + age = jnumber("age") + ) + +# whereas spread_all adds a new column +json \%>\% spread_all +c(json, json2) \%>\% spread_all +} +\seealso{ +\code{\link{spread_all}} for spreading all values, + \code{\link[tidyr]{spread}} for spreading data frames, + \code{\link{jstring}}, \code{\link{jnumber}}, + \code{\link{jlogical}} for accessing specific keys } diff --git a/man/sub-.tbl_json.Rd b/man/sub-.tbl_json.Rd index df9ada6..78c0123 100644 --- a/man/sub-.tbl_json.Rd +++ b/man/sub-.tbl_json.Rd @@ -16,6 +16,9 @@ \item{drop}{whether or not to simplify results} } +\value{ +a \code{\link{tbl_json}} object +} \description{ Extends `[.data.frame` to work with tbl_json objects, so that row filtering of the underlying data.frame also filters the associated JSON. diff --git a/man/tbl_json.Rd b/man/tbl_json.Rd index 6000108..0746756 100644 --- a/man/tbl_json.Rd +++ b/man/tbl_json.Rd @@ -24,22 +24,74 @@ is.tbl_json(.x) \arguments{ \item{df}{data.frame} -\item{json.list}{list of json lists parsed with fromJSON} +\item{json.list}{list of json lists parsed with +\code{\link[jsonlite]{fromJSON}}} -\item{drop.null.json}{drop NULL json entries from data.frame and json} +\item{drop.null.json}{drop \code{NULL} json entries from \code{df} and +\code{json.list}} -\item{.x}{an object to convert into a tbl_json object} +\item{.x}{an object to convert into a \code{tbl_json} object} \item{...}{other arguments} -\item{json.column}{the name of the JSON column of data in x, if x is a data.frame} +\item{json.column}{the name of the json column of data in \code{.x}, if +\code{.x} is a data frame} +} +\value{ +a \code{\link{tbl_json}} object } \description{ Combines structured JSON (as a data.frame) with remaining JSON -Note that json.list must have the same length as nrow(df), and if json.list -has any NULL elements, the corresponding rows will be removed from df. Also -note that "..JSON" is a reserved column name used internally for filtering -tbl_json objects, and so is not allowed in the data.frame names. +Constructs a \code{tbl_json} object, for further downstream manipulation +by other tidyjson functions. Methods exist to convert JSON stored in +character strings without any other associated data, as a separate +character string and associated data frame, or as a single data frame +with a specified character string JSON column. +} +\details{ +Most tidyjson functions accept a \code{tbl_json} object as the first +argument, and return a \code{tbl_json} object unless otherwise specified. +tidyjson functions will attempt to convert an object that isn't a +\code{tbl_json} object first, and so explicit construction of \code{tidyjson} +objects is rarely needed. + +\code{tbl_json} objects consist of a data frame along with it's associated +JSON, where each row of the data frame corresponds to a single JSON +document. The JSON is stored in a \code{"JSON"} attribute. + +Note that \code{json.list} must have the same length as \code{nrow(df)}, and +if \code{json.list} has any \code{NULL} elements, the corresponding rows will +be removed from \code{df}. Also note that \code{"..JSON"} is a reserved +column name used internally for filtering tbl_json objects, and so is not +allowed in the names of \code{df}. +} +\examples{ + +# Construct a tbl_json object using a charater string of JSON +json <- '{"animal": "cat", "count": 2}' +json \%>\% as.tbl_json + +# access the "JSON" argument +json \%>\% as.tbl_json \%>\% attr("JSON") + +# Construct a tbl_json object using multiple documents +json <- c('{"animal": "cat", "count": 2}', '{"animal": "parrot", "count": 1}') +json \%>\% as.tbl_json + +# Construct a tbl_json object from a data.frame with a JSON colum +library(tibble) +farms <- tribble( + ~farm, ~animals, + 1L, '[{"animal": "pig", "count": 50}, {"animal": "cow", "count": 10}]', + 2L, '[{"animal": "chicken", "count": 20}]' +) +farms \%>\% as.tbl_json(json.column = "animals") +# tidy the farms +farms \%>\% as.tbl_json(json.column = "animals") \%>\% + gather_array \%>\% spread_all +} +\seealso{ +\code{read_json} for reading json from files }