1.0.5-3 sent to CRAN

ropensci · May 20, 2024 · 7958e81 · 7958e81
1 parent afc71d3
commit 7958e81
Show file tree

Hide file tree

Showing 58 changed files with 1,009 additions and 1,106 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -14,8 +14,6 @@ ignore/*
 ^codecov\.yml$
 ^dev$
 ^README\.html$
-^inst/examples/data\.qmd$
-^inst/examples/text\.qmd$
 ^LICENSE\.md$
 ^install\.ps1$
 ^article$

diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION
@@ -1,3 +1,3 @@
 Version: 1.0.5-2
-Date: 2024-04-29 22:54:47 UTC
-SHA: 6bd494634d077544f5f7c628dafc19cb5a0fb987
+Date: 2024-05-14 20:20:09 UTC
+SHA: afc71d378ab49fa29ae1c6075d499f8249a80110
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,29 +1,35 @@
 Package: tabulapdf
 Type: Package
 Title: Extract Tables from PDF Documents
-Version: 1.0.5-2
-Authors@R: c(person("Thomas J.", "Leeper",
-                    role = "aut", 
-                    email = "[email protected]",
-                    comment = c(ORCID = "0000-0003-4097-6326")),
-             person("Mauricio", "Vargas Sepulveda",
-                    role = c("aut","cre"), 
-                    email = "[email protected]", 
-                    comment = c(ORCID = "0000-0003-1017-7574")),
-             person("Tom", "Paskhalis",
-                    role = "aut",
-                    email = "[email protected]",
-                    comment = c(ORCID = "0000-0001-9298-8850")),
-             person("Manuel", "Aristaran",
-                    role = "ctb"),
-             person("David", "Gohel",
-                    role = "ctb",
-                    comment = "rOpenSci reviewer"),
-             person("Lincoln", "Mullen",
-                    role = "ctb",
-                    comment = "rOpenSci reviewer"))
 Description: Bindings for the 'Tabula' <https://tabula.technology/> 'Java'
-    library, which can extract tables from PDF documents.
+    library, which can extract tables from PDF files. This tool can reduce time
+    and effort in data extraction processes in fields like investigative
+    journalism. It allows for automatic and manual table extraction, the latter
+    facilitated through a 'Shiny' interface, enabling manual areas selection\
+    with a computer mouse for data retrieval.
+Version: 1.0.5-2
+Authors@R: c(
+    person("Thomas J.", "Leeper",
+            role = "aut", 
+            email = "[email protected]",
+            comment = c(ORCID = "0000-0003-4097-6326")),
+    person("Mauricio", "Vargas Sepulveda",
+            role = c("aut","cre"), 
+            email = "[email protected]", 
+            comment = c(ORCID = "0000-0003-1017-7574")),
+    person("Tom", "Paskhalis",
+            role = "aut",
+            email = "[email protected]",
+            comment = c(ORCID = "0000-0001-9298-8850")),
+    person("Manuel", "Aristaran",
+            role = "ctb"),
+    person("David", "Gohel",
+            role = "ctb",
+            comment = "rOpenSci reviewer"),
+    person("Lincoln", "Mullen",
+            role = "ctb",
+            comment = "rOpenSci reviewer")
+    )
 License: Apache License (>= 2)
 URL: https://docs.ropensci.org/tabulapdf/ (website)
     https://github.com/ropensci/tabulapdf/

diff --git a/R/extract_metadata.R b/R/extract_metadata.R
@@ -9,42 +9,40 @@
 #' @return A list.
 #' @author Thomas J. Leeper <[email protected]>
 #' @examples
-#' \dontrun{
 #' # simple demo file
-#' f <- system.file("examples", "data.pdf", package = "tabulapdf")
+#' f <- system.file("examples", "mtcars.pdf", package = "tabulapdf")
 #'
 #' extract_metadata(f)
-#' }
 #' @seealso \code{\link{extract_tables}}, \code{\link{extract_areas}}, \code{\link{extract_text}}, \code{\link{split_pdf}}
 #' @importFrom rJava J new
 #' @export
 extract_metadata <- function(file, password = NULL, copy = FALSE) {
-    pdfDocument <- load_doc(file, password = password, copy = copy)
-    on.exit(pdfDocument$close())
+  pdfDocument <- load_doc(file, password = password, copy = copy)
+  on.exit(pdfDocument$close())
 
-    info <- pdfDocument$getDocumentInformation()
+  info <- pdfDocument$getDocumentInformation()
 
-    info_creation_date <- info$getCreationDate()
-    info_modification_date <- info$getModificationDate()
+  info_creation_date <- info$getCreationDate()
+  info_modification_date <- info$getModificationDate()
 
-    if (!is.null(info_creation_date)) {
-        info_creation_date <- info_creation_date$getTime()$toString()
-    }
+  if (!is.null(info_creation_date)) {
+    info_creation_date <- info_creation_date$getTime()$toString()
+  }
 
-    if (!is.null(info_modification_date)) {
-        info_modification_date <- info_modification_date$getTime()$toString()
-    }
+  if (!is.null(info_modification_date)) {
+    info_modification_date <- info_modification_date$getTime()$toString()
+  }
 
-    list(
-        pages = pdfDocument$getNumberOfPages(),
-        title = info$getTitle(),
-        author = info$getAuthor(),
-        subject = info$getSubject(),
-        keywords = info$getKeywords(),
-        creator = info$getCreator(),
-        producer = info$getProducer(),
-        created = info_creation_date,
-        modified = info_modification_date,
-        trapped = info$getTrapped()
-    )
+  list(
+    pages = pdfDocument$getNumberOfPages(),
+    title = info$getTitle(),
+    author = info$getAuthor(),
+    subject = info$getSubject(),
+    keywords = info$getKeywords(),
+    creator = info$getCreator(),
+    producer = info$getProducer(),
+    created = info_creation_date,
+    modified = info_modification_date,
+    trapped = info$getTrapped()
+  )
 }
diff --git a/R/extract_tables.R b/R/extract_tables.R
@@ -34,25 +34,11 @@
 #' @references \href{https://tabula.technology/}{Tabula}
 #' @author Thomas J. Leeper <[email protected]>, Tom Paskhalis <[email protected]>
 #' @examples
-#' \dontrun{
 #' # simple demo file
-#' f <- system.file("examples", "data.pdf", package = "tabulapdf")
-#'
-#' # extract all tables
-#' extract_tables(f)
+#' f <- system.file("examples", "mtcars.pdf", package = "tabulapdf")
 #'
 #' # extract tables from only second page
 #' extract_tables(f, pages = 2)
-#'
-#' # extract areas from a page
-#' ## full table
-#' extract_tables(f, pages = 2, area = list(c(126, 149, 212, 462)))
-#' ## part of the table
-#' extract_tables(f, pages = 2, area = list(c(126, 284, 174, 417)))
-#'
-#' # return tibbles
-#' extract_tables(f, pages = 2, output = "tibble")
-#' }
 #' @seealso \code{\link{extract_areas}}, \code{\link{get_page_dims}}, \code{\link{make_thumbnails}}, \code{\link{split_pdf}}
 #' @importFrom utils download.file
 #' @importFrom readr read_delim

diff --git a/R/extract_text.R b/R/extract_text.R
@@ -12,9 +12,8 @@
 #' @return If \code{pages = NULL} (the default), a length 1 character vector, otherwise a vector of length \code{length(pages)}.
 #' @author Thomas J. Leeper <[email protected]>
 #' @examples
-#' \dontrun{
 #' # simple demo file
-#' f <- system.file("examples", "text.pdf", package = "tabulapdf")
+#' f <- system.file("examples", "fortytwo.pdf", package = "tabulapdf")
 #'
 #' # extract all text
 #' extract_text(f)
@@ -24,7 +23,6 @@
 #'
 #' # extract text from selected area only
 #' extract_text(f, area = list(c(209.4, 140.5, 304.2, 500.8)))
-#' }
 #' @seealso \code{\link{extract_tables}}, \code{\link{extract_areas}}, \code{\link{split_pdf}}
 #' @importFrom rJava J new
 #' @export

diff --git a/R/get_page_dims.R b/R/get_page_dims.R
@@ -13,42 +13,40 @@
 #' @references \href{https://tabula.technology/}{Tabula}
 #' @author Thomas J. Leeper <[email protected]>
 #' @examples
-#' \dontrun{
 #' # simple demo file
-#' f <- system.file("examples", "data.pdf", package = "tabulapdf")
+#' f <- system.file("examples", "mtcars.pdf", package = "tabulapdf")
 #'
 #' get_n_pages(file = f)
 #' get_page_dims(f)
-#' }
 #' @importFrom tools file_path_sans_ext
 #' @importFrom rJava J new
 #' @seealso \code{\link{extract_tables}}, \code{\link{extract_text}}, \code{\link{make_thumbnails}}
 #' @export
 get_page_dims <- function(file, doc, pages = NULL, password = NULL, copy = FALSE) {
-    if (!missing(file)) {
-        doc <- load_doc(file, password = password, copy = copy)
-        on.exit(doc$close())
-    }
+  if (!missing(file)) {
+    doc <- load_doc(file, password = password, copy = copy)
+    on.exit(doc$close())
+  }
 
-    if (!is.null(pages)) {
-        pages <- as.integer(pages)
-    } else {
-        pages <- 1L:(get_n_pages(doc = doc))
-    }
+  if (!is.null(pages)) {
+    pages <- as.integer(pages)
+  } else {
+    pages <- 1L:(get_n_pages(doc = doc))
+  }
 
-    allpages <- doc$getDocumentCatalog()$getPages()
-    lapply(pages, function(x) {
-        thispage <- allpages$get(x - 1L)
-        c(thispage$getMediaBox()$getWidth(), thispage$getMediaBox()$getHeight())
-    })
+  allpages <- doc$getDocumentCatalog()$getPages()
+  lapply(pages, function(x) {
+    thispage <- allpages$get(x - 1L)
+    c(thispage$getMediaBox()$getWidth(), thispage$getMediaBox()$getHeight())
+  })
 }
 
 #' @rdname get_page_dims
 #' @export
 get_n_pages <- function(file, doc, password = NULL, copy = FALSE) {
-    if (!missing(file)) {
-        doc <- load_doc(file, password = password, copy = copy)
-        on.exit(doc$close())
-    }
-    doc$getNumberOfPages()
+  if (!missing(file)) {
+    doc <- load_doc(file, password = password, copy = copy)
+    on.exit(doc$close())
+  }
+  doc$getNumberOfPages()
 }