Skip to content

Commit

Permalink
1.0.5-3 sent to CRAN
Browse files Browse the repository at this point in the history
  • Loading branch information
pachadotdev committed May 20, 2024
1 parent afc71d3 commit 7958e81
Show file tree
Hide file tree
Showing 58 changed files with 1,009 additions and 1,106 deletions.
2 changes: 0 additions & 2 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ ignore/*
^codecov\.yml$
^dev$
^README\.html$
^inst/examples/data\.qmd$
^inst/examples/text\.qmd$
^LICENSE\.md$
^install\.ps1$
^article$
Expand Down
4 changes: 2 additions & 2 deletions CRAN-SUBMISSION
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
Version: 1.0.5-2
Date: 2024-04-29 22:54:47 UTC
SHA: 6bd494634d077544f5f7c628dafc19cb5a0fb987
Date: 2024-05-14 20:20:09 UTC
SHA: afc71d378ab49fa29ae1c6075d499f8249a80110
50 changes: 28 additions & 22 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,29 +1,35 @@
Package: tabulapdf
Type: Package
Title: Extract Tables from PDF Documents
Version: 1.0.5-2
Authors@R: c(person("Thomas J.", "Leeper",
role = "aut",
email = "[email protected]",
comment = c(ORCID = "0000-0003-4097-6326")),
person("Mauricio", "Vargas Sepulveda",
role = c("aut","cre"),
email = "[email protected]",
comment = c(ORCID = "0000-0003-1017-7574")),
person("Tom", "Paskhalis",
role = "aut",
email = "[email protected]",
comment = c(ORCID = "0000-0001-9298-8850")),
person("Manuel", "Aristaran",
role = "ctb"),
person("David", "Gohel",
role = "ctb",
comment = "rOpenSci reviewer"),
person("Lincoln", "Mullen",
role = "ctb",
comment = "rOpenSci reviewer"))
Description: Bindings for the 'Tabula' <https://tabula.technology/> 'Java'
library, which can extract tables from PDF documents.
library, which can extract tables from PDF files. This tool can reduce time
and effort in data extraction processes in fields like investigative
journalism. It allows for automatic and manual table extraction, the latter
facilitated through a 'Shiny' interface, enabling manual areas selection\
with a computer mouse for data retrieval.
Version: 1.0.5-2
Authors@R: c(
person("Thomas J.", "Leeper",
role = "aut",
email = "[email protected]",
comment = c(ORCID = "0000-0003-4097-6326")),
person("Mauricio", "Vargas Sepulveda",
role = c("aut","cre"),
email = "[email protected]",
comment = c(ORCID = "0000-0003-1017-7574")),
person("Tom", "Paskhalis",
role = "aut",
email = "[email protected]",
comment = c(ORCID = "0000-0001-9298-8850")),
person("Manuel", "Aristaran",
role = "ctb"),
person("David", "Gohel",
role = "ctb",
comment = "rOpenSci reviewer"),
person("Lincoln", "Mullen",
role = "ctb",
comment = "rOpenSci reviewer")
)
License: Apache License (>= 2)
URL: https://docs.ropensci.org/tabulapdf/ (website)
https://github.com/ropensci/tabulapdf/
Expand Down
50 changes: 24 additions & 26 deletions R/extract_metadata.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,42 +9,40 @@
#' @return A list.
#' @author Thomas J. Leeper <[email protected]>
#' @examples
#' \dontrun{
#' # simple demo file
#' f <- system.file("examples", "data.pdf", package = "tabulapdf")
#' f <- system.file("examples", "mtcars.pdf", package = "tabulapdf")
#'
#' extract_metadata(f)
#' }
#' @seealso \code{\link{extract_tables}}, \code{\link{extract_areas}}, \code{\link{extract_text}}, \code{\link{split_pdf}}
#' @importFrom rJava J new
#' @export
extract_metadata <- function(file, password = NULL, copy = FALSE) {
pdfDocument <- load_doc(file, password = password, copy = copy)
on.exit(pdfDocument$close())
pdfDocument <- load_doc(file, password = password, copy = copy)
on.exit(pdfDocument$close())

info <- pdfDocument$getDocumentInformation()
info <- pdfDocument$getDocumentInformation()

info_creation_date <- info$getCreationDate()
info_modification_date <- info$getModificationDate()
info_creation_date <- info$getCreationDate()
info_modification_date <- info$getModificationDate()

if (!is.null(info_creation_date)) {
info_creation_date <- info_creation_date$getTime()$toString()
}
if (!is.null(info_creation_date)) {
info_creation_date <- info_creation_date$getTime()$toString()
}

if (!is.null(info_modification_date)) {
info_modification_date <- info_modification_date$getTime()$toString()
}
if (!is.null(info_modification_date)) {
info_modification_date <- info_modification_date$getTime()$toString()
}

list(
pages = pdfDocument$getNumberOfPages(),
title = info$getTitle(),
author = info$getAuthor(),
subject = info$getSubject(),
keywords = info$getKeywords(),
creator = info$getCreator(),
producer = info$getProducer(),
created = info_creation_date,
modified = info_modification_date,
trapped = info$getTrapped()
)
list(
pages = pdfDocument$getNumberOfPages(),
title = info$getTitle(),
author = info$getAuthor(),
subject = info$getSubject(),
keywords = info$getKeywords(),
creator = info$getCreator(),
producer = info$getProducer(),
created = info_creation_date,
modified = info_modification_date,
trapped = info$getTrapped()
)
}
16 changes: 1 addition & 15 deletions R/extract_tables.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,25 +34,11 @@
#' @references \href{https://tabula.technology/}{Tabula}
#' @author Thomas J. Leeper <[email protected]>, Tom Paskhalis <[email protected]>
#' @examples
#' \dontrun{
#' # simple demo file
#' f <- system.file("examples", "data.pdf", package = "tabulapdf")
#'
#' # extract all tables
#' extract_tables(f)
#' f <- system.file("examples", "mtcars.pdf", package = "tabulapdf")
#'
#' # extract tables from only second page
#' extract_tables(f, pages = 2)
#'
#' # extract areas from a page
#' ## full table
#' extract_tables(f, pages = 2, area = list(c(126, 149, 212, 462)))
#' ## part of the table
#' extract_tables(f, pages = 2, area = list(c(126, 284, 174, 417)))
#'
#' # return tibbles
#' extract_tables(f, pages = 2, output = "tibble")
#' }
#' @seealso \code{\link{extract_areas}}, \code{\link{get_page_dims}}, \code{\link{make_thumbnails}}, \code{\link{split_pdf}}
#' @importFrom utils download.file
#' @importFrom readr read_delim
Expand Down
4 changes: 1 addition & 3 deletions R/extract_text.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,8 @@
#' @return If \code{pages = NULL} (the default), a length 1 character vector, otherwise a vector of length \code{length(pages)}.
#' @author Thomas J. Leeper <[email protected]>
#' @examples
#' \dontrun{
#' # simple demo file
#' f <- system.file("examples", "text.pdf", package = "tabulapdf")
#' f <- system.file("examples", "fortytwo.pdf", package = "tabulapdf")
#'
#' # extract all text
#' extract_text(f)
Expand All @@ -24,7 +23,6 @@
#'
#' # extract text from selected area only
#' extract_text(f, area = list(c(209.4, 140.5, 304.2, 500.8)))
#' }
#' @seealso \code{\link{extract_tables}}, \code{\link{extract_areas}}, \code{\link{split_pdf}}
#' @importFrom rJava J new
#' @export
Expand Down
42 changes: 20 additions & 22 deletions R/get_page_dims.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,42 +13,40 @@
#' @references \href{https://tabula.technology/}{Tabula}
#' @author Thomas J. Leeper <[email protected]>
#' @examples
#' \dontrun{
#' # simple demo file
#' f <- system.file("examples", "data.pdf", package = "tabulapdf")
#' f <- system.file("examples", "mtcars.pdf", package = "tabulapdf")
#'
#' get_n_pages(file = f)
#' get_page_dims(f)
#' }
#' @importFrom tools file_path_sans_ext
#' @importFrom rJava J new
#' @seealso \code{\link{extract_tables}}, \code{\link{extract_text}}, \code{\link{make_thumbnails}}
#' @export
get_page_dims <- function(file, doc, pages = NULL, password = NULL, copy = FALSE) {
if (!missing(file)) {
doc <- load_doc(file, password = password, copy = copy)
on.exit(doc$close())
}
if (!missing(file)) {
doc <- load_doc(file, password = password, copy = copy)
on.exit(doc$close())
}

if (!is.null(pages)) {
pages <- as.integer(pages)
} else {
pages <- 1L:(get_n_pages(doc = doc))
}
if (!is.null(pages)) {
pages <- as.integer(pages)
} else {
pages <- 1L:(get_n_pages(doc = doc))
}

allpages <- doc$getDocumentCatalog()$getPages()
lapply(pages, function(x) {
thispage <- allpages$get(x - 1L)
c(thispage$getMediaBox()$getWidth(), thispage$getMediaBox()$getHeight())
})
allpages <- doc$getDocumentCatalog()$getPages()
lapply(pages, function(x) {
thispage <- allpages$get(x - 1L)
c(thispage$getMediaBox()$getWidth(), thispage$getMediaBox()$getHeight())
})
}

#' @rdname get_page_dims
#' @export
get_n_pages <- function(file, doc, password = NULL, copy = FALSE) {
if (!missing(file)) {
doc <- load_doc(file, password = password, copy = copy)
on.exit(doc$close())
}
doc$getNumberOfPages()
if (!missing(file)) {
doc <- load_doc(file, password = password, copy = copy)
on.exit(doc$close())
}
doc$getNumberOfPages()
}
Loading

0 comments on commit 7958e81

Please sign in to comment.