-
Notifications
You must be signed in to change notification settings - Fork 72
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
afc71d3
commit 7958e81
Showing
58 changed files
with
1,009 additions
and
1,106 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
Version: 1.0.5-2 | ||
Date: 2024-04-29 22:54:47 UTC | ||
SHA: 6bd494634d077544f5f7c628dafc19cb5a0fb987 | ||
Date: 2024-05-14 20:20:09 UTC | ||
SHA: afc71d378ab49fa29ae1c6075d499f8249a80110 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,29 +1,35 @@ | ||
Package: tabulapdf | ||
Type: Package | ||
Title: Extract Tables from PDF Documents | ||
Version: 1.0.5-2 | ||
Authors@R: c(person("Thomas J.", "Leeper", | ||
role = "aut", | ||
email = "[email protected]", | ||
comment = c(ORCID = "0000-0003-4097-6326")), | ||
person("Mauricio", "Vargas Sepulveda", | ||
role = c("aut","cre"), | ||
email = "[email protected]", | ||
comment = c(ORCID = "0000-0003-1017-7574")), | ||
person("Tom", "Paskhalis", | ||
role = "aut", | ||
email = "[email protected]", | ||
comment = c(ORCID = "0000-0001-9298-8850")), | ||
person("Manuel", "Aristaran", | ||
role = "ctb"), | ||
person("David", "Gohel", | ||
role = "ctb", | ||
comment = "rOpenSci reviewer"), | ||
person("Lincoln", "Mullen", | ||
role = "ctb", | ||
comment = "rOpenSci reviewer")) | ||
Description: Bindings for the 'Tabula' <https://tabula.technology/> 'Java' | ||
library, which can extract tables from PDF documents. | ||
library, which can extract tables from PDF files. This tool can reduce time | ||
and effort in data extraction processes in fields like investigative | ||
journalism. It allows for automatic and manual table extraction, the latter | ||
facilitated through a 'Shiny' interface, enabling manual areas selection\ | ||
with a computer mouse for data retrieval. | ||
Version: 1.0.5-2 | ||
Authors@R: c( | ||
person("Thomas J.", "Leeper", | ||
role = "aut", | ||
email = "[email protected]", | ||
comment = c(ORCID = "0000-0003-4097-6326")), | ||
person("Mauricio", "Vargas Sepulveda", | ||
role = c("aut","cre"), | ||
email = "[email protected]", | ||
comment = c(ORCID = "0000-0003-1017-7574")), | ||
person("Tom", "Paskhalis", | ||
role = "aut", | ||
email = "[email protected]", | ||
comment = c(ORCID = "0000-0001-9298-8850")), | ||
person("Manuel", "Aristaran", | ||
role = "ctb"), | ||
person("David", "Gohel", | ||
role = "ctb", | ||
comment = "rOpenSci reviewer"), | ||
person("Lincoln", "Mullen", | ||
role = "ctb", | ||
comment = "rOpenSci reviewer") | ||
) | ||
License: Apache License (>= 2) | ||
URL: https://docs.ropensci.org/tabulapdf/ (website) | ||
https://github.com/ropensci/tabulapdf/ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,42 +9,40 @@ | |
#' @return A list. | ||
#' @author Thomas J. Leeper <[email protected]> | ||
#' @examples | ||
#' \dontrun{ | ||
#' # simple demo file | ||
#' f <- system.file("examples", "data.pdf", package = "tabulapdf") | ||
#' f <- system.file("examples", "mtcars.pdf", package = "tabulapdf") | ||
#' | ||
#' extract_metadata(f) | ||
#' } | ||
#' @seealso \code{\link{extract_tables}}, \code{\link{extract_areas}}, \code{\link{extract_text}}, \code{\link{split_pdf}} | ||
#' @importFrom rJava J new | ||
#' @export | ||
extract_metadata <- function(file, password = NULL, copy = FALSE) { | ||
pdfDocument <- load_doc(file, password = password, copy = copy) | ||
on.exit(pdfDocument$close()) | ||
pdfDocument <- load_doc(file, password = password, copy = copy) | ||
on.exit(pdfDocument$close()) | ||
|
||
info <- pdfDocument$getDocumentInformation() | ||
info <- pdfDocument$getDocumentInformation() | ||
|
||
info_creation_date <- info$getCreationDate() | ||
info_modification_date <- info$getModificationDate() | ||
info_creation_date <- info$getCreationDate() | ||
info_modification_date <- info$getModificationDate() | ||
|
||
if (!is.null(info_creation_date)) { | ||
info_creation_date <- info_creation_date$getTime()$toString() | ||
} | ||
if (!is.null(info_creation_date)) { | ||
info_creation_date <- info_creation_date$getTime()$toString() | ||
} | ||
|
||
if (!is.null(info_modification_date)) { | ||
info_modification_date <- info_modification_date$getTime()$toString() | ||
} | ||
if (!is.null(info_modification_date)) { | ||
info_modification_date <- info_modification_date$getTime()$toString() | ||
} | ||
|
||
list( | ||
pages = pdfDocument$getNumberOfPages(), | ||
title = info$getTitle(), | ||
author = info$getAuthor(), | ||
subject = info$getSubject(), | ||
keywords = info$getKeywords(), | ||
creator = info$getCreator(), | ||
producer = info$getProducer(), | ||
created = info_creation_date, | ||
modified = info_modification_date, | ||
trapped = info$getTrapped() | ||
) | ||
list( | ||
pages = pdfDocument$getNumberOfPages(), | ||
title = info$getTitle(), | ||
author = info$getAuthor(), | ||
subject = info$getSubject(), | ||
keywords = info$getKeywords(), | ||
creator = info$getCreator(), | ||
producer = info$getProducer(), | ||
created = info_creation_date, | ||
modified = info_modification_date, | ||
trapped = info$getTrapped() | ||
) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,25 +34,11 @@ | |
#' @references \href{https://tabula.technology/}{Tabula} | ||
#' @author Thomas J. Leeper <[email protected]>, Tom Paskhalis <[email protected]> | ||
#' @examples | ||
#' \dontrun{ | ||
#' # simple demo file | ||
#' f <- system.file("examples", "data.pdf", package = "tabulapdf") | ||
#' | ||
#' # extract all tables | ||
#' extract_tables(f) | ||
#' f <- system.file("examples", "mtcars.pdf", package = "tabulapdf") | ||
#' | ||
#' # extract tables from only second page | ||
#' extract_tables(f, pages = 2) | ||
#' | ||
#' # extract areas from a page | ||
#' ## full table | ||
#' extract_tables(f, pages = 2, area = list(c(126, 149, 212, 462))) | ||
#' ## part of the table | ||
#' extract_tables(f, pages = 2, area = list(c(126, 284, 174, 417))) | ||
#' | ||
#' # return tibbles | ||
#' extract_tables(f, pages = 2, output = "tibble") | ||
#' } | ||
#' @seealso \code{\link{extract_areas}}, \code{\link{get_page_dims}}, \code{\link{make_thumbnails}}, \code{\link{split_pdf}} | ||
#' @importFrom utils download.file | ||
#' @importFrom readr read_delim | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,9 +12,8 @@ | |
#' @return If \code{pages = NULL} (the default), a length 1 character vector, otherwise a vector of length \code{length(pages)}. | ||
#' @author Thomas J. Leeper <[email protected]> | ||
#' @examples | ||
#' \dontrun{ | ||
#' # simple demo file | ||
#' f <- system.file("examples", "text.pdf", package = "tabulapdf") | ||
#' f <- system.file("examples", "fortytwo.pdf", package = "tabulapdf") | ||
#' | ||
#' # extract all text | ||
#' extract_text(f) | ||
|
@@ -24,7 +23,6 @@ | |
#' | ||
#' # extract text from selected area only | ||
#' extract_text(f, area = list(c(209.4, 140.5, 304.2, 500.8))) | ||
#' } | ||
#' @seealso \code{\link{extract_tables}}, \code{\link{extract_areas}}, \code{\link{split_pdf}} | ||
#' @importFrom rJava J new | ||
#' @export | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,42 +13,40 @@ | |
#' @references \href{https://tabula.technology/}{Tabula} | ||
#' @author Thomas J. Leeper <[email protected]> | ||
#' @examples | ||
#' \dontrun{ | ||
#' # simple demo file | ||
#' f <- system.file("examples", "data.pdf", package = "tabulapdf") | ||
#' f <- system.file("examples", "mtcars.pdf", package = "tabulapdf") | ||
#' | ||
#' get_n_pages(file = f) | ||
#' get_page_dims(f) | ||
#' } | ||
#' @importFrom tools file_path_sans_ext | ||
#' @importFrom rJava J new | ||
#' @seealso \code{\link{extract_tables}}, \code{\link{extract_text}}, \code{\link{make_thumbnails}} | ||
#' @export | ||
get_page_dims <- function(file, doc, pages = NULL, password = NULL, copy = FALSE) { | ||
if (!missing(file)) { | ||
doc <- load_doc(file, password = password, copy = copy) | ||
on.exit(doc$close()) | ||
} | ||
if (!missing(file)) { | ||
doc <- load_doc(file, password = password, copy = copy) | ||
on.exit(doc$close()) | ||
} | ||
|
||
if (!is.null(pages)) { | ||
pages <- as.integer(pages) | ||
} else { | ||
pages <- 1L:(get_n_pages(doc = doc)) | ||
} | ||
if (!is.null(pages)) { | ||
pages <- as.integer(pages) | ||
} else { | ||
pages <- 1L:(get_n_pages(doc = doc)) | ||
} | ||
|
||
allpages <- doc$getDocumentCatalog()$getPages() | ||
lapply(pages, function(x) { | ||
thispage <- allpages$get(x - 1L) | ||
c(thispage$getMediaBox()$getWidth(), thispage$getMediaBox()$getHeight()) | ||
}) | ||
allpages <- doc$getDocumentCatalog()$getPages() | ||
lapply(pages, function(x) { | ||
thispage <- allpages$get(x - 1L) | ||
c(thispage$getMediaBox()$getWidth(), thispage$getMediaBox()$getHeight()) | ||
}) | ||
} | ||
|
||
#' @rdname get_page_dims | ||
#' @export | ||
get_n_pages <- function(file, doc, password = NULL, copy = FALSE) { | ||
if (!missing(file)) { | ||
doc <- load_doc(file, password = password, copy = copy) | ||
on.exit(doc$close()) | ||
} | ||
doc$getNumberOfPages() | ||
if (!missing(file)) { | ||
doc <- load_doc(file, password = password, copy = copy) | ||
on.exit(doc$close()) | ||
} | ||
doc$getNumberOfPages() | ||
} |
Oops, something went wrong.