diff --git a/.Rbuildignore b/.Rbuildignore index 70584b9..c5eb406 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -12,3 +12,5 @@ ignore/* ^docs/ ^\.github$ ^codecov\.yml$ +^dev$ +^README\.html$ diff --git a/DESCRIPTION b/DESCRIPTION index 824759b..a3f3ca1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: tabulapdf Type: Package Title: Extract tables from PDF documents -Version: 1.0.5 +Version: 1.0.5-2 Authors@R: c(person("Thomas J.", "Leeper", role = "aut", email = "thosjleeper@gmail.com", @@ -30,6 +30,7 @@ URL: https://docs.ropensci.org/tabulapdf (website) BugReports: https://github.com/ropensci/tabulapdf/issues Imports: png, + readr, rJava, tools, utils diff --git a/NAMESPACE b/NAMESPACE index 79305b2..6252dee 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,6 +22,6 @@ importFrom(rJava,.jcall) importFrom(rJava,.jfloat) importFrom(rJava,J) importFrom(rJava,new) +importFrom(readr,read_delim) importFrom(tools,file_path_sans_ext) importFrom(utils,download.file) -importFrom(utils,read.delim) diff --git a/NEWS.md b/NEWS.md index 9974c88..0d0da71 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +# CHANGES TO tabulapdf 1.0.5-2 + +* Uses readr for a much faster parsing of extracted tables. +* The default output format is now a list of tibbles. +* All tests pass. + # CHANGES TO tabulapdf 1.0.5 * Package renamed to `tabulapdf` diff --git a/R/extract_tables.R b/R/extract_tables.R index 6f54708..06a071d 100644 --- a/R/extract_tables.R +++ b/R/extract_tables.R @@ -23,8 +23,8 @@ #' @param \dots These are additional arguments passed to the internal functions dispatched by \code{method}. #' @details This function mimics the behavior of the Tabula command line utility. It returns a list of R character matrices containing tables extracted from a file by default. This response behavior can be changed by using the following options. #' \itemize{ +#' \item \code{output = "tibble"} attempts to coerce the structure returned by \code{method = "character"} into a list of tibbles and returns character strings where this fails. #' \item \code{output = "character"} returns a list of single-element character vectors, where each vector is a tab-delimited, line-separate string of concatenated table cells. -#' \item \code{output = "data.frame"} attempts to coerce the structure returned by \code{method = "character"} into a list of data.frames and returns character strings where this fails. #' \item \code{output = "csv"} writes the tables to comma-separated (CSV) files using Tabula's CSVWriter method in the same directory as the original PDF. \code{method = "tsv"} does the same but with tab-separated (TSV) files using Tabula's TSVWriter and \code{method = "json"} does the same using Tabula's JSONWriter method. Any of these three methods return the path to the directory containing the extract table files. #' \item \code{output = "asis"} returns the Java object reference, which can be useful for debugging or for writing a custom parser. #' } @@ -49,11 +49,12 @@ #' ## part of the table #' extract_tables(f, pages = 2, area = list(c(126, 284, 174, 417))) #' -#' # return data.frames -#' extract_tables(f, pages = 2, output = "data.frame") +#' # return tibbles +#' extract_tables(f, pages = 2, output = "tibble") #' } #' @seealso \code{\link{extract_areas}}, \code{\link{get_page_dims}}, \code{\link{make_thumbnails}}, \code{\link{split_pdf}} -#' @importFrom utils read.delim download.file +#' @importFrom utils download.file +#' @importFrom readr read_delim #' @importFrom tools file_path_sans_ext #' @importFrom rJava J new .jfloat .jcall #' @export @@ -64,7 +65,7 @@ extract_tables <- function(file, guess = TRUE, method = c("decide", "lattice", "stream"), output = c( - "matrix", "data.frame", "character", + "tibble", "matrix", "character", "asis", "csv", "tsv", "json" ), outdir = NULL, @@ -160,7 +161,7 @@ extract_tables <- function(file, "json" = write_jsons(tables, file = file, outdir = outdir, ...), "character" = list_characters(tables, encoding = encoding, ...), "matrix" = list_matrices(tables, encoding = encoding, ...), - "data.frame" = list_data_frames(tables, encoding = encoding, ...), + "tibble" = list_data_frames(tables, encoding = encoding, ...), "asis" = tables, tables ) diff --git a/R/output.R b/R/output.R index 23c5140..47a68fb 100644 --- a/R/output.R +++ b/R/output.R @@ -89,17 +89,17 @@ list_matrices <- function(tables, encoding = NULL, ...) { out } -list_characters <- function(tables, sep = "\t", encoding = NULL, ...) { +list_characters <- function(tables, delim = "\t", encoding = NULL, ...) { m <- list_matrices(tables, encoding = encoding, ...) lapply(m, function(x) { - paste0(apply(x, 1, paste, collapse = sep), collapse = "\n") + paste0(apply(x, 1, paste, collapse = delim), collapse = "\n") }) } -list_data_frames <- function(tables, sep = "\t", stringsAsFactors = FALSE, encoding = NULL, ...) { - char <- list_characters(tables = tables, sep = sep, encoding = encoding) +list_data_frames <- function(tables, delim = "\t", encoding = NULL, ...) { + char <- list_characters(tables = tables, delim = delim, encoding = encoding) lapply(char, function(x) { - o <- try(read.delim(text = x, stringsAsFactors = stringsAsFactors, ...)) + o <- try(read_delim(file = x, delim = delim)) if (inherits(o, "try-error")) { return(x) } else { diff --git a/dev/errors.R b/dev/errors.R index 127978d..ef23aab 100644 --- a/dev/errors.R +++ b/dev/errors.R @@ -2,6 +2,12 @@ library(tabulapdf) -out <- extract_tables("inst/examples/data.pdf", pages = 1, output = "data.frame") +out <- extract_tables("inst/examples/data.pdf", pages = 1, output = "tibble") + +class(out) + +class(out[[1]]) + +library(tibble) out diff --git a/inst/examples/text.md b/inst/examples/text.md deleted file mode 100644 index e174e81..0000000 --- a/inst/examples/text.md +++ /dev/null @@ -1,51 +0,0 @@ -\pagenumbering{gobble} - -To cite R in publications use: - - R Core Team (2018). R: A language and environment for statistical computing. R - Foundation for Statistical Computing, Vienna, Austria. URL - - https://www.R-project.org/. - -A BibTeX entry for LaTeX users is - -``` - @Manual{, - title = {R: A Language and Environment for Statistical Computing}, - author = {{R Core Team}}, - organization = {R Foundation for Statistical Computing}, - address = {Vienna, Austria}, - year = {2018}, - url = {https://www.R-project.org/}, - } -``` - -We have invested a lot of time and effort in creating R, please cite it when -using it for data analysis. See also ‘citation("pkgname")’ for citing R -packages. - -\newpage - -To cite R in publications use: - - R Core Team (2018). R: A language and environment for statistical computing. R - Foundation for Statistical Computing, Vienna, Austria. URL - - https://www.R-project.org/. - -A BibTeX entry for LaTeX users is - -``` - @Manual{, - title = {R: A Language and Environment for Statistical Computing}, - author = {{R Core Team}}, - organization = {R Foundation for Statistical Computing}, - address = {Vienna, Austria}, - year = {2018}, - url = {https://www.R-project.org/}, - } -``` - -We have invested a lot of time and effort in creating R, please cite it when -using it for data analysis. See also ‘citation("pkgname")’ for citing R -packages. diff --git a/inst/examples/text.pdf b/inst/examples/text.pdf index 58bd2e4..d0d492c 100644 Binary files a/inst/examples/text.pdf and b/inst/examples/text.pdf differ diff --git a/inst/examples/text.qmd b/inst/examples/text.qmd new file mode 100644 index 0000000..475165b --- /dev/null +++ b/inst/examples/text.qmd @@ -0,0 +1,11 @@ +--- +format: pdf +--- + +\pagenumbering{gobble} + +42 is the number from which the meaning of life, the universe, and everything can be derived. + +\newpage + +42 is the number from which the meaning of life, the universe, and everything can be derived. diff --git a/man/extract_tables.Rd b/man/extract_tables.Rd index eeb38be..33fe714 100644 --- a/man/extract_tables.Rd +++ b/man/extract_tables.Rd @@ -11,7 +11,7 @@ extract_tables( columns = NULL, guess = TRUE, method = c("decide", "lattice", "stream"), - output = c("matrix", "data.frame", "character", "asis", "csv", "tsv", "json"), + output = c("tibble", "matrix", "character", "asis", "csv", "tsv", "json"), outdir = NULL, password = NULL, encoding = NULL, @@ -62,8 +62,8 @@ Extract tables from a file \details{ This function mimics the behavior of the Tabula command line utility. It returns a list of R character matrices containing tables extracted from a file by default. This response behavior can be changed by using the following options. \itemize{ + \item \code{output = "tibble"} attempts to coerce the structure returned by \code{method = "character"} into a list of tibbles and returns character strings where this fails. \item \code{output = "character"} returns a list of single-element character vectors, where each vector is a tab-delimited, line-separate string of concatenated table cells. - \item \code{output = "data.frame"} attempts to coerce the structure returned by \code{method = "character"} into a list of data.frames and returns character strings where this fails. \item \code{output = "csv"} writes the tables to comma-separated (CSV) files using Tabula's CSVWriter method in the same directory as the original PDF. \code{method = "tsv"} does the same but with tab-separated (TSV) files using Tabula's TSVWriter and \code{method = "json"} does the same using Tabula's JSONWriter method. Any of these three methods return the path to the directory containing the extract table files. \item \code{output = "asis"} returns the Java object reference, which can be useful for debugging or for writing a custom parser. } @@ -86,8 +86,8 @@ extract_tables(f, pages = 2, area = list(c(126, 149, 212, 462))) ## part of the table extract_tables(f, pages = 2, area = list(c(126, 284, 174, 417))) -# return data.frames -extract_tables(f, pages = 2, output = "data.frame") +# return tibbles +extract_tables(f, pages = 2, output = "tibble") } } \references{ diff --git a/tests/testthat/test_extract_tables.R b/tests/testthat/test_extract_tables.R index e4e093b..8d4c03d 100644 --- a/tests/testthat/test_extract_tables.R +++ b/tests/testthat/test_extract_tables.R @@ -5,7 +5,7 @@ sf <- system.file("examples", "data.pdf", package = "tabulapdf") test_that("It basically works", { tab1 <- extract_tables(sf) expect_true(is.list(tab1)) - expect_true(is.matrix(tab1[[1]])) + expect_true(is.data.frame(tab1[[1]])) }) test_that("Warning for ignored arguments", { @@ -30,7 +30,7 @@ test_that("Import from remote file works", { tab2 <- extract_tables(f2) expect_true(is.list(tab2)) expect_true(length(tab2) == 2) - expect_true(is.matrix(tab2[[1]])) + expect_true(is.data.frame(tab2[[1]])) }) test_that("Import from remote non-Western file", { @@ -38,18 +38,18 @@ test_that("Import from remote non-Western file", { tab3 <- extract_tables(f3) expect_true(is.list(tab3)) expect_true(length(tab3) == 1) - expect_true(is.matrix(tab3[[1]])) + expect_true(is.data.frame(tab3[[1]])) }) test_that("Test 'area' argument", { a4a <- list(c(122, 149, 536, 576)) - tab4a <- extract_tables(sf, pages = 1, area = a4a, guess = FALSE, output = "data.frame") + tab4a <- extract_tables(sf, pages = 1, area = a4a, guess = FALSE, output = "tibble") expect_true(is.list(tab4a)) expect_true(is.data.frame(tab4a[[1]])) expect_true(nrow(tab4a[[1]]) == 32) expect_true(ncol(tab4a[[1]]) == 12) a4b <- list(c(122, 149, 251, 464)) - tab4b <- extract_tables(sf, pages = 1, area = a4b, guess = FALSE, output = "data.frame") + tab4b <- extract_tables(sf, pages = 1, area = a4b, guess = FALSE, output = "tibble") expect_true(is.list(tab4b)) expect_true(is.data.frame(tab4b[[1]])) expect_true(nrow(tab4b[[1]]) == 9) @@ -61,7 +61,7 @@ test_that("Test 'columns' argument", { expect_true(is.list(tab5)) expect_true(length(tab5) == 1) expect_true(ncol(tab5[[1]]) == 2) - expect_true(nrow(tab5[[1]]) == 34) + expect_true(nrow(tab5[[1]]) == 33) }) test_that("Extract from encrypted PDF", { @@ -69,7 +69,7 @@ test_that("Extract from encrypted PDF", { expect_error(extract_tables(f6, password = "wrongpassword")) tab6 <- extract_tables(f6, password = "userpassword") expect_true(is.list(tab6)) - expect_true(is.matrix(tab6[[1]])) + expect_true(is.data.frame(tab6[[1]])) }) test_that("Test 'copy' argument", { diff --git a/tests/testthat/test_extract_text.R b/tests/testthat/test_extract_text.R index 91b672b..114e3da 100644 --- a/tests/testthat/test_extract_text.R +++ b/tests/testthat/test_extract_text.R @@ -4,53 +4,39 @@ sf <- system.file("examples", "text.pdf", package = "tabulapdf") test_that("Text can be extracted from the whole document", { txt <- extract_text(sf, encoding = "UTF-8") - cite <- paste(format(citation(), style = "citation"), collapse = "") - striptxt <- gsub("[[:space:]+]", "", txt) - stripcite <- gsub("[[:space:]+]", "", cite) - expect_identical(nchar(striptxt), 2L * nchar(stripcite)) + expect_identical(txt, "42 is the number from which the meaning of life, the universe, and everything can be derived.\n42 is the number from which the meaning of life, the universe, and everything can be derived.\n") }) test_that("'page' argument in extract_text works", { txt <- extract_text(sf, pages = 1, encoding = "UTF-8") - cite <- paste(format(citation(), style = "citation"), collapse = "") - striptxt <- gsub("[[:space:]+]", "", txt) - stripcite <- gsub("[[:space:]+]", "", cite) - expect_identical(nchar(striptxt), nchar(stripcite)) + expect_identical(txt, "42 is the number from which the meaning of life, the universe, and everything can be derived.\n") }) test_that("'area' argument in extract_text works", { - txt <- extract_text(sf, area = list(c(209.4, 140.5, 304.2, 500.8)), encoding = "UTF-8") - txt <- paste(txt, collapse = "") - bibtex <- paste(as.character(toBibtex(citation())), collapse = "") - striptxt <- gsub("[[:space:]+]", "", txt) - stripbib <- gsub("[[:space:]+]", "", bibtex) - expect_identical(nchar(striptxt), 2L * nchar(stripbib)) + txt <- extract_text(sf, area = list(c(10, 15, 100, 550)), encoding = "UTF-8") + expect_identical(txt[1], "42 is the number from which the meaning of life, the universe, and everything can be derived.\n") }) test_that("'area' and 'page' arguments in extract_text work together", { - txt <- extract_text(sf, pages = 1, area = list(c(209.4, 140.5, 304.2, 500.8)), encoding = "UTF-8") - bibtex <- paste(as.character(toBibtex(citation())), collapse = "") - striptxt <- gsub("[[:space:]+]", "", txt) - stripbib <- gsub("[[:space:]+]", "", bibtex) - expect_identical(nchar(striptxt), nchar(stripbib)) + txt <- extract_text(sf, pages = 1, area = list(c(10, 15, 100, 550)), encoding = "UTF-8") + expect_identical(txt, "42 is the number from which the meaning of life, the universe, and everything can be derived.\n") }) test_that("Multiple pages with different areas can be extracted", { txt <- extract_text(sf, pages = c(1, 2), area = list( - c(124, 131, 341.6, 504.3), - c(209.4, 140.5, 304.2, 500.8) + c(10, 15, 100, 550), + c(10, 15, 100, 500) ), encoding = "UTF-8" ) - txt <- paste(txt, collapse = "") - cite <- paste(format(citation(), style = "citation"), collapse = "") - bibtex <- paste(as.character(toBibtex(citation())), collapse = "") - striptxt <- gsub("[[:space:]+]", "", txt) - stripcite <- gsub("[[:space:]+]", "", cite) - stripbib <- gsub("[[:space:]+]", "", bibtex) - bothpages <- paste0(stripcite, stripbib) - expect_identical(nchar(striptxt), nchar(bothpages)) + expect_identical( + txt, + c( + "42 is the number from which the meaning of life, the universe, and everything can be derived.\n", + "42 is the number from which the meaning of life, the universe, and everything can be deriv\n" + ) + ) }) test_that("Test 'copy' argument", { diff --git a/tests/testthat/test_non-latin.R b/tests/testthat/test_non-latin.R index f91c290..38382ee 100644 --- a/tests/testthat/test_non-latin.R +++ b/tests/testthat/test_non-latin.R @@ -2,18 +2,17 @@ context("Non-latin character tests") test_that("Read Spanish language PDF", { f1 <- "https://github.com/tabulapdf/tabula-java/raw/98957221950af4b90620b51a29e0bbe502eea9ad/src/test/resources/technology/tabula/argentina_diputados_voting_record.pdf" - expect_true(is.matrix(extract_tables(f1, pages = 1, area = list(c(269.875, 12.75, 790.5, 561)), guess = FALSE)[[1]])) - t1a <- extract_tables(f1, pages = 1, area = list(c(269.875, 12.75, 790.5, 561)), guess = FALSE, output = "data.frame", encoding = "latin1") - #expect_true(names(t1a[[1]])[2] == "Frente.CÃ.vico.por.Santiago", label = "latin1 encoding worked") - t1b <- extract_tables(f1, pages = 1, area = list(c(269.875, 12.75, 790.5, 561)), guess = FALSE, output = "data.frame", encoding = "UTF-8") - #expect_true(names(t1b[[1]])[2] == "Frente.Cívico.por.Santiago", label = "UTF-8 encoding worked") - + t1 <- extract_tables(f1, pages = 1, area = list(c(269.875, 12.75, 790.5, 561)), guess = FALSE) + t1a <- extract_tables(f1, pages = 1, area = list(c(269.875, 12.75, 790.5, 561)), guess = FALSE, output = "tibble", encoding = "latin1") + t1b <- extract_tables(f1, pages = 1, area = list(c(269.875, 12.75, 790.5, 561)), guess = FALSE, output = "tibble", encoding = "UTF-8") + expect_true(is.data.frame(t1[[1]])) + expect_true(is.data.frame(t1a[[1]])) + expect_true(is.data.frame(t1b[[1]])) }) test_that("Read French language PDF w/correct encoding", { f2 <- "http://www.europarl.europa.eu/oeil/popups/printfichetechnical.pdf?id=673511&lang=fr" t2a <- extract_text(f2, page = 1, encoding = "latin1") t2b <- extract_text(f2, page = 1, encoding = "UTF-8") - #expect_true(nchar(strsplit(t2a, "\n")[[1]][1]) == 50, label = "latin1 encoding worked") - #expect_true(nchar(strsplit(t2b, "\n")[[1]][1]) == 47, label = "UTF-8 encoding worked") + expect_false(t2a == t2b) }) diff --git a/vignettes/tabulapdf.Rmd b/vignettes/tabulapdf.Rmd index 1ff49ff..dc2d5a7 100644 --- a/vignettes/tabulapdf.Rmd +++ b/vignettes/tabulapdf.Rmd @@ -63,7 +63,7 @@ By default, `extract_tables()` returns a list of character matrices. This is bec ```{r} # attempt to coerce tables to data.frames -extract_tables(f, pages = 2, output = "data.frame") +extract_tables(f, pages = 2, output = "tibble") ``` Tabula itself implements three "writer" methods that write extracted tables to disk as CSV, TSV, or JSON files. These can be specified by `output = "csv"`, `output = "tsv"`, and `output = "json"`, respectively. For CSV and TSV, one file is written to disk for each table and R session's temporary directory `tempdir()` is used by default (alternatively, the directory can be specified through `output` argument). For JSON, one file is written containing information about all tables. For these methods, `extract_tables()` returns a path to the directory containing the output files. @@ -80,16 +80,16 @@ If none of the standard methods works well, you can specify `output = "asis"` to By default, tabulapdf uses Tabula's table detection algorithm to automatically identify tables within each page of a PDF. This automatic detection can be toggled off by setting `guess = FALSE` and specifying an "area" within each PDF page to extract the table from. Here is a comparison of the default settings, versus extracting from two alternative areas within a page: ```{r} -str(extract_tables(f, pages = 2, guess = TRUE, output = "data.frame")) -str(extract_tables(f, pages = 2, area = list(c(126, 149, 212, 462)), guess = FALSE, output = "data.frame")) -str(extract_tables(f, pages = 2, area = list(c(126, 284, 174, 417)), guess = FALSE, output = "data.frame")) +str(extract_tables(f, pages = 2, guess = TRUE, output = "tibble")) +str(extract_tables(f, pages = 2, area = list(c(126, 149, 212, 462)), guess = FALSE, output = "tibble")) +str(extract_tables(f, pages = 2, area = list(c(126, 284, 174, 417)), guess = FALSE, output = "tibble")) ``` The `area` argument should be a list either of length 1 (to use the same area for each specified page) or equal to the number of pages specified. This also means that you can extract multiple areas from one page, but specifying the page twice and indicating the two areas separately: ```{r} a2 <- list(c(126, 149, 212, 462), c(126, 284, 174, 417)) -str(extract_tables(f, pages = c(2, 2), area = a2, guess = FALSE, output = "data.frame")) +str(extract_tables(f, pages = c(2, 2), area = a2, guess = FALSE, output = "tibble")) ``` ## Interactive Table Extraction ##