From 85b1056d8f3dbeecedb2e34d7d52c0e25f541bae Mon Sep 17 00:00:00 2001 From: JBGruber Date: Sat, 17 Oct 2020 10:50:34 +0200 Subject: [PATCH] Prepared new release --- CRAN-RELEASE | 2 + DESCRIPTION | 4 +- NEWS.md | 4 + README.md | 9 +- Update package.R | 2 +- cran-comments.md | 14 +- inst/CITATION | 2 +- vignettes/demo.Rmd | 2 +- vignettes/demo.html | 431 ++++++++++++++++++++++++++------------------ 9 files changed, 282 insertions(+), 188 deletions(-) create mode 100644 CRAN-RELEASE diff --git a/CRAN-RELEASE b/CRAN-RELEASE new file mode 100644 index 0000000..e588d13 --- /dev/null +++ b/CRAN-RELEASE @@ -0,0 +1,2 @@ +This package was submitted to CRAN on 2020-10-17. +Once it is accepted, delete this file and tag the release (commit 4183eb2). diff --git a/DESCRIPTION b/DESCRIPTION index 32d3532..08497d8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: LexisNexisTools Title: Working with Files from 'LexisNexis' -Version: 0.3.1.9000 -Date: 2020-10-12 +Version: 0.3.2 +Date: 2020-10-17 Authors@R: person("Johannes", "Gruber", email = "j.gruber.1@research.gla.ac.uk", role = c("aut", "cre")) Description: My PhD supervisor once told me that everyone doing newspaper diff --git a/NEWS.md b/NEWS.md index e472ebf..7022b9d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# LexisNexisTools 0.3.2 + +* Fixed tests that caused problems on CRAN + # LexisNexisTools 0.3.1 * Added support for last remaining new format (zip). diff --git a/README.md b/README.md index ab07024..a2d87d9 100755 --- a/README.md +++ b/README.md @@ -155,8 +155,7 @@ be in most cases: well. In these cases, you should remove the whole article after inspecting it. (Use `View(LNToutput@articles$Article[LNToutput@meta$Graphic])` to view - these articles in a spreadsheet like -viewer.) + these articles in a spreadsheet like viewer.)

@@ -292,8 +291,7 @@ duplicates_df <- lnt_similarity(texts = LNToutput@articles$Article, ## ...processing date 2010-01-11: 5 duplicates found [3.05 secs]. ## Threshold = 0.97; 4 days processed; 5 duplicates found; in 3.05 secs -Now you can inspect the results using the function -`lnt_diff()`: +Now you can inspect the results using the function `lnt_diff()`: ``` r lnt_diff(duplicates_df, min = 0, max = Inf) @@ -406,8 +404,7 @@ with ID 9, all other values are `NULL`, which means the keyword wasn’t found. If your focus shifts and you want to subset your data to only include articles which mention this keyword, you could append this information to the meta information in the LNToutput object and then -subset it to articles where the list entry is different from -`NULL`. +subset it to articles where the list entry is different from `NULL`. ``` r LNToutput@meta$stats <- lnt_lookup(LNToutput, pattern = "statistical computing") diff --git a/Update package.R b/Update package.R index de8c1fe..1670930 100755 --- a/Update package.R +++ b/Update package.R @@ -73,7 +73,7 @@ devtools::check_win_oldrelease() devtools::check_win_release() ## check r_hub -rhub::check_for_cran() +rhub::check_for_cran(env_vars = c(R_COMPILE_AND_INSTALL_PACKAGES = "always")) ## release revdepcheck::revdep_check() diff --git a/cran-comments.md b/cran-comments.md index ff02786..003c55d 100755 --- a/cran-comments.md +++ b/cran-comments.md @@ -2,13 +2,21 @@ This submission is necessary as some tests fail due to the recent update of dplyr. ## Test environments -* local Kubuntu 18.04, R version 4.0.0 -* local Windows 10, R version 4.0.0 +* local Kubuntu 18.04, R version 4.0.3 +* local Windows 10, R version 4.0.3 * win-builder.r-project.org, R-release, R-oldrelease, R-devel * Ubuntu Xenial 16.04 (on travis-ci), R: release, R: oldrel, R: devel +* rhub::check_for_cran(env_vars = c(R_COMPILE_AND_INSTALL_PACKAGES = "always")) ## R CMD check results -0 ERRORs | 0 WARNINGs | 0 NOTEs +0 ERRORs | 0 WARNINGs | 1 NOTEs + +Only on rhub: +"Examples with CPU (user + system) or elapsed time > 5s +lnt_diff 6.28 0.8 7.14 + user system elapsed" + +Other test environments do not note this. ## Reverse dependency and other package conflicts diff --git a/inst/CITATION b/inst/CITATION index 5256f6c..1c8b5d1 100755 --- a/inst/CITATION +++ b/inst/CITATION @@ -3,5 +3,5 @@ bibentry(bibtype = "Manual", author = as.person("Johannes Gruber"), year = format(Sys.Date(), "%Y"), url = "https://github.com/JBGruber/LexisNexisTools", - note = "R package version 0.3.1" + note = "R package version 0.3.2" ) diff --git a/vignettes/demo.Rmd b/vignettes/demo.Rmd index 7feff42..7352a79 100644 --- a/vignettes/demo.Rmd +++ b/vignettes/demo.Rmd @@ -108,7 +108,7 @@ In these cases, you should remove the whole article after inspecting it. (Use `View(LNToutput@articles$Article[LNToutput@meta$Graphic])` to view these articles in a spreadsheet like viewer.)

- LN + LN

To use the function, you can again provide either file name(s), folder name(s) or nothing---to search the current working directory for relevant files---as `x` argument: diff --git a/vignettes/demo.html b/vignettes/demo.html index 3105b4f..c30bb5e 100644 --- a/vignettes/demo.html +++ b/vignettes/demo.html @@ -8,38 +8,123 @@ - + - + Basic Usage + + + @@ -303,7 +386,7 @@

Basic Usage

Johannes B. Gruber

-

2019-12-21

+

2020-10-15

@@ -311,9 +394,9 @@

2019-12-21

Demo

Load Package

- +
library("LexisNexisTools")

If you do not yet have files from ‘LexisNexis’ but want to test the package, you can use lnt_sample() to copy a sample file with mock data into your current working directory:

- +
lnt_sample()

Rename Files

@@ -322,19 +405,19 @@

Rename Files

- +
report <- lnt_rename()
- +
report <- lnt_rename(x = getwd(), report = TRUE)
- +
my_files <- list.files(pattern = ".txt", path = getwd(),
+                       full.names = TRUE, recursive = TRUE, ignore.case = TRUE)
+report <- lnt_rename(x = my_files, report = TRUE)
+
+report
@@ -347,7 +430,7 @@

Rename Files

- + @@ -365,10 +448,10 @@

Read in ‘LexisNexis’ Files to Get Meta, Articles and Paragraphs

  • length_keyword: This keyword, which is usually just “^LENGTH:” (or its equivalent in other languages) finds the information about the length of an article. However, since this is always the last line of the metadata, it is used to separate metadata and article text. There seems to be only one type of cases where this information is missing: if the article consists only of a graphic (which ‘LexisNexis’ does not retrieve). The final output from lnt_read() has a column named Graphic, which indicates if this keyword was missing. The article text then contains all metadata as well. In these cases, you should remove the whole article after inspecting it. (Use View(LNToutput@articles$Article[LNToutput@meta$Graphic]) to view these articles in a spreadsheet like viewer.)
  • -LN +LN

    To use the function, you can again provide either file name(s), folder name(s) or nothing—to search the current working directory for relevant files—as x argument:

    - +
    LNToutput <- lnt_read(x = getwd())
    ## Creating LNToutput from 1 file...
     ##  ...files loaded [0.0016 secs]
     ##  ...articles split [0.0089 secs]
    @@ -388,24 +471,24 @@ 

    Read in ‘LexisNexis’ Files to Get Meta, Articles and Paragraphs

    ## Elapsed time: 0.047 secs

    The returned object of class LNToutput is intended to be an intermediate container. As it stores articles and paragraphs in two separate data.frames, nested in an S4 object, the relevant text data is stored twice in almost the same format. This has the advantage, that there is no need to use special characters, such as “\n”. However, it makes the files rather big when you save them directly.

    The object can, however, be easily converted to regular data.frames using @ to select the data.frame you want:

    - +
    meta_df <- LNToutput@meta
    +articles_df <- LNToutput@articles
    +paragraphs_df <- LNToutput@paragraphs
    +
    +# Print meta to get an idea of the data
    +head(meta_df, n = 3)
    sample.TXTfile2c169e4c96d.TXTfiled1a653e0ffa.TXT not renamed (file exists) txt
    ----------++++++++++ @@ -424,7 +507,7 @@

    Read in ‘LexisNexis’ Files to Get Meta, Articles and Paragraphs

    - + @@ -436,7 +519,7 @@

    Read in ‘LexisNexis’ Files to Get Meta, Articles and Paragraphs

    - + @@ -448,7 +531,7 @@

    Read in ‘LexisNexis’ Files to Get Meta, Articles and Paragraphs

    - + @@ -461,36 +544,36 @@

    Read in ‘LexisNexis’ Files to Get Meta, Articles and Paragraphs

    1file2c169e4c96d.TXTfiled1a653e0ffa.TXT Guardian.com 2010-01-11 355 words
    2file2c169e4c96d.TXTfiled1a653e0ffa.TXT Guardian 2010-01-11 927 words
    3file2c169e4c96d.TXTfiled1a653e0ffa.TXT The Sun (England) 2010-01-11 677 words

    If you want to keep only one data.frame including metadata and text data you can easily do so:

    - +
    meta_articles_df <- lnt_convert(LNToutput, to = "data.frame")
    +
    +# Or keep the paragraphs
    +meta_paragraphs_df <- lnt_convert(LNToutput, to = "data.frame", what = "Paragraphs")

    Alternatively, you can convert LNToutput objects to formats common in other packages using the function lnt_convert:

    - +
    rDNA_docs <- lnt_convert(LNToutput, to = "rDNA")
    +
    +quanteda_corpus <- lnt_convert(LNToutput, to = "quanteda")
    +
    +tCorpus <- lnt_convert(LNToutput, to = "corpustools")
    +
    +tidy <- lnt_convert(LNToutput, to = "tidytext")
    +
    +Corpus <- lnt_convert(LNToutput, to = "tm")
    +
    +dbloc <- lnt_convert(LNToutput, to = "SQLite")

    See ?lnt_convert for details and comment in this issue if you want a format added to the convert function.

    Identify Highly Similar Articles

    In ‘LexisNexis’ itself, there is an option to group highly similar articles. However, experience shows that this feature does not always work perfectly. One common problem when working with ‘LexisNexis’ data is thus that many articles appear to be delivered twice or more times. While direct duplicates can be filtered out using, for example, LNToutput <- LNToutput[!duplicated(LNToutput@articles$Article), ] this does not work for articles with small differences. Hence when one comma or white space is different between two articles, they are treated as different.

    The function lnt_similarity() combines the fast similarity measure from quanteda with the much slower but more accurate relative Levenshtein distance to compare all articles published on the same day. Calculating the Levenshtein distance might be very slow though if you have many articles published each day in your data set. If you think the less accurate similarity measure might be sufficient in your case, simply turn this feature off with rel_dist = FALSE. The easiest way to use lnt_similarity() is to input an LNToutput object directly. However, it is also possible to provide texts, dates and IDs separately:

    - - +
    # Either provide a LNToutput
    +duplicates_df <- lnt_similarity(LNToutput = LNToutput,
    +                                threshold = 0.97)
    +
    # Or the important parts separatley
    +duplicates_df <- lnt_similarity(texts = LNToutput@articles$Article,
    +                                dates = LNToutput@meta$Date,
    +                                IDs = LNToutput@articles$ID,
    +                                threshold = 0.97)
    ## Checking similiarity for 10 articles over 4 dates...
     ##  ...quanteda dfm construced for similarity comparison [0.063 secs].
     ##  ...processing date 2010-01-08: 0 duplicates found [0.064 secs].         
    @@ -499,57 +582,57 @@ 

    Identify Highly Similar Articles

    ## ...processing date 2010-01-11: 5 duplicates found [3.05 secs]. ## Threshold = 0.97; 4 days processed; 5 duplicates found; in 3.05 secs

    Now you can inspect the results using the function lnt_diff():

    - +
    lnt_diff(duplicates_df, min = 0, max = Inf)

    diff

    By default, 25 randomly selected articles are displayed one after another, ordered by least to most different within the min and max limits.

    After you have chosen a good cut-off value, you can subset the duplicates_df data.frame and remove the respective articles:

    - +
    duplicates_df <- duplicates_df[duplicates_df$rel_dist < 0.2]
    +LNToutput <- LNToutput[!LNToutput@meta$ID %in% duplicates_df$ID_duplicate, ]

    Note, that you can subset LNToutput objects almost like you would in a regular data.frame using the square brackets.

    - +
    LNToutput[1, ]
    +#> Object of class 'LNToutput':
    +#> 1 articles
    +#> 5 paragraphs
    +#> # A tibble: 1 x 10
    +#>      ID Source_File Newspaper Date       Length Section Author Edition Headline
    +#>   <int> <chr>       <chr>     <date>     <chr>  <chr>   <chr>  <chr>   <chr>   
    +#> 1     1 filed1a653… Guardian… 2010-01-11 355 w… <NA>    Andre… <NA>    Lorem i…
    +#> # … with 1 more variable: Graphic <lgl>
    +#> # A tibble: 1 x 2
    +#>      ID Article                                                                 
    +#>   <int> <chr>                                                                   
    +#> 1     1 Lorem ipsum dolor sit amet, consectetur adipiscing elit. Etiam lacinia …
    +#> # A tibble: 5 x 3
    +#>   Art_ID Par_ID Paragraph                                                       
    +#>    <int>  <int> <chr>                                                           
    +#> 1      1      1 Lorem ipsum dolor sit amet, consectetur adipiscing elit. Etiam …
    +#> 2      1      2 Duis eleifend ipsum vehicula nunc luctus vestibulum. Donec non …
    +#> 3      1      3 Sed ut ex quis nisi interdum ornare quis quis velit. Ut element…
    +#> 4      1      4 Aliquam ut consectetur urna, et dignissim turpis. Ut mattis ele…
    +#> 5      1      5 Fusce sit amet aliquet lorem, id faucibus nisl. Nulla suscipit …

    In this case, writing [1, ] delivers an LNToutput object which includes only the first article and the metadata and paragraphs belonging to it.

    Now you can extract the remaining articles or convert them to a format you prefer.

    - +
    #' generate new dataframes without highly similar duplicates
    +meta_df <- LNToutput@meta
    +articles_df <- LNToutput@articles
    +paragraphs_df <- LNToutput@paragraphs
    +
    +# Print e.g., meta to see how the data changed
    +head(meta_df, n = 3)
    ----------++++++++++ @@ -568,7 +651,7 @@

    Identify Highly Similar Articles

    - + @@ -580,7 +663,7 @@

    Identify Highly Similar Articles

    - + @@ -592,7 +675,7 @@

    Identify Highly Similar Articles

    - + @@ -608,63 +691,63 @@

    Identify Highly Similar Articles

    Lookup Keywords

    While downloading from ‘LexisNexis’, you have already used keywords to filter relevant articles from a larger set. However, while working with the data, your focus might change or you might want to find the different versions of your keyword in the set. Both can be done using lnt_lookup:

    - +
    lnt_lookup(LNToutput, pattern = "statistical computing")
    +#> $`1`
    +#> NULL
    +#> 
    +#> $`2`
    +#> NULL
    +#> 
    +#> $`3`
    +#> NULL
    +#> 
    +#> $`7`
    +#> NULL
    +#> 
    +#> $`8`
    +#> NULL
    +#> 
    +#> $`9`
    +#> [1] "statistical computing" "statistical computing"
    +#> 
    +#> $`10`
    +#> NULL

    The output shows that the keyword pattern was only found in the article with ID 9, all other values are NULL, which means the keyword wasn’t found. If your focus shifts and you want to subset your data to only include articles which mention this keyword, you could append this information to the meta information in the LNToutput object and then subset it to articles where the list entry is different from NULL.

    - +
    LNToutput@meta$stats <- lnt_lookup(LNToutput, pattern = "statistical computing")
    +LNToutput <- LNToutput[!sapply(LNToutput@meta$stats, is.null), ]
    +LNToutput
    +#> Object of class 'LNToutput':
    +#> 1 articles
    +#> 7 paragraphs
    +#> # A tibble: 1 x 11
    +#>      ID Source_File Newspaper Date       Length Section Author Edition Headline
    +#>   <int> <chr>       <chr>     <date>     <chr>  <chr>   <chr>  <chr>   <chr>   
    +#> 1     9 filed1a653… Sunday M… 2010-01-10 446 w… NEWS; … Ross … 3 Star… R (prog…
    +#> # … with 2 more variables: Graphic <lgl>, stats <named list>
    +#> # A tibble: 1 x 2
    +#>      ID Article                                                                 
    +#>   <int> <chr>                                                                   
    +#> 1     9 R is a programming language and free software environment for statistic…
    +#> # A tibble: 7 x 3
    +#>   Art_ID Par_ID Paragraph                                                       
    +#>    <int>  <int> <chr>                                                           
    +#> 1      9     67 R is a programming language and free software environment for s…
    +#> 2      9     68 R is a GNU package. The source code for the R software environm…
    +#> 3      9     69 R is an implementation of the S programming language combined w…
    +#> 4      9     70 R was created by Ross Ihaka and Robert Gentleman at the Univers…
    +#> 5      9     71 R and its libraries implement a wide variety of statistical and…
    +#> 6      9     72 Another strength of R is static graphics, which can produce pub…
    +#> # … with 1 more row

    Another use of the function is to find out which versions of your keyword are in the set. You can do so by using regular expressions. The following looks for words starting with the ‘stat’, followed by more characters, up until the end of the word (the pattern internally always starts and ends at a word boundary).

    - +
    lnt_lookup(LNToutput, pattern = "stat.*?")
    +#> $`9`
    +#> [1] "statistical"   "statisticians" "statistical"   "statistical"  
    +#> [5] "statistical"   "statistical"   "static"

    You can use table() to count the different versions of patterns:

    - +
    table(unlist(lnt_lookup(LNToutput, pattern = "stat.+?\\b")))
    +#> 
    +#>        static   statistical statisticians 
    +#>             1             5             1
    1file2c169e4c96d.TXTfiled1a653e0ffa.TXT Guardian.com 2010-01-11 355 words
    2file2c169e4c96d.TXTfiled1a653e0ffa.TXT Guardian 2010-01-11 927 words
    3file2c169e4c96d.TXTfiled1a653e0ffa.TXT The Sun (England) 2010-01-11 677 words