Skip to content

Commit 3b88f81

Browse files
committed
Update function docs
1 parent 077ffaf commit 3b88f81

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+118
-167
lines changed

R/as_list.R

+2-4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
2-
3-
#' Method as.list() for class robotstxt_text
1+
#' Convert robotstxt_text to list
42
#'
53
#' @param x class robotstxt_text object to be transformed into list
64
#' @param ... further arguments (inherited from \code{base::as.list()})
@@ -17,4 +15,4 @@ as.list.robotstxt_text <-
1715
res$request <- attr(x, "request")
1816

1917
res
20-
}
18+
}

R/fix_url.R

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#' fix_url
1+
#' Add http protocal if missing from URL
22
#'
33
#'
44
#' @param url a character string containing a single URL
@@ -10,4 +10,4 @@ fix_url <-
1010
url <- paste0("http://", url)
1111
}
1212
url
13-
}
13+
}

R/get_robotstxt.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#' downloading robots.txt file
1+
#' Download a robots.txt file
22
#'
33
#' @param domain domain from which to download robots.txt file
44
#' @param warn warn about being unable to download domain/robots.txt because of

R/get_robotstxt_http_get.R

+3-4
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,16 @@
1-
2-
#' storage for http request response objects
1+
#' Storage for http request response objects
32
#'
43
#' @rdname get_robotstxt_http_get
54
#'
65
#' @export
76
rt_last_http <- new.env()
87
rt_last_http$request <- list()
98

10-
#' get_robotstxt() worker function to execute HTTP request
9+
#' Execute HTTP request for get_robotstxt()
1110
#'
1211
#' @param ssl_verifypeer either 1 (default) or 0, if 0 it disables SSL peer verification, which
1312
#' might help with robots.txt file retrieval
14-
#' @param domain the domain to get tobots.txt. file for
13+
#' @param domain the domain to get robots.txt. file for
1514
#' @param user_agent the user agent to use for HTTP request header
1615
#'
1716
#' @export

R/get_robotstxts.R

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
2-
#' function to get multiple robotstxt files
1+
#' Download multiple robotstxt files
32
#'
43
#' @inheritParams get_robotstxt
54
#' @param use_futures Should future::future_lapply be used for possible

R/guess_domain.R

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#' function guessing domain from path
1+
#' Guess a domain from path
22
#' @param x path aka URL from which to infer domain
33
guess_domain <- function(x){
44

@@ -23,4 +23,4 @@ guess_domain <- function(x){
2323
return(domain)
2424
}
2525

26-
}
26+
}

R/http_domain_changed.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#' http_domain_changed
1+
#' Check if HTTP domain changed
22
#'
33
#' @param response an httr response object, e.g. from a call to httr::GET()
44
#'

R/http_subdomain_changed.R

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
#' http_subdomain_changed
1+
#' Check if HTTP subdomain changed
22
#'
33
#' @param response an httr response object, e.g. from a call to httr::GET()
44
#'
5-
#' @return logical of length 1 indicating whether or not any domain change
5+
#' @return logical of length 1 indicating whether or not any subdomain change
66
#' happened during the HTTP request
77
#'
88
#'

R/http_was_redirected.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#' http_was_redirected
1+
#' Check if HTTP redirect occurred
22
#'
33
#' @param response an httr response object, e.g. from a call to httr::GET()
44
#'

R/is_suspect_robotstxt.R

+2-4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
#' is_suspect_robotstxt
1+
#' Check if file is valid / parsable robots.txt file
22
#'
3-
#' function that checks if file is valid / parsable robots.txt file
3+
#' Function that checks if file is valid / parsable robots.txt file
44
#'
55
#' @param text content of a robots.txt file provides as character vector
66
#'
@@ -26,5 +26,3 @@ is_suspect_robotstxt <- function(text){
2626
# return default
2727
return(FALSE)
2828
}
29-
30-

R/is_valid_robotstxt.R

+2-4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
#' function that checks if file is valid / parsable robots.txt file
1+
#' Validate if a file is valid / parsable robots.txt file
22
#'
3-
#' @param text content of a robots.txt file provides as character vector
3+
#' @param text content of a robots.txt file provided as character vector
44
#' @param check_strickt_ascii whether or not to check if content does adhere to the specification of RFC to use plain text aka ASCII
55
#'
66
#' @export
@@ -39,5 +39,3 @@ is_valid_robotstxt <- function(text, check_strickt_ascii = FALSE){
3939

4040
)
4141
}
42-
43-

R/null_to_default.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#' null_to_default
1+
#' Return default value if NULL
22
#'
33
#' @param x value to check and return
44
#' @param d value to return in case x is NULL

R/parse_robotstxt.R

+2-21
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
1-
#' function parsing robots.txt
2-
#' @param txt content of the robots.txt file
1+
#' Parse a robots.txt file
2+
#' @param txt content of the robots.txt file
33
#' @return a named list with useragents, comments, permissions, sitemap
44
#' @export
55
parse_robotstxt <- function(txt){
6-
# return
76
res <-
87
list(
98
useragents = rt_get_useragent(txt),
@@ -21,21 +20,3 @@ parse_robotstxt <- function(txt){
2120
)
2221
return(res)
2322
}
24-
25-
26-
27-
28-
29-
30-
31-
32-
33-
34-
35-
36-
37-
38-
39-
40-
41-

R/parse_url.R

+1-5
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,9 @@
1-
2-
3-
4-
#' parse_url
1+
#' Parse a URL
52
#'
63
#' @param url url to parse into its components
74
#'
85
#' @return data.frame with columns protocol, domain, path
96
#'
10-
#'
117
#' @keywords internal
128
#'
139
#' @examples

R/paths_allowed.R

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
#' check if a bot has permissions to access page(s)
1+
#' Check if a bot has permissions to access page(s)
22
#'
33
#'
44
#' @param domain Domain for which paths should be checked. Defaults to "auto".
55
#' If set to "auto" function will try to guess the domain by parsing the paths
66
#' argument. Note however, that these are educated guesses which might utterly
77
#' fail. To be on the safe side, provide appropriate domains manually.
88
#' @param bot name of the bot, defaults to "*"
9-
#' @param paths paths for which to check bot's permission, defaults to "/". Please, note that path to a folder should end with a trailing slash ("/").
9+
#' @param paths paths for which to check bot's permission, defaults to "/". Please note that path to a folder should end with a trailing slash ("/").
1010
#' @param check_method at the moment only kept for backward compatibility reasons - do not use parameter anymore --> will let the function simply use the default
1111
#' @param robotstxt_list either NULL -- the default -- or a list of character
1212
#' vectors with one vector per path to check

R/paths_allowed_worker_spiderbar.R

+1-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
2-
3-
#' paths_allowed_worker spiderbar flavor
1+
#' Check if a spiderbar bot has permissions to access page(s)
42
#'
53
#' @inheritParams paths_allowed
64
#'

R/print_robotstxt.R

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#' printing robotstxt
1+
#' Print robotstxt
22
#' @param x robotstxt instance to be printed
33
#' @param ... goes down the sink
44
#' @export
@@ -29,4 +29,3 @@ print.robotstxt <- function(x, ...){
2929
invisible(x)
3030
}
3131

32-

R/print_robotstxt_text.R

+3-4
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
1-
2-
#' printing robotstxt_text
1+
#' Print robotstxt's text
32
#' @param x character vector aka robotstxt$text to be printed
43
#' @param ... goes down the sink
54
#' @export
65
print.robotstxt_text <- function(x, ...){
76

8-
# rpint part of the robots.txt file
7+
# print part of the robots.txt file
98
cat("[robots.txt]\n--------------------------------------\n\n")
109
tmp <- unlist(strsplit(x, "\n"))
1110
cat(tmp[seq_len(min(length(tmp), 50))], sep ="\n")
@@ -29,4 +28,4 @@ print.robotstxt_text <- function(x, ...){
2928

3029
# return
3130
invisible(x)
32-
}
31+
}

R/remove_domain.R

+1-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#' function to remove domain from path
1+
#' Remove domain from path
22
#' @param x path aka URL from which to first infer domain and then remove it
33
remove_domain <- function(x){
44
unlist(lapply(
@@ -12,5 +12,3 @@ remove_domain <- function(x){
1212
}
1313
))
1414
}
15-
16-

R/request_handler_handler.R

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
#' request_handler_handler
1+
#' Handle robotstxt handlers
22
#'
33
#' Helper function to handle robotstxt handlers.
44
#'
55
#' @param request the request object returned by call to httr::GET()
66
#' @param handler the handler either a character string entailing various options or a function producing a specific list, see return.
7-
#' @param res a list a list with elements '[handler names], ...', 'rtxt', and 'cache'
7+
#' @param res a list with elements '[handler names], ...', 'rtxt', and 'cache'
88
#' @param info info to add to problems list
99
#' @param warn if FALSE warnings and messages are suppressed
1010
#'
@@ -72,4 +72,4 @@ request_handler_handler <-
7272

7373
# return
7474
res
75-
}
75+
}

R/robotstxt.R

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1-
#' Generate a representations of a robots.txt file
1+
#' Generate a representation of a robots.txt file
22
#'
33
#' The function generates a list that entails data resulting from parsing a robots.txt file
44
#' as well as a function called check that enables to ask the representation if bot (or
55
#' particular bots) are allowed to access a resource on the domain.
66
#'
77
#' @param domain Domain for which to generate a representation. If text equals to NULL,
88
#' the function will download the file from server - the default.
9+
#'
910
#' @param text If automatic download of the robots.txt is not preferred, the text can be
1011
#' supplied directly.
1112
#' @inheritParams get_robotstxt
@@ -20,7 +21,7 @@
2021
#' @field domain character vector holding domain name for which the robots.txt
2122
#' file is valid; will be set to NA if not supplied on initialization
2223
#'
23-
#' @field text character vector of text of robots.txt file; either supplied on
24+
#' @field character vector of text of robots.txt file; either supplied on
2425
#' initialization or automatically downloaded from domain supplied on
2526
#' initialization
2627
#'

R/rt_cache.R

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
1-
#' get_robotstxt() cache
1+
#' Get the robotstxt cache
22
rt_cache <- new.env( parent = emptyenv() )
3-

R/rt_get_comments.R

+1-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
2-
#' extracting comments from robots.txt
1+
#' Extract comments from robots.txt
32
#' @param txt content of the robots.txt file
43
#' @keywords internal
54
rt_get_comments <- function(txt){
@@ -8,4 +7,3 @@ rt_get_comments <- function(txt){
87
ccontent <- stringr::str_extract(txt[clines], "#.*")
98
data.frame(line=clines, comment=ccontent, stringsAsFactors = FALSE)
109
}
11-

R/rt_get_fields.R

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
2-
#' extracting permissions from robots.txt
1+
#' Extract permissions from robots.txt
32
#' @param txt content of the robots.txt file
43
#' @param regex regular expression specify field
54
#' @param invert invert selection made via regex?

R/rt_get_fields_worker.R

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
2-
#' extracting robotstxt fields
1+
#' Extract robotstxt fields
32
#' @param txt content of the robots.txt file
43
#' @param type name or names of the fields to be returned, defaults to all
54
#' fields

R/rt_get_useragent.R

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#' extracting HTTP useragents from robots.txt
1+
#' Extract HTTP useragents from robots.txt
22
#' @param txt content of the robots.txt file
33
#' @keywords internal
44
# rt_get_useragent <- function(txt){
@@ -19,4 +19,4 @@ rt_get_useragent <- function(txt){
1919
pattern = stringr::regex("U.*:| |\n", ignore_case = TRUE),
2020
replacement = ""
2121
)
22-
}
22+
}

R/rt_request_handler.R

+4-10
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,13 @@
1-
2-
#' rt_request_handler
1+
#' Handle robotstxt object retrieved from HTTP request
32
#'
43
#' A helper function for get_robotstxt() that will extract the robots.txt file
5-
#' from the HTTP request result object. furthermore it will inform
6-
#' get_robotstxt() if the request should be cached and which problems occured.
4+
#' from the HTTP request result object. It will inform get_robotstxt() if the
5+
#' request should be cached and which problems occurred.
76
#'
87
#'
98
#'
109
#' @param request result of an HTTP request (e.g. httr::GET())
1110
#'
12-
#'
1311
#' @param on_server_error request state handler for any 5xx status
1412
#'
1513
#' @param on_client_error request state handler for any 4xx HTTP status that is
@@ -31,8 +29,8 @@
3129
#' @param on_suspect_content request state handler for content that seems to be
3230
#' something else than a robots.txt file (usually a JSON, XML or HTML)
3331
#'
34-
#'
3532
#' @param warn suppress warnings
33+
#'
3634
#' @param encoding The text encoding to assume if no encoding is provided in the
3735
#' headers of the response
3836
#'
@@ -99,8 +97,6 @@ rt_request_handler <-
9997
}
10098

10199

102-
103-
104100
## server error
105101
server_error <-
106102
request$status_code >= 500
@@ -207,8 +203,6 @@ rt_request_handler <-
207203
}
208204

209205

210-
211-
212206
## file type mismatch
213207
file_type_mismatch <-
214208
!(grepl("text/plain", null_to_default(request$headers$`content-type`, "")))

R/sanitize_path.R

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#' making paths uniform
1+
#' Make paths uniform
22
#' @param path path to be sanitized
33
#' @return sanitized path
44
#' @keywords internal
@@ -7,4 +7,3 @@ sanitize_path <- function(path){
77
path <- ifelse( !grepl("^/", path), paste0("/", path), path)
88
return(path)
99
}
10-

0 commit comments

Comments
 (0)