From 75cc2fb1c300640a44948cf801a5e0a2dee3aec9 Mon Sep 17 00:00:00 2001 From: Krzysztof Dyba <35004826+kadyb@users.noreply.github.com> Date: Wed, 16 Aug 2023 21:49:06 +0200 Subject: [PATCH] upload files --- Tutorial.Rmd | 220 ++++++ Tutorial.html | 1769 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1989 insertions(+) create mode 100644 Tutorial.Rmd create mode 100644 Tutorial.html diff --git a/Tutorial.Rmd b/Tutorial.Rmd new file mode 100644 index 0000000..624b37a --- /dev/null +++ b/Tutorial.Rmd @@ -0,0 +1,220 @@ +--- +title: "Parallel raster processing in **stars**" +author: "Krzysztof Dyba" +output: + html_document: + toc: yes + toc_float: true + css: style.css +--- + +Prediction on large datasets can be time-consuming, but with enough computing +power, this task can be parallelized easily. Some algorithms provide native +multithreading like `predict()` function in `{ranger}` package (but this is +not standard). In this situation, we have to parallelize the calculations +ourselves. There are several approaches to do this, but for this example we will +divide a large out of memory raster into smaller blocks and make predictions +using multithreading. + +## Data loading + +```{r message=FALSE} +library("stars") +set.seed(1) # define a random seed +``` + +The first step is to list the rasters representing the blue (B2), green (B3), +red (B4), and near infrared (B5) bands to be loaded using the `list.files()` +function. We need to define two additional arguments, i.e. `pattern = "\\.TIF$"` +and `full.names = TRUE`. + +```{r} +files = list.files("Landsat", pattern = "\\.TIF$", full.names = TRUE) +files +``` + +Since this data does not fit in memory, we have to load it as a proxy (actually, +we will only load the metadata describing these rasters). To do this, we use the +`read_stars()` function with the arguments `proxy = TRUE` and `along = 3`. This +second argument will cause the bands (layers) to be combined into a +three-dimensional matrix: $[longitude, \ latitude, \ spectral \ band]$. + +```{r} +rasters = read_stars(files, proxy = TRUE, along = 3) +``` + +We can replace the default filenames with band names, and rename the third +dimension as follows: + +```{r} +bands = c("Blue", "Green", "Red", "NIR") +# rename bands and dimension +rasters = st_set_dimensions(rasters, 3, values = bands, names = "band") +names(rasters) = "Landsat" # rename the object +``` + +## Sampling + +For modeling, we need to use a smaller sample rather than the entire dataset. +Drawing training points consists of several steps, which are shown below: + +```{r} +bbox = st_as_sfc(st_bbox(rasters)) # define a bounding box +smp = st_sample(bbox, size = 20000) # sample points from the extent of the polygon +smp = st_extract(rasters, smp) # extract the pixel values for those points +smp = st_drop_geometry(st_as_sf(smp)) # convert to a data frame (remove geometry) +smp = na.omit(smp) # remove missing values (NA) +head(smp) +``` + +## Modelling + +The aim of our analysis is to perform unsupervised classification (clustering) +of raster pixels based on spectral bands. In other words, we want to group similar +cells together to make homogeneous clusters. A popular method is the Gaussian +mixture models available in the **mclust** package. Clustering can be done using +the `Mclust()` function and requires the target number of clusters to be defined +in advance (e.g. `G = 6`). + +```{r message=FALSE} +library("mclust") +mdl = Mclust(smp, G = 6) # train the model +``` + +## Prediction + +As stated earlier, the entire raster is too large to be loaded into memory. +We need to split it into smaller blocks. For this, we can use the `st_tile()` +function, which requires the total number of rows and columns, and the number +of rows and columns of a small block (for example, it could be 2048 x 2048, +but usually we should find the optimal size). + +```{r} +tiles = st_tile(nrow(rasters), ncol(rasters), 2048, 2048) +head(tiles) +``` + +Finally, the input raster will be divided into `r nrow(tiles)` smaller blocks. +In the following sections, we will compare the performance of single-threaded +and multi-threaded processing. + +Now let's discuss what steps we need to take to make a prediction: + +1. We have `r nrow(tiles)` blocks, so we need to make predictions in a loop. +2. We have to load the rasters, but this time into memory (`proxy = FALSE`) and +only the block (`RasterIO = tiles[iterator, ]`). +3. We use the `predict()` function for clustering. Note we need to define the +`drop_dimensions = TRUE` argument to remove the coordinates from the data frame. +4. Finally, we have to save the clustering results to disk (it can be a temporary +directory). Be sure to specify the missing values (`NA_value = 0`) and the block +size as the input (`chunk_size = dim(tile)`). + +**Note!** The `read_stars()` function opens the connection to the file and closes +it each time, which causes overhead. With a large number of blocks, this can +significantly affect performance. In this case, it is better to use a low-level +API, e.g. **[gdalraster](https://github.com/USDAForestService/gdalraster)**. + +### Single thread + +```{r} +start_time = Sys.time() + +for (i in seq_len(nrow(tiles))) { + tile = read_stars(files, proxy = FALSE, RasterIO = tiles[i, ]) + names(tile) = bands # rename bands + + pr = predict(tile, mdl, drop_dimensions = TRUE) + pr = pr[1] # select only clusters from output + save_path = file.path(tempdir(), paste0("tile_", i, ".tif")) + write_stars(pr, save_path, NA_value = 0, options = "COMPRESS=NONE", + type = "Byte", chunk_size = dim(tile)) +} + +end_time_1 = difftime(Sys.time(), start_time, units = "secs") +end_time_1 = round(end_time_1) +``` + +The prediction took `r as.integer(end_time_1)` seconds. + +### Multiple threads + +Multithreaded prediction is a bit more complicated. This requires the use of +two additional packages, i.e. **foreach** and **doParallel**. Now we need to +setup a parallel backend by defining the number of available threads (or by +detecting them automatically using `detectCores()`) in `makeCluster()`, and then +registering the cluster using the `registerDoParallel()` function. This can be +accomplished, for instance, with: + +```{r message=FALSE} +library("foreach") +library("doParallel") +threads = 3 # specify number of threads +cl = makeCluster(threads) +registerDoParallel(cl) +``` + +Once we have the computing cluster prepared, we can write a loop that will be +executed in parallel. Instead of the standard `for()` loop, we will use `foreach()` +with the `%dopar%` operator. In `foreach()` we need to define an iterator and +packages that will be exported to each worker. Then we use the `%dopar%` operator +and write the core of the function (it is identical as in the single-threaded +example). + +```{r} +packages = c("stars", "mclust") + +start_time = Sys.time() + +tifs = foreach(i = seq_len(nrow(tiles)), .packages = packages) %dopar% { + tile = read_stars(files, proxy = FALSE, RasterIO = tiles[i, ], NA_value = 0) + names(tile) = bands + + pr = predict(tile, mdl, drop_dimensions = TRUE) + pr = pr[1] + save_path = file.path(tempdir(), paste0("tile_", i, ".tif")) + write_stars(pr, save_path, NA_value = 0, options = "COMPRESS=NONE", + type = "Byte", chunk_size = dim(tile)) + return(save_path) +} + +end_time_2 = difftime(Sys.time(), start_time, units = "secs") +end_time_2 = round(end_time_2) +``` + +The prediction took `r as.integer(end_time_2)` seconds. By using three threads +instead of one, we have cut the operation time by half! + +## Post-processing + +We have our raster blocks saved in a temporary directory. The final step is to +make a mosaic (that is, to combine the blocks into a single raster). I recommend +using GDAL tools, as they provide the best performance when there are a large +number of blocks. We can use: + +1. `buildvrt` to create a virtual mosaic. +2. `translate` to save as a geotiff. + +We can call these tools using the `gdal_utils()` function from the **sf** package. + +```{r} +vrt = tempfile(fileext = ".vrt") +gdal_utils(util = "buildvrt", unlist(tifs), destination = vrt) +gdal_utils(util = "translate", vrt, destination = "predict.tif") +``` + +Once the parallel computation is complete, we can close the computing cluster +using `stopCluster()` function (this will delete the temporary files). + +```{r} +stopCluster(cl) +``` + +So let's see what our final map looks like. + +```{r message=FALSE} +clustering = read_stars("predict.tif") +colors = c("#29a329", "#cbcbcb", "#ffffff", "#086209", "#fdd327", "#064d06") +names = c("Low vegetation", "Bare soil", "Cloud", "Forest 1", "Cropland", "Forest 2") +clustering[[1]] = factor(clustering[[1]], labels = names) +plot(clustering, main = NULL, col = colors) +``` diff --git a/Tutorial.html b/Tutorial.html new file mode 100644 index 0000000..215f6a2 --- /dev/null +++ b/Tutorial.html @@ -0,0 +1,1769 @@ + + + + +
+ + + + + + + + + +library("stars")
+set.seed(1) # define a random seed
+Prediction on large datasets can be time-consuming, but with enough
+computing power, this task can be parallelized easily. Some algorithms
+provide native multithreading like predict()
function in
+{ranger}
package (but this is not standard). In this
+situation, we have to parallelize the calculations ourselves. There are
+several approaches to do this, but for this example we will divide a
+large out of memory raster into smaller blocks and make predictions
+using multithreading.
The first step is to list the rasters representing the blue (B2),
+green (B3), red (B4), and near infrared (B5) bands to be loaded using
+the list.files()
function. We need to define two additional
+arguments, i.e. pattern = "\\.TIF$"
and
+full.names = TRUE
.
files = list.files("Landsat", pattern = "\\.TIF$", full.names = TRUE)
+files
+## [1] "Landsat/LC08_L2SP_191023_20230605_20230613_02_T1_SR_B2.TIF"
+## [2] "Landsat/LC08_L2SP_191023_20230605_20230613_02_T1_SR_B3.TIF"
+## [3] "Landsat/LC08_L2SP_191023_20230605_20230613_02_T1_SR_B4.TIF"
+## [4] "Landsat/LC08_L2SP_191023_20230605_20230613_02_T1_SR_B5.TIF"
+Since this data does not fit in memory, we have to load it as a proxy
+(actually, we will only load the metadata describing these rasters). To
+do this, we use the read_stars()
function with the
+arguments proxy = TRUE
and along = 3
. This
+second argument will cause the bands (layers) to be combined into a
+three-dimensional matrix: $ [longitude,  latitude,  spectral  band]
+$.
rasters = read_stars(files, proxy = TRUE, along = 3)
+We can replace the default filenames with band names, and rename the +third dimension as follows:
+bands = c("Blue", "Green", "Red", "NIR")
+# rename bands and dimension
+rasters = st_set_dimensions(rasters, 3, values = bands, names = "band")
+names(rasters) = "Landsat" # rename the object
+For modeling, we need to use a smaller sample rather than the entire +dataset. Drawing training points consists of several steps, which are +shown below:
+bbox = st_as_sfc(st_bbox(rasters)) # define a bounding box
+smp = st_sample(bbox, size = 20000) # sample points from the extent of the polygon
+smp = st_extract(rasters, smp) # extract the pixel values for those points
+smp = st_drop_geometry(st_as_sf(smp)) # convert to a data frame (remove geometry)
+smp = na.omit(smp) # remove missing values (NA)
+The aim of our analysis is to perform unsupervised classification
+(clustering) of raster pixels based on spectral bands. In other words,
+we want to group similar cells together to make homogeneous clusters. A
+popular method is the Gaussian mixture models available in the
+mclust package. Clustering can be done using the
+Mclust()
function and requires the target number of
+clusters to be defined in advance (e.g. G = 6
).
library("mclust")
+mdl = Mclust(smp, G = 6) # train the model
+As stated earlier, the entire raster is too large to be loaded into
+memory. We need to split it into smaller blocks. For this, we can use
+the st_tile()
function, which requires the total number of
+rows and columns, and the number of rows and columns of a small block
+(for example, it could be 2048 x 2048, but usually we should find the
+optimal size).
tiles = st_tile(nrow(rasters), ncol(rasters), 2048, 2048)
+head(tiles)
+## nXOff nYOff nXSize nYSize
+## [1,] 1 1 2048 2048
+## [2,] 1 2049 2048 2048
+## [3,] 1 4097 2048 2048
+## [4,] 1 6145 2048 1807
+## [5,] 2049 1 2048 2048
+## [6,] 2049 2049 2048 2048
+Finally, the input raster will be divided into 16 smaller blocks. In +the following sections, we will compare the performance of +single-threaded and multi-threaded processing.
+Now let’s discuss what steps we need to take to make a +prediction:
+proxy = FALSE
) and only the block
+(RasterIO = tiles[iterator, ]
).predict()
function for clustering. Note we
+need to define the drop_dimensions = TRUE
argument to
+remove the coordinates from the data frame.NA_value = 0
) and the block size as the input
+(chunk_size = dim(tile)
).Note! The read_stars()
function opens
+the connection to the file and closes it each time, which causes
+overhead. With a large number of blocks, this can significantly affect
+performance. In this case, it is better to use a low-level API,
+e.g. gdalraster.
start_time = Sys.time()
+
+for (i in seq_len(nrow(tiles))) {
+ tile = read_stars(files, proxy = FALSE, RasterIO = tiles[i, ])
+ names(tile) = bands # rename bands
+
+ pr = predict(tile, mdl, drop_dimensions = TRUE)
+ pr = pr[1] # select only clusters from output
+ save_path = file.path(tempdir(), paste0("tile_", i, ".tif"))
+ write_stars(pr, save_path, NA_value = 0, options = "COMPRESS=NONE",
+ type = "Byte", chunk_size = dim(tile))
+}
+
+end_time_1 = difftime(Sys.time(), start_time, units = "secs")
+end_time_1 = round(end_time_1)
+The prediction took 459 seconds.
+Multithreaded prediction is a bit more complicated. This requires the
+use of two additional packages, i.e. foreach and
+doParallel. Now we need to setup a parallel backend by
+defining the number of available threads (or by detecting them
+automatically using detectCores()
) in
+makeCluster()
, and then registering the cluster using the
+registerDoParallel()
function. This can be accomplished,
+for instance, with:
library("foreach")
+library("doParallel")
+threads = 3 # specify number of threads
+cl = makeCluster(threads)
+registerDoParallel(cl)
+Once we have the computing cluster prepared, we can write a loop that
+will be executed in parallel. Instead of the standard for()
+loop, we will use foreach()
with the %dopar%
+operator. In foreach()
we need to define an iterator and
+packages that will be exported to each worker. Then we use the
+%dopar%
operator and write the core of the function (it is
+identical as in the single-threaded example).
packages = c("stars", "mclust")
+
+start_time = Sys.time()
+
+tifs = foreach(i = seq_len(nrow(tiles)), .packages = packages) %dopar% {
+ tile = read_stars(files, proxy = FALSE, RasterIO = tiles[i, ], NA_value = 0)
+ names(tile) = bands
+
+ pr = predict(tile, mdl, drop_dimensions = TRUE)
+ pr = pr[1]
+ save_path = file.path(tempdir(), paste0("tile_", i, ".tif"))
+ write_stars(pr, save_path, NA_value = 0, options = "COMPRESS=NONE",
+ type = "Byte", chunk_size = dim(tile))
+ save_path
+}
+
+end_time_2 = difftime(Sys.time(), start_time, units = "secs")
+end_time_2 = round(end_time_2)
+The prediction took 248 seconds. By using three threads instead of +one, we have cut the operation time by half!
+We have our raster blocks saved in a temporary directory. The final +step is to make a mosaic (that is, to combine the blocks into a single +raster). I recommend using GDAL tools, as they provide the best +performance when there are a large number of blocks. We can use:
+buildvrt
to create a virtual mosaic.translate
to save as a geotiff.We can call these tools using the gdal_utils()
function
+from the sf package.
vrt = tempfile(fileext = ".vrt")
+gdal_utils(util = "buildvrt", unlist(tifs), destination = vrt)
+gdal_utils(util = "translate", vrt, destination = "predict.tif")
+Once the parallel computation is complete, we can close the computing
+cluster using stopCluster()
function (this will delete the
+temporary files).
stopCluster(cl)
+So let’s see what our final map looks like.
+clustering = read_stars("predict.tif")
+colors = c("#29a329", "#cbcbcb", "#ffffff", "#086209", "#fdd327", "#064d06")
+names = c("Low vegetation", "Bare soil", "Cloud", "Forest 1", "Cropland", "Forest 2")
+clustering[[1]] = factor(clustering[[1]], labels = names)
+plot(clustering, main = NULL, col = colors)
+
+