From 8fc36915a487af0f92002198a21aa97c685d8ccc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Fri, 18 Nov 2022 12:33:31 -0500 Subject: [PATCH 1/7] Dev in a vignette, why not? --- docs/src/vignettes/09_data_preparation.jl | 54 +++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 docs/src/vignettes/09_data_preparation.jl diff --git a/docs/src/vignettes/09_data_preparation.jl b/docs/src/vignettes/09_data_preparation.jl new file mode 100644 index 000000000..8713ade8d --- /dev/null +++ b/docs/src/vignettes/09_data_preparation.jl @@ -0,0 +1,54 @@ +# # Preparing data for prediction + +using SpeciesDistributionToolkit +using CairoMakie + +# + +spatial_extent = (left = 5.0, bottom = 57.5, right = 10.0, top = 62.7) + +# + +rangifer = taxon("Rangifer tarandus tarandus"; strict = false) +query = [ + "occurrenceStatus" => "PRESENT", + "hasCoordinate" => true, + "decimalLatitude" => (spatial_extent.bottom, spatial_extent.top), + "decimalLongitude" => (spatial_extent.left, spatial_extent.right), + "limit" => 300, +] +presences = occurrences(rangifer, query...) +for i in 1:3 + occurrences!(presences) +end + +# + +dataprovider = RasterData(CHELSA1, BioClim) + +varnames = layerdescriptions(dataprovider) + +# + +layers = [ + 1.0SimpleSDMPredictor(dataprovider; spatial_extent..., layer = lname) for + lname in ["BIO1", "BIO12"] +] + +# + +mutable struct SimpleSDMStack{T <: SimpleSDMLayer} + names::Vector{String} + layers::Vector{Base.RefValue{T}} +end + +# + +stack = SimpleSDMStack(["BIO1", "BIO12"], Ref.(layers)) + +# + +import Tables +Tables.istable(::Type{T}) where {T <: SimpleSDMStack} = true +Tables.columnaccess(::Type{T}) where {T <: SimpleSDMStack} = true +Tables.columns(s::T) where {T <: SimpleSDMStack} = s.layers From 71d5477789abbb90fbb5bbabdac75117c7c1c0ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Fri, 18 Nov 2022 15:17:54 -0500 Subject: [PATCH 2/7] Table for stacks of layers --- Project.toml | 1 + docs/src/vignettes/09_data_preparation.jl | 56 +++++++++++++++++++---- src/SpeciesDistributionToolkit.jl | 4 ++ src/stack.jl | 53 +++++++++++++++++++++ 4 files changed, 104 insertions(+), 10 deletions(-) create mode 100644 src/stack.jl diff --git a/Project.toml b/Project.toml index 34728f991..18edbf076 100644 --- a/Project.toml +++ b/Project.toml @@ -12,6 +12,7 @@ GBIF = "ee291a33-5a6c-5552-a3c8-0f29a1181037" GDAL = "add2ef01-049f-52c4-9ee2-e494f65e021a" GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a" GeoMakie = "db073c08-6b98-4ee5-b6a4-5efafb3259c6" +MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" Revise = "295af30f-e4ad-537b-8983-00126c2a3abe" SimpleSDMDatasets = "2c7d61d0-5c73-410d-85b2-d2e7fbbdcefa" diff --git a/docs/src/vignettes/09_data_preparation.jl b/docs/src/vignettes/09_data_preparation.jl index 8713ade8d..11f658328 100644 --- a/docs/src/vignettes/09_data_preparation.jl +++ b/docs/src/vignettes/09_data_preparation.jl @@ -1,7 +1,6 @@ # # Preparing data for prediction using SpeciesDistributionToolkit -using CairoMakie # @@ -31,24 +30,61 @@ varnames = layerdescriptions(dataprovider) # layers = [ - 1.0SimpleSDMPredictor(dataprovider; spatial_extent..., layer = lname) for + convert( + SimpleSDMResponse, + 1.0SimpleSDMPredictor(dataprovider; spatial_extent..., layer = lname), + ) for lname in ["BIO1", "BIO12"] ] -# +# + +presenceonly = mask(layers[1], presences, Bool) +absenceonly = SpeciesDistributionToolkit.sample( + pseudoabsencemask(SurfaceRangeEnvelope, presenceonly), + 250, +) +replace!(presenceonly, false => nothing) +replace!(absenceonly, false => nothing) +for cell in absenceonly + presenceonly[cell.longitude, cell.latitude] = false +end -mutable struct SimpleSDMStack{T <: SimpleSDMLayer} - names::Vector{String} - layers::Vector{Base.RefValue{T}} +for i in eachindex(layers) + keys_to_void = setdiff(keys(layers[i]), keys(presenceonly)) + for k in keys_to_void + layers[i][k] = nothing + end end +layers + # -stack = SimpleSDMStack(["BIO1", "BIO12"], Ref.(layers)) +refs = Ref.([layers..., presenceonly]) + +datastack = SimpleSDMStack(["BIO1", "BIO12", "Presence"], refs) # import Tables -Tables.istable(::Type{T}) where {T <: SimpleSDMStack} = true -Tables.columnaccess(::Type{T}) where {T <: SimpleSDMStack} = true -Tables.columns(s::T) where {T <: SimpleSDMStack} = s.layers +Tables.istable(::Type{SimpleSDMStack}) = true +Tables.rowaccess(::Type{SimpleSDMStack}) = true +function Tables.schema(s::SimpleSDMStack) + tp = first(s) + @info keys(tp) + sc = Tables.Schema(keys(tp), typeof.(values(tp))) + @info sc + return sc +end + +using DataFrames +DataFrame(datastack) + +# + +using MLJ + +# + +t = table(datastack) diff --git a/src/SpeciesDistributionToolkit.jl b/src/SpeciesDistributionToolkit.jl index bb19be4a3..1ee277589 100644 --- a/src/SpeciesDistributionToolkit.jl +++ b/src/SpeciesDistributionToolkit.jl @@ -29,6 +29,10 @@ include("io/geotiff.jl") include("io/ascii.jl") include("io/read_write.jl") +# Stack for data export +include("stack.jl") +export SimpleSDMStack + # Tables interface import Tables include("tables.jl") diff --git a/src/stack.jl b/src/stack.jl new file mode 100644 index 000000000..fd7c4a814 --- /dev/null +++ b/src/stack.jl @@ -0,0 +1,53 @@ +""" + SimpleSDMStack + +Stores multiple _references_ to layers +""" +struct SimpleSDMStack + names::Vector{String} + layers::Vector{Base.RefValue} +end + +Base.length(s::T) where {T <: SimpleSDMStack} = length(first(s.layers).x) +Base.names(s::T) where {T <: SimpleSDMStack} = s.names + +SimpleSDMLayers.latitudes(s::T) where {T <: SimpleSDMStack} = latitudes(first(s.layers).x) +SimpleSDMLayers.longitudes(s::T) where {T <: SimpleSDMStack} = longitudes(first(s.layers).x) +SimpleSDMLayers.boundingbox(s::T) where {T <: SimpleSDMStack} = + boundingbox(first(s.layers).x) + +Base.IteratorSize(::T) where {T <: SimpleSDMStack} = Base.HasLength() +function Base.IteratorEltype(s::T) where {T <: SimpleSDMStack} + varnames = [:longitude, :latitude, Symbol.(names(s))...] + vartypes = [ + eltype(longitudes(s)), + eltype(latitudes(s)), + [SimpleSDMLayers._inner_type(l.x) for l in s.layers]..., + ] + return NamedTuple{tuple(varnames...), Tuple{vartypes...}} +end + +function Base.iterate(s::SimpleSDMStack) + position = findfirst(!isnothing, s.layers[1].x.grid) + lon = longitudes(s)[last(position.I)] + lat = latitudes(s)[first(position.I)] + vals = [l.x[lon, lat] for l in s.layers] + varnames = [:longitude, :latitude, Symbol.(names(s))...] + return (NamedTuple{tuple(varnames...)}(tuple(lon, lat, vals...)), position) +end + +function Base.iterate(s::SimpleSDMStack, state) + newstate = LinearIndices(s.layers[1].x.grid)[state] + 1 + newstate > prod(size(s.layers[1].x.grid)) && return nothing + position = findnext( + !isnothing, + s.layers[1].x.grid, + CartesianIndices(s.layers[1].x.grid)[newstate], + ) + isnothing(position) && return nothing + lon = longitudes(s)[last(position.I)] + lat = latitudes(s)[first(position.I)] + vals = [l.x[lon, lat] for l in s.layers] + varnames = [:longitude, :latitude, Symbol.(names(s))...] + return (NamedTuple{tuple(varnames...)}(tuple(lon, lat, vals...)), position) +end From 231c836064fcdda5134b400f6ba2ae46880d1bb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Fri, 18 Nov 2022 16:21:54 -0500 Subject: [PATCH 3/7] Giant mess that I will need to cleanup --- Project.toml | 2 + docs/src/vignettes/09_data_preparation.jl | 54 ++++++++++++++++++++--- 2 files changed, 51 insertions(+), 5 deletions(-) diff --git a/Project.toml b/Project.toml index 18edbf076..d1083d03b 100644 --- a/Project.toml +++ b/Project.toml @@ -13,6 +13,8 @@ GDAL = "add2ef01-049f-52c4-9ee2-e494f65e021a" GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a" GeoMakie = "db073c08-6b98-4ee5-b6a4-5efafb3259c6" MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" +MLJLinearModels = "6ee0df7b-362f-4a72-a706-9e79364fb692" +MLJModels = "d491faf4-2d78-11e9-2867-c94bc002c0b7" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" Revise = "295af30f-e4ad-537b-8983-00126c2a3abe" SimpleSDMDatasets = "2c7d61d0-5c73-410d-85b2-d2e7fbbdcefa" diff --git a/docs/src/vignettes/09_data_preparation.jl b/docs/src/vignettes/09_data_preparation.jl index 11f658328..830e69acf 100644 --- a/docs/src/vignettes/09_data_preparation.jl +++ b/docs/src/vignettes/09_data_preparation.jl @@ -1,6 +1,7 @@ # # Preparing data for prediction using SpeciesDistributionToolkit +using CairoMakie # @@ -34,9 +35,13 @@ layers = [ SimpleSDMResponse, 1.0SimpleSDMPredictor(dataprovider; spatial_extent..., layer = lname), ) for - lname in ["BIO1", "BIO12"] + lname in keys(varnames) ] +# + +originallayers = deepcopy(layers) + # presenceonly = mask(layers[1], presences, Bool) @@ -63,7 +68,9 @@ layers refs = Ref.([layers..., presenceonly]) -datastack = SimpleSDMStack(["BIO1", "BIO12", "Presence"], refs) +datastack = SimpleSDMStack([values(varnames)..., "Presence"], refs) + +predictionstack = SimpleSDMStack([values(varnames)...], Ref.(originallayers)) # @@ -72,9 +79,7 @@ Tables.istable(::Type{SimpleSDMStack}) = true Tables.rowaccess(::Type{SimpleSDMStack}) = true function Tables.schema(s::SimpleSDMStack) tp = first(s) - @info keys(tp) sc = Tables.Schema(keys(tp), typeof.(values(tp))) - @info sc return sc end @@ -87,4 +92,43 @@ using MLJ # -t = table(datastack) +y, X = unpack(select(DataFrame(datastack), Not([:longitude, :latitude])), ==(:Presence)); +y = coerce(y, Continuous) + +# + +Standardizer = @load Standardizer pkg = MLJModels add = true verbosity = 0 +LM = @load LinearRegressor pkg = MLJLinearModels add = true verbosity = 0 +model = Standardizer() |> LM() + +# + +mach = machine(model, X, y) |> fit! + +# + +perf_measures = [mcc, f1score, accuracy, balanced_accuracy] +evaluate!( + mach; + resampling = CV(; nfolds = 3, shuffle = true, rng = Xoshiro(234)), + measure = perf_measures, +) + +# + +value = predict(mach, select(DataFrame(predictionstack), Not([:longitude, :latitude]))); + +# + +prediction = select(DataFrame(predictionstack), [:longitude, :latitude]); +prediction.value = value; + +# + +output = Tables.materializer(SimpleSDMResponse)(prediction) + +# + +heatmap(sprinkle(output)...; colormap = :viridis) +scatter!(longitudes(presences), latitudes(presences)) +current_figure() From 14690bfe579e964c3024dc80fe9ad0c2ddb66a92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Fri, 18 Nov 2022 22:38:25 -0500 Subject: [PATCH 4/7] Inner constructor for stacks --- docs/src/vignettes/09_data_preparation.jl | 8 ------- src/stack.jl | 28 +++++++++++++++++++++-- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/docs/src/vignettes/09_data_preparation.jl b/docs/src/vignettes/09_data_preparation.jl index 830e69acf..01753e932 100644 --- a/docs/src/vignettes/09_data_preparation.jl +++ b/docs/src/vignettes/09_data_preparation.jl @@ -74,14 +74,6 @@ predictionstack = SimpleSDMStack([values(varnames)...], Ref.(originallayers)) # -import Tables -Tables.istable(::Type{SimpleSDMStack}) = true -Tables.rowaccess(::Type{SimpleSDMStack}) = true -function Tables.schema(s::SimpleSDMStack) - tp = first(s) - sc = Tables.Schema(keys(tp), typeof.(values(tp))) - return sc -end using DataFrames DataFrame(datastack) diff --git a/src/stack.jl b/src/stack.jl index fd7c4a814..4b575ae35 100644 --- a/src/stack.jl +++ b/src/stack.jl @@ -1,16 +1,32 @@ """ SimpleSDMStack -Stores multiple _references_ to layers +Stores multiple _references_ to layers alongside with their names. This is mostly useful because it provides an interface that we can use as a Tables.jl provider. """ struct SimpleSDMStack names::Vector{String} layers::Vector{Base.RefValue} + function SimpleSDMStack(names::Vector{String}, layers::Vector{Base.RefValue}) + # As many names as layers + @assert length(names) == length(layers) + # Layers have the correct type + @assert all([typeof(layer.x) <: SimpleSDMLayer for layer in layers]) + # Layers are all compatible + @assert all([ + SimpleSDMLayers._layers_are_compatible(first(layers).x, layer.x) for + layer in layers + ]) + # Layers all have the same keys + @assert all([ + sort(keys(first(layers).x)) == sort(keys(layer.x)) for layer in layers + ]) + # Return if all pass + return new(names, layers) + end end Base.length(s::T) where {T <: SimpleSDMStack} = length(first(s.layers).x) Base.names(s::T) where {T <: SimpleSDMStack} = s.names - SimpleSDMLayers.latitudes(s::T) where {T <: SimpleSDMStack} = latitudes(first(s.layers).x) SimpleSDMLayers.longitudes(s::T) where {T <: SimpleSDMStack} = longitudes(first(s.layers).x) SimpleSDMLayers.boundingbox(s::T) where {T <: SimpleSDMStack} = @@ -51,3 +67,11 @@ function Base.iterate(s::SimpleSDMStack, state) varnames = [:longitude, :latitude, Symbol.(names(s))...] return (NamedTuple{tuple(varnames...)}(tuple(lon, lat, vals...)), position) end + +Tables.istable(::Type{SimpleSDMStack}) = true +Tables.rowaccess(::Type{SimpleSDMStack}) = true +function Tables.schema(s::SimpleSDMStack) + tp = first(s) + sc = Tables.Schema(keys(tp), typeof.(values(tp))) + return sc +end From 895d3a21d61d2b1753176c0a7da89c0671be4b33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Fri, 18 Nov 2022 22:42:59 -0500 Subject: [PATCH 5/7] Removed all the packages that are only required for the vignettes --- Project.toml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/Project.toml b/Project.toml index d1083d03b..5079c2239 100644 --- a/Project.toml +++ b/Project.toml @@ -5,18 +5,11 @@ version = "0.0.1" [deps] ArchGDAL = "c9ce4bd3-c3d5-55b8-8973-c0e20141b8c3" -DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" Fauxcurrences = "a2d61402-033a-4ca9-aef4-652d70cf7c9c" GBIF = "ee291a33-5a6c-5552-a3c8-0f29a1181037" GDAL = "add2ef01-049f-52c4-9ee2-e494f65e021a" -GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a" -GeoMakie = "db073c08-6b98-4ee5-b6a4-5efafb3259c6" -MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" -MLJLinearModels = "6ee0df7b-362f-4a72-a706-9e79364fb692" -MLJModels = "d491faf4-2d78-11e9-2867-c94bc002c0b7" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" -Revise = "295af30f-e4ad-537b-8983-00126c2a3abe" SimpleSDMDatasets = "2c7d61d0-5c73-410d-85b2-d2e7fbbdcefa" SimpleSDMLayers = "2c645270-77db-11e9-22c3-0f302a89c64c" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" From 5996b770bfe4cf9d86034d12cdc8dcdc2ad61bdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Fri, 18 Nov 2022 22:52:27 -0500 Subject: [PATCH 6/7] Update project for the docs --- docs/Project.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/Project.toml b/docs/Project.toml index cf335c8a0..2b72164af 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,8 +1,10 @@ [deps] CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" GeoMakie = "db073c08-6b98-4ee5-b6a4-5efafb3259c6" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306" +MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" From 050c34f8cba7ec45b66da1475e8de377e75a5489 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20Poisot?= Date: Fri, 25 Nov 2022 15:44:44 -0500 Subject: [PATCH 7/7] update where Tables is --- src/SpeciesDistributionToolkit.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/SpeciesDistributionToolkit.jl b/src/SpeciesDistributionToolkit.jl index 1ee277589..334ec600a 100644 --- a/src/SpeciesDistributionToolkit.jl +++ b/src/SpeciesDistributionToolkit.jl @@ -9,6 +9,8 @@ const _distance_function = Distances.Haversine(6371.0) import StatsBase +import Tables + # We make ample use of re-export using Reexport @@ -34,7 +36,6 @@ include("stack.jl") export SimpleSDMStack # Tables interface -import Tables include("tables.jl") # Functions for pseudo-absence generation