Skip to content

Commit

Permalink
Merge pull request #987 from JuliaAI/dev
Browse files Browse the repository at this point in the history
For a 1.6 release
  • Loading branch information
ablaom authored Jul 2, 2024
2 parents 5739a73 + 9d78c6c commit 0849be7
Show file tree
Hide file tree
Showing 13 changed files with 174 additions and 107 deletions.
6 changes: 3 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "MLJBase"
uuid = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
authors = ["Anthony D. Blaom <[email protected]>"]
version = "1.5.0"
version = "1.6"

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Expand Down Expand Up @@ -47,7 +47,7 @@ DelimitedFiles = "1"
Distributions = "0.25.3"
InvertedIndices = "1"
LearnAPI = "0.1"
MLJModelInterface = "1.10"
MLJModelInterface = "1.11"
Missings = "0.4, 1"
OrderedCollections = "1.1"
Parameters = "0.12"
Expand All @@ -58,7 +58,7 @@ Reexport = "1.2"
ScientificTypes = "3"
StatisticalMeasures = "0.1.1"
StatisticalMeasuresBase = "0.1.1"
StatisticalTraits = "3.3"
StatisticalTraits = "3.4"
Statistics = "1"
StatsBase = "0.32, 0.33, 0.34"
Tables = "0.2, 1.0"
Expand Down
2 changes: 1 addition & 1 deletion src/composition/learning_networks/nodes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ function _formula(stream, X::Node, depth, indent)
if X.machine !== nothing
print(stream, crind(indent + length(operation_name) - anti))
printstyled(IOContext(stream, :color=>SHOW_COLOR[]),
# handle(X.machine),
#handle(X.machine),
X.machine,
bold=SHOW_COLOR[])
n_args == 0 || print(stream, ", ")
Expand Down
3 changes: 2 additions & 1 deletion src/composition/learning_networks/signatures.jl
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,8 @@ See also [`MLJBase.Signature`](@ref).
"""
fitted_params_supplement(signature::Signature) = call_and_copy(fitted_params_nodes(signature))

""" report(signature; supplement=true)
"""
report(signature; supplement=true)
**Private method.**
Expand Down
20 changes: 17 additions & 3 deletions src/composition/models/pipelines.jl
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ or what `transform` returns if it is `Unsupervised`.
Names for the component fields are automatically generated unless
explicitly specified, as in
```
```julia
Pipeline(encoder=ContinuousEncoder(drop_last=false),
stand=Standardizer())
```
Expand Down Expand Up @@ -225,6 +225,15 @@ implements it (some clustering models). Similarly, calling `transform`
on a supervised pipeline calls `transform` on the supervised
component.
### Transformers that need a target in training
Some transformers that have type `Unsupervised` (so that the output of `transform` is
propagated in pipelines) may require a target variable for training. An example are
so-called target encoders (which transform categorical input features, based on some
target observations). Provided they appear before any `Supervised` component in the
pipelines, such models are supported. Of course a target must be provided whenever
training such a pipeline, whether or not it contains a `Supervised` component.
### Optional key-word arguments
- `prediction_type` -
Expand Down Expand Up @@ -444,9 +453,13 @@ function extend(front::Front{Pred}, ::Static, name, cache, args...)
Front(transform(mach, active(front)), front.transform, Pred())
end

function extend(front::Front{Trans}, component::Unsupervised, name, cache, args...)
function extend(front::Front{Trans}, component::Unsupervised, name, cache, ::Any, sources...)
a = active(front)
mach = machine(name, a; cache=cache)
if target_in_fit(component)
mach = machine(name, a, first(sources); cache=cache)
else
mach = machine(name, a; cache=cache)
end
Front(predict(mach, a), transform(mach, a), Trans())
end

Expand Down Expand Up @@ -598,6 +611,7 @@ function MMI.iteration_parameter(pipe::SupervisedPipeline)
end

MMI.target_scitype(p::SupervisedPipeline) = target_scitype(supervised_component(p))
MMI.target_in_fit(p::SomePipeline) = any(target_in_fit, components(p))

MMI.package_name(::Type{<:SomePipeline}) = "MLJBase"
MMI.load_path(::Type{<:SomePipeline}) = "MLJBase.Pipeline"
Expand Down
10 changes: 6 additions & 4 deletions src/data/data.jl
Original file line number Diff line number Diff line change
Expand Up @@ -401,12 +401,18 @@ _isnan(x::Number) = isnan(x)

skipnan(x) = Iterators.filter(!_isnan, x)

isinvalid(x) = ismissing(x) || _isnan(x)

"""
skipinvalid(itr)
Return an iterator over the elements in `itr` skipping `missing` and
`NaN` values. Behaviour is similar to [`skipmissing`](@ref).
"""
skipinvalid(v) = v |> skipmissing |> skipnan

"""
skipinvalid(A, B)
For vectors `A` and `B` of the same length, return a tuple of vectors
Expand All @@ -417,10 +423,6 @@ always returns a vector. Does not remove `Missing` from the element
types if present in the original iterators.
"""
skipinvalid(v) = v |> skipmissing |> skipnan

isinvalid(x) = ismissing(x) || _isnan(x)

function skipinvalid(yhat, y)
mask = .!(isinvalid.(yhat) .| isinvalid.(y))
return yhat[mask], y[mask]
Expand Down
9 changes: 5 additions & 4 deletions src/data/datasets.jl
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ function load_smarket()
end

"""Load a well-known sunspot time series (table with one column).
[https://www.sws.bom.gov.au/Educational/2/3/6]](https://www.sws.bom.gov.au/Educational/2/3/6)
<https://www.sws.bom.gov.au/Educational/2/3/6>
"""
load_sunspots() = load_dataset("sunspots.csv", COERCE_SUNSPOTS)

Expand Down Expand Up @@ -250,9 +250,10 @@ macro load_crabs()
end
end

""" Load S&P Stock Market dataset, as used in (An Introduction to
Statistical Learning with applications in
R)[https://rdrr.io/cran/ISLR/man/Smarket.html](https://rdrr.io/cran/ISLR/man/Smarket.html),
"""
Load S&P Stock Market dataset, as used in
[An Introduction to Statistical Learning with applications in
R](https://rdrr.io/cran/ISLR/man/Smarket.html),
by Witten et al (2013), Springer-Verlag, New York."""
macro load_smarket()
quote
Expand Down
22 changes: 13 additions & 9 deletions src/data/datasets_synthetic.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@ Internal function to finalize the `make_*` functions.
function finalize_Xy(X, y, shuffle, as_table, eltype, rng; clf::Bool=true)
# Shuffle the rows if required
if shuffle
X, y = shuffle_rows(X, y; rng=rng)
end
if eltype != Float64
X = convert.(eltype, X)
end
# return as matrix if as_table=false
X, y = shuffle_rows(X, y; rng=rng)
end
if eltype != Float64
X = convert.(eltype, X)
end
# return as matrix if as_table=false
as_table || return X, y
clf && return MLJBase.table(X), categorical(y)
if length(size(y)) > 1
Expand Down Expand Up @@ -172,7 +172,6 @@ membership to the smaller or larger circle, respectively.
* `noise=0`: standard deviation of the Gaussian noise added to the data,
* `factor=0.8`: ratio of the smaller radius over the larger one,
$(EXTRA_KW_MAKE*EXTRA_CLASSIFICATION)
### Example
Expand Down Expand Up @@ -318,7 +317,12 @@ Make portion `s` of vector `θ` exactly 0.
"""
sparsify!(rng, θ, s) =.*= (rand(rng, length(θ)) .< s))

"""Add outliers to portion s of vector."""
"""
outlify!(rng, y, s)
Add outliers to portion `s` of vector `y`.
"""
outlify!(rng, y, s) =
(n = length(y); y .+= 20 * randn(rng, n) .* (rand(rng, n) .< s))

Expand All @@ -329,7 +333,7 @@ const SIGMOID_32 = log(Float32(1)/eps(Float32) - Float32(1))
sigmoid(x)
Return the sigmoid computed in a numerically stable way:
``σ(x) = 1/(1+exp(-x))``
``σ(x) = 1/(1+\\exp(-x))``
"""
function sigmoid(x::Float64)
Expand Down
50 changes: 25 additions & 25 deletions src/hyperparam/one_dimensional_range_methods.jl
Original file line number Diff line number Diff line change
Expand Up @@ -66,31 +66,31 @@ In the first case iteration is over all `values` stored in the range
iteration is over approximately `n` ordered values, generated as
follows:
(i) First, exactly `n` values are generated between `U` and `L`, with a
spacing determined by `r.scale` (uniform if `scale=:linear`) where `U`
and `L` are given by the following table:
| `r.lower` | `r.upper` | `L` | `U` |
|-------------|------------|---------------------|---------------------|
| finite | finite | `r.lower` | `r.upper` |
| `-Inf` | finite | `r.upper - 2r.unit` | `r.upper` |
| finite | `Inf` | `r.lower` | `r.lower + 2r.unit` |
| `-Inf` | `Inf` | `r.origin - r.unit` | `r.origin + r.unit` |
(ii) If a callable `f` is provided as `scale`, then a uniform spacing
is always applied in (i) but `f` is broadcast over the results. (Unlike
ordinary scales, this alters the effective range of values generated,
instead of just altering the spacing.)
(iii) If `r` is a discrete numeric range (`r isa NumericRange{<:Integer}`)
then the values are additionally rounded, with any duplicate values
removed. Otherwise all the values are used (and there are exacltly `n`
of them).
(iv) Finally, if a random number generator `rng` is specified, then the values are
returned in random order (sampling without replacement), and otherwise
they are returned in numeric order, or in the order provided to the
range constructor, in the case of a `NominalRange`.
1. First, exactly `n` values are generated between `U` and `L`, with a
spacing determined by `r.scale` (uniform if `scale=:linear`) where `U`
and `L` are given by the following table:
| `r.lower` | `r.upper` | `L` | `U` |
|-------------|------------|---------------------|---------------------|
| finite | finite | `r.lower` | `r.upper` |
| `-Inf` | finite | `r.upper - 2r.unit` | `r.upper` |
| finite | `Inf` | `r.lower` | `r.lower + 2r.unit` |
| `-Inf` | `Inf` | `r.origin - r.unit` | `r.origin + r.unit` |
2. If a callable `f` is provided as `scale`, then a uniform spacing
is always applied in (1) but `f` is broadcast over the results. (Unlike
ordinary scales, this alters the effective range of values generated,
instead of just altering the spacing.)
3. If `r` is a discrete numeric range (`r isa NumericRange{<:Integer}`)
then the values are additionally rounded, with any duplicate values
removed. Otherwise all the values are used (and there are exacltly `n`
of them).
4. Finally, if a random number generator `rng` is specified, then the values are
returned in random order (sampling without replacement), and otherwise
they are returned in numeric order, or in the order provided to the
range constructor, in the case of a `NominalRange`.
"""
iterator(rng::AbstractRNG, r::ParamRange, args...) =
Expand Down
62 changes: 31 additions & 31 deletions src/machines.jl
Original file line number Diff line number Diff line change
Expand Up @@ -529,7 +529,7 @@ err_missing_model(model) = ErrorException(
)

"""
last_model(mach::Machine)
last_model(mach::Machine)
Return the last model used to train the machine `mach`. This is a bona fide model, even if
`mach.model` is a symbol.
Expand Down Expand Up @@ -572,31 +572,31 @@ the true model given by `getproperty(composite, model)`. See also [`machine`](@r
For the action to be a no-operation, either `mach.frozen == true` or
or none of the following apply:
- (i) `mach` has never been trained (`mach.state == 0`).
1. `mach` has never been trained (`mach.state == 0`).
- (ii) `force == true`.
2. `force == true`.
- (iii) The `state` of some other machine on which `mach` depends has
changed since the last time `mach` was trained (ie, the last time
`mach.state` was last incremented).
3. The `state` of some other machine on which `mach` depends has
changed since the last time `mach` was trained (ie, the last time
`mach.state` was last incremented).
- (iv) The specified `rows` have changed since the last retraining and
`mach.model` does not have `Static` type.
4. The specified `rows` have changed since the last retraining and
`mach.model` does not have `Static` type.
- (v) `mach.model` is a model and different from the last model used for training, but has
the same type.
5. `mach.model` is a model and different from the last model used for training, but has
the same type.
- (vi) `mach.model` is a model but has a type different from the last model used for
training.
6. `mach.model` is a model but has a type different from the last model used for
training.
- (vii) `mach.model` is a symbol and `(composite, mach.model)` is different from the last
model used for training, but has the same type.
7. `mach.model` is a symbol and `(composite, mach.model)` is different from the last
model used for training, but has the same type.
- (viii) `mach.model` is a symbol and `(composite, mach.model)` has a different type from
the last model used for training.
8. `mach.model` is a symbol and `(composite, mach.model)` has a different type from
the last model used for training.
In any of the cases (i) - (iv), (vi), or (viii), `mach` is trained ab initio. If (v) or
(vii) is true, then a training update is applied.
In any of the cases (1) - (4), (6), or (8), `mach` is trained ab initio.
If (5) or (7) is true, then a training update is applied.
To freeze or unfreeze `mach`, use `freeze!(mach)` or `thaw!(mach)`.
Expand Down Expand Up @@ -658,7 +658,7 @@ function fit_only!(
rows === nothing && (rows = (:))
rows_is_new = !isdefined(mach, :old_rows) || rows != mach.old_rows

condition_iv = rows_is_new && !(mach.model isa Static)
condition_4 = rows_is_new && !(mach.model isa Static)

upstream_has_changed = mach.old_upstream_state != upstream_state

Expand All @@ -672,16 +672,16 @@ function fit_only!(

# build or update cached `resampled_data` if necessary (`mach.data` is already defined
# above if needed here):
if cache_data && (!data_is_valid || condition_iv)
if cache_data && (!data_is_valid || condition_4)
mach.resampled_data = selectrows(model, rows, mach.data...)
end

# `fit`, `update`, or return untouched:
if mach.state == 0 || # condition (i)
force == true || # condition (ii)
upstream_has_changed || # condition (iii)
condition_iv || # condition (iv)
modeltype_changed # conditions (vi) or (vii)
if mach.state == 0 || # condition (1)
force == true || # condition (2)
upstream_has_changed || # condition (3)
condition_4 || # condition (4)
modeltype_changed # conditions (6) or (7)

isdefined(mach, :report) || (mach.report = LittleDict{Symbol,Any}())

Expand Down Expand Up @@ -709,7 +709,7 @@ function fit_only!(
rethrow()
end

elseif model != mach.old_model # condition (v)
elseif model != mach.old_model # condition (5)

# update the model:
fitlog(mach, :update, verbosity)
Expand Down Expand Up @@ -1044,9 +1044,10 @@ To serialise using a different format, see [`serializable`](@ref).
Machines are deserialized using the `machine` constructor as shown in
the example below.
> The implementation of `save` for machines changed in MLJ 0.18
> (MLJBase 0.20). You can only restore a machine saved using older
> versions of MLJ using an older version.
!!! note
The implementation of `save` for machines changed in MLJ 0.18
(MLJBase 0.20). You can only restore a machine saved using older
versions of MLJ using an older version.
### Example
Expand All @@ -1073,8 +1074,7 @@ predict(predict_only_mach, X)
general purpose serialization formats, can allow for arbitrary code
execution during loading. This means it is possible for someone
to use a JLS file that looks like a serialized MLJ machine as a
[Trojan
horse](https://en.wikipedia.org/wiki/Trojan_horse_(computing)).
[Trojan horse](https://en.wikipedia.org/wiki/Trojan_horse_(computing)).
See also [`serializable`](@ref), [`machine`](@ref).
Expand Down
Loading

0 comments on commit 0849be7

Please sign in to comment.