Skip to content

Commit 29b230f

Browse files
committed
Merge branch 'StatsBase2021' into nl/weightedstats
2 parents 850d3e6 + 1e5d2a8 commit 29b230f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+3211
-1106
lines changed

.github/workflows/CompatHelper.yml

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
name: CompatHelper
2+
on:
3+
schedule:
4+
- cron: 0 0 * * *
5+
workflow_dispatch:
6+
jobs:
7+
CompatHelper:
8+
runs-on: ubuntu-latest
9+
steps:
10+
- name: "Install CompatHelper"
11+
run: |
12+
import Pkg
13+
name = "CompatHelper"
14+
uuid = "aa819f21-2bde-4658-8897-bab36330d9b7"
15+
version = "2"
16+
Pkg.add(; name, uuid, version)
17+
shell: julia --color=yes {0}
18+
- name: "Run CompatHelper"
19+
run: |
20+
import CompatHelper
21+
CompatHelper.main()
22+
shell: julia --color=yes {0}
23+
env:
24+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
25+
COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
26+
# COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }}

.github/workflows/TagBot.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
name: TagBot
2+
on:
3+
schedule:
4+
- cron: 0 * * * *
5+
jobs:
6+
TagBot:
7+
runs-on: ubuntu-latest
8+
steps:
9+
- uses: JuliaRegistries/TagBot@v1
10+
with:
11+
token: ${{ secrets.GITHUB_TOKEN }}

.github/workflows/ci.yml

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -52,15 +52,8 @@ jobs:
5252
runs-on: ubuntu-latest
5353
steps:
5454
- uses: actions/checkout@v2
55-
- uses: julia-actions/setup-julia@v1
56-
with:
57-
version: '1'
58-
- run: |
59-
julia --project=docs -e '
60-
using Pkg
61-
Pkg.develop(PackageSpec(path=pwd()))
62-
Pkg.instantiate()'
63-
- run: julia --project=docs docs/make.jl
55+
- uses: julia-actions/julia-buildpkg@latest
56+
- uses: julia-actions/julia-docdeploy@latest
6457
env:
6558
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
6659
DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}

Project.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
name = "Statistics"
2-
uuid = "20745b16-79ce-11e8-11f9-7d13ad32a3b2"
2+
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
33

44
[deps]
55
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
66
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
77
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
88

99
[extras]
10+
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
1011
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
1112
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
1213

1314
[targets]
14-
test = ["Random", "Test"]
15+
test = ["Dates", "Random", "Test"]

README.md

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,16 @@
1-
## StatsBase.jl
1+
# Statistics.jl
22

3-
*StatsBase.jl* is a Julia package that provides basic support for statistics. Particularly, it implements a variety of statistics-related functions, such as scalar statistics, high-order moment computation, counting, ranking, covariances, sampling, and empirical density estimation.
3+
[![Build status](https://github.com/JuliaLang/Statistics.jl/workflows/CI/badge.svg)](https://github.com/JuliaLang/Statistics.jl/actions?query=workflow%3ACI+branch%3Amaster)
44

5-
- **Current Release**:
6-
[![StatsBase](http://pkg.julialang.org/badges/StatsBase_0.5.svg)](http://pkg.julialang.org/?pkg=StatsBase)
7-
[![StatsBase](http://pkg.julialang.org/badges/StatsBase_0.6.svg)](http://pkg.julialang.org/?pkg=StatsBase)
8-
- **Build & Testing Status:**
9-
[![Build Status](https://travis-ci.org/JuliaStats/StatsBase.jl.svg?branch=master)](https://travis-ci.org/JuliaStats/StatsBase.jl)
10-
[![Build status](https://ci.appveyor.com/api/projects/status/fsut3j3onulvws1w?svg=true)](https://ci.appveyor.com/project/nalimilan/statsbase-jl)
11-
[![Coverage Status](https://coveralls.io/repos/JuliaStats/StatsBase.jl/badge.svg?branch=master)](https://coveralls.io/r/JuliaStats/StatsBase.jl?branch=master)
12-
[![Coverage Status](http://codecov.io/github/JuliaStats/StatsBase.jl/coverage.svg?branch=master)](http://codecov.io/github/JuliaStats/StatsBase.jl?branch=master)
5+
Development repository for the Statistics standard library (stdlib) that ships with Julia.
136

14-
- **Documentation**: [![][docs-stable-img]][docs-stable-url] [![][docs-latest-img]][docs-latest-url]
7+
#### Using the development version of Statistics.jl
158

16-
[docs-latest-img]: https://img.shields.io/badge/docs-latest-blue.svg
17-
[docs-latest-url]: http://JuliaStats.github.io/StatsBase.jl/latest/
9+
If you want to develop this package, do the following steps:
10+
- Clone the repo anywhere.
11+
- In line 2 of the `Project.toml` file (the line that begins with `uuid = ...`), modify the UUID, e.g. change the `107` to `207`.
12+
- Change the current directory to the Statistics repo you just cloned and start julia with `julia --project`.
13+
- `import Statistics` will now load the files in the cloned repo instead of the Statistics stdlib.
14+
- To test your changes, simply do `include("test/runtests.jl")`.
1815

19-
[docs-stable-img]: https://img.shields.io/badge/docs-stable-blue.svg
20-
[docs-stable-url]: http://JuliaStats.github.io/StatsBase.jl/stable/
16+
If you need to build Julia from source with a git checkout of Statistics, then instead use `make DEPS_GIT=Statistics` when building Julia. The `Statistics` repo is in `stdlib/Statistics`, and created initially with a detached `HEAD`. If you're doing this from a pre-existing Julia repository, you may need to `make clean` beforehand.

docs/src/empirical.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22

33
## Histograms
44

5-
The `Histogram` type represents data that has been tabulated into intervals
6-
(known as *bins*) along the real line, or in higher dimensions, over the real
7-
plane.
5+
```@docs
6+
Histogram
7+
```
88

99
Histograms can be fitted to data using the `fit` method.
1010

docs/src/index.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,4 @@ corrections where necessary.
1313
Pages = ["weights.md", "scalarstats.md", "cov.md", "robust.md", "ranking.jl",
1414
"empirical.md"]
1515
Depth = 2
16-
```
16+
```

docs/src/scalarstats.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,4 +71,4 @@ modes
7171

7272
```@docs
7373
describe
74-
```
74+
```

docs/src/weights.md

Lines changed: 81 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,15 +64,91 @@ w = ProbabilityWeights([0.2, 0.1, 0.3])
6464
w = pweights([0.2, 0.1, 0.3])
6565
```
6666

67+
### `UnitWeights`
68+
69+
Unit weights are a special case in which all observations are given a weight equal to `1`. Using such weights is equivalent to computing unweighted statistics.
70+
71+
This type can notably be used when implementing an algorithm so that a only a weighted variant has to be written. The unweighted variant is then obtained by passing a `UnitWeights` object. This is very efficient since no weights vector is actually allocated.
72+
73+
```julia
74+
w = uweights(3)
75+
w = uweights(Float64, 3)
76+
```
77+
6778
### `Weights`
6879

69-
The `Weights` type describes a generic weights vector which does not support all operations possible for `FrequencyWeights`, `AnalyticWeights` and `ProbabilityWeights`.
80+
The `Weights` type describes a generic weights vector which does not support all operations possible for `FrequencyWeights`, `AnalyticWeights`, `ProbabilityWeights` and `UnitWeights`.
7081

7182
```julia
7283
w = Weights([1., 2., 3.])
7384
w = weights([1., 2., 3.])
7485
```
7586

87+
### Exponential weights: `eweights`
88+
89+
Exponential weights are a common form of temporal weights which assign exponentially decreasing
90+
weights to past observations.
91+
92+
If `t` is a vector of temporal indices then for each index `i` we compute the weight as:
93+
94+
``λ (1 - λ)^{1 - i}``
95+
96+
``λ`` is a smoothing factor or rate parameter such that ``0 < λ ≤ 1``.
97+
As this value approaches 0, the resulting weights will be almost equal,
98+
while values closer to 1 will put greater weight on the tail elements of the vector.
99+
100+
For example, the following call generates exponential weights for ten observations with ``λ = 0.3``.
101+
```julia-repl
102+
julia> eweights(1:10, 0.3)
103+
10-element Weights{Float64,Float64,Array{Float64,1}}:
104+
0.3
105+
0.42857142857142855
106+
0.6122448979591837
107+
0.8746355685131197
108+
1.249479383590171
109+
1.7849705479859588
110+
2.549957925694227
111+
3.642797036706039
112+
5.203995766722913
113+
7.434279666747019
114+
```
115+
116+
Simply passing the number of observations `n` is equivalent to passing in `1:n`.
117+
118+
```julia-repl
119+
julia> eweights(10, 0.3)
120+
10-element Weights{Float64,Float64,Array{Float64,1}}:
121+
0.3
122+
0.42857142857142855
123+
0.6122448979591837
124+
0.8746355685131197
125+
1.249479383590171
126+
1.7849705479859588
127+
2.549957925694227
128+
3.642797036706039
129+
5.203995766722913
130+
7.434279666747019
131+
```
132+
133+
Finally, you can construct exponential weights from an arbitrary subset of timestamps within a larger range.
134+
135+
```julia-repl
136+
julia> t
137+
2019-01-01T01:00:00:2 hours:2019-01-01T05:00:00
138+
139+
julia> r
140+
2019-01-01T01:00:00:1 hour:2019-01-02T01:00:00
141+
142+
julia> eweights(t, r, 0.3)
143+
3-element Weights{Float64,Float64,Array{Float64,1}}:
144+
0.3
145+
0.6122448979591837
146+
1.249479383590171
147+
```
148+
149+
NOTE: This is equivalent to `eweights(something.(indexin(t, r)), 0.3)`, which is saying that for each value in `t` return the corresponding index for that value in `r`.
150+
Since `indexin` returns `nothing` if there is no corresponding value from `t` in `r` we use `something` to eliminate that possibility.
151+
76152
## Methods
77153

78154
`AbstractWeights` implements the following methods:
@@ -90,9 +166,12 @@ AbstractWeights
90166
AnalyticWeights
91167
FrequencyWeights
92168
ProbabilityWeights
169+
UnitWeights
93170
Weights
94171
aweights
95172
fweights
96173
pweights
174+
eweights
175+
uweights
97176
weights
98-
```
177+
```

perf/sampling.jl

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ using StatsBase
66

77
import StatsBase: direct_sample!, xmultinom_sample!
88
import StatsBase: knuths_sample!, fisher_yates_sample!, self_avoid_sample!
9-
import StatsBase: seqsample_a!, seqsample_c!
9+
import StatsBase: seqsample_a!, seqsample_c!, seqsample_d!
1010

1111
### generic sampling benchmarking
1212

@@ -42,6 +42,9 @@ tsample!(s::Seq_A, a, x) = seqsample_a!(a, x)
4242
mutable struct Seq_C <: NoRep end
4343
tsample!(s::Seq_C, a, x) = seqsample_c!(a, x)
4444

45+
mutable struct Seq_D <: NoRep end
46+
tsample!(s::Seq_D, a, x) = seqsample_d!(a, x)
47+
4548
mutable struct Sample_NoRep <: NoRep end
4649
tsample!(s::Sample_NoRep, a, x) = sample!(a, x; replace=false, ordered=false)
4750

@@ -87,6 +90,7 @@ const procs2 = Proc[ SampleProc{Knuths}(),
8790
SampleProc{Sample_NoRep}(),
8891
SampleProc{Seq_A}(),
8992
SampleProc{Seq_C}(),
93+
SampleProc{Seq_D}(),
9094
SampleProc{Sample_NoRep_Ord}() ]
9195

9296
const cfgs2 = (Int, Int)[]
@@ -110,4 +114,3 @@ println("Sampling Without Replacement")
110114
println("===================================")
111115
show(rtable2; unit=:mps, cfghead="(n, k)")
112116
println()
113-

src/Statistics.jl

Lines changed: 24 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ export std, stdm, var, varm, mean!, mean,
1818
# moments.jl
1919
skewness, kurtosis,
2020
# weights.jl
21-
AbstractWeights, Weights, AnalyticWeights, FrequencyWeights, ProbabilityWeights,
22-
weights, aweights, fweights, pweights,
21+
AbstractWeights, Weights, AnalyticWeights, FrequencyWeights, ProbabilityWeights, UnitWeights,
22+
weights, aweights, eweights, fweights, pweights, uweights,
2323
# scalarstats.jl
2424
geomean, harmmean, genmean, mode, modes, percentile, span, variation, sem, mad, mad!,
2525
iqr, genvar, totalvar, entropy, renyientropy, crossentropy, kldivergence, describe,
@@ -264,6 +264,16 @@ _mean(::typeof(identity), A::AbstractArray, dims::Colon, w::AbstractArray) =
264264
_mean(::typeof(identity), A::AbstractArray, dims, w::AbstractArray) =
265265
_mean!(Base.reducedim_init(t -> (t*zero(eltype(w)))/2, Base.add_sum, A, dims), A, w)
266266

267+
function _mean(::typeof(identity), A::AbstractArray, dims, w::UnitWeights)
268+
size(A, dims) != length(w) && throw(DimensionMismatch("Inconsistent array dimension."))
269+
return mean(A, dims=dims)
270+
end
271+
272+
function _mean(::typeof(identity), A::AbstractArray, dims::Colon, w::UnitWeights)
273+
length(A) != length(w) && throw(DimensionMismatch("Inconsistent array dimension."))
274+
return mean(A)
275+
end
276+
267277
##### variances #####
268278

269279
# faster computation of real(conj(x)*y)
@@ -451,78 +461,6 @@ function _varm(A::AbstractArray{T}, m, corrected::Bool, dims::Colon,
451461
varcorrection(w, corrected) * s
452462
end
453463

454-
"""
455-
varcorrection(n::Integer, corrected=false)
456-
457-
Compute a bias correction factor for calculating `var`, `std` and `cov` with
458-
`n` observations. Returns ``\\frac{1}{n - 1}`` when `corrected=true`
459-
(i.e. [Bessel's correction](https://en.wikipedia.org/wiki/Bessel's_correction)),
460-
otherwise returns ``\\frac{1}{n}`` (i.e. no correction).
461-
"""
462-
@inline varcorrection(n::Integer, corrected::Bool=false) = 1 / (n - Int(corrected))
463-
464-
"""
465-
varcorrection(w::Weights, corrected=false)
466-
467-
Returns ``\\frac{1}{\\sum w}`` when `corrected=false` and throws an `ArgumentError`
468-
if `corrected=true`.
469-
"""
470-
@inline function varcorrection(w::Weights, corrected::Bool=false)
471-
corrected && throw(ArgumentError("Weights type does not support bias correction: " *
472-
"use FrequencyWeights, AnalyticWeights or ProbabilityWeights if applicable."))
473-
1 / w.sum
474-
end
475-
476-
"""
477-
varcorrection(w::AnalyticWeights, corrected=false)
478-
479-
* `corrected=true`: ``\\frac{1}{\\sum w - \\sum {w^2} / \\sum w}``
480-
* `corrected=false`: ``\\frac{1}{\\sum w}``
481-
"""
482-
@inline function varcorrection(w::AnalyticWeights, corrected::Bool=false)
483-
s = w.sum
484-
485-
if corrected
486-
sum_sn = sum(x -> (x / s) ^ 2, w)
487-
1 / (s * (1 - sum_sn))
488-
else
489-
1 / s
490-
end
491-
end
492-
493-
"""
494-
varcorrection(w::FrequencyWeights, corrected=false)
495-
496-
* `corrected=true`: ``\\frac{1}{\\sum{w} - 1}``
497-
* `corrected=false`: ``\\frac{1}{\\sum w}``
498-
"""
499-
@inline function varcorrection(w::FrequencyWeights, corrected::Bool=false)
500-
s = w.sum
501-
502-
if corrected
503-
1 / (s - 1)
504-
else
505-
1 / s
506-
end
507-
end
508-
509-
"""
510-
varcorrection(w::ProbabilityWeights, corrected=false)
511-
512-
* `corrected=true`: ``\\frac{n}{(n - 1) \\sum w}`` where ``n`` equals `count(!iszero, w)`
513-
* `corrected=false`: ``\\frac{1}{\\sum w}``
514-
"""
515-
@inline function varcorrection(w::ProbabilityWeights, corrected::Bool=false)
516-
s = w.sum
517-
518-
if corrected
519-
n = count(!iszero, w)
520-
n / (s * (n - 1))
521-
else
522-
1 / s
523-
end
524-
end
525-
526464
"""
527465
var(itr; corrected::Bool=true, [weights::AbstractWeights], mean=nothing[, dims])
528466
@@ -1425,6 +1363,18 @@ function _quantile(v::AbstractArray{V}, p, sorted::Bool, alpha::Real, beta::Real
14251363
return out
14261364
end
14271365

1366+
function _quantile(v::AbstractArray, p, sorted::Bool,
1367+
alpha::Real, beta::Real, w::UnitWeights)
1368+
length(v) != length(w) && throw(DimensionMismatch("Inconsistent array dimension."))
1369+
return quantile(v, p)
1370+
end
1371+
1372+
function _quantile(v::AbstractArray, p::Real, sorted::Bool,
1373+
alpha::Real, beta::Real, w::UnitWeights)
1374+
length(v) != length(w) && throw(DimensionMismatch("Inconsistent array dimension."))
1375+
return quantile(v, p)
1376+
end
1377+
14281378
_quantile(v::AbstractArray, p::Real, sorted::Bool, alpha::Real, beta::Real,
14291379
w::AbstractArray) =
14301380
_quantile(v, [p], sorted, alpha, beta, w)[1]

0 commit comments

Comments
 (0)