From a96674d16835bcafa9c2df9b3295e7ce8351daf0 Mon Sep 17 00:00:00 2001
From: Adrian Hill <adrian.hill@mailbox.org>
Date: Tue, 12 Mar 2024 18:49:09 +0100
Subject: [PATCH] Improve documentation (#34)

* Improve documentation

* Rename source files

* More tables in README

* Split docs into user and dev docs

* Add Fallback call structure diagrams

* Improve Mermaid diagrams

* Fix typos

* Fix API ref

* No duplicates

* Reorder stuff

---------

Co-authored-by: Guillaume Dalle <22795598+gdalle@users.noreply.github.com>
---
 README.md                                   | 18 ++--
 docs/Project.toml                           |  1 +
 docs/make.jl                                |  5 +-
 docs/src/api.md                             | 24 +++---
 docs/src/design.md                          | 57 -------------
 docs/src/developer.md                       | 92 +++++++++++++++++++++
 docs/src/getting_started.md                 | 51 ++++++++++++
 src/DifferentiationInterface.jl             |  8 +-
 src/{scalar_scalar.jl => derivative.jl}     |  0
 src/{array_scalar.jl => gradient.jl}        |  0
 src/{array_array.jl => jacobian.jl}         |  4 +-
 src/{scalar_array.jl => multiderivative.jl} |  0
 12 files changed, 174 insertions(+), 86 deletions(-)
 delete mode 100644 docs/src/design.md
 create mode 100644 docs/src/developer.md
 create mode 100644 docs/src/getting_started.md
 rename src/{scalar_scalar.jl => derivative.jl} (100%)
 rename src/{array_scalar.jl => gradient.jl} (100%)
 rename src/{array_array.jl => jacobian.jl} (99%)
 rename src/{scalar_array.jl => multiderivative.jl} (100%)

diff --git a/README.md b/README.md
index e49d159bf..72993d831 100644
--- a/README.md
+++ b/README.md
@@ -17,14 +17,16 @@ It supports in-place versions of every operator, and ensures type stability when
 
 We support some of the backends defined by [ADTypes.jl](https://github.com/SciML/ADTypes.jl):
 
-- [ChainRulesCore.jl](https://github.com/JuliaDiff/ChainRulesCore.jl) with `AutoChainRules(ruleconfig)`
-- [Diffractor.jl](https://github.com/JuliaDiff/Diffractor.jl) with `AutoDiffractor()`
-- [Enzyme.jl](https://github.com/EnzymeAD/Enzyme.jl) with `AutoEnzyme(Val(:forward))` or `AutoEnzyme(Val(:reverse))`
-- [FiniteDiff.jl](https://github.com/JuliaDiff/FiniteDiff.jl) with `AutoFiniteDiff()`
-- [ForwardDiff.jl](https://github.com/JuliaDiff/ForwardDiff.jl) with `AutoForwardDiff()`
-- [PolyesterForwardDiff.jl](https://github.com/JuliaDiff/PolyesterForwardDiff.jl) with `AutoPolyesterForwardDiff(; chunksize=C)`
-- [ReverseDiff.jl](https://github.com/JuliaDiff/ReverseDiff.jl) with `AutoReverseDiff()`
-- [Zygote.jl](https://github.com/FluxML/Zygote.jl) with `AutoZygote()`
+| Backend                                                                         | Type                                                       |
+|:--------------------------------------------------------------------------------|:-----------------------------------------------------------|
+| [ChainRulesCore.jl](https://github.com/JuliaDiff/ChainRulesCore.jl)             | `AutoChainRules(ruleconfig)`                               |
+| [Diffractor.jl](https://github.com/JuliaDiff/Diffractor.jl)                     | `AutoDiffractor()`                                         |
+| [Enzyme.jl](https://github.com/EnzymeAD/Enzyme.jl)                              | `AutoEnzyme(Val(:forward))` or `AutoEnzyme(Val(:reverse))` |
+| [FiniteDiff.jl](https://github.com/JuliaDiff/FiniteDiff.jl)                     | `AutoFiniteDiff()`                                         |
+| [ForwardDiff.jl](https://github.com/JuliaDiff/ForwardDiff.jl)                   | `AutoForwardDiff()`                                        |
+| [PolyesterForwardDiff.jl](https://github.com/JuliaDiff/PolyesterForwardDiff.jl) | `AutoPolyesterForwardDiff(; chunksize=C)`                  |
+| [ReverseDiff.jl](https://github.com/JuliaDiff/ReverseDiff.jl)                   | `AutoReverseDiff()`                                        |
+| [Zygote.jl](https://github.com/FluxML/Zygote.jl)                                | `AutoZygote()`                                             |
 
 ## Example
 
diff --git a/docs/Project.toml b/docs/Project.toml
index 43ae050ca..aae07b52d 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -5,6 +5,7 @@ DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
 DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
 Diffractor = "9f5e2b26-1114-432f-b630-d3fe2085c51c"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+DocumenterMermaid = "a078cd44-4d9c-4618-b545-3ab9d77f9177"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
diff --git a/docs/make.jl b/docs/make.jl
index d485c686a..92ebe4ca1 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -2,6 +2,7 @@ using Base: get_extension
 using DifferentiationInterface
 import DifferentiationInterface as DI
 using Documenter
+using DocumenterMermaid
 
 using ADTypes
 using Diffractor: Diffractor
@@ -64,7 +65,9 @@ makedocs(;
         canonical="https://gdalle.github.io/DifferentiationInterface.jl",
         edit_link="main",
     ),
-    pages=["Home" => "index.md", "design.md", "api.md", "backends.md"],
+    pages=[
+        "Home" => "index.md", "getting_started.md", "api.md", "backends.md", "developer.md"
+    ],
     warnonly=:missing_docs,  # missing docs for ADTypes.jl are normal
 )
 
diff --git a/docs/src/api.md b/docs/src/api.md
index 0d74648c4..b33c57f4f 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -9,46 +9,42 @@ CollapsedDocStrings = true
 DifferentiationInterface
 ```
 
-## Utilities
-
-### Scalar to scalar
+## Derivative
 
 ```@autodocs
 Modules = [DifferentiationInterface]
-Pages = ["scalar_scalar.jl"]
+Pages = ["src/derivative.jl"]
 ```
 
-### Scalar to array
+## Multiderivative
 
 ```@autodocs
 Modules = [DifferentiationInterface]
-Pages = ["scalar_array.jl"]
+Pages = ["multiderivative.jl"]
 ```
 
-### Array to scalar
+## Gradient
 
 ```@autodocs
 Modules = [DifferentiationInterface]
-Pages = ["array_scalar.jl"]
+Pages = ["gradient.jl"]
 ```
 
-### Array to array
+## Jacobian
 
 ```@autodocs
 Modules = [DifferentiationInterface]
-Pages = ["array_array.jl"]
+Pages = ["jacobian.jl"]
 ```
 
-## Primitives
-
-### Pushforward
+## Pushforward (JVP)
 
 ```@autodocs
 Modules = [DifferentiationInterface]
 Pages = ["pushforward.jl"]
 ```
 
-### Pullback
+## Pullback (JVP)
 
 ```@autodocs
 Modules = [DifferentiationInterface]
diff --git a/docs/src/design.md b/docs/src/design.md
deleted file mode 100644
index f737cadb8..000000000
--- a/docs/src/design.md
+++ /dev/null
@@ -1,57 +0,0 @@
-# Design
-
-The operators defined in this package are split into two main parts:
-
-- the "utilities", which are sufficient for most users
-- the "primitives", which are mostly relevant for experts or backend developers
-
-## Utilities
-
-Depending on the type of input and output, differentiation operators can have various names.
-We choose the following terminology for the utilities we provide:
-
-|                  | **scalar output** | **array output** |
-| ---------------- | ----------------- | ---------------- |
-| **scalar input** | derivative        | multiderivative  |
-| **array input**  | gradient          | jacobian         |
-
-Most backends have custom implementations for all of these, which we reuse whenever possible.
-
-## Primitives
-
-Every utility can also be implemented from either of these two primitives:
-
-- the pushforward (in forward mode), computing a Jacobian-vector product
-- the pullback (in reverse mode), computing a vector-Jacobian product
-
-## Variants
-
-Whenever it makes sense, four variants of the same operator are defined:
-
-|                       | **mutating**                             | **non-mutating**               |
-| --------------------- | ---------------------------------------- | ------------------------------ |
-| **primal too**        | `value_and_something!(storage, args...)` | `value_and_something(args...)` |
-| **differential only** | `something!(storage, args...)`           | `something(args...)`           |
-
-Replace `something` with `derivative`, `multiderivative`, `gradient`, `jacobian`, `pushforward` or `pullback` to get the correct name.
-
-## Preparation
-
-In many cases, automatic differentiation can be accelerated if the function has been run at least once (e.g. to record a tape) and if some cache objects are provided.
-This is a backend-specific procedure, but we expose a common syntax to achieve it.
-
-If you run `prepare_something(backend, f, x)`, it will create an object called `extras` containing the necessary information to speed up the `something` procedure and its variants.
-You can them call `something(backend, f, x, extras)`, which should be faster than `something(backend, f, x)`.
-This is especially worth it if you plan to call `something` several times in similar settings: same backend, same function, but different inputs.
-You can think of it as a warm up.
-
-By default, all the preparation functions return `nothing`.
-We do not make any guarantees on their implementation for each backend, or on the performance gains that can be expected.
-
-## Backend requirements
-
-The only requirement for a backend is to implement either [`value_and_pushforward!`](@ref) or [`value_and_pullback!`](@ref), from which the rest of the operators can be deduced.
-We provide a standard series of fallbacks, but we leave it to each backend to redefine as many of the utilities as necessary to achieve optimal performance.
-
-Every backend we support corresponds to a package extension of DifferentiationInterface.jl (located in the `ext` subfolder).
-Advanced users are welcome to code more backends and submit pull requests!
diff --git a/docs/src/developer.md b/docs/src/developer.md
new file mode 100644
index 000000000..7ff9dd50f
--- /dev/null
+++ b/docs/src/developer.md
@@ -0,0 +1,92 @@
+# For AD developers
+
+## Backend requirements
+
+Every [operator](@ref operators) can be implemented from either of these two primitives:
+
+- the pushforward (in forward mode), computing a Jacobian-vector product
+- the pullback (in reverse mode), computing a vector-Jacobian product
+
+The only requirement for a backend is therefore to implement either [`value_and_pushforward!`](@ref) or [`value_and_pullback!`](@ref), from which the rest of the operators can be deduced.
+We provide a standard series of fallbacks, but we leave it to each backend to redefine as many of the utilities as necessary to achieve optimal performance.
+
+Every backend we support corresponds to a package extension of DifferentiationInterface.jl (located in the `ext` subfolder).
+Advanced users are welcome to code more backends and submit pull requests!
+
+## Fallback call structure
+
+### Forward mode
+
+```mermaid
+flowchart LR
+    subgraph Gradient
+    gradient --> value_and_gradient
+    value_and_gradient --> value_and_gradient!
+    gradient! --> value_and_gradient!
+    end
+
+    subgraph Jacobian
+    jacobian --> value_and_jacobian
+    value_and_jacobian --> value_and_jacobian!
+    jacobian! --> value_and_jacobian!
+    end
+
+    subgraph Multiderivative
+    multiderivative --> value_and_multiderivative
+    value_and_multiderivative --> value_and_multiderivative!
+    multiderivative! --> value_and_multiderivative!
+    end
+
+    subgraph Derivative
+    derivative --> value_and_derivative
+    end
+
+    subgraph Pushforward
+    pushforward --> value_and_pushforward
+    value_and_pushforward --> value_and_pushforward!
+    pushforward! --> value_and_pushforward!
+    end
+
+    value_and_jacobian! --> value_and_pushforward!
+    value_and_gradient! --> value_and_pushforward!
+    value_and_multiderivative! --> value_and_pushforward!
+    value_and_derivative --> value_and_pushforward
+```
+
+### Reverse mode
+
+```mermaid
+flowchart LR
+    subgraph Gradient
+    gradient --> value_and_gradient
+    value_and_gradient --> value_and_gradient!
+    gradient! --> value_and_gradient!
+    end
+
+    subgraph Jacobian
+    jacobian --> value_and_jacobian
+    value_and_jacobian --> value_and_jacobian!
+    jacobian! --> value_and_jacobian!
+    end
+
+    subgraph Multiderivative
+    multiderivative --> value_and_multiderivative
+    value_and_multiderivative --> value_and_multiderivative!
+    multiderivative! --> value_and_multiderivative!
+    end
+
+    subgraph Derivative
+    derivative --> value_and_derivative
+    end
+
+    subgraph Pullback
+    pullback --> value_and_pullback
+    value_and_pullback --> value_and_pullback!
+    pullback! --> value_and_pullback!
+    end
+
+    value_and_jacobian! --> value_and_pullback!
+    value_and_gradient! --> value_and_pullback!
+    value_and_multiderivative! --> value_and_pullback!
+    value_and_derivative --> value_and_pullback
+```
diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md
new file mode 100644
index 000000000..ff7f9afeb
--- /dev/null
+++ b/docs/src/getting_started.md
@@ -0,0 +1,51 @@
+# Getting started
+
+## [Operators](@id operators)
+
+Depending on the type of input and output, differentiation operators can have various names.
+We choose the following terminology for the ones we provide:
+
+|                  | **scalar output** | **array output**  |
+| ---------------- | ----------------- | ----------------- |
+| **scalar input** | `derivative`      | `multiderivative` |
+| **array input**  | `gradient`        | `jacobian`        |
+
+Most backends have custom implementations for all of these, which we reuse whenever possible.
+
+### Variants
+
+Whenever it makes sense, four variants of the same operator are defined:
+
+| **Operator**      | **non-mutating**          | **mutating**               | **non-mutating with primal**        | **mutating with primal**             |
+| :---------------- | :------------------------ | :------------------------- | :---------------------------------- | :----------------------------------- |
+| Derivative        | [`derivative`](@ref)      | N/A                        | [`value_and_derivative`](@ref)      | N/A                                  |
+| Multiderivative   | [`multiderivative`](@ref) | [`multiderivative!`](@ref) | [`value_and_multiderivative`](@ref) | [`value_and_multiderivative!`](@ref) |
+| Gradient          | [`gradient`](@ref)        | [`gradient!`](@ref)        | [`value_and_gradient`](@ref)        | [`value_and_gradient!`](@ref)        |
+| Jacobian          | [`jacobian`](@ref)        | [`jacobian!`](@ref)        | [`value_and_jacobian`](@ref)        | [`value_and_jacobian!`](@ref)        |
+| Pushforward (JVP) | [`pushforward`](@ref)     | [`pushforward!`](@ref)     | [`value_and_pushforward`](@ref)     | [`value_and_pushforward!`](@ref)     |
+| Pullback (VJP)    | [`pullback`](@ref)        | [`pullback!`](@ref)        | [`value_and_pullback`](@ref)        | [`value_and_pullback!`](@ref)        |
+
+Note that scalar outputs can't be mutated, which is why `derivative` doesn't have mutating variants.
+
+## Preparation
+
+In many cases, automatic differentiation can be accelerated if the function has been run at least once (e.g. to record a tape) and if some cache objects are provided.
+This is a backend-specific procedure, but we expose a common syntax to achieve it.
+
+| **Operator**      | **preparation function**          |
+| :---------------- | :-------------------------------- |
+| Derivative        | [`prepare_derivative`](@ref)      |
+| Multiderivative   | [`prepare_multiderivative`](@ref) |
+| Gradient          | [`prepare_gradient`](@ref)        |
+| Jacobian          | [`prepare_jacobian`](@ref)        |
+| Pushforward (JVP) | [`prepare_pushforward`](@ref)     |
+| Pullback (VJP)    | [`prepare_pullback`](@ref)        |
+
+If you run `prepare_operator(backend, f, x)`, it will create an object called `extras` containing the necessary information to speed up `operator` and its variants.
+This information is specific to `backend` and `f`, as well as the _type and size_ of the input `x`, but it should work with different _values_ of `x`.
+
+You can then call `operator(backend, f, similar_x, extras)`, which should be faster than `operator(backend, f, similar_x)`.
+This is especially worth it if you plan to call `operator` several times in similar settings: you can think of it as a warm up.
+
+By default, all the preparation functions return `nothing`.
+We do not make any guarantees on their implementation for each backend, or on the performance gains that can be expected.
diff --git a/src/DifferentiationInterface.jl b/src/DifferentiationInterface.jl
index cce87e560..8457d2007 100644
--- a/src/DifferentiationInterface.jl
+++ b/src/DifferentiationInterface.jl
@@ -19,10 +19,10 @@ include("mode.jl")
 include("utils.jl")
 include("pushforward.jl")
 include("pullback.jl")
-include("scalar_scalar.jl")
-include("scalar_array.jl")
-include("array_scalar.jl")
-include("array_array.jl")
+include("derivative.jl")
+include("multiderivative.jl")
+include("gradient.jl")
+include("jacobian.jl")
 include("prepare.jl")
 
 export value_and_pushforward!, value_and_pushforward
diff --git a/src/scalar_scalar.jl b/src/derivative.jl
similarity index 100%
rename from src/scalar_scalar.jl
rename to src/derivative.jl
diff --git a/src/array_scalar.jl b/src/gradient.jl
similarity index 100%
rename from src/array_scalar.jl
rename to src/gradient.jl
diff --git a/src/array_array.jl b/src/jacobian.jl
similarity index 99%
rename from src/array_array.jl
rename to src/jacobian.jl
index 62f65ea4a..eae106f22 100644
--- a/src/array_array.jl
+++ b/src/jacobian.jl
@@ -2,7 +2,7 @@ const JAC_NOTES = """
 ## Notes
 
 Regardless of the shape of `x` and `y`, if `x` has length `n` and `y` has length `m`, then `jac` is expected to be a `m × n` matrix.
-This function acts as if the input and output had been flattened with `vec`. 
+This function acts as if the input and output had been flattened with `vec`.
 """
 
 """
@@ -57,7 +57,7 @@ end
 
 Compute the primal value `y = f(x)` and the Jacobian matrix `jac = ∂f(x)` of an array-to-array function.
 
-$JAC_NOTES 
+$JAC_NOTES
 """
 function value_and_jacobian(backend::AbstractADType, f, x::AbstractArray, args...)
     y = f(x)
diff --git a/src/scalar_array.jl b/src/multiderivative.jl
similarity index 100%
rename from src/scalar_array.jl
rename to src/multiderivative.jl