From 2da48d790c3fc1a469da31ddb50585ff2eea8e18 Mon Sep 17 00:00:00 2001
From: Peter Jun Park <peter.park@amd.com>
Date: Thu, 18 Jul 2024 08:28:36 -0400
Subject: [PATCH] add spellcheck/linting from rocm-docs-core

Signed-off-by: Peter Jun Park <peter.park@amd.com>

fix rst directives

satisfy spellcheck

fix more spelling
---
 .github/workflows/docs-linting.yml            |  16 +++
 .wordlist.txt                                 |  46 +++++++-
 docs/conceptual/pipeline-metrics.rst          |   2 +-
 docs/conceptual/vector-l1-cache.rst           |   6 +-
 docs/reference/faq.rst                        |   2 +-
 .../includes/infinity-fabric-transactions.rst | 104 +++++++++---------
 ...nstructions-per-cycle-and-utilizations.rst |  24 ++--
 docs/tutorial/includes/lds-examples.rst       |   2 +-
 .../includes/occupancy-limiters-example.rst   |   8 +-
 .../valu-arithmetic-instruction-mix.rst       |   5 +-
 .../vector-memory-operation-counting.rst      |   2 +-
 11 files changed, 137 insertions(+), 80 deletions(-)
 create mode 100644 .github/workflows/docs-linting.yml
diff --git a/.github/workflows/docs-linting.yml b/.github/workflows/docs-linting.yml
new file mode 100644
index 000000000..8563da255
--- /dev/null
+++ b/.github/workflows/docs-linting.yml
@@ -0,0 +1,16 @@
+name: Documentation
+
+on:
+  push:
+    branches:
+    - dev
+    - 'docs/*'
+  pull_request:
+    branches:
+    - dev
+    - 'docs/*'
+
+jobs:
+  call-workflow-passing-data:
+    name: Linting
+    uses: ROCm/rocm-docs-core/.github/workflows/linting.yml@develop
diff --git a/.wordlist.txt b/.wordlist.txt
index 7e513bfb0..7ff4b07e5 100644
--- a/.wordlist.txt
+++ b/.wordlist.txt
@@ -1,5 +1,41 @@
-Grafana
-MobaXterm
-Normalizations
-Omniperf's
-roofline
+AGPRs
+FLOPs
+GPUOpen
+GiB
+HIP's
+HIP’s
+IOPs
+IPC
+KiB
+LD
+Lmod
+Relatedly
+SEs
+SIG
+Transcendentals
+UID
+Utilizations
+VPGRs
+amd
+architected
+ast
+atomicAdd
+backpressure
+backpressuring
+broadcasted
+cdna
+conf
+gcn
+isa
+lookaside
+mantor
+modulefile
+modulefiles
+pdf
+sl
+substring
+typename
+untar
+utilizations
+vega
+vl
diff --git a/docs/conceptual/pipeline-metrics.rst b/docs/conceptual/pipeline-metrics.rst
index b3ac98b55..93ca144c6 100644
--- a/docs/conceptual/pipeline-metrics.rst
+++ b/docs/conceptual/pipeline-metrics.rst
@@ -478,7 +478,7 @@ VMEM instruction mix
 
 This section breaks down the types of vector memory (VMEM) instructions
 that were issued. Refer to the
-`Instruction Counts metrics section <desc-ta>` of address-processor frontend of
+`Instruction Counts metrics section <desc-ta>` of address-processor front end of
 the vL1D cache for a description of these VMEM instructions.
 
 .. _mfma-instruction-mix:
diff --git a/docs/conceptual/vector-l1-cache.rst b/docs/conceptual/vector-l1-cache.rst
index b86dd0a0c..a004d3ebe 100644
--- a/docs/conceptual/vector-l1-cache.rst
+++ b/docs/conceptual/vector-l1-cache.rst
@@ -136,7 +136,7 @@ Busy / stall metrics
 
 When executing vector memory instructions, the compute unit must send an
 address (and in the case of writes/atomics, data) to the address
-processing unit. When the frontend cannot accept any more addresses, it
+processing unit. When the front-end cannot accept any more addresses, it
 must backpressure the wave-issue logic for the VMEM pipe and prevent the
 issue of a vector memory instruction until a previously issued memory
 operation has been processed.
@@ -500,7 +500,7 @@ vL1D cache access metrics
 -------------------------
 
 The vL1D cache access metrics broadly indicate the type of requests
-incoming from the :ref:`cache frontend <desc-ta>`, the number of requests that
+incoming from the :ref:`cache front-end <desc-ta>`, the number of requests that
 were serviced by the vL1D, and the number & type of outgoing requests to
 the :doc:`L2 cache <l2-cache>`. In addition, this section includes the
 approximate latencies of accesses to the cache itself, along with
@@ -728,7 +728,7 @@ Omniperf reports the following vL1D data-return path metrics:
        :doc:`compute units <compute-unit>` on the accelerator, per
        :ref:`normalization unit <normalization-units>`. This is expected to be
        the sum of global/generic and spill/stack stores counted by the
-       :ref:`vL1D cache-frontend <ta-instruction-counts>`.
+       :ref:`vL1D cache-front-end <ta-instruction-counts>`.
 
      - Instructions per normalization unit
 
diff --git a/docs/reference/faq.rst b/docs/reference/faq.rst
index e847c8a35..ef951a949 100644
--- a/docs/reference/faq.rst
+++ b/docs/reference/faq.rst
@@ -17,7 +17,7 @@ backend. You can do this using :ref:`database <modes-database>` mode.
 
 Pass in the directory of your desired workload as follows.
 
-.. code:: shell
+.. code-block:: shell
 
     $ omniperf database --import -w <path-to-results> -H <hostname> -u <username> -t <team-name>
 
diff --git a/docs/tutorial/includes/infinity-fabric-transactions.rst b/docs/tutorial/includes/infinity-fabric-transactions.rst
index 2acd27dce..320f0d523 100644
--- a/docs/tutorial/includes/infinity-fabric-transactions.rst
+++ b/docs/tutorial/includes/infinity-fabric-transactions.rst
@@ -9,7 +9,7 @@ Infinity Fabric transactions
 
 This following code snippet launches a simple read-only kernel.
 
-.. code:: cpp
+.. code-block:: cpp
 
    // the main streaming kernel
    __global__ void kernel(int* x, size_t N, int zero) {
@@ -25,12 +25,12 @@ This following code snippet launches a simple read-only kernel.
      }
    }
 
-This happens twice -- once as a warmup and once for analysis. Note that the
+This happens twice -- once as a warm-up and once for analysis. Note that the
 buffer ``x`` is initialized to all zeros via a call to ``hipMemcpy`` on the
 host before the kernel is ever launched. Therefore, the following conditional
 is identically false -- and thus we expect no writes.
 
-.. code:: cpp
+.. code-block:: cpp
 
    if (sum != 0) { ...
 
@@ -62,7 +62,7 @@ Experiment #1:  Coarse-grained, accelerator-local HBM reads
 In our first experiment, we consider the simplest possible case, a
 ``hipMalloc``\ ’d buffer that is local to our current accelerator:
 
-.. code:: shell-session
+.. code-block:: shell-session
 
    $ omniperf profile -n coarse_grained_local --no-roof -- ./fabric -t 1 -o 0
    Using:
@@ -129,7 +129,7 @@ In addition, we see a small amount of :ref:`uncached <memory-type>` reads
 
 * Kernel arguments
 
-* Coordinate parameters (e.g., blockDim.z) that were not initialized by the
+* Coordinate parameters (e.g., ``blockDim.z``) that were not initialized by the
   hardware, etc. and may account for some of our ‘remote’ read requests
   (**17.5.4**), e.g., reading from CPU DRAM.
 
@@ -161,7 +161,7 @@ accelerator. Our code uses the ``hipExtMallocWithFlag`` API with the
    to set the environment variable ``HSA_FORCE_FINE_GRAIN_PCIE=1`` to enable
    this memory type.
 
-.. code:: shell-session
+.. code-block:: shell-session
 
    $ omniperf profile -n fine_grained_local --no-roof -- ./fabric -t 0 -o 0
    Using:
@@ -241,7 +241,7 @@ finally resetting the device back to the default, e.g.,
 Although we have not changed our code significantly, we do see a
 substantial change in the L2-Fabric metrics:
 
-.. code:: shell-session
+.. code-block:: shell-session
 
    $ omniperf profile -n fine_grained_remote --no-roof -- ./fabric -t 0 -o 2
    Using:
@@ -331,7 +331,7 @@ In this experiment, we move our :ref:`fine-grained <memory-type>` allocation to
 be owned by the CPU’s DRAM. We accomplish this by allocating host-pinned
 fine-grained memory using the ``hipHostMalloc`` API:
 
-.. code:: shell-session
+.. code-block:: shell-session
 
    $ omniperf profile -n fine_grained_host --no-roof -- ./fabric -t 0 -o 1
    Using:
@@ -384,8 +384,8 @@ fine-grained memory using the ``hipHostMalloc`` API:
 
 Here we see *almost* the same results as in the
 :ref:`previous experiment <infinity-fabric-ex3>`, however now as we are crossing
-a PCIe bus to the CPU, we see that the Infinity Fabric Read stalls (17.4.1)
-have shifted to be a PCIe stall (17.4.2). In addition, as (on this
+a PCIe bus to the CPU, we see that the Infinity Fabric Read stalls (**17.4.1**)
+have shifted to be a PCIe stall (**17.4.2**). In addition, as (on this
 system) the PCIe bus has a lower peak bandwidth than the AMD Infinity
 Fabric connection between two accelerators, we once again observe an
 increase in the percentage of stalls on this interface.
@@ -395,7 +395,7 @@ increase in the percentage of stalls on this interface.
    Had we performed this same experiment on an
    `MI250X system <https://www.amd.com/system/files/documents/amd-cdna2-white-paper.pdf>`_,
    these transactions would again have been marked as Infinity Fabric Read
-   stalls (17.4.1), as the CPU is connected to the accelerator via AMD Infinity
+   stalls (**17.4.1**), as the CPU is connected to the accelerator via AMD Infinity
    Fabric.
 
 .. _infinity-fabric-ex5:
@@ -408,7 +408,7 @@ In our next fabric experiment, we change our CPU memory allocation to be
 ``hipHostMalloc`` API the ``hipHostMallocNonCoherent`` flag, to mark the
 allocation as coarse-grained:
 
-.. code:: shell-session
+.. code-block:: shell-session
 
    $ omniperf profile -n coarse_grained_host --no-roof -- ./fabric -t 1 -o 1
    Using:
@@ -459,11 +459,11 @@ allocation as coarse-grained:
    │ 17.5.4  │ Remote Read     │ 671088645.00 │ 671088645.00 │ 671088645.00 │ Req per kernel │
    ╘═════════╧═════════════════╧══════════════╧══════════════╧══════════════╧════════════════╛
 
-Here we see a similar result to our `previous
-experiment <Fabric_exp_4>`__, with one key difference: our accesses are
-no longer marked as Uncached Read requests (17.2.3, 17.5.1), but instead
-are 64B read requests (17.5.2), as observed in our `Coarse-grained,
-accelerator-local HBM <Fabric_exp_1>`__ experiment.
+Here we see a similar result to our
+:ref:`previous experiment <infinity-fabric-ex4>`, with one key difference: our
+accesses are no longer marked as Uncached Read requests (**17.2.3, 17.5.1**), but instead
+are 64B read requests (**17.5.2**), as observed in our
+:ref:`Coarse-grained, accelerator-local HBM <infinity-fabric-ex1>` experiment.
 
 .. _infinity-fabric-ex6:
 
@@ -471,12 +471,12 @@ Experiment #6: Fine-grained, CPU-DRAM writes
 --------------------------------------------
 
 Thus far in our exploration of the L2-Fabric interface, we have
-primarily focused on read operations. However, in `our request flow
-diagram <fabric-fig>`__, we note that writes are counted separately. To
-obeserve this, we use the ‘-p’ flag to trigger write operations to
-fine-grained memory allocated on the host:
+primarily focused on read operations. However, in
+:ref:`our request flow diagram <l2-request-flow>`, we note that writes are
+counted separately. To observe this, we use the ``-p`` flag to trigger write
+operations to fine-grained memory allocated on the host:
 
-.. code:: shell-session
+.. code-block:: shell-session
 
    $ omniperf profile -n fine_grained_host_write --no-roof -- ./fabric -t 0 -o 1 -p 1
    Using:
@@ -533,25 +533,29 @@ fine-grained memory allocated on the host:
    │ 17.5.10 │ Atomic                  │         0.00 │         0.00 │         0.00 │ Req per kernel │
    ╘═════════╧═════════════════════════╧══════════════╧══════════════╧══════════════╧════════════════╛
 
-Here we notice a few changes in our request pattern: - As expected, the
-requests have changed from 64B Reads to 64B Write requests (17.5.7), -
-these requests are homed in on a “remote” destination (17.2.6, 17.5.9),
-as expected, and, - these are also counted as a single Uncached Write
-request (17.5.6).
+Here we notice a few changes in our request pattern:
+
+* As expected, the requests have changed from 64B Reads to 64B Write requests
+  (**17.5.7**),
+
+* these requests are homed in on a “remote” destination (**17.2.6, 17.5.9**), as
+  expected, and,
+
+* these are also counted as a single Uncached Write request (**17.5.6**).
 
 In addition, there rather significant changes in the bandwidth values
-reported: - the “L2-Fabric Write and Atomic” bandwidth metric (17.2.4)
-reports about 40GiB of data written across Infinity Fabric(tm) while, -
-the “Remote Write and Traffic” metric (17.2.5) indicates that nearly
+reported: - the “L2-Fabric Write and Atomic” bandwidth metric (**17.2.4**)
+reports about 40GiB of data written across Infinity Fabric while, -
+the “Remote Write and Traffic” metric (**17.2.5**) indicates that nearly
 100% of these request are being directed to a remote source
 
 The precise meaning of these metrics are explored in the
 :ref:`subsequent experiment <infinity-fabric-ex7>`.
 
-Finally, we note that we see no write stalls on the PCIe(r) bus
-(17.4.3). This is because writes over a PCIe(r) bus `are
-non-posted <https://members.pcisig.com/wg/PCI-SIG/document/10912>`__,
-i.e., they do not require acknowledgement.
+Finally, we note that we see no write stalls on the PCIe bus
+(**17.4.3**). This is because writes over a PCIe bus `are
+non-posted <https://members.pcisig.com/wg/PCI-SIG/document/10912>`_,
+that is, they do not require acknowledgement.
 
 .. _infinity-fabric-ex7:
 
@@ -561,7 +565,7 @@ Experiment #7: Fine-grained, CPU-DRAM atomicAdd
 Next, we change our experiment to instead target ``atomicAdd``
 operations to the CPU’s DRAM.
 
-.. code:: shell-session
+.. code-block:: shell-session
 
    $ omniperf profile -n fine_grained_host_add --no-roof -- ./fabric -t 0 -o 1 -p 2
    Using:
@@ -618,14 +622,18 @@ operations to the CPU’s DRAM.
    │ 17.5.10 │ Atomic                  │ 13421773.00 │ 13421773.00 │ 13421773.00 │ Req per kernel │
    ╘═════════╧═════════════════════════╧═════════════╧═════════════╧═════════════╧════════════════╛
 
-In this case, there is quite a lot to unpack: - For the first time, the
-32B Write requests (17.5.5) are heavily used. - These correspond to
-Atomic requests (17.2.7, 17.5.10), and are counted as Uncached Writes
-(17.5.6). - The L2-Fabric Write and Atomic bandwidth metric (17.2.4)
-shows about 0.4 GiB of traffic. For convenience, the sample reduces the
-default problem size for this case due to the speed of atomics across a
-PCIe(r) bus, and finally, - The traffic is directed to a remote device
-(17.2.6, 17.5.9)
+In this case, there is quite a lot to unpack:
+
+- For the first time, the 32B Write requests (**17.5.5**) are heavily used.
+
+- These correspond to Atomic requests (**17.2.7, 17.5.10**), and are counted as
+  Uncached Writes (**17.5.6**).
+
+- The L2-Fabric Write and Atomic bandwidth metric (**17.2.4**) shows about 0.4
+  GiB of traffic. For convenience, the sample reduces the default problem size
+  for this case due to the speed of atomics across a PCIe bus, and finally,
+
+- The traffic is directed to a remote device (**17.2.6, 17.5.9**)
 
 Let us consider what an “atomic” request means in this context. Recall
 that we are discussing memory traffic flowing from the L2 cache, the
@@ -634,9 +642,8 @@ MI250, to e.g., the CPU’s DRAM. In this light, we see that these
 requests correspond to *system scope* atomics, and specifically in the
 case of the MI250, to fine-grained memory!
 
-.. raw:: html
-
-   <!-- Leave as possible future experiment to add
+..
+   `Leave as possible future experiment to add
 
 
    ### Experiment #2 - Non-temporal writes
@@ -647,9 +654,8 @@ case of the MI250, to fine-grained memory!
    ```
    template<typename T>
    __device__ void store (T* ptr, T val) {
-     __builtin_nontemporal_store(val, ptr);
+    __builtin_nontemporal_store(val, ptr);
    }
    ```
 
-   On an AMD [MI2XX](2xxnote) accelerator, for FP32 values this will generate a `global_store_dword` instruction, with the `glc` and `slc` bits set, described in [section 10.1](https://developer.amd.com/wp-content/resources/CDNA2_Shader_ISA_4February2022.pdf) of the CDNA2 ISA guide.
-    -->
+   On an AMD MI2XX accelerator, for FP32 values this will generate a `global_store_dword` instruction, with the `glc` and `slc` bits set, described in [section 10.1](https://developer.amd.com/wp-content/resources/CDNA2_Shader_ISA_4February2022.pdf) of the CDNA2 ISA guide.`
diff --git a/docs/tutorial/includes/instructions-per-cycle-and-utilizations.rst b/docs/tutorial/includes/instructions-per-cycle-and-utilizations.rst
index db25b64cf..bee4891b6 100644
--- a/docs/tutorial/includes/instructions-per-cycle-and-utilizations.rst
+++ b/docs/tutorial/includes/instructions-per-cycle-and-utilizations.rst
@@ -9,13 +9,13 @@ Omniperf.
 
 This example is compiled using ``c++17`` support:
 
-.. code:: shell
+.. code-block:: shell
 
    $ hipcc -O3 ipc.hip -o ipc -std=c++17
 
 and was run on an MI250 CDNA2 accelerator:
 
-.. code:: shell
+.. code-block:: shell
 
    $ omniperf profile -n ipc --no-roof -- ./ipc
 
@@ -30,7 +30,7 @@ Design note
 The kernels in this example all execute a specific assembly operation
 ``N`` times (1000, by default), for instance the ``vmov`` kernel:
 
-.. code:: cpp
+.. code-block:: cpp
 
    template<int N=1000>
    __device__ void vmov_op() {
@@ -58,7 +58,7 @@ Now we can use our test to measure the achieved instructions-per-cycle
 of various types of instructions. We start with a simple :ref:`VALU <desc-valu>`
 operation, i.e., a ``v_mov_b32`` instruction, e.g.:
 
-.. code:: asm
+.. code-block:: asm
 
    v_mov_b32 v0, v1
 
@@ -66,7 +66,7 @@ This instruction simply copies the contents from the source register
 (``v1``) to the destination register (``v0``). Investigating this kernel
 with Omniperf, we see:
 
-.. code:: shell-session
+.. code-block:: shell-session
 
    $ omniperf analyze -p workloads/ipc/mi200/ --dispatch 7 -b 11.2
    <...>
@@ -267,13 +267,13 @@ we choose a ``s_nop`` instruction, which according to the
 
 Here we choose to use a no-op of:
 
-.. code:: asm
+.. code-block:: asm
 
    s_nop 0x0
 
 to make our point. Running this kernel through Omniperf yields:
 
-.. code:: shell-session
+.. code-block:: shell-session
 
    $ omniperf analyze -p workloads/ipc/mi200/ --dispatch 9 -b 11.2
    <...>
@@ -331,7 +331,7 @@ logical question then is, ‘what *is* this metric counting in our
 
 The generated assembly looks something like:
 
-.. code:: asm
+.. code-block:: asm
 
    ;;#ASMSTART
    s_nop 0x0
@@ -366,7 +366,7 @@ Next, we explore a simple `SALU <desc-salu>` kernel in our on-going IPC and
 utilization example. For this case, we select a simple scalar move
 operation, e.g.:
 
-.. code:: asm
+.. code-block:: asm
 
    s_mov_b32 s0, s1
 
@@ -374,7 +374,7 @@ which, in analogue to our :ref:`v_mov <ipc-valu-utilization>` example, copies th
 contents of the source scalar register (``s1``) to the destination
 scalar register (``s0``). Running this kernel through Omniperf yields:
 
-.. code:: shell-session
+.. code-block:: shell-session
 
    $ omniperf analyze -p workloads/ipc/mi200/ --dispatch 10 -b 11.2
    <...>
@@ -422,7 +422,7 @@ VALU Active Threads
 For our final IPC/Utilization example, we consider a slight modification
 of our `v_mov <ipc-valu-utlization>` example:
 
-.. code:: cpp
+.. code-block:: cpp
 
    template<int N=1000>
    __global__ void vmov_with_divergence() {
@@ -434,7 +434,7 @@ That is, we wrap our :ref:`VALU <desc-valu>` operation inside a conditional
 where only one lane in our wavefront is active. Running this kernel
 through Omniperf yields:
 
-.. code:: shell-session
+.. code-block:: shell-session
 
    $ omniperf analyze -p workloads/ipc/mi200/ --dispatch 11 -b 11.2
    <...>
diff --git a/docs/tutorial/includes/lds-examples.rst b/docs/tutorial/includes/lds-examples.rst
index 2244968a8..4f6caea1c 100644
--- a/docs/tutorial/includes/lds-examples.rst
+++ b/docs/tutorial/includes/lds-examples.rst
@@ -15,7 +15,7 @@ v5.6.0, and Omniperf v2.0.0.
 
    $ hipcc -O3 lds.hip -o lds
 
-Finally, we generate our omniperf profile as:
+Finally, we generate our ``omniperf profile`` as:
 
 .. code-block:: shell-session
 
diff --git a/docs/tutorial/includes/occupancy-limiters-example.rst b/docs/tutorial/includes/occupancy-limiters-example.rst
index e86f68620..1be165a60 100644
--- a/docs/tutorial/includes/occupancy-limiters-example.rst
+++ b/docs/tutorial/includes/occupancy-limiters-example.rst
@@ -199,7 +199,7 @@ LDS limited
 
 To examine an LDS limited example, we must change our kernel slightly:
 
-.. code:: cpp
+.. code-block:: cpp
 
    constexpr size_t fully_allocate_lds = 64ul * 1024ul / sizeof(double);
    __launch_bounds__(256)
@@ -305,7 +305,7 @@ SGPR limited
 Finally, we modify our kernel once more to make it limited by
 `SGPRs <salu>`__:
 
-.. code:: cpp
+.. code-block:: cpp
 
    constexpr int sgprlim = 1;
    __launch_bounds__(1024, 8)
@@ -331,7 +331,7 @@ use ``sgprlim``) of the array to reduce VGPR/Scratch usage.
 
 This results in the following assembly metadata for this kernel:
 
-.. code:: asm
+.. code-block:: asm
 
            .size   _Z9sgprboundiPd, .Lfunc_end3-_Z9sgprboundiPd
                                            ; -- End function
@@ -349,7 +349,7 @@ This results in the following assembly metadata for this kernel:
 
 Analyzing this workload yields:
 
-.. code:: shell-session
+.. code-block:: shell-session
 
    $ omniperf analyze -p workloads/occupancy/mi200/ -b 2.1.15 6.2 7.1.5 7.1.6 7.1.7 7.1.8 7.1.9 --dispatch 5
    <...>
diff --git a/docs/tutorial/includes/valu-arithmetic-instruction-mix.rst b/docs/tutorial/includes/valu-arithmetic-instruction-mix.rst
index 123deea3a..b3bc63b42 100644
--- a/docs/tutorial/includes/valu-arithmetic-instruction-mix.rst
+++ b/docs/tutorial/includes/valu-arithmetic-instruction-mix.rst
@@ -13,7 +13,6 @@ VALU arithmetic instruction mix
    However, the actual experiment results in this section were collected on an
    :ref:`MI2XX <mixxx-note>` accelerator.
 
-
 .. _valu-experiment-design:
 
 Design note
@@ -22,12 +21,12 @@ Design note
 This code uses a number of inline assembly instructions to cleanly
 identify the types of instructions being issued, as well as to avoid
 optimization / dead-code elimination by the compiler. While inline
-assembly is inherently unportable, this example is expected to work on
+assembly is inherently not portable, this example is expected to work on
 all GCN GPUs and CDNA accelerators.
 
 We reproduce a sample of the kernel as follows:
 
-.. code:: cpp
+.. code-block:: cpp
 
    // fp32: add, mul, transcendental and fma
    float f1, f2;
diff --git a/docs/tutorial/includes/vector-memory-operation-counting.rst b/docs/tutorial/includes/vector-memory-operation-counting.rst
index 1ab456a7b..1a0861bfe 100644
--- a/docs/tutorial/includes/vector-memory-operation-counting.rst
+++ b/docs/tutorial/includes/vector-memory-operation-counting.rst
@@ -46,7 +46,7 @@ Design note
 ^^^^^^^^^^^
 
 This section explains some of the more peculiar lines of code in the
-example, for example, the use of compiler builtins and explicit address space
+example, for example, the use of compiler built-ins and explicit address space
 casting, and so forth.
 
 .. code-block:: cpp