diff --git a/paper/header.tex b/paper/header.tex
index 2b199dcb..95c8be4d 100644
--- a/paper/header.tex
+++ b/paper/header.tex
@@ -1,6 +1,6 @@
 % **************GENERATED FILE, DO NOT EDIT**************
 
-\title{Extending JumpProcess.jl for fast point process simulation with time-varying intensities}
+\title{Extending JumpProcesses.jl for fast point process simulation with time-varying intensities}
 
 \author[1]{Guilherme Augusto Zagatti}
 \author[3]{Samuel A. Isaacson}
@@ -17,7 +17,7 @@
 \keywords{Julia, Simulation, Jump process, Point process}
 
 \hypersetup{
-pdftitle = {Extending JumpProcess.jl for fast point process simulation with time-varying intensities},
+pdftitle = {Extending JumpProcesses.jl for fast point process simulation with time-varying intensities},
 pdfsubject = {JuliaCon 2023 Proceedings},
 pdfauthor = {Guilherme Augusto Zagatti, Samuel A. Isaacson, Christopher Rackauckas, Vasily Ilin, See-Kiong Ng, Stéphane Bressan},
 pdfkeywords = {Julia, Simulation, Jump process, Point process},
diff --git a/paper/paper.tex b/paper/paper.tex
index d47d293d..f9a800d6 100644
--- a/paper/paper.tex
+++ b/paper/paper.tex
@@ -11,6 +11,10 @@
 \algrenewcommand\algorithmicindent{1.1em}
 \usepackage{mathtools}
 
+\usepackage[normalem]{ulem}
+\usepackage{lineno}
+\linenumbers
+
 \newcommand{\etal}{\textit{et al}.}
 \newcommand{\ie}{\textit{i}.\textit{e}.}
 \newcommand{\eg}{\textit{e}.\textit{g}.}
@@ -47,11 +51,11 @@
   \else
     \def\comment@@start{#1}%
     \ifx\comment@@start\comment@@S
-      \textcolor{blue}{[\textbf{#1:} #2]}%
+      \textcolor{blue}{[\textbf{#1:}#2]}%
     \else\ifx\comment@@start\comment@@G
-      \textcolor{magenta}{[\textbf{#1:} #2]}%
+      \textcolor{magenta}{[\textbf{#1:}#2]}%
     \else
-        \textcolor{teal}{[#1 #2]}%
+        \textcolor{teal}{[#1#2]}%
     \fi
     \fi
   \fi
@@ -73,13 +77,17 @@
 
 \maketitle
 
-\abstract{Point processes model the occurrence of a countable number of random points over some support. They can model diverse phenomena, such as chemical reactions, stock market transactions and social interactions. We show that the \texttt{JumpProcesses.jl} library, which was first developed for simulating jump processes via stochastic simulation algorithms (SSAs) --- including Doob's method, Gillespie's methods, and Kinetic Monte Carlo methods --- also provides performant methods or sampling temporal point processes (TPPs). Historically, jump processes have been developed in the context of dynamical systems to describe dynamics with discrete jumps. In contrast, the development of point processes has been more focused on describing the occurrence of random events. In this paper, we bridge the gap between the treatment of point and jump process simulation. The algorithms previously included in \texttt{JumpProcesses.jl} can be mapped to three general methods developed in statistics for simulating TPPs. Our comparative exercise reveals that the library lacked an efficient algorithm for simulating processes with variable intensity rates. We develop a new simulation algorithm \texttt{Coevolve}. This is the first thinning algorithm to step in sync with model time reducing the number of time proposal rejections and allowing for new possibilities such as simulating variable-rate jump processes coupled with differential equations via thinning. \texttt{JumpProcesses.jl} can now simulate any point process on the real line with a non-negative, left-continuous, history-adapted and locally bounded intensity rate efficiently, enabling the library to become one of the few readily available, fast and general-purpose options for simulating TPPs.}
+\abstract{Point processes model the occurrence of a countable number of random points over some support. They can model diverse phenomena, such as chemical reactions, stock market transactions and social interactions. We show that the \texttt{JumpProcesses.jl} library, which was first developed for simulating jump processes via stochastic simulation algorithms (SSAs) --- including Doob's method, Gillespie's methods, and Kinetic Monte Carlo methods --- also provides performant methods \comment{G: for} sampling temporal point processes (TPPs). Historically, jump processes have been developed in the context of dynamical systems to describe dynamics with discrete jumps. In contrast, the development of point processes has been more focused on describing the occurrence of random events. In this paper, we bridge the gap between the treatment of point and jump process simulation. The algorithms previously included in \texttt{JumpProcesses.jl} can be mapped to three general methods developed in statistics for simulating TPPs. Our comparative exercise reveals that the library lacked an efficient algorithm for simulating processes with variable intensity rates. We develop \comment{G:\sout{a new simulation algorithm \texttt{Coevolve}. This is}} the first thinning algorithm to step in sync with model time reducing the number of time proposal rejections and allowing for new possibilities such as simulating variable-rate jump processes coupled with differential equations\comment{G:. We implement the new algorithm in} \texttt{JumpProcesses.jl}\comment{G:, which} can now simulate any point process on the real line with a non-negative, left-continuous, history-adapted and locally bounded intensity rate efficiently, enabling the library to become one of the few readily available, fast and general-purpose options for simulating TPPs.}
 
 \section{Introduction}
 
-Methods for simulating the trajectory of temporal point processes (TPPs) can be split into exact and inexact methods. Exact methods generate statistically exact realizations of each point in the process chronologically~\footnote{Some exact methods might not be completely exact since they rely on root finding approximation methods. However, we follow convention and denote all such methods as exact methods.}. This exactness provides unbiased samples, but can suffer from reduced performance when simulating systems where numerous events can fire within a short period since every single point needs to be accounted for. Inexact methods trade accuracy for speed by simulating the total number of events in successive intervals. They are popular in biochemical applications, \eg \( \tau \)-leap methods~\cite{gillespie2001}, which often require the simulation of chemical reactions in systems with large molecular populations.
+Methods for simulating the trajectory of temporal point processes (TPPs) can be split into exact and inexact methods. Exact methods generate statistically exact realizations of each point in the process chronologically~\footnote{Some exact methods might not be completely exact since they rely on root finding approximation methods. However, we follow convention and denote all such methods as exact methods.}. This exactness provides \comment{G:\sout{unbiased}correct} samples, but can suffer from reduced performance when simulating systems where numerous events can fire within a short period since every single point needs to be accounted for. Inexact methods trade accuracy for speed by simulating the total number of events in successive intervals. They are popular in biochemical applications, \eg \( \tau \)-leap methods~\cite{gillespie2001}, which often require the simulation of chemical reactions in systems with large molecular populations.
+
+Previously, the development of point process simulation libraries focused primarily on univariate processes with exotic intensities, or large systems with conditionally constant intensities, but not on both. As such, there was no widely used general-purpose software for efficiently simulating \comment{G:multivariate TPPs in large systems with time-dependent rates. To enable the efficient simulation of such systems, we contribute a new simulation algorithm for multivariate TPPs. Our new method is a type of thinning algorithm that thins in sync with time. This allows the coupling of large multivariate TPPs with other algorithms that step chronologically through time such as differential equation solvers. Our new algorithm improves the COEVOLVE algorithm from~\cite{farajtabar2017}. COEVOLVE itself can be seen as an improvement from the next reaction method of~\cite{gibson2000}. We can trace the idea of synced thinning back to Section 7.5~\cite{daley2003} where it is discussed, but no algorithmic implementation of such idea existed until now.  COEVOLVE~\cite{farajtabar2017} did not entertain jump processes that belong to systems of differential equations.} 
+
+\comment{G:Our new algorithm is implemented as the \texttt{Coevolve} aggregator in \texttt{JumpProcesses.jl}, a core sub-library of the popular \texttt{DifferentialEquations.jl} library~\cite{rackauckas2017}. It is named after COEVOLVE~\cite{farajtabar2017} as our new algorithm supersedes the original one. The new aggregator} dramatically boosts the computational performance of the library in simulating processes with intensities that have an explicit dependence on time and/or other continuous variables,  significantly expanding the type of models that can be efficiently simulated by it. Widely-used point processes with such intensities include compound inhomogeneous Poisson process, Hawkes processes, stress-release processes and piecewise deterministic Markov processes (PDMPs).
 
-Previously, the development of point process simulation libraries focused primarily on univariate processes with exotic intensities, or large systems with conditionally constant intensities, but not on both. As such, there was no widely used general-purpose software for efficiently simulating compound point processes in large systems with time-dependent rates. To enable the efficient simulation of such processes, we contribute a new simulation algorithm together with its implementation as the \texttt{Coevolve} aggregator in \texttt{JumpProcesses.jl}, a core sub-library of the popular \texttt{DifferentialEquations.jl} library~\cite{rackauckas2017}. Our new method is a type of thinning algorithm that thins in sync with time. This allows the coupling of large multivariate TPPs with other algorithms that step chronologically through time such as differential equation solvers. Our new algorithm improves the COEVOLVE algorithm described in~\cite{farajtabar2017} from where the new \texttt{JumpProcesses.jl} aggregator borrows its name. The addition of Coevolve dramatically boosts the computational performance of the library in simulating processes with intensities that have an explicit dependence on time and/or other continuous variables,  significantly expanding the type of models that can be efficiently simulated. Widely-used point processes with such intensities include compound inhomogeneous Poisson process, Hawkes processes, stress-release processes and piecewise deterministic Markov processes (PDMPs). Since \texttt{JumpProcesses.jl} is a member of Julia's SciML organization, it also becomes easier, and more feasible, to incorporate compound point processes with explicit time-dependent rates into a wide variety of applications and higher-level analyses. Our new additions are available as of \texttt{JumpProcesses.jl} 9.7\footnote{All examples and benchmarks in this paper use version 9.9 of the library}.
+Since \texttt{JumpProcesses.jl} is a member of Julia's SciML organization, it also becomes easier, and more feasible, to incorporate compound point processes with explicit time-dependent rates into a wide variety of applications and higher-level analyses. Our new additions are available as of \texttt{JumpProcesses.jl} 9.7\footnote{All examples and benchmarks in this paper use version 9.9 of the library}.
 
 This paper starts by bridging the gap between simulation methods developed in statistics and biochemistry, which led us to the development of \texttt{Coevolve}. We briefly introduce TPPs and simulation methods for the homogeneous Poisson process, which serve as building blocks for all other simulation methods. Then, we identify and discuss three types of exact simulation methods. In the second part of this paper, we describe the algorithms implemented in \texttt{JumpProcesses.jl} and how they relate to the literature. We highlight our contribution \texttt{Coevolve}, investigate the correctness of our implementation and provide performance benchmarks to demonstrate its value. The paper concludes by discussing potential improvements.
 
@@ -89,13 +97,13 @@ \section{The temporal point process}
 \begin{equation}\label{eq:lambda}
   \lambda^\ast (t) \equiv \lambda(t \mid H_{t^-} ) = \frac{p^\ast(t)}{1 - \int_{t_n}^{t} p^\ast(u) \, du},
 \end{equation}
-and conditional mark distribution, \( f^*(k | t) \) --- see~Chapter 7~\cite{daley2003}. Here \( H_{t^-} = \{ (t_n, k_n) \mid 0 \leq t_n < t \} \) denotes the internal history of the process up to but not including \( t \), the superscript \( \ast \) denotes the conditioning of any function on \( H_{t^-} \), and \( p^\ast(t) \) is the density function corresponding to the probability of an event taking place at time \( t \) given \( H_{t^-} \). We can interpret the conditional intensity as the likelihood of observing a point in the next infinitesimal unit of time, given that no point has occurred since the last observed point in \( H_{t^-} \). Lastly, the mark distribution denotes the density function corresponding to the probability of observing mark \( k \) given the occurrence of an event at time \( t \) and internal history \( H_{t^-} \).
+and conditional mark distribution, \( f^*(k | t) \) --- see~Chapter 7~\cite{daley2003}. \comment{G: A mark is any random attribute associated with a point}. Here \( H_{t^-} = \{ (t_n, k_n) \mid 0 \leq t_n < t \} \) denotes the history of the process up to but not including \( t \). \comment{G: In other words, the history is a sequence of tuples with the timestamp and mark of each event.} The superscript \( \ast \) denotes the conditioning of any function on \( H_{t^-} \), and \( p^\ast(t) \) is the density function corresponding to the probability of an event taking place at time \( t \) given \( H_{t^-} \). We can interpret the conditional intensity as the likelihood of observing a point in the next infinitesimal unit of time, given that no point has occurred since the last observed point in \( H_{t^-} \). Lastly, the mark distribution denotes the density function corresponding to the probability of observing mark \( k \) given the occurrence of an event at time \( t \) and history \( H_{t^-} \).
 
 \section{The homogeneous process}
 
 A homogeneous process can be simulated using properties of the Poisson process, which allow us to describe two equivalent sampling procedures. The first procedure consists of drawing successive inter-arrival times. The distance between any two points in a homogeneous process is distributed according to the exponential distribution --- see Theorem 7.2~\cite{last2017}. Given the homogeneous process with intensity $\lambda$, then the distance \( \Delta t \) between two points is distributed according to $\Delta t \sim \exp(\lambda)$. Draws from the exponential distribution can be performed by drawing from a uniform distribution in the interval $[0, 1]$. If $V \sim U[0, 1]$, then \( T = - \ln(V) / \lambda \sim \exp(1) \). (Note, however, in Julia the optimized Ziggurat-based method used in the \texttt{randexp} stdlib function is generally faster than this \textit{inverse} method for sampling a unit exponential random variable.) When a point process is homogeneous, the \textit{inverse} method of Subsection~\ref{subsec:sim-inverse} reduces to this approach. Thus, we defer the presentation of this Algorithm to the next section.
 
-The second procedure uses the fact that Poisson processes can be represented as a mixed binomial process with a Poisson mixing distribution --- see Proposition 3.5~\cite{last2017}. In particular, the total number of points of a Poisson homogeneous process in \( [0, T) \) is distributed according to \( \mathcal{N} (T) \sim \operatorname{Poisson}( \lambda T ) \) and the location of each point within the region is independently distributed according to the uniform distribution \( t_n \sim U[0, T] \).
+The second procedure uses the fact that Poisson processes can be represented as a mixed binomial process with a Poisson mixing distribution --- see Proposition 3.5~\cite{last2017}. In particular, the total number of points of a Poisson homogeneous process in \( [0, T) \) is distributed according to \( \mathcal{N} (T) \sim \operatorname{Poisson}( \lambda T ) \) and the location of each point within the region is independently distributed according to the uniform distribution \comment{G:\sout{\( t_n \sim U[0, T] \)}\( U[0, T] \)}.
 
 % Tau leaping methods in Subsection~\ref{subsec:sim-tau} use a similar procedure to focus only on sampling the total number of points in successive intervals.
 
@@ -126,9 +134,9 @@ \subsection{Inverse methods} \label{subsec:sim-inverse}
   \Lambda^\ast (t_n) \equiv \tilde{t}_n \equiv \int_0^{t_n} \lambda^\ast (u) du
 \end{equation}
 The transformed data \( \tilde{t}_n \) forms a homogeneous Poisson process with unit rate. Now, if this is the case, then the transformed interval is distributed according to the exponential distribution.
-\begin{equation}\label{eqn:inverse}
-  \Delta \tilde{t}_n \equiv \tilde{t}_n - \tilde{t}_{n-1} \sim \exp(1)
-\end{equation}
+\comment{G:\begin{equation}\label{eqn:inverse}
+  \Delta \tilde{t}_n \equiv \tilde{t}_n - \tilde{t}_{n-1} = \int_{t_{n-1}}^{t_n} \lambda^\ast(u) du \sim \exp(1)
+\end{equation}}
 The idea is to draw realizations from the unit rate Exponential process and solve Equation~\ref{eqn:inverse} for \( t_n \)  to determine the next event/firing time. We illustrate this in Algorithm~\ref{algo:sim-inverse} where we adapt Algorithm 7.4~\cite{daley2003}.
 
 Whenever the conditional intensity is constant between two points, Equation~\ref{eqn:inverse} can be solved analytically. Let \( \lambda^\ast \, (t) = \lambda_{n-1} , \forall t_{n-1} \leq t < t_n \), then
@@ -145,13 +153,14 @@ \subsection{Inverse methods} \label{subsec:sim-inverse}
 % https://math.stackexchange.com/questions/1467784/inverse-of-a-functions-integral
 The main drawback of the \textit{inverse} method is that the root finding problem defined in Equation~\ref{eqn:inverse} often requires a numerical solution. To get around a similar obstacle in the context of PDMPs, Veltz~\cite{veltz2015} proposes a change of variables in time that recasts the root finding problem into an initial value problem. He denotes his method \textit{CHV}.
 
-PDMPs are composed of two parts: the jump process and the piecewise ODE that changes stochastically at jump times --- see Lemaire~\etal~\cite{lemaire2018} for a formal definition. Therefore, it is easy to employ \textit{CHV} in our case by setting the ODE part to zero throughout time. Adapting from Veltz~\cite{veltz2015}, we can determine the model jump time \( t_n \) after sampling \( \Delta \tilde{t}_n \sim \exp(1) \) by solving the following initial value problem until \( \Delta \tilde{t}_n \).
+PDMPs are composed of two parts: the jump process and the piecewise ODE that changes stochastically at jump times --- see Lemaire~\etal~\cite{lemaire2018} for a formal definition. 
+
+\comment{G: It is easy to employ \textit{CHV} in our case by setting the ODE part to zero throughout time. By re-arranging Equation~\ref{eqn:compensator} and Equation~\ref{eqn:inverse}, we note that it is a one-to-one mapping between \( t \) and \( \tilde{t} \) which allow us to obtain \( t(\Delta \tilde{t}_n) = \Lambda^{\ast-1} (\tilde{t}_{n-1} + \Delta \tilde{t}_n) \) which describes the law of motion for a PDMP. Adapting from Veltz~\cite{veltz2015}, we can determine the model jump time \( t_n \) after sampling \( \Delta \tilde{t}_n \sim \exp(1) \) by solving the following initial value problem until \( \Delta \tilde{t}_n \), which we denote \textit{CHV simple}.
 \begin{equation} \label{eqn:chv-simple}
-    t (0) = t_{n-1} \text{ , } \frac{d t}{d \tilde{t} } = \frac{1}{\lambda^\ast (t)}
-\end{equation}
-Looking back at Equation~\ref{eqn:compensator}, we note that it is a one-to-one mapping between \( t \) and \( \tilde{t} \) which makes it completely natural to write \( t(\Delta \tilde{t}_n) \equiv \Lambda^{\ast-1} (\tilde{t}_{n-1} + \Delta \tilde{t}_n) \).
+    t (0) = t_{n-1} \text{ , } \frac{d t}{d \tilde{t} } \, (\Delta \tilde{t}) = \frac{1}{\lambda^\ast (t)}
+\end{equation}}
 
-Alternatively, when the intensity function is differentiable between jumps we can go even further by recasting the jump problem as a PDMP. Let \( \lambda^\ast_n \equiv \lambda^\ast(t_n) \), then the flow \( \varphi_{t-t_n}( \lambda^\ast_n ) \) maps the initial value of the conditional intensity at time \( t_n \) to its value at time \( t \). In other words, the flow describes the deterministic evolution of the conditional intensity function over time. Next, denote \( \mathbf{1}( \cdot ) \) as the indicator function, then the conditional intensity function can be re-written as a jump process:
+Alternatively, when the intensity function is differentiable between jumps we can go even further\comment{G:\sout{ by recasting the jump problem as a PDMP}}. Let \( \lambda^\ast_n \equiv \lambda^\ast(t_n) \), then the flow \( \varphi_{t-t_n}( \lambda^\ast_n ) \) maps the initial value of the conditional intensity at time \( t_n \) to its value at time \( t \). In other words, the flow describes the deterministic evolution of the conditional intensity function over time. Next, denote \( \mathbf{1}( \cdot ) \) as the indicator function, then the conditional intensity function can be re-written as a jump process:
 \begin{equation} \label{eqn:conditional-jump}
   \lambda^\ast (t) = \sum_{n \geq 1} \varphi_{t - t_{n-1}} ( \lambda_{n-1} ) \mathbf{1}(t_{n-1} \leq t < t_n).
 \end{equation}
@@ -162,7 +171,7 @@ \subsection{Inverse methods} \label{subsec:sim-inverse}
 \end{equation}
 where \( g: \mathbb{R}^+ \to \mathbb{R} \) is the vector field of \( \lambda^\ast \) such that \( d \lambda^\ast / dt = g( \lambda^\ast ) \).
 
-Based on Equation~\ref{eq:lambda}, we find that the probability of observing an interval longer than \( s \) given internal history \( H_{t^-} \) is equivalent to:
+Based on Equation~\ref{eq:lambda}, we find that the probability of observing an interval longer than \( s \) given history \( H_{t^-} \) is equivalent to:
 \begin{equation} \label{eqn:transition-rate}
 \begin{split}
   &\Pr(t_n - t_{n-1} > s \mid H_{t^-}) = 1 - \int_{t_{n-1}}^{t_{n-1} + s} p^\ast(u) du = \\
@@ -172,19 +181,19 @@ \subsection{Inverse methods} \label{subsec:sim-inverse}
 \end{equation}
 
 Equations~\ref{eqn:conditional-jump} and~\ref{eqn:transition-rate} define a PDMP satisfying the conditions of Theorem 3.1~\cite{veltz2015}. In this case, we find \( t_n \) by solving the following initial value problem from \( 0 \) to \( \Delta \tilde{t}_n \sim \exp(1) \).
-\begin{equation} \label{eqn:chv-full}
+\comment{G:\begin{equation} \label{eqn:chv-full}
   \begin{cases}
   \begin{aligned}
-    & \lambda^\ast (t(0)) = \lambda^\ast (t_{n-1}) \text{ , } \frac{d \lambda^\ast}{d \tilde{t}} =  \frac{g(\lambda^\ast(t))}{\lambda^\ast (t)} \\
-    & t (0) = t_{n-1} \text{ , } \frac{d t}{d \tilde{t} } = \frac{1}{\lambda^\ast (t)}.
+    & \varphi_0 (\lambda^\ast_{n-1}) = \lambda^\ast (t_{n-1}) \text{ , } \frac{d}{d \tilde{t}} \, \varphi_{\Delta {t}} \, (\lambda^\ast_{n-1}) =  \frac{g(\lambda^\ast(t))}{\lambda^\ast (t)} \\
+    & t (0) = t_{n-1} \text{ , } \frac{d t}{d \tilde{t} } (\Delta \tilde{t}) = \frac{1}{\lambda^\ast (t)}.
   \end{aligned}
   \end{cases}
-\end{equation}
-This problem specifies how the conditional intensity and model time evolve with respect to the transformed time. The solution to Equation~\ref{eqn:inverse} is then given by \( ( t_n = t(\Delta \tilde{t}_n), \lambda^\ast(t(\Delta \tilde{t}_n)) =  \lambda^\ast(t_n)) \).
+\end{equation}}
+This problem specifies how the conditional intensity and model time evolve with respect to the transformed time. The solution to Equation~\ref{eqn:inverse} is then given by \( ( t_n = t(\Delta \tilde{t}_n), \lambda^\ast(t(\Delta \tilde{t}_n)) =  \lambda^\ast(t_n)) \). \comment{G: We denote this problem \textit{CHV full}.}
 
-In Algorithm~\ref{algo:sim-inverse}, we can implement the CHV method by solving either Equation~\ref{eqn:chv-simple} or Equation~\ref{eqn:chv-full} instead of Equation~\ref{eqn:inverse}. We denote the first specification as \textit{CHV simple} and the second as \textit{CHV full}. Note that \textit{CHV full} requires that the conditional intensity be piecewise differentiable. The algorithmic complexity is then determined by the ODE solver and no root-finding is required. In Section~\ref{subsec:benchmark}, we will show that there are substantial differences in performance between them with the full specification being faster. %For more discussions on the links between point processes and Markov processes see Chapter 10~\cite{daley2007}.
+In Algorithm~\ref{algo:sim-inverse}, we can implement the CHV method by solving either Equation~\ref{eqn:chv-simple} or Equation~\ref{eqn:chv-full} instead of Equation~\ref{eqn:inverse}. Note that \textit{CHV full} requires that the conditional intensity be piecewise differentiable. The algorithmic complexity is then determined by the ODE solver and no root-finding is required. In Section~\ref{subsec:benchmark}, we will show that there are substantial differences in performance between them with the full specification being faster. %For more discussions on the links between point processes and Markov processes see Chapter 10~\cite{daley2007}.
 
-Another concern with Algorithm~\ref{algo:sim-inverse} is updating and drawing from the conditional mark distribution in Line~\ref{line:inverse-mark-sample}, and updating the conditional intensity in Line~\ref{line:inverse-history-update}. Assume a process with \( K \) number of marks. A naive implementation of Line~\ref{line:inverse-history-update} scales with the number of marks as \( O(K) \) since \( \lambda^\ast \) is usually constructed as the sum of \( K \) independent processes, each of which requires updating the conditional intensity rate. Likewise, drawing from the mark distribution in Line~\ref{line:inverse-mark-sample} usually involves drawing from a categorical distribution whose naive implementations also scales with the number of marks as \( O(K) \).
+Another concern with Algorithm~\ref{algo:sim-inverse} is updating and drawing from the conditional mark distribution in Line~\ref{line:inverse-mark-sample}\comment{G:\sout{, and updating the conditional intensity in Line~\ref{line:inverse-history-update}}}. Assume a process with \( K \) number of marks.\comment{G:\sout{ A naive implementation of Line~\ref{line:inverse-history-update} scales with the number of marks as \( O(K) \) since \( \lambda^\ast \) is usually constructed as the sum of \( K \) independent processes, each of which requires updating the conditional intensity rate. Likewise,} Naively} drawing from the mark distribution in Line~\ref{line:inverse-mark-sample} usually involves drawing from a categorical distribution whose implementations also scales with the number of marks as \( O(K) \).
 
 Finally, Algorithm~\ref{algo:sim-inverse} is not guaranteed to terminate in finite time since one might need to sample many points before \( t_n > T \). The sampling rate can be especially high when simulating the process in a large population with self-exciting encounters. In biochemistry, Salis and Kaznessis~\cite{salis2005} partition a large system of chemical reactions into two: fast and slow reactions. While they approximate the fast reactions with a Gaussian process, the slow reactions are solved using a variation of the inverse method. They obtain an equivalent expression for the rate of slow reactions as in Equation~\ref{eqn:inverse}, which is integrated with the Euler method.
 
@@ -227,7 +236,7 @@ \subsection{Thinning methods} \label{subsec:sim-thinning}
 
 In Line~\ref{line:u-sample} of Algorithm~\ref{algo:sim-thinning}, since the candidate interval \( u \) is itself the random inter-event interval from a TPP with conditional intensity \( \bar{B}^\ast(\cdot) \), we are back to simulating a TPP via the inverse method. Therefore, the wrong choice of \( \bar{B}^\ast(\cdot) \) could in fact deteriorate the performance of the simulation. In many applications, the bound \( \bar{B}^\ast(\cdot) \) is constant over \( [0, L^\ast(t)] \) which simplifies the simulation since then \( u \sim \exp(\bar{B}^\ast (t)) \).  Alternatively, Bierkens~\etal~\cite{bierkens2019} uses a Taylor approximation of \( \lambda^\ast(t) \) to obtain an upper-bound which is a linear function of \( t \)~\footnote{Their implementation of the Zig-Zag process, a type of PMDP for Markov Chain Monte Carlo, is available as a Julia package at \url{https://github.com/mschauer/ZigZagBoomerang.jl}.}.
 
-When the conditional intensity is constant between jumps such that \( \lambda^\ast \, (t) = \lambda_{n-1} , \forall t_{n-1} \leq t < t_n \), let \( \bar{B}^\ast(t) = \ubar{B}^\ast(t) = \lambda_{n-1} \) and \( L^\ast(t) = \infty \). We have that for any \( u \sim \exp(1 \; / \; \bar{B}^\ast(t)) =  \exp(\lambda_{n-1})\) and \( v \sim U[0, 1] \), \( u < L^\ast(t) = \infty \) and \( v < \lambda^\ast \, (t + u) \; / \; \bar{B}^\ast(t) = 1 \). Therefore, we advance the internal history for every iteration of Algorithm~\ref{algo:sim-thinning}. In this case, the bound \( \bar{B}^\ast(t) \) is as tight as possible, and this method becomes equivalent to the \textit{inverse} method of Subsection~\ref{subsec:sim-inverse}.
+When the conditional intensity is constant between jumps such that \( \lambda^\ast \, (t) = \lambda_{n-1} , \forall t_{n-1} \leq t < t_n \), let \( \bar{B}^\ast(t) = \ubar{B}^\ast(t) = \lambda_{n-1} \) and \( L^\ast(t) = \infty \). We have that for any \( u \sim \exp(1 \; / \; \bar{B}^\ast(t)) =  \exp(\lambda_{n-1})\) and \( v \sim U[0, 1] \), \( u < L^\ast(t) = \infty \) and \( v < \lambda^\ast \, (t + u) \; / \; \bar{B}^\ast(t) = 1 \). Therefore, we advance the history for every iteration of Algorithm~\ref{algo:sim-thinning}. In this case, the bound \( \bar{B}^\ast(t) \) is as tight as possible, and this method becomes equivalent to the \textit{inverse} method of Subsection~\ref{subsec:sim-inverse}.
 
 We can draw more connections between \textit{thinning} and \textit{inversion}. Lemaire~\etal~\cite{lemaire2018} propose a version of the \textit{thinning} algorithm for PDMPs which does not use a local interval for rejection --- equivalent to \( L^\ast(t) = \infty \). They propose an optimal upper-bound \( \bar{B}^\ast(t) \) as a piecewise constant function partitioned in such a way that it envelopes the intensity function as strictly as possible. The efficiency of their algorithm depends on the assumption that the stochastic process determined by \( \bar{B}^\ast(t) \) can be efficiently inverted. They show that under certain conditions the stochastic process determined by \( \bar{B}^\ast(t) \) converges in distribution to the target conditional intensity as the partitions of the optimal boundary converge to zero. These results suggest that the efficiency of \textit{thinning} compared to \textit{inversion} most likely depends on the rejection rate obtained by the former and the number of steps required by the ODE solver for the latter.
 
@@ -239,7 +248,7 @@ \subsection{Thinning methods} \label{subsec:sim-thinning}
     \State initialize the history \( H_{T^-} \leftarrow \{ \} \)
     \State set \( n \leftarrow 0, t \leftarrow 0 \)
     \While{true}
-      \State \( t \leftarrow \operatorname{TimeViaThinning}([t, T), H_{T^-}, \lambda^\ast) \)
+      \State \( t \leftarrow \operatorname{\textsc{TimeViaThinning}}([t, T), H_{T^-}, \lambda^\ast) \)
       \If{\(t \geq T \)}
         \State \textbf{break}
       \EndIf
@@ -259,7 +268,7 @@ \subsection{Thinning methods} \label{subsec:sim-thinning}
 \begin{algorithmic}[1]
   \Procedure{TimeViaThinning}{\([t, T) \), \( \lambda^\ast \), \( H_{t} \),}
       \While{\( t < T \)}
-        \State update \( \lambda^\ast \) \label{line:lambda-update}
+        \State update \( \lambda^\ast \) \comment{G: with new \( H_{t} \)} \label{line:lambda-update}
         \State find \( \bar{B}^\ast (t) \), \( \ubar{B}^\ast (t) \) and \( L^\ast(t) \) which satisfy Eq.~\ref{eq:thinning-condition}
         \State draw candidate interval \( u \) such that \\ \hskip2.5em \( P(u > s) = \exp( - \int_0^s \bar{B}^\ast (t + s) ds ) \) \label{line:u-sample}
         \State draw acceptance threshold \( v \sim U[0, 1] \)
@@ -267,9 +276,9 @@ \subsection{Thinning methods} \label{subsec:sim-thinning}
           \State \( t \leftarrow t + L^\ast(t) \)
           \State \textbf{next}
         \EndIf
-        \If{\( ( v \leq \ubar{B}^\ast(t + u) ) \) or \( ( v \leq \lambda^\ast \, (t + u) / \bar{B}^\ast(t + u) ) \)} \label{line:short-circuit}
+        \If{\comment{G: \( ( v \leq \ubar{B}^\ast(t + u) / \bar{B}^\ast(t + u)) \)} \\ \hskip2.5em or \( ( v \leq \lambda^\ast \, (t + u) / \bar{B}^\ast(t + u) ) \)} \label{line:short-circuit}
           \State \( t \leftarrow t + u \)
-          \State \Return \( t \)
+          \State \comment{G:\sout{\Return \( t \)}\textbf{break}}
         \EndIf
         \State \( t \leftarrow t + u \)
       \EndWhile
@@ -288,7 +297,7 @@ \subsection{Queuing methods for multivariate processes}
 
 We prefer to call this class of methods \textit{queued thinning} methods since most efficiency gains come from maintaining a priority queue of the next event times. Up to this point we assumed that we were sampling from a global process with a mark distribution that could generate any mark \( k \) given an event at time \( t \). With queuing, it is possible to simulate point processes with a finite space of marks as \( M \) interdependent point processes --- see Definition 6.4.1~\cite{daley2003} of multivariate point processes --- doing away with the need to draw from the mark distribution at every event occurrence. Alternatively, it is possible to split the global process into \( M \) interdependent processes each of which has its own mark distribution.
 
-Algorithm~\ref{algo:sim-queuing}, presents a method for sampling a superposed point process consisting of \( M \) processes by keeping the strike time of each process in a priority queue \( Q \). The priority queue is initially constructed in \( O(M) \) steps in Lines~\ref{line:queuing-init-begin} to~\ref{line:queuing-init-end} of Algorithm~\ref{algo:sim-queuing}. In contrast to \textit{thinning} methods, updates to the conditional intensity depend only on the size of the neighborhood of \( i \). That is, processes \( j \) whose conditional intensity depends on the history of \( i \). If the graph is sparse, then updates will be faster than with \textit{thinning}.
+Algorithm~\ref{algo:sim-queuing} presents our \comment{G: new} method for sampling a superposed point process consisting of \( M \) processes by keeping the strike time of each process in a priority queue \( Q \). \comment{G: Thus, it is an example of \textit{queued thinning}.} The priority queue is initially constructed in \( O(M) \) steps in Lines~\ref{line:queuing-init-begin} to~\ref{line:queuing-init-end} of Algorithm~\ref{algo:sim-queuing}. In contrast to \textit{thinning} methods, updates to the conditional intensity depend only on the size of the neighborhood of \( i \). That is, processes \( j \) whose conditional intensity depends on the history of \( i \). If the graph is sparse, then updates will be faster than with \textit{thinning}.
 
 A source of inefficiency in some implementations of \textit{queued thinning} algorithms such as~\cite{farajtabar2017} is the fact that one goes through multiple rejection cycles at time \( t \) before accepting a time candidate \( t < t_i \) for process \( i \). This requires looking ahead in the future. In addition to that, if process \( j \), which \( i \) depends on, takes place before \( t_i \), then we need to repeat the whole thinning process to obtain a new time candidate for \( i \).
 
@@ -322,7 +331,7 @@ \subsection{Queuing methods for multivariate processes}
     \State initialize the history \( H_{T^-} \leftarrow \{ \} \)
     \State set \( n \leftarrow 0, t \leftarrow 0 \)
     \For{i=1,M} \label{line:queuing-init-begin}
-      \State \( (t_i, \bar{B}_i^\ast, \ubar{B}_i^\ast, a_i) \leftarrow \operatorname{QueueTime}(0, H_{T^-}, \lambda_{i}^\ast(\cdot)) \) \label{line:candidate-one}
+      \State \( (t_i, \bar{B}_i^\ast, \ubar{B}_i^\ast, a_i) \leftarrow \operatorname{\textsc{QueueTime}}(0, H_{T^-}, \lambda_{i}^\ast(\cdot)) \) \label{line:candidate-one}
       \State push \( (i, t_i, \bar{B}_i^\ast, \ubar{B}_i^\ast, a_i) \) to \( Q \)
     \EndFor \label{line:queuing-init-end}
     \While{\( t < T \)}
@@ -331,8 +340,8 @@ \subsection{Queuing methods for multivariate processes}
       \If{\(t \geq T \)}
         \State \textbf{break}
       \EndIf
-      \State draw \( v \sim U[0, \bar{B}_i^\ast] \)
-      \If{\( a_i \) and \( ( v > \ubar{B}_i^\ast ) \) and \( ( v > \lambda^\ast \, (t) ) \)}
+      \State draw \comment{G: \( v \sim U[0, 1] \)}
+      \If{\( a_i \) and \comment{G: \( ( v > \ubar{B}_i^\ast / \bar{B}_i^\ast ) \) and \( ( v > \lambda^\ast \, (t) / \bar{B}_i^\ast ) \)}}
         \State \( a_i \leftarrow \operatorname{false} \)
       \EndIf
       \If{ \( a_i \)}
@@ -366,25 +375,31 @@ \subsection{Queuing methods for multivariate processes}
 
 \section{Implementation} \label{sec:implementation}
 
-\texttt{JumpProcesses.jl} is a Julia library for simulating jump --- or point --- processes which is part of Julia's SciML organization. In the Julia ecosystem, there are other libraries that can sample certain TPPs including \path{Hawkes.jl} \footnote{\url{https://github.com/em1234321/Hawkes.jl}}, \path{HawkesProcesses.jl}~\footnote{\url{https://github.com/dm13450/HawkesProcesses.jl}}, \path{NetworkHawkesProcesses.jl} \footnote{\url{https://github.com/cswaney/NetworkHawkesProcesses.jl}}, \path{PointProcessInference.jl} \cite{schauer2020} \footnote{\url{https://github.com/mschauer/PointProcessInference.jl}}, \path{GeoStats.jl} \cite{hoffimann2020} \footnote{\url{https://github.com/JuliaEarth/GeoStats.jl}}, \path{PiecewiseDeterministicMarkovProcesses.jl}~\cite{veltz2015}~\footnote{\url{https://github.com/rveltz/PiecewiseDeterministicMarkovProcesses.jl}}, and \path{PointProcesses.jl}~\cite{dalle2024} \footnote{\url{https://github.com/gdalle/PointProcesses.jl}}. Apart from \texttt{PiecewiseDeterministicMarkovProcesses.jl}, these other libraries can only sample the Poisson and/or the Hawkes processes. \texttt{PointProcesses.jl} also offers a formalized interface that other packages can implement to leverage its TPP modelling functionality. While \texttt{JumpProcesses.jl} can be used to directly simulate TPPs, in its documentation we also show how it can be wrapped to conform to this interface \footnote{\url{https://docs.sciml.ai/JumpProcesses/stable/applications/advanced_point_process}}.
+\texttt{JumpProcesses.jl} is a Julia library for simulating jump --- or point --- processes which is part of Julia's SciML organization. In the Julia ecosystem, there are other libraries that can sample certain TPPs including \path{Hawkes.jl} \footnote{\url{https://github.com/em1234321/Hawkes.jl}}, \path{HawkesProcesses.jl}~\footnote{\url{https://github.com/dm13450/HawkesProcesses.jl}}, \path{NetworkHawkesProcesses.jl} \footnote{\url{https://github.com/cswaney/NetworkHawkesProcesses.jl}}, \path{PointProcessInference.jl} \cite{schauer2020} \footnote{\url{https://github.com/mschauer/PointProcessInference.jl}}, \path{GeoStats.jl} \cite{hoffimann2020} \footnote{\url{https://github.com/JuliaEarth/GeoStats.jl}}, \path{PiecewiseDeterministicMarkovProcesses.jl}~\cite{veltz2015}~\footnote{\url{https://github.com/rveltz/PiecewiseDeterministicMarkovProcesses.jl}}, and \path{PointProcesses.jl}~\cite{dalle2024} \footnote{\url{https://github.com/gdalle/PointProcesses.jl}}. Apart from \texttt{PiecewiseDeterministicMarkovProcesses.jl}, these other libraries can only sample the Poisson and/or the Hawkes processes. \texttt{PointProcesses.jl} also offers a formalized interface that other packages can implement to leverage its TPP modelling functionality. While \texttt{JumpProcesses.jl} can be used to directly simulate TPPs, in its documentation we also show how it can be wrapped to conform to this interface \footnote{\url{https://docs.sciml.ai/JumpProcesses/stable/applications/advanced_point_process}}.\comment{G: Outside of Julia, there are many packages to simulate TPPs. A non-exaustive list include the Python libraries \path{Tick}~\cite{bacry2018}, \path{PoPPy}~\cite{xu2019a}, \path{hawkesbook}~\cite{laub2021}, and the R library \path{PtProcess}~\cite{harte2010}.}
+
+Our discussion in Section~\ref{sec:act} identified three exact methods for simulating point processes. In all the cases, we identified two mathematical constructs required for simulation: the intensity rate and the mark distribution. In \texttt{JumpProcesses.jl}, these can be mapped to user defined functions \texttt{rate(u, p, t)} and \texttt{affect!(integrator)}. The former takes the current state of the system, \texttt{u}, user provided parameters, \texttt{p}, and the current time, \texttt{t}, and returns the value of the intensity function at this time. The latter takes the solver \texttt{integrator} object, which stores all solution information, and updates it, including the state \texttt{integrator.u}, for whatever changes should occur when the jump it encodes fires at the time \texttt{integrator.t}. The library provides APIs for defining processes based on the nature of the intensity rate and the intended simulation algorithm. Processes simulated using exact sampling methods can choose between \texttt{ConstantRateJump} and \texttt{VariableRateJump}. While the former expects the rate between jumps to be constant, the latter allows for time-dependent rates. The library also provides the \texttt{MassActionJump} API to define large systems of point processes that can be expressed as mass action type reaction equations. Finally, \texttt{RegularJump} is intended for \comment{G:\sout{tau}\( \tau \)}-leaping methods.
 
-Our discussion in Section~\ref{sec:act} identified three exact methods for simulating point processes. In all the cases, we identified two mathematical constructs required for simulation: the intensity rate and the mark distribution. In \texttt{JumpProcesses.jl}, these can be mapped to user defined functions \texttt{rate(u, p, t)} and \texttt{affect!(integrator)}. The former takes the current state of the system, \texttt{u}, user provided parameters, \texttt{p}, and the current time, \texttt{t}, and returns the value of the intensity function at this time. The latter takes the solver \texttt{integrator} object, which stores all solution information, and updates it, including the state \texttt{integrator.u}, for whatever changes should occur when the jump it encodes fires at the time \texttt{integrator.t}. The library provides APIs for defining processes based on the nature of the intensity rate and the intended simulation algorithm. Processes simulated using exact sampling methods can choose between \texttt{ConstantRateJump} and \texttt{VariableRateJump}. While the former expects the rate between jumps to be constant, the latter allows for time-dependent rates. The library also provides the \texttt{MassActionJump} API to define large systems of point processes that can be expressed as mass action type reaction equations. Finally, \texttt{RegularJump} is intended for tau-leaping methods.
+\subsection{Inverse implementation}
 
-The \textit{inverse} method as described around Equation~\ref{eqn:inverse} uses root finding to calculate the next jump time. Jumps to be simulated via the \textit{inverse} method must be initialized as a \texttt{VariableRateJump}. \texttt{JumpProcesses.jl} builds a continuous callback following the algorithm in~\cite{salis2005} and passes the problem to an \texttt{OrdinaryDiffEq.jl} integrator, which easily interoperates with \texttt{JumpProcesses.jl} (both libraries are part of the \textit{SciML} organization, and by design built to easily compose). \texttt{JumpProcesses.jl} does not yet support the CHV ODE based approach.
+The \textit{inverse} method as described around Equation~\ref{eqn:inverse} uses root finding to calculate the next jump time. Jumps to be simulated via the \textit{inverse} method must be initialized as a \texttt{VariableRateJump}. \texttt{JumpProcesses.jl} builds a continuous callback following the algorithm in~\cite{salis2005} and passes the problem to an \texttt{OrdinaryDiffEq.jl} integrator, which easily interoperates with \texttt{JumpProcesses.jl} (both libraries are part of the \textit{SciML} organization, and by design built to easily compose). \texttt{JumpProcesses.jl} does not yet support \comment{G:\sout{the} any of  the} CHV approaches.
 
-Alternatively, \textit{thinning} methods can be simulated via discrete steps. In \texttt{JumpProcesses.jl}, simulation approaches that take discrete steps are handled via discrete callbacks that are checked at the end of each time-step of some time evolution algorithm, \eg an ODE solver from \texttt{OrdinaryDiffEq.jl}, a stochastic differential equation solver from \texttt{StochasticDiffEq.jl}, or the pure-jump process \texttt{SSAStepper} provided by \texttt{JumpProcesses.jl}. In simple terms, discrete callbacks involve two functions. Condition functions are checked at each step of the main loop of a time-stepping method to see if the callback should be executed, and if it should, an associated affect function is called. In the context of the library, any method that uses thinning via a discrete callback is called an \textit{aggregator}. There are twelve different aggregators which we discuss below and are summarized in Table~\ref{tab:aggregators} in the \hyperref[sec:annex]{Annex}.
+\subsection{Thinning implementation}
+
+Alternatively, \textit{thinning} methods can be simulated via discrete steps. \comment{G: In \texttt{JumpProcesses.jl}, simulation approaches that take discrete steps are handled via discrete callbacks that are checked at the end of each time-step of some time evolution algorithm, \eg an ODE solver from \texttt{OrdinaryDiffEq.jl}, a stochastic differential equation solver from \texttt{StochasticDiffEq.jl}, or the pure-jump process \texttt{SSAStepper} provided by \texttt{JumpProcesses.jl}. In simple terms, discrete callbacks involve two functions. Condition functions are checked at each step of the main loop of a time-stepping method to see if the callback should be executed, and if it should, an associated affect function is called.} In the context of the library, any method that uses thinning via a discrete callback is called an \textit{aggregator}. There are twelve different aggregators which we discuss below and are summarized in Table~\ref{tab:aggregators} in the \hyperref[sec:annex]{Annex}. \comment{G: At the moment, it is not necessarily the case that one method supersedes the other. There are cases in which a particular method might be faster than others.}
 
 We start with constant rate \textit{thinning} aggregators for marked TPPs. Algorithm~\ref{algo:sim-thinning} assumes that there is a single process. In reality, all the implementations first assume a finite multivariate point process with \( M \) interdependent sub-processes. This can be easily conciliated, as we do now, using Definition 6.4.1~\cite{daley2003} which states the equivalence of such process with a point process with a finite space of marks. 
 
 As all the constant rate \textit{thinning} aggregators only support \texttt{ConstantRateJump}s and \texttt{MassActionJump}s, \ie the intensity between jumps is constant, Algorithm~\ref{algo:next-time-thinning} short-circuits to quickly return \( t \sim \exp(\bar{B}) = \exp(\lambda_n) \) as discussed in Subsection~\ref{subsec:sim-thinning}. Next, the mark distribution becomes the categorical distribution weighted by the intensity of each process. That is, given an event at time \( t_n \), we have that the probability of drawing process \( i \) out of \( M \) sub-processes is \( \lambda_i^\ast (t_n)  / \lambda^\ast (t_n) \). Conditional on sub-process \( i \), the corresponding \texttt{affect!(integrator)} is invoked, that is, \( k_n \sim f_i^\ast (k \mid t_n) \). So all sub-processes could potentially be marked, but note users need to handle any additional sampling related to such marks within their provided \texttt{affect!} function. Where most implementations differ is on updating the mark distribution in Line~\ref{line:thinning-mark-sample} of Algorithm~\ref{algo:sim-thinning} and the conditional intensity rate in Line~\ref{line:lambda-update} of Algorithm~\ref{algo:next-time-thinning}. 
 
-\texttt{Direct} and \texttt{DirectFW} follow the \textit{direct} method in~\cite{gillespie1976} which re-evaluates all intensities after every iteration scaling at \( O(K) \). It draws the next-time from the ground process whose rate is the sum of all sub-processes' rates. It selects the mark by executing a search in an array that stores the cumulative sum of rates. 
+\texttt{Direct} \comment{G:\sout{and \texttt{DirectFW}}} follow the \textit{direct} method in~\cite{gillespie1976} which re-evaluates all intensities after every iteration scaling at \( O(K) \). It draws the next-time from the ground process whose rate is the sum of all sub-processes' rates. It selects the mark by executing a search in an array that stores the cumulative sum of rates. 
 
 \texttt{SortingDirect}, \texttt{RDirect}, \texttt{DirectCR} are improvements over the \texttt{Direct} method. They only re-evaluate the intensities of the processes that are affected by the realized process based on a dependency graph. \texttt{SortingDirect} draws from the ground process, but keeps the intensity rate in a loosely sorted array following~\cite{mccollum2006}. \texttt{RDirect} is a rejection-based direct method which assigns the maximum rate of the system as the bound to all processes. The implementation slight differs from Algorithm~\ref{algo:sim-thinning}. Since all sub-process have the same rate it draws the next time from a homogeneous Poisson process with the maximum rate, then randomly selects a candidate process and confirms the candidate only if its rate is above a random proportion of the maximum rate. \texttt{DirectCR} --- from~\cite{slepoy2008} --- is a composition-rejection method that groups sub-processes with similar rates using a priority table. Each group is assigned the sum of all the rates within it. We apply a routine equivalent to \texttt{Direct} to select the time in which the next group fires. Given a group, we then select which process fires.
 
 \texttt{RSSA} and \texttt{RSSACR} place processes in bounded brackets. \texttt{RSSA} --- from~\cite{thanh2014} --- follows Algorithm~\ref{algo:sim-thinning} very closely in the case where the bounds are constant between jumps. \texttt{RSSACR} --- from ~\cite{thanh2017} --- groups sub-processes with similar rates like \texttt{DirectCR}, but then places each group within a bounded bracket. It then samples the next group to fire similar to \texttt{RSSA}. Given the group, it selects a candidate to fire and performs a thinning routine to accept or reject.
 
-Finally, we have what we call the \textit{queued thinning} aggregators. Starting with aggregators that only support \texttt{ConstantRateJump}s we have, \texttt{FRM}, \texttt{FRMFW} and \texttt{NRM}. \texttt{FRM} and \texttt{FRMFW} follow the \textit{first reaction} method in~\cite{gillespie1976}. To compute the next jump, both algorithms compute the time to the next event for each process and select the process with minimum time. This is equivalent to assuming a complete dependency graph in Algorithm~\ref{algo:sim-queuing}. For large systems, these methods are inefficient compared to \texttt{NRM} which is a \texttt{queued thinning} method sourced from~\cite{gibson2000}. \texttt{NRM} gains efficiency by using an indexed priority queue to store and determine next event times, and by using dependency graphs to only update intensities that would need to be recalculated after a given event.
+\subsection{Queued thinning implemeentation}
+
+Finally, we have what we call the \textit{queued thinning} aggregators. Starting with aggregators that only support \texttt{ConstantRateJump}s we have, \texttt{FRM}\comment{G: \sout{, \texttt{FRMFW}}} and \texttt{NRM}. \texttt{FRM}\comment{G:\sout{ and \texttt{FRMFW}}} follow the \textit{first reaction} method in~\cite{gillespie1976}. To compute the next jump, both algorithms compute the time to the next event for each process and select the process with minimum time. This is equivalent to assuming a complete dependency graph in Algorithm~\ref{algo:sim-queuing}. For large systems, these methods are inefficient compared to \texttt{NRM} which is a \comment{G:\sout{\texttt{queued thinning}}\textit{queued thinning}} method sourced from~\cite{gibson2000}. \texttt{NRM} gains efficiency by using an indexed priority queue to store and determine next event times, and by using dependency graphs to only update intensities that would need to be recalculated after a given event.
 
 Most of the algorithms implemented in \texttt{JumpProcesses.jl} come from the biochemistry literature. There has been less emphasis on implementing processes commonly studied in statistics such as self-exciting point processes characterized by time-varying and history-dependent intensity rates. Our latest aggregator, \texttt{Coevolve}, which is an implementation of Algorithm~\ref{algo:sim-queuing}, addresses this gap. This is the first aggregator that supports \texttt{VariableRateJump}s. Compared with the current \textit{inverse} method-based approach that relies on ODE integration, the new aggregator substantially improves the performance of simulations with time-dependent intensity rates and/or coupled with differential equations from \texttt{DifferentialEquations.jl}.
 
@@ -471,7 +486,7 @@ \subsection{Benchmarks} \label{subsec:benchmark}
 
 First, we assess the speed of the aggregators against jump processes whose rates are constant between jumps. There are four such benchmarks: a 1-dimensional continuous time random walk approximation of a diffusion model (Diffusion), the multi-state model from Appendix A.6~\cite{marchetti2017} (Multi-state), a simple negative feedback gene expression model (Gene I) and the negative feedback gene expression from~\cite{gupta2018} (Gene II). We simulate a single trajectory for each aggregator to visually check that they produce similar trajectories for a given model. The Diffusion, Multi-state, Gene I and Gene II benchmarks are then simulated \( 50 \), \( 100 \), \( 2000 \) and \( 200 \) times, respectively. Check the source code for further implementation details.
 
-Benchmark results are listed in Table~\ref{tab:benchmark-biochemistry}. The table shows that no single aggregator dominates suggesting they should be selected according to the task at hand. However, \texttt{FRM}, \texttt{NRM}, \texttt{Coevolve} never dominate any benchmark. In common, they all belong to the family of queuing methods suggesting that there is a penalty when using such methods for jump processes whose rates are constant between jumps. We also note that the performance of \texttt{Coevolve} lag that of \texttt{NRM} despite the fact that \texttt{Coevolve} should take the same number of steps as \texttt{NRM} when no \texttt{VariableRateJump} is used. The reason behind this discrepancy is likely due to implementation differences, but left for future investigation.
+Benchmark results are listed in Table~\ref{tab:benchmark-biochemistry}. The table shows that no single aggregator dominates suggesting they should be selected according to the task at hand. However, \texttt{FRM}, \texttt{NRM}, \texttt{Coevolve} never dominate any benchmark. In common, they all belong to the family of queuing methods suggesting that there is a penalty when using such methods for jump processes whose rates are constant between jumps.\comment{G: The fact that these are not the fastest methods for the constant rate benchmarks shows that improvements to thinning algorithms bring substantial performance gains which could potentially be explored in queued thinning algorithms.} We also note that the performance of \texttt{Coevolve} lags that of \texttt{NRM} despite the fact that \texttt{Coevolve} should take the same number of steps as \texttt{NRM} when no \texttt{VariableRateJump} is used. The reason behind this discrepancy is likely due to implementation differences, but left for future investigation.
 
 \begin{table}
 \centering
@@ -602,7 +617,7 @@ \section{Conclusion}
 
 There are still a number of ways forward. First, given the performance of the \textit{CHV} algorithm in our benchmarks, we should consider adding it to \texttt{JumpProcesses.jl} as another aggregator so that it can benefit from tighter integration with the SciML organization and libraries. The saving behavior of \textit{CHV} might pose a challenge when bringing this algorithm to the library. 
 
-Second, the new aggregator depends on the user providing bounds on the jump rates as well as the duration of their validity. In practice, it can be difficult to determine these bounds a priori, particularly for models with many ODE variables. Moreover, determining such bounds from an analytical solution or the underlying ODEs does not guarantee their holding for the numerically computed solution (which is obtained via an ODE discretization), and so modifications may be needed in practice. A possible improvement would be for \texttt{JumpProcesses.jl} to determine these bounds automatically taking into account the derivative of the rates. The approach of \texttt{ZigZagBoomerang.jl} that combines Taylor approximation of the conditional intensity with automatic differentiation could be explored. Deriving efficient bounds require not only knowledge of the problem and a good amount of analytical work, but also knowledge about the numerical integrator. At best, the algorithm can perform significantly slower if a suboptimal bound or interval is used, at worst it can return incorrect results if a bound is incorrect --- \ie~it can be violated inside the calculated interval of validity. 
+Second, the new aggregator depends on the user providing bounds on the jump rates as well as the duration of their validity. In practice, it can be difficult to determine these bounds a priori, particularly for models with many ODE variables. Moreover, determining such bounds from an analytical solution or the underlying ODEs does not guarantee their holding for the numerically computed solution (which is obtained via an ODE discretization), and so modifications may be needed in practice. A possible improvement would be for \texttt{JumpProcesses.jl} to determine these bounds automatically taking into account the derivative of the rates. The approach of \texttt{ZigZagBoomerang.jl}~\cite{bierkens2019} that combines Taylor approximation of the conditional intensity with automatic differentiation could be explored. Deriving efficient bounds require not only knowledge of the problem and a good amount of analytical work, but also knowledge about the numerical integrator. At best, the algorithm can perform significantly slower if a suboptimal bound or interval is used, at worst it can return incorrect results if a bound is incorrect --- \ie~it can be violated inside the calculated interval of validity. 
 
 Third, \texttt{JumpProcesses.jl} would benefit from further development in inexact methods. At the moment, support is limited to processes with constant rates between jumps and the only solver available \texttt{SimpleTauLeaping} does not support marks. Inexact methods should allow for the simulation of longer periods of time when only an event count per time interval is required. Hawkes processes can be expressed as a branching process. There are simulation algorithms that already take advantage of this structure to leap through time~\cite{laub2021}. It would be important to adapt these algorithms for general, compound branching processes to cater for a larger number of settings. 
 
diff --git a/paper/paper.yml b/paper/paper.yml
index fd92ae65..be417744 100644
--- a/paper/paper.yml
+++ b/paper/paper.yml
@@ -1,4 +1,4 @@
-title: "Extending JumpProcess.jl for fast point process simulation with time-varying intensities"
+title: "Extending JumpProcesses.jl for fast point process simulation with time-varying intensities"
 keywords:
   - Julia
   - Simulation
diff --git a/paper/references.bib b/paper/references.bib
index ba93fe80..046a59da 100644
--- a/paper/references.bib
+++ b/paper/references.bib
@@ -1,3 +1,17 @@
+@misc{bacry2018,
+  title = {Tick: A {{Python}} Library for Statistical Learning, with a Particular Emphasis on Time-Dependent Modelling},
+  shorttitle = {Tick},
+  author = {Bacry, Emmanuel and Bompaire, Martin and Ga{\"i}ffas, St{\'e}phane and Poulsen, Soren},
+  year = {2018},
+  month = mar,
+  number = {arXiv:1707.03003},
+  eprint = {1707.03003},
+  primaryclass = {stat},
+  publisher = {arXiv},
+  doi = {10.48550/arXiv.1707.03003},
+  archiveprefix = {arxiv}
+}
+
 @article{bierkens2019,
   title = {The {{Zig-Zag}} Process and Super-Efficient Sampling for {{Bayesian}} Analysis of Big Data},
   author = {Bierkens, Joris and Fearnhead, Paul and Roberts, Gareth},
@@ -6,7 +20,7 @@ @article{bierkens2019
   journal = {The Annals of Statistics},
   volume = {47},
   number = {3},
-  publisher = {{Institute of Mathematical Statistics}},
+  publisher = {Institute of Mathematical Statistics},
   issn = {0090-5364, 2168-8966},
   doi = {10.1214/18-AOS1715}
 }
@@ -16,8 +30,8 @@ @book{bjork2021
   shorttitle = {Point {{Processes}} and {{Jump Diffusions}}},
   author = {Bj{\"o}rk, Tomas},
   year = {2021},
-  publisher = {{Cambridge University Press}},
-  address = {{Cambridge}},
+  publisher = {Cambridge University Press},
+  address = {Cambridge},
   doi = {10.1017/9781009002127},
   isbn = {978-1-316-51867-0}
 }
@@ -28,8 +42,8 @@ @incollection{bloznelis1994
   author = {Bloznelis, M. and Paulauskas, V.},
   editor = {{Hoffmann-J{\o}rgensen}, J{\o}rgen and Kuelbs, James and Marcus, Michael B.},
   year = {1994},
-  publisher = {{Birkh{\"a}user Boston}},
-  address = {{Boston, MA}},
+  publisher = {Birkh{\"a}user Boston},
+  address = {Boston, MA},
   doi = {10.1007/978-1-4612-0253-0_9},
   isbn = {978-1-4612-6682-2}
 }
@@ -41,8 +55,8 @@ @book{daley2003
   year = {2003},
   series = {Probability and {{Its Applications}}, {{An Introduction}} to the {{Theory}} of {{Point Processes}}},
   edition = {2},
-  publisher = {{Springer-Verlag}},
-  address = {{New York}},
+  publisher = {Springer-Verlag},
+  address = {New York},
   doi = {10.1007/b97277},
   isbn = {978-0-387-95541-4}
 }
@@ -54,8 +68,8 @@ @book{daley2007
   year = {2007},
   month = nov,
   edition = {2nd edition},
-  publisher = {{Springer}},
-  address = {{New York}},
+  publisher = {Springer},
+  address = {New York},
   isbn = {978-0-387-21337-8}
 }
 
@@ -107,8 +121,7 @@ @article{farajtabar2017
   journal = {The Journal of Machine Learning Research},
   volume = {18},
   number = {1},
-  issn = {1532-4435},
-  doi = {10.5555/3122009.3122050}
+  issn = {1532-4435}
 }
 
 @incollection{feller1949,
@@ -118,7 +131,7 @@ @incollection{feller1949
   year = {1949},
   month = jan,
   volume = {1},
-  publisher = {{University of California Press}},
+  publisher = {University of California Press},
   url = {https://projecteuclid.org/ebooks/berkeley-symposium-on-mathematical-statistics-and-probability/Proceedings-of-the-First-Berkeley-Symposium-on-Mathematical-Statistics-and/chapter/On-the-Theory-of-Stochastic-Processes-with-Particular-Reference-to/bsmsp/1166219215}
 }
 
@@ -218,6 +231,18 @@ @article{gupta2018
   doi = {10.3390/computation6010009}
 }
 
+@article{harte2010,
+  title = {{{PtProcess}} : {{An R Package}} for {{Modelling Marked Point Processes Indexed}} by {{Time}}},
+  shorttitle = {{{{\textbf{PtProcess}}}}},
+  author = {Harte, David},
+  year = {2010},
+  journal = {Journal of Statistical Software},
+  volume = {35},
+  number = {8},
+  issn = {1548-7660},
+  doi = {10.18637/jss.v035.i08}
+}
+
 @misc{hoffimann2020,
   title = {{{JuliaEarth}}/{{GeoStats}}.Jl: V0.11.7},
   shorttitle = {{{JuliaEarth}}/{{GeoStats}}.Jl},
@@ -246,14 +271,14 @@ @book{last2017
   year = {2017},
   month = oct,
   edition = {1st edition},
-  publisher = {{Cambridge University Press}}
+  publisher = {Cambridge University Press}
 }
 
 @book{laub2021,
   title = {The {{Elements}} of {{Hawkes Processes}}},
   author = {Laub, Patrick J. and Lee, Young and Taimre, Thomas},
   year = {2021},
-  publisher = {{Springer International Publishing}},
+  publisher = {Springer International Publishing},
   doi = {10.1007/978-3-030-84639-8},
   isbn = {978-3-030-84638-1 978-3-030-84639-8}
 }
@@ -298,7 +323,7 @@ @article{lewis1976
 
 @article{lewis1979,
   title = {Simulation of Nonhomogeneous Poisson Processes by Thinning},
-  author = {Lewis, P. a. W and Shedler, G. S.},
+  author = {Lewis, P. A. W. and Shedler, G. S.},
   year = {1979},
   journal = {Naval Research Logistics Quarterly},
   volume = {26},
@@ -312,8 +337,8 @@ @book{marchetti2017
   author = {Marchetti, Luca and Priami, Corrado and Thanh, Vo Hong},
   year = {2017},
   series = {Texts in {{Theoretical Computer Science}}. {{An EATCS Series}}},
-  publisher = {{Springer International Publishing}},
-  address = {{Cham}},
+  publisher = {Springer International Publishing},
+  address = {Cham},
   doi = {10.1007/978-3-319-63113-4},
   isbn = {978-3-319-63111-0 978-3-319-63113-4}
 }
@@ -324,8 +349,8 @@ @incollection{masuda2013a
   author = {Masuda, Naoki and Takaguchi, Taro and Sato, Nobuo and Yano, Kazuo},
   editor = {Holme, Petter and Saram{\"a}ki, Jari},
   year = {2013},
-  publisher = {{Springer Berlin Heidelberg}},
-  address = {{Berlin, Heidelberg}},
+  publisher = {Springer Berlin Heidelberg},
+  address = {Berlin, Heidelberg},
   doi = {10.1007/978-3-642-36461-7_12},
   isbn = {978-3-642-36461-7}
 }
@@ -336,7 +361,7 @@ @book{masuda2016
   author = {Masuda, Naoki and Lambiotte, Renaud},
   year = {2016},
   month = sep,
-  publisher = {{World Scientific}},
+  publisher = {World Scientific},
   doi = {10.1142/q0033},
   isbn = {978-1-78634-114-3}
 }
@@ -405,12 +430,13 @@ @book{pervin2014
   editor = {Boas, Ralph P.},
   year = {2014},
   month = dec,
-  publisher = {{Academic Press}},
+  publisher = {Academic Press},
+  doi = {10.1016/c2013-0-08097-8},
   isbn = {978-1-4832-1172-5}
 }
 
 @article{rackauckas2017,
-  title = {{{DifferentialEquations}}.Jl {\textendash} {{A Performant}} and {{Feature-Rich Ecosystem}} for {{Solving Differential Equations}} in {{Julia}}},
+  title = {{{DifferentialEquations}}.Jl -- {{A Performant}} and {{Feature-Rich Ecosystem}} for {{Solving Differential Equations}} in {{Julia}}},
   author = {Rackauckas, Christopher and Nie, Qing},
   year = {2017},
   month = may,
@@ -425,7 +451,7 @@ @misc{rodrigues2021
   author = {Rodrigues, Yuri E. and Tigaret, Cezar M. and Marie, H{\'e}l{\`e}ne and O'Donnell, Cian and Veltz, Romain},
   year = {2021},
   month = mar,
-  publisher = {{bioRxiv}},
+  publisher = {bioRxiv},
   doi = {10.1101/2021.03.30.437703},
   archiveprefix = {bioRxiv}
 }
@@ -520,7 +546,7 @@ @misc{veltz2015
   month = apr,
   number = {arXiv:1504.06873},
   eprint = {1504.06873},
-  publisher = {{arXiv}},
+  publisher = {arXiv},
   doi = {10.48550/arXiv.1504.06873},
   archiveprefix = {arxiv}
 }
@@ -536,3 +562,17 @@ @article{wu1983
   issn = {0090-5364, 2168-8966},
   doi = {10.1214/aos/1176346060}
 }
+
+@misc{xu2019a,
+  title = {{{PoPPy}}: {{A Point Process Toolbox Based}} on {{PyTorch}}},
+  shorttitle = {{{PoPPy}}},
+  author = {Xu, Hongteng},
+  year = {2019},
+  month = oct,
+  number = {arXiv:1810.10122},
+  eprint = {1810.10122},
+  primaryclass = {cs, stat},
+  publisher = {arXiv},
+  doi = {10.48550/arXiv.1810.10122},
+  archiveprefix = {arxiv}
+}