OpenFn · taylordowns2000 · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024
diff --git a/.env.example b/.env.example
@@ -72,6 +72,7 @@
 # period if you are using Kubernetes.
 # RUN_GRACE_PERIOD_SECONDS=10
 # WORKER_MAX_RUN_DURATION_SECONDS=300
+# WORKER_MAX_PULL_TIMEOUT_SECONDS=10
 
 # TODO: these aren't specified in Runtime, do they belong to the worker process?
 # WORKER_MAX_RUN_MEMORY_MB=500

diff --git a/lib/lightning/config.ex b/lib/lightning/config.ex
@@ -40,6 +40,11 @@ defmodule Lightning.Config do
       Application.get_env(:lightning, :run_grace_period_seconds)
     end
 
+    @impl true
+    def max_pull_timeout_seconds do
+      Application.get_env(:lightning, :max_pull_timeout_seconds)
+    end
+
     @impl true
     def default_max_run_duration do
       Application.get_env(:lightning, :max_run_duration_seconds)
@@ -164,6 +169,7 @@ defmodule Lightning.Config do
   @callback email_sender_name() :: String.t()
   @callback get_extension_mod(key :: atom()) :: any()
   @callback grace_period() :: integer()
+  @callback max_pull_timeout_seconds() :: integer()
   @callback instance_admin_email() :: String.t()
   @callback kafka_duplicate_tracking_retention_seconds() :: integer()
   @callback kafka_number_of_consumers() :: integer()

diff --git a/lib/lightning/config/bootstrap.ex b/lib/lightning/config/bootstrap.ex
@@ -225,6 +225,10 @@ defmodule Lightning.Config.Bootstrap do
       src: env!("PLAUSIBLE_SRC", :string, nil),
       data_domain: env!("PLAUSIBLE_DATA_DOMAIN", :string, nil)
 
+    config :lightning,
+           :max_pull_timeout_seconds,
+           env!("WORKER_MAX_PULL_TIMEOUT_SECONDS", :integer, 30)
+
     config :lightning,
            :run_grace_period_seconds,
            env!("RUN_GRACE_PERIOD_SECONDS", :integer, 10)

diff --git a/lib/lightning/janitor.ex b/lib/lightning/janitor.ex
@@ -24,7 +24,12 @@ defmodule Lightning.Janitor do
   by the Oban cron plugin.
   """
   @impl Oban.Worker
-  def perform(%Oban.Job{}), do: find_and_update_lost()
+  def perform(%Oban.Job{}), do: chores()
+
+  defp chores do
+    forfeit_expired_claims()
+    find_and_update_lost()
+  end
 
   @doc """
   The find_and_update_lost function determines the current time, finds all
@@ -44,4 +49,23 @@ defmodule Lightning.Janitor do
       |> Stream.run()
     end)
   end
+
+  @doc """
+  The find_and_update_lost function determines the current time, finds all
+  runs that were claimed before the earliest allowable claim time for
+  unfinished runs, and marks them as lost.
+  """
+  def forfeit_expired_claims do
+    stream =
+      Runs.Query.forfeited()
+      |> Repo.stream()
+
+    Repo.transaction(fn ->
+      stream
+      |> Stream.each(fn run ->
+        Runs.forfeit_claim(run)
+      end)
+      |> Stream.run()
+    end)
+  end
 end
diff --git a/lib/lightning/runs.ex b/lib/lightning/runs.ex
@@ -319,6 +319,18 @@ defmodule Lightning.Runs do
     end)
   end
 
+  @spec forfeit_claim(Lightning.Run.t()) ::
+          {:ok, any()} | {:error, any()}
+  def forfeit_claim(%Run{} = run) do
+    Logger.warning(fn ->
+      "Detected forfeit run: #{inspect(run)}"
+    end)
+
+    run
+    |> Ecto.Changeset.change(state: "available")
+    |> Repo.update()
+  end
+
   defdelegate subscribe(run), to: Events
 
   def get_project_id_for_run(run) do

diff --git a/lib/lightning/runs/query.ex b/lib/lightning/runs/query.ex
@@ -8,6 +8,25 @@ defmodule Lightning.Runs.Query do
 
   require Lightning.Run
 
+  @spec forfeited :: Ecto.Queryable.t()
+  def forfeited do
+    now = Lightning.current_time()
+
+    max_pull_timeout_seconds = Lightning.Config.max_pull_timeout_seconds()
+    grace_period_ms = Lightning.Config.grace_period() * 1000
+
+    fallback_oldest_claim =
+      now
+      |> DateTime.add(-max_pull_timeout_seconds, :second)
+      |> DateTime.add(-grace_period_ms, :millisecond)
+
+    from(r in Run,
+      where: r.claimed_at < ^fallback_oldest_claim,
+      where: is_nil(r.started_at),
+      where: r.state == "claimed"
+    )
+  end
+
   @doc """
   Return all runs that have been claimed by a worker before the earliest
   acceptable start time (determined by the run options and grace period) but are

diff --git a/lib/lightning_web/channels/worker_channel.ex b/lib/lightning_web/channels/worker_channel.ex
@@ -31,6 +31,11 @@ defmodule LightningWeb.WorkerChannel do
         runs =
           runs
           |> Enum.map(fn run ->
+            # in heavy load this averages over 10 seconds and causes all runs to
+            # be marked as lost
+            # dbg("there was a run and we set it to started")
+            # :timer.sleep(11_000)
+
             opts = run_options(run)
 
             token =

diff --git a/test/lightning/janitor_test.exs b/test/lightning/janitor_test.exs
@@ -6,6 +6,16 @@ defmodule Lightning.JanitorTest do
   alias Lightning.Repo
   alias Lightning.Run
 
+  describe "forfeit_expired_claims/0" do
+    test "releases runs for reclaim if they have not been started after the pull timeout plus grace" do
+      dbg("eish, i don't love this. what if there was some other reason for them getting lost?")
+      dbg("like, what if they did start, and did the work, and all that, but we never heard back from them because of network issues?")
+      dbg("i wouldn't want to re-do the work.")
+      dbg("i wish there was some way to only mark them as claimed once we know that the worker actually got the run.")
+      dbg("can we check that our reply in the websocket channel was actually received by the ws-worker?")
+    end
+  end
+
   describe "find_and_update_lost/0" do
     @tag :capture_log
     test "updates lost runs and their steps" do