zircon/kernel/object/job_dispatcher.cc

// Copyright 2016 The Fuchsia Authors
//
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file or at
// https://opensource.org/licenses/MIT

#include "object/job_dispatcher.h"

#include <inttypes.h>
#include <lib/counters.h>
#include <platform.h>
#include <zircon/errors.h>
#include <zircon/rights.h>
#include <zircon/types.h>
#include <zircon/syscalls/policy.h>

#include <fbl/alloc_checker.h>
#include <fbl/array.h>
#include <fbl/auto_lock.h>
#include <fbl/inline_array.h>
#include <kernel/mutex.h>
#include <ktl/algorithm.h>
#include <object/process_dispatcher.h>

KCOUNTER(dispatcher_job_create_count, "dispatcher.job.create")
KCOUNTER(dispatcher_job_destroy_count, "dispatcher.job.destroy")

// The starting max_height value of the root job.
static constexpr uint32_t kRootJobMaxHeight = 32;

static constexpr char kRootJobName[] = "root";

template <>
uint64_t JobDispatcher::ChildCountLocked<JobDispatcher>() const {
  return jobs_.size();
}

template <>
uint64_t JobDispatcher::ChildCountLocked<ProcessDispatcher>() const {
  return procs_.size();
}

// To come up with an order on our recursive locks we take advantage of the fact that our
// max_height reduces from parent to child. As we acquire locks from parent->child we can build an
// increasing counter by inverting the max_height. We add 1 to the counter just so the order value
// of 0 is reserved for the default order when the lock is acquired without an order being
// specified.
uint32_t JobDispatcher::LockOrder() const { return kRootJobMaxHeight - max_height() + 1; }

// Calls the provided |zx_status_t func(fbl::RefPtr<DISPATCHER_TYPE>)|
// function on all live elements of |children|, which must be one of |jobs_|
// or |procs_|. Stops iterating early if |func| returns a value other than
// ZX_OK, returning that value from this method. |lock_| must be held when
// calling this method, and it will still be held while the callback is
// called.
//
// The returned |LiveRefsArray| needs to be destructed when |lock_| is not
// held anymore. The recommended pattern is:
//
//  LiveRefsArray refs;
//  {
//      Guard<Mutex> guard{get_lock()};
//      refs = ForEachChildInLocked(...);
//  }
//
template <typename T, typename Fn>
JobDispatcher::LiveRefsArray JobDispatcher::ForEachChildInLocked(T& children, zx_status_t* result,
                                                                 Fn func) {
  // Convert child raw pointers into RefPtrs. This is tricky and requires
  // special logic on the RefPtr class to handle a ref count that can be
  // zero.
  //
  // The main requirement is that |lock_| is both controlling child
  // list lookup and also making sure that the child destructor cannot
  // make progress when doing so. In other words, when inspecting the
  // |children| list we can be sure that a given child process or child
  // job is either
  //   - alive, with refcount > 0
  //   - in destruction process but blocked, refcount == 0

  const uint64_t count = ChildCountLocked<typename T::ValueType>();

  if (!count) {
    *result = ZX_OK;
    return LiveRefsArray();
  }

  fbl::AllocChecker ac;
  LiveRefsArray refs(new (&ac) fbl::RefPtr<Dispatcher>[count], count);
  if (!ac.check()) {
    *result = ZX_ERR_NO_MEMORY;
    return LiveRefsArray();
  }

  size_t ix = 0;

  for (auto& craw : children) {
    auto cref = ::fbl::MakeRefPtrUpgradeFromRaw(&craw, get_lock());
    if (!cref)
      continue;

    *result = func(cref);
    // |cref| might be the last reference at this point. If so,
    // when we drop it in the next iteration the object dtor
    // would be called here with the |get_lock()| held. To avoid that
    // we keep the reference alive in the |refs| array and pass
    // the responsibility of releasing them outside the lock to
    // the caller.
    refs[ix++] = ktl::move(cref);

    if (*result != ZX_OK)
      break;
  }

  return refs;
}

fbl::RefPtr<JobDispatcher> JobDispatcher::CreateRootJob() {
  fbl::AllocChecker ac;
  auto job = fbl::AdoptRef(new (&ac) JobDispatcher(0u, nullptr, JobPolicy::CreateRootPolicy()));
  if (!ac.check()) {
    panic("root-job: failed to allocate\n");
  }
  job->set_name(kRootJobName, sizeof(kRootJobName));
  return job;
}

zx_status_t JobDispatcher::Create(uint32_t flags, const fbl::RefPtr<JobDispatcher>& parent,
                                  KernelHandle<JobDispatcher>* handle, zx_rights_t* rights) {
  if (parent != nullptr && parent->max_height() == 0) {
    // The parent job cannot have children.
    return ZX_ERR_OUT_OF_RANGE;
  }

  fbl::AllocChecker ac;
  KernelHandle new_handle(
      fbl::AdoptRef(new (&ac) JobDispatcher(flags, parent, parent->GetPolicy())));
  if (!ac.check())
    return ZX_ERR_NO_MEMORY;

  if (!parent->AddChildJob(new_handle.dispatcher())) {
    return ZX_ERR_BAD_STATE;
  }

  *rights = default_rights();
  *handle = ktl::move(new_handle);
  return ZX_OK;
}

JobDispatcher::JobDispatcher(uint32_t /*flags*/, fbl::RefPtr<JobDispatcher> parent,
                             JobPolicy policy)
    : SoloDispatcher(ZX_JOB_NO_PROCESSES | ZX_JOB_NO_JOBS | ZX_JOB_NO_CHILDREN),
      parent_(ktl::move(parent)),
      max_height_(parent_ ? parent_->max_height() - 1 : kRootJobMaxHeight),
      state_(State::READY),
      return_code_(0),
      kill_on_oom_(false),
      policy_(policy),
      exceptionate_(ZX_EXCEPTION_CHANNEL_TYPE_JOB),
      debug_exceptionate_(ZX_EXCEPTION_CHANNEL_TYPE_JOB_DEBUGGER) {
  kcounter_add(dispatcher_job_create_count, 1);
}

JobDispatcher::~JobDispatcher() {
  kcounter_add(dispatcher_job_destroy_count, 1);
  RemoveFromJobTreesUnlocked();
}

zx_koid_t JobDispatcher::get_related_koid() const { return parent_ ? parent_->get_koid() : 0u; }

bool JobDispatcher::AddChildProcess(const fbl::RefPtr<ProcessDispatcher>& process) {
  canary_.Assert();

  Guard<Mutex> guard{get_lock()};
  if (state_ != State::READY)
    return false;
  procs_.push_back(process.get());
  UpdateSignalsLocked();
  return true;
}

bool JobDispatcher::AddChildJob(const fbl::RefPtr<JobDispatcher>& job) {
  canary_.Assert();

  Guard<Mutex> guard{get_lock()};

  if (state_ != State::READY)
    return false;

  // Put the new job after our next-youngest child, or us if we have none.
  //
  // We try to make older jobs closer to the root (both hierarchically and
  // temporally) show up earlier in enumeration.
  JobDispatcher* neighbor = (jobs_.is_empty() ? this : &jobs_.back());

  // This can only be called once, the job should not already be part
  // of any job tree.
  DEBUG_ASSERT(!fbl::InContainer<JobDispatcher::RawListTag>(*job));
  DEBUG_ASSERT(neighbor != job.get());

  jobs_.push_back(job.get());
  UpdateSignalsLocked();
  return true;
}

void JobDispatcher::RemoveChildProcess(ProcessDispatcher* process) {
  canary_.Assert();

  bool should_die = false;
  {
    Guard<Mutex> guard{get_lock()};
    // The process dispatcher can call us in its destructor, Kill(),
    // or RemoveThread().
    if (!fbl::InContainer<ProcessDispatcher::RawJobListTag>(*process)) {
      return;
    }
    procs_.erase(*process);
    UpdateSignalsLocked();
    should_die = IsReadyForDeadTransitionLocked();

    // Aggregate runtime stats from exiting process.
    aggregated_runtime_stats_.Add(process->GetAggregatedRuntime());
  }

  if (should_die)
    FinishDeadTransitionUnlocked();
}

void JobDispatcher::RemoveChildJob(JobDispatcher* job) {
  canary_.Assert();

  bool should_die = false;
  {
    Guard<Mutex> guard{get_lock()};
    if (!fbl::InContainer<JobDispatcher::RawListTag>(*job)) {
      return;
    }

    jobs_.erase(*job);
    jobs_.size();
    UpdateSignalsLocked();
    should_die = IsReadyForDeadTransitionLocked();
  }

  if (should_die)
    FinishDeadTransitionUnlocked();
}

JobDispatcher::State JobDispatcher::GetState() const {
  Guard<Mutex> guard{get_lock()};
  return state_;
}

void JobDispatcher::RemoveFromJobTreesUnlocked() {
  canary_.Assert();

  if (parent_)
    parent_->RemoveChildJob(this);
}

bool JobDispatcher::IsReadyForDeadTransitionLocked() {
  canary_.Assert();
  return state_ == State::KILLING && jobs_.is_empty() && procs_.is_empty();
}

void JobDispatcher::FinishDeadTransitionUnlocked() {
  canary_.Assert();

  // Make sure we're killing from the bottom of the tree up or else parent
  // jobs could die before their children.
  //
  // In particular, this means we have to finish dying before leaving the job
  // trees, since the last child leaving the tree can trigger its parent to
  // finish dying.
  DEBUG_ASSERT(!parent_ || (parent_->GetState() != State::DEAD));
  {
    Guard<Mutex> guard{get_lock()};
    state_ = State::DEAD;
    exceptionate_.Shutdown();
    debug_exceptionate_.Shutdown();
    UpdateStateLocked(0u, ZX_JOB_TERMINATED);
  }

  RemoveFromJobTreesUnlocked();
}

void JobDispatcher::UpdateSignalsLocked() {
  // Clear all signals, and mark the appropriate ones active.
  //
  // The active signals take precedence over the clear signals.
  zx_signals_t clear = (ZX_JOB_NO_JOBS | ZX_JOB_NO_PROCESSES | ZX_JOB_NO_CHILDREN);

  // Removing jobs or processes.
  zx_signals_t set = 0u;
  if (procs_.is_empty()) {
    set |= ZX_JOB_NO_PROCESSES;
  }
  if (jobs_.is_empty()) {
    set |= ZX_JOB_NO_JOBS;
  }
  if (jobs_.is_empty() && procs_.is_empty()) {
    set |= ZX_JOB_NO_CHILDREN;
  }

  UpdateStateLocked(clear, set);
}

JobPolicy JobDispatcher::GetPolicy() const {
  Guard<Mutex> guard{get_lock()};
  return policy_;
}

bool JobDispatcher::KillJobWithKillOnOOM() {
  // Get list of jobs with kill bit set.
  OOMBitJobArray oom_jobs;
  int count = 0;
  CollectJobsWithOOMBit(&oom_jobs, &count);
  if (count == 0) {
    printf("OOM: no jobs with kill_on_oom found\n");
    return false;
  }

  // Sort by max height.
  ktl::stable_sort(oom_jobs.begin(), oom_jobs.begin() + count,
                   [](const fbl::RefPtr<JobDispatcher>& a, const fbl::RefPtr<JobDispatcher>& b) {
                     return a->max_height() < b->max_height();
                   });

  // Kill lowest to highest until we find something to kill.
  for (int i = count - 1; i >= 0; --i) {
    auto& job = oom_jobs[i];
    if (job->Kill(ZX_TASK_RETCODE_OOM_KILL)) {
      char name[ZX_MAX_NAME_LEN];
      job->get_name(name);
      printf("OOM: killing %" PRIu64 " '%s'\n", job->get_koid(), name);
      return true;
    }
  }

  printf("OOM: no job found to kill\n");
  return false;
}

void JobDispatcher::CollectJobsWithOOMBit(OOMBitJobArray* into, int* count) {
  // As CollectJobsWithOOMBit will recurse we need to give a lock order to the guard.
  Guard<Mutex> guard{&lock_, LockOrder()};
  // We had to take the guard directly on lock_ above as the get_lock() virtual method erases the
  // Nestasble type information. The AssertHeld here allows us to restore the clang capability
  // analysis.
  AssertHeld(*get_lock());

  if (kill_on_oom_) {
    if (*count >= static_cast<int>(into->size())) {
      printf("OOM: skipping some jobs, exceeded max count\n");
      return;
    }

    auto cref = ::fbl::MakeRefPtrUpgradeFromRaw(this, get_lock());
    if (!cref)
      return;
    (*into)[*count] = ktl::move(cref);
    *count += 1;
  }

  for (auto& job : jobs_) {
    job.CollectJobsWithOOMBit(into, count);
  }
}

bool JobDispatcher::Kill(int64_t return_code) {
  canary_.Assert();

  JobList jobs_to_kill;
  ProcessList procs_to_kill;

  LiveRefsArray jobs_refs;
  LiveRefsArray proc_refs;

  bool should_die = false;
  {
    Guard<Mutex> guard{get_lock()};
    if (state_ != State::READY)
      return false;

    return_code_ = return_code;
    state_ = State::KILLING;
    zx_status_t result;

    // Safely gather refs to the children.
    jobs_refs = ForEachChildInLocked(jobs_, &result, [&](fbl::RefPtr<JobDispatcher> job) {
      jobs_to_kill.push_front(ktl::move(job));
      return ZX_OK;
    });
    proc_refs = ForEachChildInLocked(procs_, &result, [&](fbl::RefPtr<ProcessDispatcher> proc) {
      procs_to_kill.push_front(ktl::move(proc));
      return ZX_OK;
    });

    should_die = IsReadyForDeadTransitionLocked();
  }

  if (should_die)
    FinishDeadTransitionUnlocked();

  // Since we kill the child jobs first we have a depth-first massacre.
  while (!jobs_to_kill.is_empty()) {
    // TODO(cpu): This recursive call can overflow the stack.
    jobs_to_kill.pop_front()->Kill(return_code);
  }

  while (!procs_to_kill.is_empty()) {
    procs_to_kill.pop_front()->Kill(return_code);
  }

  return true;
}

bool JobDispatcher::CanSetPolicy() TA_REQ(get_lock()) {
  // Can't set policy when there are active processes or jobs. This constraint ensures that a
  // process's policy cannot change over its lifetime.  Because a process's policy cannot change,
  // the risk of TOCTOU bugs is reduced and we are free to apply policy at the ProcessDispatcher
  // without having to walk up the tree to its containing job.
  if (!procs_.is_empty() || !jobs_.is_empty()) {
    return false;
  }
  return true;
}

zx_status_t JobDispatcher::SetBasicPolicy(uint32_t mode, const zx_policy_basic_v1_t* in_policy,
                                          size_t policy_count) {
  fbl::AllocChecker ac;
  fbl::InlineArray<zx_policy_basic_v2_t, kPolicyBasicInlineCount> policy(&ac, policy_count);
  if (!ac.check()) {
    return ZX_ERR_NO_MEMORY;
  }

  for (size_t ix = 0; ix != policy.size(); ++ix) {
    policy[ix].condition = in_policy[ix].condition;
    policy[ix].action = in_policy[ix].policy;
    policy[ix].flags = ZX_POL_OVERRIDE_DENY;
  }

  return SetBasicPolicy(mode, policy.get(), policy.size());
}

zx_status_t JobDispatcher::SetBasicPolicy(uint32_t mode, const zx_policy_basic_v2_t* in_policy,
                                          size_t policy_count) {
  Guard<Mutex> guard{get_lock()};

  if (!CanSetPolicy()) {
    return ZX_ERR_BAD_STATE;
  }
  return policy_.AddBasicPolicy(mode, in_policy, policy_count);
}

zx_status_t JobDispatcher::SetTimerSlackPolicy(const zx_policy_timer_slack& policy) {
  Guard<Mutex> guard{get_lock()};

  if (!CanSetPolicy()) {
    return ZX_ERR_BAD_STATE;
  }

  // Is the policy valid?
  if (policy.min_slack < 0) {
    return ZX_ERR_INVALID_ARGS;
  }
  slack_mode new_mode;
  switch (policy.default_mode) {
    case ZX_TIMER_SLACK_CENTER:
      new_mode = TIMER_SLACK_CENTER;
      break;
    case ZX_TIMER_SLACK_EARLY:
      new_mode = TIMER_SLACK_EARLY;
      break;
    case ZX_TIMER_SLACK_LATE:
      new_mode = TIMER_SLACK_LATE;
      break;
    default:
      return ZX_ERR_INVALID_ARGS;
  };

  const TimerSlack old_slack = policy_.GetTimerSlack();
  const zx_duration_t new_amount = ktl::max(old_slack.amount(), policy.min_slack);
  const TimerSlack new_slack(new_amount, new_mode);

  policy_.SetTimerSlack(new_slack);

  return ZX_OK;
}

bool JobDispatcher::EnumerateChildren(JobEnumerator* je, bool recurse) {
  canary_.Assert();

  LiveRefsArray jobs_refs;
  LiveRefsArray proc_refs;

  zx_status_t result = ZX_OK;

  {
    // As EnumerateChildren will recurse we need to give a lock order to the guard.
    Guard<Mutex> guard{&lock_, LockOrder()};
    // We had to take the guard directly on lock_ above as the get_lock() virtual method erases the
    // Nestasble type information. The AssertHeld here allows us to restore the clang capability
    // analysis.
    AssertHeld(*get_lock());

    proc_refs =
        ForEachChildInLocked(procs_, &result, [&](const fbl::RefPtr<ProcessDispatcher>& proc) {
          return je->OnProcess(proc.get()) ? ZX_OK : ZX_ERR_STOP;
        });
    if (result != ZX_OK) {
      return false;
    }

    jobs_refs = ForEachChildInLocked(jobs_, &result, [&](const fbl::RefPtr<JobDispatcher>& job) {
      if (!je->OnJob(job.get())) {
        return ZX_ERR_STOP;
      }
      if (recurse) {
        // TODO(kulakowski): This recursive call can overflow the stack.
        return job->EnumerateChildren(je, /* recurse */ true) ? ZX_OK : ZX_ERR_STOP;
      }
      return ZX_OK;
    });
  }

  return result == ZX_OK;
}

fbl::RefPtr<ProcessDispatcher> JobDispatcher::LookupProcessById(zx_koid_t koid) {
  canary_.Assert();

  LiveRefsArray proc_refs;

  fbl::RefPtr<ProcessDispatcher> found_proc;
  {
    Guard<Mutex> guard{get_lock()};
    zx_status_t result;

    proc_refs = ForEachChildInLocked(procs_, &result, [&](fbl::RefPtr<ProcessDispatcher> proc) {
      if (proc->get_koid() == koid) {
        found_proc = ktl::move(proc);
        return ZX_ERR_STOP;
      }
      return ZX_OK;
    });
  }
  return found_proc;  // Null if not found.
}

fbl::RefPtr<JobDispatcher> JobDispatcher::LookupJobById(zx_koid_t koid) {
  canary_.Assert();

  LiveRefsArray jobs_refs;

  fbl::RefPtr<JobDispatcher> found_job;
  {
    Guard<Mutex> guard{get_lock()};
    zx_status_t result;

    jobs_refs = ForEachChildInLocked(jobs_, &result, [&](fbl::RefPtr<JobDispatcher> job) {
      if (job->get_koid() == koid) {
        found_job = ktl::move(job);
        return ZX_ERR_STOP;
      }
      return ZX_OK;
    });
  }
  return found_job;  // Null if not found.
}

void JobDispatcher::get_name(char out_name[ZX_MAX_NAME_LEN]) const {
  canary_.Assert();

  name_.get(ZX_MAX_NAME_LEN, out_name);
}

zx_status_t JobDispatcher::set_name(const char* name, size_t len) {
  canary_.Assert();

  return name_.set(name, len);
}

Exceptionate* JobDispatcher::exceptionate(Exceptionate::Type type) {
  canary_.Assert();
  return type == Exceptionate::Type::kDebug ? &debug_exceptionate_ : &exceptionate_;
}

void JobDispatcher::set_kill_on_oom(bool value) {
  Guard<Mutex> guard{get_lock()};
  kill_on_oom_ = value;
}

bool JobDispatcher::get_kill_on_oom() const {
  Guard<Mutex> guard{get_lock()};
  return kill_on_oom_;
}

void JobDispatcher::GetInfo(zx_info_job_t* info) const {
  canary_.Assert();

  Guard<Mutex> guard{get_lock()};
  info->return_code = return_code_;
  info->exited = (state_ == State::DEAD) || (state_ == State::KILLING);
  info->kill_on_oom = kill_on_oom_;
  info->debugger_attached = debug_exceptionate_.HasValidChannel();
}

zx_status_t JobDispatcher::AccumulateRuntimeTo(zx_info_task_runtime_t* info) const {
  canary_.Assert();

  Guard<Mutex> guard{get_lock()};
  aggregated_runtime_stats_.AccumulateRuntimeTo(info);

  // At this point, the process in question may be in its destructor waiting to acquire the lock and
  // remove itself from this job, but its aggregated runtime is not yet part of this job's data.
  //
  // AccumulateRuntimeTo must be safe to be called even when the process is in its destructor.
  for (const auto& proc : procs_) {
    zx_status_t err = proc.AccumulateRuntimeTo(info);
    if (err != ZX_OK) {
      return err;
    }
  }
  return ZX_OK;
}