From e2680d76e642f2f752c419c085f65ed93c60e2ce Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Fri, 20 Sep 2024 14:39:21 +0800
Subject: [PATCH 01/51] [task] support percpu run-queue and cpu affinity for
 axtask

---
 .github/workflows/test.yml       |   2 +-
 Cargo.lock                       |   8 +-
 api/axfeat/Cargo.toml            |   2 +-
 modules/axruntime/Cargo.toml     |   2 +-
 modules/axtask/Cargo.toml        |  16 +-
 modules/axtask/src/api.rs        |  23 ++-
 modules/axtask/src/lib.rs        |   1 +
 modules/axtask/src/run_queue.rs  | 307 +++++++++++++++++++++++++------
 modules/axtask/src/task.rs       | 113 ++++++++++--
 modules/axtask/src/task_ext.rs   |   2 +-
 modules/axtask/src/timers.rs     |  39 ++--
 modules/axtask/src/wait_queue.rs | 164 +++++++++++------
 12 files changed, 524 insertions(+), 155 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index d3c1de36f1..1eedadff69 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -5,7 +5,7 @@ on: [push, pull_request]
 env:
   qemu-version: 8.2.0
   rust-toolchain: nightly-2024-05-02
-  arceos-apps: '68054e8'
+  arceos-apps: 'b25b7e2'
 
 jobs:
   unit-test:
diff --git a/Cargo.lock b/Cargo.lock
index eb9a02423d..8db9882c99 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -510,6 +510,7 @@ dependencies = [
  "axconfig",
  "axhal",
  "axtask",
+ "bitmaps",
  "cfg-if",
  "crate_interface",
  "kernel_guard",
@@ -1022,9 +1023,9 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
 
 [[package]]
 name = "memory_addr"
-version = "0.3.1"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6f769efcf10b9dfb4c913bebb409cda77b1a3f072b249bf5465e250bcb30eb49"
+checksum = "2ca25419c2b34080d526d6836a53dcab129767ab6a9904587c87265ae6d41aa9"
 
 [[package]]
 name = "minimal-lexical"
@@ -1314,8 +1315,9 @@ checksum = "e6e36312fb5ddc10d08ecdc65187402baba4ac34585cb9d1b78522ae2358d890"
 [[package]]
 name = "scheduler"
 version = "0.1.0"
-source = "git+https://github.com/arceos-org/scheduler.git?tag=v0.1.0#c8d25d9aed146dca28dc8987afd229b52c20361a"
+source = "git+https://github.com/arceos-org/scheduler.git?branch=num_tasks#415a620347722cb734fa440d9103065690a5853b"
 dependencies = [
+ "kspin",
  "linked_list",
 ]
 
diff --git a/api/axfeat/Cargo.toml b/api/axfeat/Cargo.toml
index 6e246b36df..17121025f8 100644
--- a/api/axfeat/Cargo.toml
+++ b/api/axfeat/Cargo.toml
@@ -13,7 +13,7 @@ documentation = "https://arceos-org.github.io/arceos/axfeat/index.html"
 default = []
 
 # Multicore
-smp = ["axhal/smp", "axruntime/smp", "kspin/smp"]
+smp = ["axhal/smp", "axruntime/smp", "axtask/smp", "kspin/smp"]
 
 # Floating point/SIMD
 fp_simd = ["axhal/fp_simd"]
diff --git a/modules/axruntime/Cargo.toml b/modules/axruntime/Cargo.toml
index b904fe6b89..c26237b718 100644
--- a/modules/axruntime/Cargo.toml
+++ b/modules/axruntime/Cargo.toml
@@ -12,7 +12,7 @@ documentation = "https://arceos-org.github.io/arceos/axruntime/index.html"
 [features]
 default = []
 
-smp = ["axhal/smp"]
+smp = ["axhal/smp", "axtask?/smp"]
 irq = ["axhal/irq", "axtask?/irq", "percpu", "kernel_guard"]
 tls = ["axhal/tls", "axtask?/tls"]
 alloc = ["axalloc"]
diff --git a/modules/axtask/Cargo.toml b/modules/axtask/Cargo.toml
index 25fe7fb371..ec5a16ab3e 100644
--- a/modules/axtask/Cargo.toml
+++ b/modules/axtask/Cargo.toml
@@ -13,12 +13,20 @@ documentation = "https://arceos-org.github.io/arceos/axtask/index.html"
 default = []
 
 multitask = [
-    "dep:axconfig", "dep:percpu", "dep:kspin", "dep:lazyinit", "dep:memory_addr",
-    "dep:scheduler", "dep:timer_list", "kernel_guard", "dep:crate_interface",
+    "dep:axconfig",
+    "dep:percpu",
+    "dep:kspin",
+    "dep:lazyinit",
+    "dep:memory_addr",
+    "dep:scheduler",
+    "dep:timer_list",
+    "kernel_guard",
+    "dep:crate_interface",
 ]
 irq = []
 tls = ["axhal/tls"]
 preempt = ["irq", "percpu?/preempt", "kernel_guard/preempt"]
+smp = ["kspin/smp"]
 
 sched_fifo = ["multitask"]
 sched_rr = ["multitask", "preempt"]
@@ -29,6 +37,7 @@ test = ["percpu?/sp-naive"]
 [dependencies]
 cfg-if = "1.0"
 log = "0.4.21"
+bitmaps = { version = "3.2.1", default-features = false }
 axhal = { workspace = true }
 axconfig = { workspace = true, optional = true }
 percpu = { version = "0.1", optional = true }
@@ -38,7 +47,8 @@ memory_addr = { version = "0.3", optional = true }
 timer_list = { version = "0.1", optional = true }
 kernel_guard = { version = "0.1", optional = true }
 crate_interface = { version = "0.1", optional = true }
-scheduler = { git = "https://github.com/arceos-org/scheduler.git", tag = "v0.1.0", optional = true }
+# scheduler = { git = "https://github.com/arceos-org/scheduler.git", tag = "v0.1.0", optional = true }
+scheduler = { git = "https://github.com/arceos-org/scheduler.git", branch = "num_tasks", optional = true }
 
 [dev-dependencies]
 rand = "0.8"
diff --git a/modules/axtask/src/api.rs b/modules/axtask/src/api.rs
index 156ba9d3b6..cd99702dc2 100644
--- a/modules/axtask/src/api.rs
+++ b/modules/axtask/src/api.rs
@@ -2,7 +2,7 @@
 
 use alloc::{string::String, sync::Arc};
 
-pub(crate) use crate::run_queue::{AxRunQueue, RUN_QUEUE};
+pub(crate) use crate::run_queue::{current_run_queue, select_run_queue};
 
 #[doc(cfg(feature = "multitask"))]
 pub use crate::task::{CurrentTask, TaskId, TaskInner};
@@ -77,6 +77,8 @@ pub fn init_scheduler() {
 /// Initializes the task scheduler for secondary CPUs.
 pub fn init_scheduler_secondary() {
     crate::run_queue::init_secondary();
+    #[cfg(feature = "irq")]
+    crate::timers::init();
 }
 
 /// Handles periodic timer ticks for the task manager.
@@ -86,13 +88,18 @@ pub fn init_scheduler_secondary() {
 #[doc(cfg(feature = "irq"))]
 pub fn on_timer_tick() {
     crate::timers::check_events();
-    RUN_QUEUE.lock().scheduler_timer_tick();
+    current_run_queue().scheduler_timer_tick();
 }
 
 /// Adds the given task to the run queue, returns the task reference.
 pub fn spawn_task(task: TaskInner) -> AxTaskRef {
     let task_ref = task.into_arc();
-    RUN_QUEUE.lock().add_task(task_ref.clone());
+    let _kernel_guard = kernel_guard::NoPreemptIrqSave::new();
+    crate::select_run_queue(
+        #[cfg(feature = "smp")]
+        task_ref.clone(),
+    )
+    .add_task(task_ref.clone());
     task_ref
 }
 
@@ -103,7 +110,7 @@ pub fn spawn_raw<F>(f: F, name: String, stack_size: usize) -> AxTaskRef
 where
     F: FnOnce() + Send + 'static,
 {
-    spawn_task(TaskInner::new(f, name, stack_size))
+    spawn_task(TaskInner::new(f, name, stack_size, None))
 }
 
 /// Spawns a new task with the default parameters.
@@ -129,13 +136,13 @@ where
 ///
 /// [CFS]: https://en.wikipedia.org/wiki/Completely_Fair_Scheduler
 pub fn set_priority(prio: isize) -> bool {
-    RUN_QUEUE.lock().set_current_priority(prio)
+    current_run_queue().set_current_priority(prio)
 }
 
 /// Current task gives up the CPU time voluntarily, and switches to another
 /// ready task.
 pub fn yield_now() {
-    RUN_QUEUE.lock().yield_current();
+    current_run_queue().yield_current()
 }
 
 /// Current task is going to sleep for the given duration.
@@ -150,14 +157,14 @@ pub fn sleep(dur: core::time::Duration) {
 /// If the feature `irq` is not enabled, it uses busy-wait instead.
 pub fn sleep_until(deadline: axhal::time::TimeValue) {
     #[cfg(feature = "irq")]
-    RUN_QUEUE.lock().sleep_until(deadline);
+    current_run_queue().sleep_until(deadline);
     #[cfg(not(feature = "irq"))]
     axhal::time::busy_wait_until(deadline);
 }
 
 /// Exits the current task.
 pub fn exit(exit_code: i32) -> ! {
-    RUN_QUEUE.lock().exit_current(exit_code)
+    current_run_queue().exit_current(exit_code)
 }
 
 /// The idle task routine.
diff --git a/modules/axtask/src/lib.rs b/modules/axtask/src/lib.rs
index 64702634f5..489ac0a95a 100644
--- a/modules/axtask/src/lib.rs
+++ b/modules/axtask/src/lib.rs
@@ -42,6 +42,7 @@ cfg_if::cfg_if! {
         extern crate log;
         extern crate alloc;
 
+        #[macro_use]
         mod run_queue;
         mod task;
         mod task_ext;
diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index 1592b7637e..231a8adfd1 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -1,37 +1,190 @@
 use alloc::collections::VecDeque;
 use alloc::sync::Arc;
-use kspin::SpinNoIrq;
+use core::mem::MaybeUninit;
+
+use bitmaps::Bitmap;
 use lazyinit::LazyInit;
 use scheduler::BaseScheduler;
 
+use axhal::cpu::this_cpu_id;
+
 use crate::task::{CurrentTask, TaskState};
 use crate::{AxTaskRef, Scheduler, TaskInner, WaitQueue};
 
-// TODO: per-CPU
-pub(crate) static RUN_QUEUE: LazyInit<SpinNoIrq<AxRunQueue>> = LazyInit::new();
+macro_rules! percpu_static {
+    ($($name:ident: $ty:ty = $init:expr),* $(,)?) => {
+        $(
+            #[percpu::def_percpu]
+            static $name: $ty = $init;
+        )*
+    };
+}
 
-// TODO: per-CPU
-static EXITED_TASKS: SpinNoIrq<VecDeque<AxTaskRef>> = SpinNoIrq::new(VecDeque::new());
+percpu_static! {
+    RUN_QUEUE: LazyInit<AxRunQueue> = LazyInit::new(),
+    EXITED_TASKS: VecDeque<AxTaskRef> = VecDeque::new(),
+    WAIT_FOR_EXIT: WaitQueue = WaitQueue::new(),
+    IDLE_TASK: LazyInit<AxTaskRef> = LazyInit::new(),
+}
 
-static WAIT_FOR_EXIT: WaitQueue = WaitQueue::new();
+/// An array of references to run queues, one for each CPU, indexed by cpu_id.
+///
+/// This static variable holds references to the run queues for each CPU in the system.
+///
+/// # Safety
+///
+/// Access to this variable is marked as `unsafe` because it contains `MaybeUninit` references,
+/// which require careful handling to avoid undefined behavior. The array should be fully
+/// initialized before being accessed to ensure safe usage.
+static mut RUN_QUEUES: [MaybeUninit<&'static mut AxRunQueue>; axconfig::SMP] =
+    [ARRAY_REPEAT_VALUE; axconfig::SMP];
+const ARRAY_REPEAT_VALUE: MaybeUninit<&'static mut AxRunQueue> = MaybeUninit::uninit();
+/// Returns a reference to the current run queue.
+///
+/// ## Safety
+///
+/// This function returns a static reference to the current run queue, which
+/// is inherently unsafe. It assumes that the `RUN_QUEUE` has been properly
+/// initialized and is not accessed concurrently in a way that could cause
+/// data races or undefined behavior.
+///
+/// ## Returns
+///
+/// A static reference to the current run queue.
+#[inline]
+pub(crate) fn current_run_queue() -> &'static mut AxRunQueue {
+    unsafe { RUN_QUEUE.current_ref_mut_raw() }
+}
 
-#[percpu::def_percpu]
-static IDLE_TASK: LazyInit<AxTaskRef> = LazyInit::new();
+/// Selects the run queue index based on a CPU set bitmap, minimizing the number of tasks.
+///
+/// This function filters the available run queues based on the provided `cpu_set` and
+/// selects the one with the fewest tasks. The selected run queue's index (cpu_id) is returned.
+///
+/// ## Arguments
+///
+/// * `cpu_set` - A bitmap representing the CPUs that are eligible for task execution.
+///
+/// ## Returns
+///
+/// The index (cpu_id) of the selected run queue.
+///
+/// ## Panics
+///
+/// This function will panic if there is no available run queue that matches the CPU set.
+///
+#[cfg(feature = "smp")]
+#[inline]
+fn select_run_queue_index(cpu_set: Bitmap<{ axconfig::SMP }>) -> usize {
+    unsafe {
+        RUN_QUEUES
+            .iter()
+            .filter(|rq| cpu_set.get(rq.assume_init_ref().cpu_id()))
+            .min_by_key(|rq| rq.assume_init_ref().num_tasks())
+            .expect("No available run queue that matches the CPU set")
+            .assume_init_ref()
+            .cpu_id()
+    }
+}
 
+/// Retrieves a `'static` reference to the run queue corresponding to the given index.
+///
+/// This function asserts that the provided index is within the range of available CPUs
+/// and returns a reference to the corresponding run queue.
+///
+/// ## Arguments
+///
+/// * `index` - The index of the run queue to retrieve.
+///
+/// ## Returns
+///
+/// A reference to the `AxRunQueue` corresponding to the provided index.
+///
+/// ## Panics
+///
+/// This function will panic if the index is out of bounds.
+///
+#[inline]
+fn get_run_queue(index: usize) -> &'static mut AxRunQueue {
+    assert!(index < axconfig::SMP);
+    unsafe { RUN_QUEUES[index].assume_init_mut() }
+}
+
+/// Selects the appropriate run queue for the provided task.
+///
+/// * In a single-core system, this function always returns a reference to the global run queue.
+/// * In a multi-core system, this function selects the run queue based on the task's CPU affinity and load balance.
+///
+/// ## Arguments
+///
+/// * `task` - A reference to the task for which a run queue is being selected.
+///
+/// ## Returns
+///
+/// A reference to the selected `AxRunQueue`.
+///
+/// ## TODO
+///
+/// 1. Implement better load balancing across CPUs for more efficient task distribution.
+/// 2. Use a more generic load balancing algorithm that can be customized or replaced.
+///
+#[inline]
+pub(crate) fn select_run_queue(#[cfg(feature = "smp")] task: AxTaskRef) -> &'static mut AxRunQueue {
+    #[cfg(not(feature = "smp"))]
+    {
+        // When SMP is disabled, all tasks are scheduled on the same global run queue.
+        current_run_queue()
+    }
+    #[cfg(feature = "smp")]
+    {
+        // When SMP is enabled, select the run queue based on the task's CPU affinity and load balance.
+        let index = select_run_queue_index(task.cpu_set());
+        get_run_queue(index)
+    }
+}
+
+/// AxRunQueue represents a run queue for global system or a specific CPU.
 pub(crate) struct AxRunQueue {
+    /// The ID of the CPU this run queue is associated with.
+    cpu_id: usize,
+    /// The core scheduler of this run queue.
     scheduler: Scheduler,
 }
 
 impl AxRunQueue {
-    pub fn new() -> SpinNoIrq<Self> {
-        let gc_task = TaskInner::new(gc_entry, "gc".into(), axconfig::TASK_STACK_SIZE).into_arc();
+    pub fn new(cpu_id: usize) -> Self {
+        let gc_task = TaskInner::new(
+            gc_entry,
+            "gc".into(),
+            axconfig::TASK_STACK_SIZE,
+            // gc task shoule be pinned to the current CPU.
+            Some(1 << cpu_id),
+        )
+        .into_arc();
+
         let mut scheduler = Scheduler::new();
         scheduler.add_task(gc_task);
-        SpinNoIrq::new(Self { scheduler })
+        Self { cpu_id, scheduler }
+    }
+
+    /// Returns the cpu id of current run queue,
+    /// which is also its index in `RUN_QUEUES`.
+    pub fn cpu_id(&self) -> usize {
+        self.cpu_id
+    }
+
+    /// Returns the number of tasks in current run queue's scheduler,
+    /// which is used for load balance during scheduling.
+    #[cfg(feature = "smp")]
+    pub fn num_tasks(&self) -> usize {
+        self.scheduler.num_tasks()
     }
+}
 
+/// Core functions of run queue.
+impl AxRunQueue {
     pub fn add_task(&mut self, task: AxTaskRef) {
-        debug!("task spawn: {}", task.id_name());
+        debug!("Add {} on run_queue {}", task.id_name(), self.cpu_id);
         assert!(task.is_ready());
         self.scheduler.add_task(task);
     }
@@ -46,6 +199,7 @@ impl AxRunQueue {
     }
 
     pub fn yield_current(&mut self) {
+        let _kernel_guard = kernel_guard::NoPreemptIrqSave::new();
         let curr = crate::current();
         trace!("task yield: {}", curr.id_name());
         assert!(curr.is_running());
@@ -59,14 +213,15 @@ impl AxRunQueue {
 
     #[cfg(feature = "preempt")]
     pub fn preempt_resched(&mut self) {
+        // There is no need to disable IRQ and preemption here, because
+        // they both have been disabled in `current_check_preempt_pending`.
         let curr = crate::current();
         assert!(curr.is_running());
 
-        // When we get the mutable reference of the run queue, we must
-        // have held the `SpinNoIrq` lock with both IRQs and preemption
-        // disabled. So we need to set `current_disable_count` to 1 in
-        // `can_preempt()` to obtain the preemption permission before
-        //  locking the run queue.
+        // When we call `preempt_resched()`, both IRQs and preemption must
+        // have been disabled by `kernel_guard::NoPreemptIrqSave`. So we need
+        // to set `current_disable_count` to 1 in `can_preempt()` to obtain
+        // the preemption permission.
         let can_preempt = curr.can_preempt(1);
 
         debug!(
@@ -82,55 +237,65 @@ impl AxRunQueue {
     }
 
     pub fn exit_current(&mut self, exit_code: i32) -> ! {
+        let _kernel_guard = kernel_guard::NoPreemptIrqSave::new();
+
         let curr = crate::current();
         debug!("task exit: {}, exit_code={}", curr.id_name(), exit_code);
-        assert!(curr.is_running());
+        assert!(curr.is_running(), "task is not running: {:?}", curr.state());
         assert!(!curr.is_idle());
         if curr.is_init() {
-            EXITED_TASKS.lock().clear();
+            EXITED_TASKS.with_current(|exited_tasks| exited_tasks.clear());
             axhal::misc::terminate();
         } else {
             curr.set_state(TaskState::Exited);
-            curr.notify_exit(exit_code, self);
-            EXITED_TASKS.lock().push_back(curr.clone());
-            WAIT_FOR_EXIT.notify_one_locked(false, self);
+
+            // Notify the joiner task.
+            curr.notify_exit(exit_code);
+
+            // Push current task to the `EXITED_TASKS` list, which will be consumed by the GC task.
+            EXITED_TASKS.with_current(|exited_tasks| exited_tasks.push_back(curr.clone()));
+            // Wake up the GC task to drop the exited tasks.
+            WAIT_FOR_EXIT.with_current(|wq| wq.notify_one(false));
+            // Schedule to next task.
             self.resched(false);
         }
+        drop(_kernel_guard);
         unreachable!("task exited!");
     }
 
-    pub fn block_current<F>(&mut self, wait_queue_push: F)
-    where
-        F: FnOnce(AxTaskRef),
-    {
+    pub fn blocked_resched(&mut self) {
         let curr = crate::current();
-        debug!("task block: {}", curr.id_name());
-        assert!(curr.is_running());
-        assert!(!curr.is_idle());
-
-        // we must not block current task with preemption disabled.
-        #[cfg(feature = "preempt")]
-        assert!(curr.can_preempt(1));
+        assert!(
+            curr.is_blocking(),
+            "task is not blocking, {:?}",
+            curr.state()
+        );
 
-        curr.set_state(TaskState::Blocked);
-        wait_queue_push(curr.clone());
+        debug!("task block: {}", curr.id_name());
         self.resched(false);
     }
 
+    /// Unblock one task by inserting it into the run queue.
+    /// If task state is `BLOCKING`, it will enter a loop until the task is in `BLOCKED` state.
     pub fn unblock_task(&mut self, task: AxTaskRef, resched: bool) {
-        debug!("task unblock: {}", task.id_name());
-        if task.is_blocked() {
+        task.clone().unblock_locked(|| {
+            let cpu_id = self.cpu_id;
+            debug!("task unblock: {} on run_queue {}", task.id_name(), cpu_id);
             task.set_state(TaskState::Ready);
-            self.scheduler.add_task(task); // TODO: priority
-            if resched {
+            self.scheduler.add_task(task.clone()); // TODO: priority
+
+            // Note: when the task is unblocked on another CPU's run queue,
+            // we just ingiore the `resched` flag.
+            if resched && cpu_id == this_cpu_id() {
                 #[cfg(feature = "preempt")]
                 crate::current().set_preempt_pending(true);
             }
-        }
+        })
     }
 
     #[cfg(feature = "irq")]
     pub fn sleep_until(&mut self, deadline: axhal::time::TimeValue) {
+        let kernel_guard = kernel_guard::NoPreemptIrqSave::new();
         let curr = crate::current();
         debug!("task sleep: {}, deadline={:?}", curr.id_name(), deadline);
         assert!(curr.is_running());
@@ -139,9 +304,10 @@ impl AxRunQueue {
         let now = axhal::time::wall_time();
         if now < deadline {
             crate::timers::set_alarm_wakeup(deadline, curr.clone());
-            curr.set_state(TaskState::Blocked);
+            curr.set_state(TaskState::Blocking);
             self.resched(false);
         }
+        drop(kernel_guard)
     }
 }
 
@@ -156,19 +322,32 @@ impl AxRunQueue {
                 self.scheduler.put_prev_task(prev.clone(), preempt);
             }
         }
+
+        if prev.is_blocking() {
+            prev.set_state(TaskState::Blocked);
+        }
+
         let next = self.scheduler.pick_next_task().unwrap_or_else(|| unsafe {
             // Safety: IRQs must be disabled at this time.
             IDLE_TASK.current_ref_raw().get_unchecked().clone()
         });
+        assert!(
+            next.is_ready(),
+            "next {} is not ready: {:?}",
+            next.id_name(),
+            next.state()
+        );
         self.switch_to(prev, next);
     }
 
     fn switch_to(&mut self, prev_task: CurrentTask, next_task: AxTaskRef) {
-        trace!(
-            "context switch: {} -> {}",
-            prev_task.id_name(),
-            next_task.id_name()
-        );
+        if !prev_task.is_idle() || !next_task.is_idle() {
+            debug!(
+                "context switch: {} -> {}",
+                prev_task.id_name(),
+                next_task.id_name()
+            );
+        }
         #[cfg(feature = "preempt")]
         next_task.set_preempt_pending(false);
         next_task.set_state(TaskState::Running);
@@ -186,6 +365,7 @@ impl AxRunQueue {
             assert!(Arc::strong_count(&next_task) >= 1);
 
             CurrentTask::set_current(prev_task, next_task);
+
             (*prev_ctx_ptr).switch_to(&*next_ctx_ptr);
         }
     }
@@ -194,10 +374,10 @@ impl AxRunQueue {
 fn gc_entry() {
     loop {
         // Drop all exited tasks and recycle resources.
-        let n = EXITED_TASKS.lock().len();
+        let n = EXITED_TASKS.with_current(|exited_tasks| exited_tasks.len());
         for _ in 0..n {
             // Do not do the slow drops in the critical section.
-            let task = EXITED_TASKS.lock().pop_front();
+            let task = EXITED_TASKS.with_current(|exited_tasks| exited_tasks.pop_front());
             if let Some(task) = task {
                 if Arc::strong_count(&task) == 1 {
                     // If I'm the last holder of the task, drop it immediately.
@@ -205,18 +385,25 @@ fn gc_entry() {
                 } else {
                     // Otherwise (e.g, `switch_to` is not compeleted, held by the
                     // joiner, etc), push it back and wait for them to drop first.
-                    EXITED_TASKS.lock().push_back(task);
+                    EXITED_TASKS.with_current(|exited_tasks| exited_tasks.push_back(task));
                 }
             }
         }
-        WAIT_FOR_EXIT.wait();
+        unsafe { WAIT_FOR_EXIT.current_ref_raw() }.wait();
     }
 }
 
 pub(crate) fn init() {
+    let cpu_id = this_cpu_id();
+
     // Create the `idle` task (not current task).
     const IDLE_TASK_STACK_SIZE: usize = 4096;
-    let idle_task = TaskInner::new(|| crate::run_idle(), "idle".into(), IDLE_TASK_STACK_SIZE);
+    let idle_task = TaskInner::new(
+        || crate::run_idle(),
+        "idle".into(),
+        IDLE_TASK_STACK_SIZE,
+        Some(1 << cpu_id),
+    );
     IDLE_TASK.with_current(|i| {
         i.init_once(idle_task.into_arc());
     });
@@ -224,12 +411,20 @@ pub(crate) fn init() {
     // Put the subsequent execution into the `main` task.
     let main_task = TaskInner::new_init("main".into()).into_arc();
     main_task.set_state(TaskState::Running);
-    unsafe { CurrentTask::init_current(main_task) };
+    unsafe { CurrentTask::init_current(main_task) }
 
-    RUN_QUEUE.init_once(AxRunQueue::new());
+    info!("Initialize RUN_QUEUES");
+    RUN_QUEUE.with_current(|rq| {
+        rq.init_once(AxRunQueue::new(cpu_id));
+    });
+    unsafe {
+        RUN_QUEUES[cpu_id].write(RUN_QUEUE.current_ref_mut_raw());
+    }
 }
 
 pub(crate) fn init_secondary() {
+    let cpu_id = this_cpu_id();
+
     // Put the subsequent execution into the `idle` task.
     let idle_task = TaskInner::new_init("idle".into()).into_arc();
     idle_task.set_state(TaskState::Running);
@@ -237,4 +432,10 @@ pub(crate) fn init_secondary() {
         i.init_once(idle_task.clone());
     });
     unsafe { CurrentTask::init_current(idle_task) }
+    RUN_QUEUE.with_current(|rq| {
+        rq.init_once(AxRunQueue::new(cpu_id));
+    });
+    unsafe {
+        RUN_QUEUES[cpu_id].write(RUN_QUEUE.current_ref_mut_raw());
+    }
 }
diff --git a/modules/axtask/src/task.rs b/modules/axtask/src/task.rs
index a916551eb5..d1d8201ab7 100644
--- a/modules/axtask/src/task.rs
+++ b/modules/axtask/src/task.rs
@@ -1,19 +1,21 @@
 use alloc::{boxed::Box, string::String, sync::Arc};
 use core::ops::Deref;
+#[cfg(any(feature = "preempt", feature = "irq"))]
+use core::sync::atomic::AtomicUsize;
 use core::sync::atomic::{AtomicBool, AtomicI32, AtomicU64, AtomicU8, Ordering};
 use core::{alloc::Layout, cell::UnsafeCell, fmt, ptr::NonNull};
 
-#[cfg(feature = "preempt")]
-use core::sync::atomic::AtomicUsize;
+use bitmaps::Bitmap;
+use kspin::SpinRaw;
+use memory_addr::{align_up_4k, VirtAddr};
 
+use axhal::arch::TaskContext;
+use axhal::cpu::this_cpu_id;
 #[cfg(feature = "tls")]
 use axhal::tls::TlsArea;
 
-use axhal::arch::TaskContext;
-use memory_addr::{align_up_4k, VirtAddr};
-
 use crate::task_ext::AxTaskExt;
-use crate::{AxRunQueue, AxTask, AxTaskRef, WaitQueue};
+use crate::{AxTask, AxTaskRef, WaitQueue};
 
 /// A unique identifier for a thread.
 #[derive(Debug, Clone, Copy, Eq, PartialEq)]
@@ -23,10 +25,18 @@ pub struct TaskId(u64);
 #[repr(u8)]
 #[derive(Debug, Clone, Copy, Eq, PartialEq)]
 pub(crate) enum TaskState {
+    /// Task is running on some CPU.
     Running = 1,
+    /// Task is ready to run on some scheduler's ready queue.
     Ready = 2,
-    Blocked = 3,
-    Exited = 4,
+    /// Task is just be blocked and inserted into the wait queue,
+    /// but still have **NOT finished** its scheduling process.
+    Blocking = 3,
+    /// Task is blocked (in the wait queue or timer list),
+    /// and it has finished its scheduling process, it can be wake up by `notify()` on any run queue safely.
+    Blocked = 4,
+    /// Task is exited and waiting for being dropped.
+    Exited = 5,
 }
 
 /// The inner task structure.
@@ -39,10 +49,19 @@ pub struct TaskInner {
     entry: Option<*mut dyn FnOnce()>,
     state: AtomicU8,
 
+    /// CPU affinity mask.
+    cpu_set: Bitmap<{ axconfig::SMP }>,
+
     in_wait_queue: AtomicBool,
     #[cfg(feature = "irq")]
     in_timer_list: AtomicBool,
 
+    /// Used to protect the task from being unblocked by timer and `notify()` at the same time.
+    /// It is used in `unblock_task()`, which is called by wait queue's `notify()` and timer's callback.
+    /// Since preemption and irq are both disabled during `unblock_task()`, we can simply use a raw spin lock here.
+    #[cfg(feature = "irq")]
+    unblock_lock: SpinRaw<()>,
+
     #[cfg(feature = "preempt")]
     need_resched: AtomicBool,
     #[cfg(feature = "preempt")]
@@ -77,8 +96,9 @@ impl From<u8> for TaskState {
         match state {
             1 => Self::Running,
             2 => Self::Ready,
-            3 => Self::Blocked,
-            4 => Self::Exited,
+            3 => Self::Blocking,
+            4 => Self::Blocked,
+            5 => Self::Exited,
             _ => unreachable!(),
         }
     }
@@ -89,7 +109,16 @@ unsafe impl Sync for TaskInner {}
 
 impl TaskInner {
     /// Create a new task with the given entry function and stack size.
-    pub fn new<F>(entry: F, name: String, stack_size: usize) -> Self
+    ///
+    /// When "smp" feature is enabled:
+    /// `cpu_set` represents a set of physical CPUs, which is implemented as a bit mask,
+    /// refering to `cpu_set_t` in Linux.
+    /// The task will be only scheduled on the specified CPUs if `cpu_set` is set as `Some(cpu_mask)`,
+    /// Otherwise, the task will be scheduled on all CPUs under specific load balancing policy.
+    /// Reference:
+    ///     * https://man7.org/linux/man-pages/man2/sched_setaffinity.2.html
+    ///     * https://man7.org/linux/man-pages/man3/CPU_SET.3.html
+    pub fn new<F>(entry: F, name: String, stack_size: usize, cpu_set: Option<usize>) -> Self
     where
         F: FnOnce() + Send + 'static,
     {
@@ -108,6 +137,21 @@ impl TaskInner {
         if t.name == "idle" {
             t.is_idle = true;
         }
+        t.cpu_set = match cpu_set {
+            Some(cpu_set) => {
+                let mut bit_map = Bitmap::new();
+                let mut i = 0;
+                while i < axconfig::SMP {
+                    if cpu_set & (1 << i) != 0 {
+                        bit_map.set(i, true);
+                    }
+                    i += 1;
+                }
+                bit_map
+            }
+            // This task can be scheduled on all CPUs by default.
+            None => Bitmap::mask(axconfig::SMP),
+        };
         t
     }
 
@@ -171,9 +215,12 @@ impl TaskInner {
             is_init: false,
             entry: None,
             state: AtomicU8::new(TaskState::Ready as u8),
+            cpu_set: Bitmap::new(),
             in_wait_queue: AtomicBool::new(false),
             #[cfg(feature = "irq")]
             in_timer_list: AtomicBool::new(false),
+            #[cfg(feature = "irq")]
+            unblock_lock: SpinRaw::new(()),
             #[cfg(feature = "preempt")]
             need_resched: AtomicBool::new(false),
             #[cfg(feature = "preempt")]
@@ -199,6 +246,7 @@ impl TaskInner {
     pub(crate) fn new_init(name: String) -> Self {
         let mut t = Self::new_common(TaskId::new(), name);
         t.is_init = true;
+        t.cpu_set.set(this_cpu_id(), true);
         if t.name == "idle" {
             t.is_idle = true;
         }
@@ -234,6 +282,11 @@ impl TaskInner {
         matches!(self.state(), TaskState::Blocked)
     }
 
+    #[inline]
+    pub(crate) fn is_blocking(&self) -> bool {
+        matches!(self.state(), TaskState::Blocking)
+    }
+
     #[inline]
     pub(crate) const fn is_init(&self) -> bool {
         self.is_init
@@ -244,6 +297,11 @@ impl TaskInner {
         self.is_idle
     }
 
+    #[inline]
+    pub(crate) const fn cpu_set(&self) -> Bitmap<{ axconfig::SMP }> {
+        self.cpu_set
+    }
+
     #[inline]
     pub(crate) fn in_wait_queue(&self) -> bool {
         self.in_wait_queue.load(Ordering::Acquire)
@@ -266,6 +324,27 @@ impl TaskInner {
         self.in_timer_list.store(in_timer_list, Ordering::Release);
     }
 
+    pub(crate) fn unblock_locked<F>(&self, mut run_queue_push: F)
+    where
+        F: FnMut(),
+    {
+        // When task's state is Blocking, it has not finished its scheduling process.
+        if self.is_blocking() {
+            while self.is_blocking() {
+                // Wait for the task to finish its scheduling process.
+                core::hint::spin_loop();
+            }
+            assert!(self.is_blocked())
+        }
+
+        // When irq is enabled, use `unblock_lock` to protect the task from being unblocked by timer and `notify()` at the same time.
+        #[cfg(feature = "irq")]
+        let _lock = self.unblock_lock.lock();
+        if self.is_blocked() {
+            run_queue_push();
+        }
+    }
+
     #[inline]
     #[cfg(feature = "preempt")]
     pub(crate) fn set_preempt_pending(&self, pending: bool) {
@@ -297,16 +376,17 @@ impl TaskInner {
     fn current_check_preempt_pending() {
         let curr = crate::current();
         if curr.need_resched.load(Ordering::Acquire) && curr.can_preempt(0) {
-            let mut rq = crate::RUN_QUEUE.lock();
+            let _kernel_guard = kernel_guard::NoPreemptIrqSave::new();
             if curr.need_resched.load(Ordering::Acquire) {
-                rq.preempt_resched();
+                crate::current_run_queue().preempt_resched()
             }
         }
     }
 
-    pub(crate) fn notify_exit(&self, exit_code: i32, rq: &mut AxRunQueue) {
+    /// Notify all tasks that join on this task.
+    pub(crate) fn notify_exit(&self, exit_code: i32) {
         self.exit_code.store(exit_code, Ordering::Release);
-        self.wait_for_exit.notify_all_locked(false, rq);
+        self.wait_for_exit.notify_all(false);
     }
 
     #[inline]
@@ -429,8 +509,7 @@ impl Deref for CurrentTask {
 }
 
 extern "C" fn task_entry() -> ! {
-    // release the lock that was implicitly held across the reschedule
-    unsafe { crate::RUN_QUEUE.force_unlock() };
+    // Enable irq (if feature "irq" is enabled) before running the task entry function.
     #[cfg(feature = "irq")]
     axhal::arch::enable_irqs();
     let task = crate::current();
diff --git a/modules/axtask/src/task_ext.rs b/modules/axtask/src/task_ext.rs
index bacd56d0a0..f64ddbdcc9 100644
--- a/modules/axtask/src/task_ext.rs
+++ b/modules/axtask/src/task_ext.rs
@@ -145,7 +145,7 @@ pub trait TaskExtMut<T: Sized> {
 ///
 /// axtask::init_scheduler();
 ///
-/// let mut inner = TaskInner::new(|| {},  "".into(), 0x1000);
+/// let mut inner = TaskInner::new(|| {},  "".into(), 0x1000, None);
 /// assert!(inner.init_task_ext(TaskExtImpl { proc_id: 233 }).is_some());
 /// // cannot initialize twice
 /// assert!(inner.init_task_ext(TaskExtImpl { proc_id: 0xdead }).is_none());
diff --git a/modules/axtask/src/timers.rs b/modules/axtask/src/timers.rs
index 1c4a8eed05..cb55dde24c 100644
--- a/modules/axtask/src/timers.rs
+++ b/modules/axtask/src/timers.rs
@@ -1,40 +1,47 @@
 use alloc::sync::Arc;
-use axhal::time::wall_time;
-use kspin::SpinNoIrq;
+
 use lazyinit::LazyInit;
 use timer_list::{TimeValue, TimerEvent, TimerList};
 
-use crate::{AxTaskRef, RUN_QUEUE};
+use axhal::time::wall_time;
 
-// TODO: per-CPU
-static TIMER_LIST: LazyInit<SpinNoIrq<TimerList<TaskWakeupEvent>>> = LazyInit::new();
+use crate::{select_run_queue, AxTaskRef};
+
+percpu_static! {
+    TIMER_LIST: LazyInit<TimerList<TaskWakeupEvent>> = LazyInit::new(),
+}
 
 struct TaskWakeupEvent(AxTaskRef);
 
 impl TimerEvent for TaskWakeupEvent {
     fn callback(self, _now: TimeValue) {
-        let mut rq = RUN_QUEUE.lock();
         self.0.set_in_timer_list(false);
-        rq.unblock_task(self.0, true);
+        select_run_queue(
+            #[cfg(feature = "smp")]
+            self.0.clone(),
+        )
+        .unblock_task(self.0, true)
     }
 }
 
 pub fn set_alarm_wakeup(deadline: TimeValue, task: AxTaskRef) {
-    let mut timers = TIMER_LIST.lock();
-    task.set_in_timer_list(true);
-    timers.set(deadline, TaskWakeupEvent(task));
+    TIMER_LIST.with_current(|timer_list| {
+        task.set_in_timer_list(true);
+        timer_list.set(deadline, TaskWakeupEvent(task));
+    })
 }
 
 pub fn cancel_alarm(task: &AxTaskRef) {
-    let mut timers = TIMER_LIST.lock();
-    task.set_in_timer_list(false);
-    timers.cancel(|t| Arc::ptr_eq(&t.0, task));
+    TIMER_LIST.with_current(|timer_list| {
+        task.set_in_timer_list(false);
+        timer_list.cancel(|t| Arc::ptr_eq(&t.0, task));
+    })
 }
 
 pub fn check_events() {
     loop {
         let now = wall_time();
-        let event = TIMER_LIST.lock().expire_one(now);
+        let event = TIMER_LIST.with_current(|timer_list| timer_list.expire_one(now));
         if let Some((_deadline, event)) = event {
             event.callback(now);
         } else {
@@ -44,5 +51,7 @@ pub fn check_events() {
 }
 
 pub fn init() {
-    TIMER_LIST.init_once(SpinNoIrq::new(TimerList::new()));
+    TIMER_LIST.with_current(|timer_list| {
+        timer_list.init_once(TimerList::new());
+    });
 }
diff --git a/modules/axtask/src/wait_queue.rs b/modules/axtask/src/wait_queue.rs
index d13628ad6f..ba3fc31209 100644
--- a/modules/axtask/src/wait_queue.rs
+++ b/modules/axtask/src/wait_queue.rs
@@ -1,8 +1,8 @@
 use alloc::collections::VecDeque;
 use alloc::sync::Arc;
-use kspin::SpinRaw;
+use kspin::SpinNoIrq;
 
-use crate::{AxRunQueue, AxTaskRef, CurrentTask, RUN_QUEUE};
+use crate::{current_run_queue, select_run_queue, task::TaskState, AxTaskRef, CurrentTask};
 
 /// A queue to store sleeping tasks.
 ///
@@ -27,21 +27,21 @@ use crate::{AxRunQueue, AxTaskRef, CurrentTask, RUN_QUEUE};
 /// assert_eq!(VALUE.load(Ordering::Relaxed), 1);
 /// ```
 pub struct WaitQueue {
-    queue: SpinRaw<VecDeque<AxTaskRef>>, // we already disabled IRQs when lock the `RUN_QUEUE`
+    queue: SpinNoIrq<VecDeque<AxTaskRef>>,
 }
 
 impl WaitQueue {
     /// Creates an empty wait queue.
     pub const fn new() -> Self {
         Self {
-            queue: SpinRaw::new(VecDeque::new()),
+            queue: SpinNoIrq::new(VecDeque::new()),
         }
     }
 
     /// Creates an empty wait queue with space for at least `capacity` elements.
     pub fn with_capacity(capacity: usize) -> Self {
         Self {
-            queue: SpinRaw::new(VecDeque::with_capacity(capacity)),
+            queue: SpinNoIrq::new(VecDeque::with_capacity(capacity)),
         }
     }
 
@@ -50,9 +50,8 @@ impl WaitQueue {
         // the event from another queue.
         if curr.in_wait_queue() {
             // wake up by timer (timeout).
-            // `RUN_QUEUE` is not locked here, so disable IRQs.
-            let _guard = kernel_guard::IrqSave::new();
-            self.queue.lock().retain(|t| !curr.ptr_eq(t));
+            let mut wq_locked = self.queue.lock();
+            wq_locked.retain(|t| !curr.ptr_eq(t));
             curr.set_in_wait_queue(false);
         }
         #[cfg(feature = "irq")]
@@ -62,14 +61,42 @@ impl WaitQueue {
         }
     }
 
+    fn push_to_wait_queue(&self) {
+        let mut wq = self.queue.lock();
+        let curr = crate::current();
+        assert!(curr.is_running());
+        assert!(!curr.is_idle());
+        // we must not block current task with preemption disabled.
+        // Current expected preempt count is 2.
+        // 1 for `NoPreemptIrqSave`, 1 for wait queue's `SpinNoIrq`.
+        #[cfg(feature = "preempt")]
+        assert!(curr.can_preempt(2));
+
+        // We set task state as `Blocking` to clarify that the task is blocked
+        // but **still NOT** finished its scheduling process.
+        //
+        // When another task (generally on another run queue) try to unblock this task,
+        // * if this task's state is still `Blocking`:
+        //      it needs to wait for this task's state to be changed to `Blocked`, which means it has finished its scheduling process.
+        // * if this task's state is `Blocked`:
+        //      it means this task is blocked and finished its scheduling process, in another word, it has left current run queue,
+        //      so this task can be scheduled on any run queue.
+        curr.set_state(TaskState::Blocking);
+        curr.set_in_wait_queue(true);
+
+        debug!("{} push to wait queue", curr.id_name());
+
+        wq.push_back(curr.clone());
+    }
+
     /// Blocks the current task and put it into the wait queue, until other task
     /// notifies it.
     pub fn wait(&self) {
-        RUN_QUEUE.lock().block_current(|task| {
-            task.set_in_wait_queue(true);
-            self.queue.lock().push_back(task)
-        });
+        let kernel_guard = kernel_guard::NoPreemptIrqSave::new();
+        self.push_to_wait_queue();
+        current_run_queue().blocked_resched();
         self.cancel_events(crate::current());
+        drop(kernel_guard);
     }
 
     /// Blocks the current task and put it into the wait queue, until the given
@@ -81,23 +108,40 @@ impl WaitQueue {
     where
         F: Fn() -> bool,
     {
+        let kernel_guard = kernel_guard::NoPreemptIrqSave::new();
         loop {
-            let mut rq = RUN_QUEUE.lock();
+            let mut wq = self.queue.lock();
             if condition() {
                 break;
             }
-            rq.block_current(|task| {
-                task.set_in_wait_queue(true);
-                self.queue.lock().push_back(task);
-            });
+            let curr = crate::current();
+            assert!(curr.is_running());
+            assert!(!curr.is_idle());
+
+            debug!("{} push to wait queue on wait_until", curr.id_name());
+
+            // we must not block current task with preemption disabled.
+            // Current expected preempt count is 2.
+            // 1 for `NoPreemptIrqSave`, 1 for wait queue's `SpinNoIrq`.
+            #[cfg(feature = "preempt")]
+            assert!(curr.can_preempt(2));
+            wq.push_back(curr.clone());
+
+            curr.set_state(TaskState::Blocking);
+            curr.set_in_wait_queue(true);
+            drop(wq);
+
+            current_run_queue().blocked_resched();
         }
         self.cancel_events(crate::current());
+        drop(kernel_guard);
     }
 
     /// Blocks the current task and put it into the wait queue, until other tasks
     /// notify it, or the given duration has elapsed.
     #[cfg(feature = "irq")]
     pub fn wait_timeout(&self, dur: core::time::Duration) -> bool {
+        let kernel_guard = kernel_guard::NoPreemptIrqSave::new();
         let curr = crate::current();
         let deadline = axhal::time::wall_time() + dur;
         debug!(
@@ -107,12 +151,12 @@ impl WaitQueue {
         );
         crate::timers::set_alarm_wakeup(deadline, curr.clone());
 
-        RUN_QUEUE.lock().block_current(|task| {
-            task.set_in_wait_queue(true);
-            self.queue.lock().push_back(task)
-        });
+        self.push_to_wait_queue();
+        current_run_queue().blocked_resched();
+
         let timeout = curr.in_wait_queue(); // still in the wait queue, must have timed out
         self.cancel_events(curr);
+        drop(kernel_guard);
         timeout
     }
 
@@ -126,6 +170,7 @@ impl WaitQueue {
     where
         F: Fn() -> bool,
     {
+        let kernel_guard = kernel_guard::NoPreemptIrqSave::new();
         let curr = crate::current();
         let deadline = axhal::time::wall_time() + dur;
         debug!(
@@ -137,17 +182,29 @@ impl WaitQueue {
 
         let mut timeout = true;
         while axhal::time::wall_time() < deadline {
-            let mut rq = RUN_QUEUE.lock();
+            let mut wq = self.queue.lock();
             if condition() {
                 timeout = false;
                 break;
             }
-            rq.block_current(|task| {
-                task.set_in_wait_queue(true);
-                self.queue.lock().push_back(task);
-            });
+            assert!(curr.is_running());
+            assert!(!curr.is_idle());
+
+            // we must not block current task with preemption disabled.
+            // Current expected preempt count is 2.
+            // 1 for `NoPreemptIrqSave`, 1 for wait queue's `SpinNoIrq`.
+            #[cfg(feature = "preempt")]
+            assert!(curr.can_preempt(2));
+            wq.push_back(curr.clone());
+
+            curr.set_state(TaskState::Blocking);
+            curr.set_in_wait_queue(true);
+            drop(wq);
+
+            current_run_queue().blocked_resched()
         }
         self.cancel_events(curr);
+        drop(kernel_guard);
         timeout
     }
 
@@ -156,9 +213,12 @@ impl WaitQueue {
     /// If `resched` is true, the current task will be preempted when the
     /// preemption is enabled.
     pub fn notify_one(&self, resched: bool) -> bool {
-        let mut rq = RUN_QUEUE.lock();
-        if !self.queue.lock().is_empty() {
-            self.notify_one_locked(resched, &mut rq)
+        let mut wq = self.queue.lock();
+        if let Some(task) = wq.pop_front() {
+            task.set_in_wait_queue(false);
+            unblock_one_task(task, resched);
+            drop(wq);
+            true
         } else {
             false
         }
@@ -170,14 +230,14 @@ impl WaitQueue {
     /// preemption is enabled.
     pub fn notify_all(&self, resched: bool) {
         loop {
-            let mut rq = RUN_QUEUE.lock();
-            if let Some(task) = self.queue.lock().pop_front() {
+            let mut wq = self.queue.lock();
+            if let Some(task) = wq.pop_front() {
                 task.set_in_wait_queue(false);
-                rq.unblock_task(task, resched);
+                unblock_one_task(task, resched);
             } else {
                 break;
             }
-            drop(rq); // we must unlock `RUN_QUEUE` after unlocking `self.queue`.
+            drop(wq);
         }
     }
 
@@ -186,31 +246,31 @@ impl WaitQueue {
     /// If `resched` is true, the current task will be preempted when the
     /// preemption is enabled.
     pub fn notify_task(&mut self, resched: bool, task: &AxTaskRef) -> bool {
-        let mut rq = RUN_QUEUE.lock();
         let mut wq = self.queue.lock();
-        if let Some(index) = wq.iter().position(|t| Arc::ptr_eq(t, task)) {
-            task.set_in_wait_queue(false);
-            rq.unblock_task(wq.remove(index).unwrap(), resched);
-            true
-        } else {
-            false
-        }
-    }
-
-    pub(crate) fn notify_one_locked(&self, resched: bool, rq: &mut AxRunQueue) -> bool {
-        if let Some(task) = self.queue.lock().pop_front() {
+        let task_to_be_notify = {
+            if let Some(index) = wq.iter().position(|t| Arc::ptr_eq(t, task)) {
+                wq.remove(index)
+            } else {
+                None
+            }
+        };
+        if let Some(task) = task_to_be_notify {
+            // Mark task as not in wait queue.
             task.set_in_wait_queue(false);
-            rq.unblock_task(task, resched);
+            unblock_one_task(task, resched);
+            drop(wq);
             true
         } else {
             false
         }
     }
+}
 
-    pub(crate) fn notify_all_locked(&self, resched: bool, rq: &mut AxRunQueue) {
-        while let Some(task) = self.queue.lock().pop_front() {
-            task.set_in_wait_queue(false);
-            rq.unblock_task(task, resched);
-        }
-    }
+pub(crate) fn unblock_one_task(task: AxTaskRef, resched: bool) {
+    // Select run queue by the CPU set of the task.
+    select_run_queue(
+        #[cfg(feature = "smp")]
+        task.clone(),
+    )
+    .unblock_task(task, resched)
 }

From b706e876297243f1c586a508c9fea4cdc9dfc6ac Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Sat, 21 Sep 2024 15:50:06 +0800
Subject: [PATCH 02/51] refactor: add preempt guard for AxRunQueue through
 AxRunQueueRef

---
 modules/axtask/src/api.rs        | 14 ++++---
 modules/axtask/src/run_queue.rs  | 67 +++++++++++++++++++++-----------
 modules/axtask/src/task.rs       |  2 +-
 modules/axtask/src/timers.rs     |  3 +-
 modules/axtask/src/wait_queue.rs | 24 ++++++------
 5 files changed, 67 insertions(+), 43 deletions(-)

diff --git a/modules/axtask/src/api.rs b/modules/axtask/src/api.rs
index cd99702dc2..a36d1ccf0d 100644
--- a/modules/axtask/src/api.rs
+++ b/modules/axtask/src/api.rs
@@ -2,6 +2,8 @@
 
 use alloc::{string::String, sync::Arc};
 
+use kernel_guard::{NoOp, NoPreemptIrqSave};
+
 pub(crate) use crate::run_queue::{current_run_queue, select_run_queue};
 
 #[doc(cfg(feature = "multitask"))]
@@ -88,14 +90,14 @@ pub fn init_scheduler_secondary() {
 #[doc(cfg(feature = "irq"))]
 pub fn on_timer_tick() {
     crate::timers::check_events();
-    current_run_queue().scheduler_timer_tick();
+    current_run_queue::<NoOp>().scheduler_timer_tick();
 }
 
 /// Adds the given task to the run queue, returns the task reference.
 pub fn spawn_task(task: TaskInner) -> AxTaskRef {
     let task_ref = task.into_arc();
     let _kernel_guard = kernel_guard::NoPreemptIrqSave::new();
-    crate::select_run_queue(
+    crate::select_run_queue::<NoPreemptIrqSave>(
         #[cfg(feature = "smp")]
         task_ref.clone(),
     )
@@ -136,13 +138,13 @@ where
 ///
 /// [CFS]: https://en.wikipedia.org/wiki/Completely_Fair_Scheduler
 pub fn set_priority(prio: isize) -> bool {
-    current_run_queue().set_current_priority(prio)
+    current_run_queue::<NoPreemptIrqSave>().set_current_priority(prio)
 }
 
 /// Current task gives up the CPU time voluntarily, and switches to another
 /// ready task.
 pub fn yield_now() {
-    current_run_queue().yield_current()
+    current_run_queue::<NoPreemptIrqSave>().yield_current()
 }
 
 /// Current task is going to sleep for the given duration.
@@ -157,14 +159,14 @@ pub fn sleep(dur: core::time::Duration) {
 /// If the feature `irq` is not enabled, it uses busy-wait instead.
 pub fn sleep_until(deadline: axhal::time::TimeValue) {
     #[cfg(feature = "irq")]
-    current_run_queue().sleep_until(deadline);
+    current_run_queue::<NoPreemptIrqSave>().sleep_until(deadline);
     #[cfg(not(feature = "irq"))]
     axhal::time::busy_wait_until(deadline);
 }
 
 /// Exits the current task.
 pub fn exit(exit_code: i32) -> ! {
-    current_run_queue().exit_current(exit_code)
+    current_run_queue::<NoPreemptIrqSave>().exit_current(exit_code)
 }
 
 /// The idle task routine.
diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index 231a8adfd1..046b569c4f 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -3,6 +3,7 @@ use alloc::sync::Arc;
 use core::mem::MaybeUninit;
 
 use bitmaps::Bitmap;
+use kernel_guard::BaseGuard;
 use lazyinit::LazyInit;
 use scheduler::BaseScheduler;
 
@@ -39,6 +40,7 @@ percpu_static! {
 static mut RUN_QUEUES: [MaybeUninit<&'static mut AxRunQueue>; axconfig::SMP] =
     [ARRAY_REPEAT_VALUE; axconfig::SMP];
 const ARRAY_REPEAT_VALUE: MaybeUninit<&'static mut AxRunQueue> = MaybeUninit::uninit();
+
 /// Returns a reference to the current run queue.
 ///
 /// ## Safety
@@ -51,9 +53,14 @@ const ARRAY_REPEAT_VALUE: MaybeUninit<&'static mut AxRunQueue> = MaybeUninit::un
 /// ## Returns
 ///
 /// A static reference to the current run queue.
-#[inline]
-pub(crate) fn current_run_queue() -> &'static mut AxRunQueue {
-    unsafe { RUN_QUEUE.current_ref_mut_raw() }
+// #[inline(always)]
+pub(crate) fn current_run_queue<G: BaseGuard>() -> AxRunQueueRef<'static, G> {
+    let irq_state = G::acquire();
+    AxRunQueueRef {
+        inner: unsafe { RUN_QUEUE.current_ref_mut_raw() },
+        state: irq_state,
+        _phantom: core::marker::PhantomData,
+    }
 }
 
 /// Selects the run queue index based on a CPU set bitmap, minimizing the number of tasks.
@@ -129,7 +136,9 @@ fn get_run_queue(index: usize) -> &'static mut AxRunQueue {
 /// 2. Use a more generic load balancing algorithm that can be customized or replaced.
 ///
 #[inline]
-pub(crate) fn select_run_queue(#[cfg(feature = "smp")] task: AxTaskRef) -> &'static mut AxRunQueue {
+pub(crate) fn select_run_queue<G: BaseGuard>(
+    #[cfg(feature = "smp")] task: AxTaskRef,
+) -> AxRunQueueRef<'static, G> {
     #[cfg(not(feature = "smp"))]
     {
         // When SMP is disabled, all tasks are scheduled on the same global run queue.
@@ -137,9 +146,14 @@ pub(crate) fn select_run_queue(#[cfg(feature = "smp")] task: AxTaskRef) -> &'sta
     }
     #[cfg(feature = "smp")]
     {
+        let irq_state = G::acquire();
         // When SMP is enabled, select the run queue based on the task's CPU affinity and load balance.
         let index = select_run_queue_index(task.cpu_set());
-        get_run_queue(index)
+        AxRunQueueRef {
+            inner: get_run_queue(index),
+            state: irq_state,
+            _phantom: core::marker::PhantomData,
+        }
     }
 }
 
@@ -151,6 +165,18 @@ pub(crate) struct AxRunQueue {
     scheduler: Scheduler,
 }
 
+pub(crate) struct AxRunQueueRef<'a, G: BaseGuard> {
+    inner: &'a mut AxRunQueue,
+    state: G::State,
+    _phantom: core::marker::PhantomData<G>,
+}
+
+impl<'a, G: BaseGuard> Drop for AxRunQueueRef<'a, G> {
+    fn drop(&mut self) {
+        G::release(self.state);
+    }
+}
+
 impl AxRunQueue {
     pub fn new(cpu_id: usize) -> Self {
         let gc_task = TaskInner::new(
@@ -182,17 +208,17 @@ impl AxRunQueue {
 }
 
 /// Core functions of run queue.
-impl AxRunQueue {
+impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
     pub fn add_task(&mut self, task: AxTaskRef) {
-        debug!("Add {} on run_queue {}", task.id_name(), self.cpu_id);
+        debug!("Add {} on run_queue {}", task.id_name(), self.inner.cpu_id);
         assert!(task.is_ready());
-        self.scheduler.add_task(task);
+        self.inner.scheduler.add_task(task);
     }
 
     #[cfg(feature = "irq")]
     pub fn scheduler_timer_tick(&mut self) {
         let curr = crate::current();
-        if !curr.is_idle() && self.scheduler.task_tick(curr.as_task_ref()) {
+        if !curr.is_idle() && self.inner.scheduler.task_tick(curr.as_task_ref()) {
             #[cfg(feature = "preempt")]
             curr.set_preempt_pending(true);
         }
@@ -203,11 +229,12 @@ impl AxRunQueue {
         let curr = crate::current();
         trace!("task yield: {}", curr.id_name());
         assert!(curr.is_running());
-        self.resched(false);
+        self.inner.resched(false);
     }
 
     pub fn set_current_priority(&mut self, prio: isize) -> bool {
-        self.scheduler
+        self.inner
+            .scheduler
             .set_priority(crate::current().as_task_ref(), prio)
     }
 
@@ -230,15 +257,13 @@ impl AxRunQueue {
             can_preempt
         );
         if can_preempt {
-            self.resched(true);
+            self.inner.resched(true);
         } else {
             curr.set_preempt_pending(true);
         }
     }
 
     pub fn exit_current(&mut self, exit_code: i32) -> ! {
-        let _kernel_guard = kernel_guard::NoPreemptIrqSave::new();
-
         let curr = crate::current();
         debug!("task exit: {}, exit_code={}", curr.id_name(), exit_code);
         assert!(curr.is_running(), "task is not running: {:?}", curr.state());
@@ -257,9 +282,8 @@ impl AxRunQueue {
             // Wake up the GC task to drop the exited tasks.
             WAIT_FOR_EXIT.with_current(|wq| wq.notify_one(false));
             // Schedule to next task.
-            self.resched(false);
+            self.inner.resched(false);
         }
-        drop(_kernel_guard);
         unreachable!("task exited!");
     }
 
@@ -272,17 +296,17 @@ impl AxRunQueue {
         );
 
         debug!("task block: {}", curr.id_name());
-        self.resched(false);
+        self.inner.resched(false);
     }
 
     /// Unblock one task by inserting it into the run queue.
     /// If task state is `BLOCKING`, it will enter a loop until the task is in `BLOCKED` state.
     pub fn unblock_task(&mut self, task: AxTaskRef, resched: bool) {
         task.clone().unblock_locked(|| {
-            let cpu_id = self.cpu_id;
+            let cpu_id = self.inner.cpu_id;
             debug!("task unblock: {} on run_queue {}", task.id_name(), cpu_id);
             task.set_state(TaskState::Ready);
-            self.scheduler.add_task(task.clone()); // TODO: priority
+            self.inner.scheduler.add_task(task.clone()); // TODO: priority
 
             // Note: when the task is unblocked on another CPU's run queue,
             // we just ingiore the `resched` flag.
@@ -295,7 +319,7 @@ impl AxRunQueue {
 
     #[cfg(feature = "irq")]
     pub fn sleep_until(&mut self, deadline: axhal::time::TimeValue) {
-        let kernel_guard = kernel_guard::NoPreemptIrqSave::new();
+        let _kernel_guard = kernel_guard::NoPreemptIrqSave::new();
         let curr = crate::current();
         debug!("task sleep: {}, deadline={:?}", curr.id_name(), deadline);
         assert!(curr.is_running());
@@ -305,9 +329,8 @@ impl AxRunQueue {
         if now < deadline {
             crate::timers::set_alarm_wakeup(deadline, curr.clone());
             curr.set_state(TaskState::Blocking);
-            self.resched(false);
+            self.inner.resched(false);
         }
-        drop(kernel_guard)
     }
 }
 
diff --git a/modules/axtask/src/task.rs b/modules/axtask/src/task.rs
index d1d8201ab7..b1025c3314 100644
--- a/modules/axtask/src/task.rs
+++ b/modules/axtask/src/task.rs
@@ -378,7 +378,7 @@ impl TaskInner {
         if curr.need_resched.load(Ordering::Acquire) && curr.can_preempt(0) {
             let _kernel_guard = kernel_guard::NoPreemptIrqSave::new();
             if curr.need_resched.load(Ordering::Acquire) {
-                crate::current_run_queue().preempt_resched()
+                crate::current_run_queue::<kernel_guard::NoOp>().preempt_resched()
             }
         }
     }
diff --git a/modules/axtask/src/timers.rs b/modules/axtask/src/timers.rs
index cb55dde24c..954b07f330 100644
--- a/modules/axtask/src/timers.rs
+++ b/modules/axtask/src/timers.rs
@@ -1,5 +1,6 @@
 use alloc::sync::Arc;
 
+use kernel_guard::NoOp;
 use lazyinit::LazyInit;
 use timer_list::{TimeValue, TimerEvent, TimerList};
 
@@ -16,7 +17,7 @@ struct TaskWakeupEvent(AxTaskRef);
 impl TimerEvent for TaskWakeupEvent {
     fn callback(self, _now: TimeValue) {
         self.0.set_in_timer_list(false);
-        select_run_queue(
+        select_run_queue::<NoOp>(
             #[cfg(feature = "smp")]
             self.0.clone(),
         )
diff --git a/modules/axtask/src/wait_queue.rs b/modules/axtask/src/wait_queue.rs
index ba3fc31209..dd021f2ec4 100644
--- a/modules/axtask/src/wait_queue.rs
+++ b/modules/axtask/src/wait_queue.rs
@@ -1,5 +1,7 @@
 use alloc::collections::VecDeque;
 use alloc::sync::Arc;
+
+use kernel_guard::{NoOp, NoPreemptIrqSave};
 use kspin::SpinNoIrq;
 
 use crate::{current_run_queue, select_run_queue, task::TaskState, AxTaskRef, CurrentTask};
@@ -92,11 +94,10 @@ impl WaitQueue {
     /// Blocks the current task and put it into the wait queue, until other task
     /// notifies it.
     pub fn wait(&self) {
-        let kernel_guard = kernel_guard::NoPreemptIrqSave::new();
+        let _kernel_guard = NoPreemptIrqSave::new();
         self.push_to_wait_queue();
-        current_run_queue().blocked_resched();
+        current_run_queue::<NoOp>().blocked_resched();
         self.cancel_events(crate::current());
-        drop(kernel_guard);
     }
 
     /// Blocks the current task and put it into the wait queue, until the given
@@ -108,7 +109,7 @@ impl WaitQueue {
     where
         F: Fn() -> bool,
     {
-        let kernel_guard = kernel_guard::NoPreemptIrqSave::new();
+        let _kernel_guard = NoPreemptIrqSave::new();
         loop {
             let mut wq = self.queue.lock();
             if condition() {
@@ -131,17 +132,16 @@ impl WaitQueue {
             curr.set_in_wait_queue(true);
             drop(wq);
 
-            current_run_queue().blocked_resched();
+            current_run_queue::<NoOp>().blocked_resched();
         }
         self.cancel_events(crate::current());
-        drop(kernel_guard);
     }
 
     /// Blocks the current task and put it into the wait queue, until other tasks
     /// notify it, or the given duration has elapsed.
     #[cfg(feature = "irq")]
     pub fn wait_timeout(&self, dur: core::time::Duration) -> bool {
-        let kernel_guard = kernel_guard::NoPreemptIrqSave::new();
+        let _kernel_guard = NoPreemptIrqSave::new();
         let curr = crate::current();
         let deadline = axhal::time::wall_time() + dur;
         debug!(
@@ -152,11 +152,10 @@ impl WaitQueue {
         crate::timers::set_alarm_wakeup(deadline, curr.clone());
 
         self.push_to_wait_queue();
-        current_run_queue().blocked_resched();
+        current_run_queue::<NoOp>().blocked_resched();
 
         let timeout = curr.in_wait_queue(); // still in the wait queue, must have timed out
         self.cancel_events(curr);
-        drop(kernel_guard);
         timeout
     }
 
@@ -170,7 +169,7 @@ impl WaitQueue {
     where
         F: Fn() -> bool,
     {
-        let kernel_guard = kernel_guard::NoPreemptIrqSave::new();
+        let _kernel_guard = NoPreemptIrqSave::new();
         let curr = crate::current();
         let deadline = axhal::time::wall_time() + dur;
         debug!(
@@ -201,10 +200,9 @@ impl WaitQueue {
             curr.set_in_wait_queue(true);
             drop(wq);
 
-            current_run_queue().blocked_resched()
+            current_run_queue::<NoOp>().blocked_resched()
         }
         self.cancel_events(curr);
-        drop(kernel_guard);
         timeout
     }
 
@@ -268,7 +266,7 @@ impl WaitQueue {
 
 pub(crate) fn unblock_one_task(task: AxTaskRef, resched: bool) {
     // Select run queue by the CPU set of the task.
-    select_run_queue(
+    select_run_queue::<NoPreemptIrqSave>(
         #[cfg(feature = "smp")]
         task.clone(),
     )

From 65cc73297a9e194964eaca4100b50d5aba417984 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Mon, 23 Sep 2024 15:29:10 +0800
Subject: [PATCH 03/51] refactor: use kernel_guard hold by RQ in wait_queue.rs

---
 modules/axtask/src/wait_queue.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/modules/axtask/src/wait_queue.rs b/modules/axtask/src/wait_queue.rs
index dd021f2ec4..d167ce5a7d 100644
--- a/modules/axtask/src/wait_queue.rs
+++ b/modules/axtask/src/wait_queue.rs
@@ -109,7 +109,7 @@ impl WaitQueue {
     where
         F: Fn() -> bool,
     {
-        let _kernel_guard = NoPreemptIrqSave::new();
+        let mut rq = current_run_queue::<NoPreemptIrqSave>();
         loop {
             let mut wq = self.queue.lock();
             if condition() {
@@ -132,7 +132,7 @@ impl WaitQueue {
             curr.set_in_wait_queue(true);
             drop(wq);
 
-            current_run_queue::<NoOp>().blocked_resched();
+            rq.blocked_resched();
         }
         self.cancel_events(crate::current());
     }
@@ -141,7 +141,7 @@ impl WaitQueue {
     /// notify it, or the given duration has elapsed.
     #[cfg(feature = "irq")]
     pub fn wait_timeout(&self, dur: core::time::Duration) -> bool {
-        let _kernel_guard = NoPreemptIrqSave::new();
+        let mut rq = current_run_queue::<NoPreemptIrqSave>();
         let curr = crate::current();
         let deadline = axhal::time::wall_time() + dur;
         debug!(
@@ -152,7 +152,7 @@ impl WaitQueue {
         crate::timers::set_alarm_wakeup(deadline, curr.clone());
 
         self.push_to_wait_queue();
-        current_run_queue::<NoOp>().blocked_resched();
+        rq.blocked_resched();
 
         let timeout = curr.in_wait_queue(); // still in the wait queue, must have timed out
         self.cancel_events(curr);
@@ -169,7 +169,7 @@ impl WaitQueue {
     where
         F: Fn() -> bool,
     {
-        let _kernel_guard = NoPreemptIrqSave::new();
+        let mut rq = current_run_queue::<NoPreemptIrqSave>();
         let curr = crate::current();
         let deadline = axhal::time::wall_time() + dur;
         debug!(
@@ -200,7 +200,7 @@ impl WaitQueue {
             curr.set_in_wait_queue(true);
             drop(wq);
 
-            current_run_queue::<NoOp>().blocked_resched()
+            rq.blocked_resched()
         }
         self.cancel_events(curr);
         timeout

From e5551cbb6bf1bea6b2359cf5ac27472f36c57c59 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Mon, 23 Sep 2024 19:39:04 +0800
Subject: [PATCH 04/51] feat: introduce cpumask

---
 Cargo.lock                      | 10 +++++++-
 modules/axtask/Cargo.toml       |  3 ++-
 modules/axtask/src/api.rs       |  2 +-
 modules/axtask/src/run_queue.rs | 32 +++++++++--------------
 modules/axtask/src/task.rs      | 45 ++++++++++-----------------------
 modules/axtask/src/task_ext.rs  |  2 +-
 6 files changed, 38 insertions(+), 56 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 8db9882c99..98747e20d2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -510,8 +510,8 @@ dependencies = [
  "axconfig",
  "axhal",
  "axtask",
- "bitmaps",
  "cfg-if",
+ "cpumask",
  "crate_interface",
  "kernel_guard",
  "kspin",
@@ -697,6 +697,14 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7f8f80099a98041a3d1622845c271458a2d73e688351bf3cb999266764b81d48"
 
+[[package]]
+name = "cpumask"
+version = "0.1.0"
+source = "git+https://github.com/arceos-org/cpumask.git#b4eab5e7ae8543029bf248769fcccb137151acb3"
+dependencies = [
+ "bitmaps",
+]
+
 [[package]]
 name = "crate_interface"
 version = "0.1.3"
diff --git a/modules/axtask/Cargo.toml b/modules/axtask/Cargo.toml
index ec5a16ab3e..133b8aa3bd 100644
--- a/modules/axtask/Cargo.toml
+++ b/modules/axtask/Cargo.toml
@@ -22,6 +22,7 @@ multitask = [
     "dep:timer_list",
     "kernel_guard",
     "dep:crate_interface",
+    "dep:cpumask",
 ]
 irq = []
 tls = ["axhal/tls"]
@@ -37,7 +38,6 @@ test = ["percpu?/sp-naive"]
 [dependencies]
 cfg-if = "1.0"
 log = "0.4.21"
-bitmaps = { version = "3.2.1", default-features = false }
 axhal = { workspace = true }
 axconfig = { workspace = true, optional = true }
 percpu = { version = "0.1", optional = true }
@@ -49,6 +49,7 @@ kernel_guard = { version = "0.1", optional = true }
 crate_interface = { version = "0.1", optional = true }
 # scheduler = { git = "https://github.com/arceos-org/scheduler.git", tag = "v0.1.0", optional = true }
 scheduler = { git = "https://github.com/arceos-org/scheduler.git", branch = "num_tasks", optional = true }
+cpumask = { git = "https://github.com/arceos-org/cpumask.git", optional = true }
 
 [dev-dependencies]
 rand = "0.8"
diff --git a/modules/axtask/src/api.rs b/modules/axtask/src/api.rs
index a36d1ccf0d..b89369b552 100644
--- a/modules/axtask/src/api.rs
+++ b/modules/axtask/src/api.rs
@@ -112,7 +112,7 @@ pub fn spawn_raw<F>(f: F, name: String, stack_size: usize) -> AxTaskRef
 where
     F: FnOnce() + Send + 'static,
 {
-    spawn_task(TaskInner::new(f, name, stack_size, None))
+    spawn_task(TaskInner::new(f, name, stack_size))
 }
 
 /// Spawns a new task with the default parameters.
diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index 046b569c4f..fcae8f99ef 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -2,7 +2,7 @@ use alloc::collections::VecDeque;
 use alloc::sync::Arc;
 use core::mem::MaybeUninit;
 
-use bitmaps::Bitmap;
+use cpumask::CpuMask;
 use kernel_guard::BaseGuard;
 use lazyinit::LazyInit;
 use scheduler::BaseScheduler;
@@ -65,12 +65,12 @@ pub(crate) fn current_run_queue<G: BaseGuard>() -> AxRunQueueRef<'static, G> {
 
 /// Selects the run queue index based on a CPU set bitmap, minimizing the number of tasks.
 ///
-/// This function filters the available run queues based on the provided `cpu_set` and
+/// This function filters the available run queues based on the provided `cpumask` and
 /// selects the one with the fewest tasks. The selected run queue's index (cpu_id) is returned.
 ///
 /// ## Arguments
 ///
-/// * `cpu_set` - A bitmap representing the CPUs that are eligible for task execution.
+/// * `cpumask` - A bitmap representing the CPUs that are eligible for task execution.
 ///
 /// ## Returns
 ///
@@ -82,11 +82,11 @@ pub(crate) fn current_run_queue<G: BaseGuard>() -> AxRunQueueRef<'static, G> {
 ///
 #[cfg(feature = "smp")]
 #[inline]
-fn select_run_queue_index(cpu_set: Bitmap<{ axconfig::SMP }>) -> usize {
+fn select_run_queue_index(cpumask: CpuMask<{ axconfig::SMP }>) -> usize {
     unsafe {
         RUN_QUEUES
             .iter()
-            .filter(|rq| cpu_set.get(rq.assume_init_ref().cpu_id()))
+            .filter(|rq| cpumask.get(rq.assume_init_ref().cpu_id()))
             .min_by_key(|rq| rq.assume_init_ref().num_tasks())
             .expect("No available run queue that matches the CPU set")
             .assume_init_ref()
@@ -148,7 +148,7 @@ pub(crate) fn select_run_queue<G: BaseGuard>(
     {
         let irq_state = G::acquire();
         // When SMP is enabled, select the run queue based on the task's CPU affinity and load balance.
-        let index = select_run_queue_index(task.cpu_set());
+        let index = select_run_queue_index(task.cpumask());
         AxRunQueueRef {
             inner: get_run_queue(index),
             state: irq_state,
@@ -179,14 +179,9 @@ impl<'a, G: BaseGuard> Drop for AxRunQueueRef<'a, G> {
 
 impl AxRunQueue {
     pub fn new(cpu_id: usize) -> Self {
-        let gc_task = TaskInner::new(
-            gc_entry,
-            "gc".into(),
-            axconfig::TASK_STACK_SIZE,
-            // gc task shoule be pinned to the current CPU.
-            Some(1 << cpu_id),
-        )
-        .into_arc();
+        let gc_task = TaskInner::new(gc_entry, "gc".into(), axconfig::TASK_STACK_SIZE).into_arc();
+        // gc task shoule be pinned to the current CPU.
+        gc_task.set_cpumask(CpuMask::from_usize(1 << cpu_id));
 
         let mut scheduler = Scheduler::new();
         scheduler.add_task(gc_task);
@@ -421,12 +416,9 @@ pub(crate) fn init() {
 
     // Create the `idle` task (not current task).
     const IDLE_TASK_STACK_SIZE: usize = 4096;
-    let idle_task = TaskInner::new(
-        || crate::run_idle(),
-        "idle".into(),
-        IDLE_TASK_STACK_SIZE,
-        Some(1 << cpu_id),
-    );
+    let idle_task = TaskInner::new(|| crate::run_idle(), "idle".into(), IDLE_TASK_STACK_SIZE);
+    // idle task should be pinned to the current CPU.
+    idle_task.set_cpumask(CpuMask::from_usize(1 << cpu_id));
     IDLE_TASK.with_current(|i| {
         i.init_once(idle_task.into_arc());
     });
diff --git a/modules/axtask/src/task.rs b/modules/axtask/src/task.rs
index b1025c3314..b6190adf86 100644
--- a/modules/axtask/src/task.rs
+++ b/modules/axtask/src/task.rs
@@ -5,8 +5,8 @@ use core::sync::atomic::AtomicUsize;
 use core::sync::atomic::{AtomicBool, AtomicI32, AtomicU64, AtomicU8, Ordering};
 use core::{alloc::Layout, cell::UnsafeCell, fmt, ptr::NonNull};
 
-use bitmaps::Bitmap;
-use kspin::SpinRaw;
+use cpumask::CpuMask;
+use kspin::{SpinNoIrq, SpinRaw};
 use memory_addr::{align_up_4k, VirtAddr};
 
 use axhal::arch::TaskContext;
@@ -50,7 +50,7 @@ pub struct TaskInner {
     state: AtomicU8,
 
     /// CPU affinity mask.
-    cpu_set: Bitmap<{ axconfig::SMP }>,
+    cpumask: SpinNoIrq<CpuMask<{ axconfig::SMP }>>,
 
     in_wait_queue: AtomicBool,
     #[cfg(feature = "irq")]
@@ -109,16 +109,7 @@ unsafe impl Sync for TaskInner {}
 
 impl TaskInner {
     /// Create a new task with the given entry function and stack size.
-    ///
-    /// When "smp" feature is enabled:
-    /// `cpu_set` represents a set of physical CPUs, which is implemented as a bit mask,
-    /// refering to `cpu_set_t` in Linux.
-    /// The task will be only scheduled on the specified CPUs if `cpu_set` is set as `Some(cpu_mask)`,
-    /// Otherwise, the task will be scheduled on all CPUs under specific load balancing policy.
-    /// Reference:
-    ///     * https://man7.org/linux/man-pages/man2/sched_setaffinity.2.html
-    ///     * https://man7.org/linux/man-pages/man3/CPU_SET.3.html
-    pub fn new<F>(entry: F, name: String, stack_size: usize, cpu_set: Option<usize>) -> Self
+    pub fn new<F>(entry: F, name: String, stack_size: usize) -> Self
     where
         F: FnOnce() + Send + 'static,
     {
@@ -137,21 +128,6 @@ impl TaskInner {
         if t.name == "idle" {
             t.is_idle = true;
         }
-        t.cpu_set = match cpu_set {
-            Some(cpu_set) => {
-                let mut bit_map = Bitmap::new();
-                let mut i = 0;
-                while i < axconfig::SMP {
-                    if cpu_set & (1 << i) != 0 {
-                        bit_map.set(i, true);
-                    }
-                    i += 1;
-                }
-                bit_map
-            }
-            // This task can be scheduled on all CPUs by default.
-            None => Bitmap::mask(axconfig::SMP),
-        };
         t
     }
 
@@ -215,7 +191,8 @@ impl TaskInner {
             is_init: false,
             entry: None,
             state: AtomicU8::new(TaskState::Ready as u8),
-            cpu_set: Bitmap::new(),
+            // By default, the task is allowed to run on all CPUs.
+            cpumask: SpinNoIrq::new(CpuMask::full()),
             in_wait_queue: AtomicBool::new(false),
             #[cfg(feature = "irq")]
             in_timer_list: AtomicBool::new(false),
@@ -246,7 +223,7 @@ impl TaskInner {
     pub(crate) fn new_init(name: String) -> Self {
         let mut t = Self::new_common(TaskId::new(), name);
         t.is_init = true;
-        t.cpu_set.set(this_cpu_id(), true);
+        t.set_cpumask(CpuMask::from_usize(1 << this_cpu_id()));
         if t.name == "idle" {
             t.is_idle = true;
         }
@@ -298,8 +275,12 @@ impl TaskInner {
     }
 
     #[inline]
-    pub(crate) const fn cpu_set(&self) -> Bitmap<{ axconfig::SMP }> {
-        self.cpu_set
+    pub(crate) fn cpumask(&self) -> CpuMask<{ axconfig::SMP }> {
+        *self.cpumask.lock()
+    }
+
+    pub(crate) fn set_cpumask(&self, cpumask: CpuMask<{ axconfig::SMP }>) {
+        *self.cpumask.lock() = cpumask
     }
 
     #[inline]
diff --git a/modules/axtask/src/task_ext.rs b/modules/axtask/src/task_ext.rs
index f64ddbdcc9..bacd56d0a0 100644
--- a/modules/axtask/src/task_ext.rs
+++ b/modules/axtask/src/task_ext.rs
@@ -145,7 +145,7 @@ pub trait TaskExtMut<T: Sized> {
 ///
 /// axtask::init_scheduler();
 ///
-/// let mut inner = TaskInner::new(|| {},  "".into(), 0x1000, None);
+/// let mut inner = TaskInner::new(|| {},  "".into(), 0x1000);
 /// assert!(inner.init_task_ext(TaskExtImpl { proc_id: 233 }).is_some());
 /// // cannot initialize twice
 /// assert!(inner.init_task_ext(TaskExtImpl { proc_id: 0xdead }).is_none());

From 6cab4ccf9be25fe77ccc5dc6d01d9196bda432be Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Wed, 25 Sep 2024 16:45:58 +0800
Subject: [PATCH 05/51] [feat] add on_cpu and prev_task_on_cpu_ptr field in
 AxTaskInner

---
 modules/axtask/src/run_queue.rs  | 22 ++++----
 modules/axtask/src/task.rs       | 89 +++++++++++++++++++++++++-------
 modules/axtask/src/wait_queue.rs | 15 ++----
 3 files changed, 86 insertions(+), 40 deletions(-)

diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index fcae8f99ef..7507a51f31 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -284,11 +284,7 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
 
     pub fn blocked_resched(&mut self) {
         let curr = crate::current();
-        assert!(
-            curr.is_blocking(),
-            "task is not blocking, {:?}",
-            curr.state()
-        );
+        assert!(curr.is_blocked(), "task is not blocked, {:?}", curr.state());
 
         debug!("task block: {}", curr.id_name());
         self.inner.resched(false);
@@ -323,7 +319,7 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
         let now = axhal::time::wall_time();
         if now < deadline {
             crate::timers::set_alarm_wakeup(deadline, curr.clone());
-            curr.set_state(TaskState::Blocking);
+            curr.set_state(TaskState::Blocked);
             self.inner.resched(false);
         }
     }
@@ -341,10 +337,6 @@ impl AxRunQueue {
             }
         }
 
-        if prev.is_blocking() {
-            prev.set_state(TaskState::Blocked);
-        }
-
         let next = self.scheduler.pick_next_task().unwrap_or_else(|| unsafe {
             // Safety: IRQs must be disabled at this time.
             IDLE_TASK.current_ref_raw().get_unchecked().clone()
@@ -369,6 +361,9 @@ impl AxRunQueue {
         #[cfg(feature = "preempt")]
         next_task.set_preempt_pending(false);
         next_task.set_state(TaskState::Running);
+        // Claim the task as running, we do this before switching to it
+        // such that any running task will have this set.
+        next_task.set_on_cpu(true);
         if prev_task.ptr_eq(&next_task) {
             return;
         }
@@ -377,6 +372,9 @@ impl AxRunQueue {
             let prev_ctx_ptr = prev_task.ctx_mut_ptr();
             let next_ctx_ptr = next_task.ctx_mut_ptr();
 
+            // Store the pointer of `on_cpu` flag of **prev_task** in **next_task**'s struct.
+            next_task.set_prev_task_on_cpu_ptr(prev_task.on_cpu_mut_ptr());
+
             // The strong reference count of `prev_task` will be decremented by 1,
             // but won't be dropped until `gc_entry()` is called.
             assert!(Arc::strong_count(prev_task.as_task_ref()) > 1);
@@ -385,6 +383,10 @@ impl AxRunQueue {
             CurrentTask::set_current(prev_task, next_task);
 
             (*prev_ctx_ptr).switch_to(&*next_ctx_ptr);
+
+            // Current it's **next_task** running on this CPU, clear the `prev_task`'s `on_cpu` field
+            // to indicate that it has finished its scheduling process and no longer running on this CPU.
+            crate::current().clear_prev_task_on_cpu();
         }
     }
 }
diff --git a/modules/axtask/src/task.rs b/modules/axtask/src/task.rs
index b6190adf86..69957fa412 100644
--- a/modules/axtask/src/task.rs
+++ b/modules/axtask/src/task.rs
@@ -2,7 +2,7 @@ use alloc::{boxed::Box, string::String, sync::Arc};
 use core::ops::Deref;
 #[cfg(any(feature = "preempt", feature = "irq"))]
 use core::sync::atomic::AtomicUsize;
-use core::sync::atomic::{AtomicBool, AtomicI32, AtomicU64, AtomicU8, Ordering};
+use core::sync::atomic::{AtomicBool, AtomicI32, AtomicPtr, AtomicU64, AtomicU8, Ordering};
 use core::{alloc::Layout, cell::UnsafeCell, fmt, ptr::NonNull};
 
 use cpumask::CpuMask;
@@ -29,14 +29,11 @@ pub(crate) enum TaskState {
     Running = 1,
     /// Task is ready to run on some scheduler's ready queue.
     Ready = 2,
-    /// Task is just be blocked and inserted into the wait queue,
-    /// but still have **NOT finished** its scheduling process.
-    Blocking = 3,
     /// Task is blocked (in the wait queue or timer list),
     /// and it has finished its scheduling process, it can be wake up by `notify()` on any run queue safely.
-    Blocked = 4,
+    Blocked = 3,
     /// Task is exited and waiting for being dropped.
-    Exited = 5,
+    Exited = 4,
 }
 
 /// The inner task structure.
@@ -56,6 +53,10 @@ pub struct TaskInner {
     #[cfg(feature = "irq")]
     in_timer_list: AtomicBool,
 
+    /// Used to indicate whether the task is running on a CPU.
+    on_cpu: AtomicBool,
+    prev_task_on_cpu_ptr: AtomicPtr<bool>,
+
     /// Used to protect the task from being unblocked by timer and `notify()` at the same time.
     /// It is used in `unblock_task()`, which is called by wait queue's `notify()` and timer's callback.
     /// Since preemption and irq are both disabled during `unblock_task()`, we can simply use a raw spin lock here.
@@ -96,9 +97,8 @@ impl From<u8> for TaskState {
         match state {
             1 => Self::Running,
             2 => Self::Ready,
-            3 => Self::Blocking,
-            4 => Self::Blocked,
-            5 => Self::Exited,
+            3 => Self::Blocked,
+            4 => Self::Exited,
             _ => unreachable!(),
         }
     }
@@ -196,6 +196,8 @@ impl TaskInner {
             in_wait_queue: AtomicBool::new(false),
             #[cfg(feature = "irq")]
             in_timer_list: AtomicBool::new(false),
+            on_cpu: AtomicBool::new(false),
+            prev_task_on_cpu_ptr: AtomicPtr::new(core::ptr::null_mut()),
             #[cfg(feature = "irq")]
             unblock_lock: SpinRaw::new(()),
             #[cfg(feature = "preempt")]
@@ -259,11 +261,6 @@ impl TaskInner {
         matches!(self.state(), TaskState::Blocked)
     }
 
-    #[inline]
-    pub(crate) fn is_blocking(&self) -> bool {
-        matches!(self.state(), TaskState::Blocking)
-    }
-
     #[inline]
     pub(crate) const fn is_init(&self) -> bool {
         self.is_init
@@ -309,13 +306,21 @@ impl TaskInner {
     where
         F: FnMut(),
     {
-        // When task's state is Blocking, it has not finished its scheduling process.
-        if self.is_blocking() {
-            while self.is_blocking() {
+        debug!("{} unblocking", self.id_name());
+
+        // If the owning (remote) CPU is still in the middle of schedule() with
+        // this task as prev, wait until it's done referencing the task.
+        //
+        // Pairs with the `clear_prev_task_on_cpu()`.
+        //
+        // This ensures that tasks getting woken will be fully ordered against
+        // their previous state and preserve Program Order.
+        if self.on_cpu() {
+            while self.on_cpu() {
                 // Wait for the task to finish its scheduling process.
                 core::hint::spin_loop();
             }
-            assert!(self.is_blocked())
+            assert!(!self.on_cpu())
         }
 
         // When irq is enabled, use `unblock_lock` to protect the task from being unblocked by timer and `notify()` at the same time.
@@ -381,6 +386,50 @@ impl TaskInner {
         self.ctx.get_mut()
     }
 
+    /// Returns the raw pointer to the `on_cpu` field.
+    #[inline]
+    pub(crate) const fn on_cpu_mut_ptr(&self) -> *mut bool {
+        self.on_cpu.as_ptr()
+    }
+
+    /// Sets whether the task is running on a CPU.
+    pub fn set_on_cpu(&self, on_cpu: bool) {
+        self.on_cpu.store(on_cpu, Ordering::Release)
+    }
+
+    /// Sets `prev_task_on_cpu_ptr` to the given pointer provided by previous task running on this CPU.
+    pub fn set_prev_task_on_cpu_ptr(&self, prev_task_on_cpu_ptr: *mut bool) {
+        self.prev_task_on_cpu_ptr
+            .store(prev_task_on_cpu_ptr, Ordering::Release)
+    }
+
+    /// Clears the `on_cpu` field of previous task running on this CPU.
+    /// It is called by the current task before running.
+    /// The pointer is provided by previous task running on this CPU through `set_prev_task_on_cpu_ptr()`.
+    ///
+    /// ## Note
+    ///     This must be the very last reference to @_prev_task from this CPU.
+    ///     After `on_cpu` is cleared, the task can be moved to a different CPU.
+    ///     We must ensure this doesn't happen until the switch is completely finished.
+    ///
+    /// ## Safety
+    ///     The caller must ensure that the pointer is valid and points to a boolean value, which is
+    ///     done by the previous task running on this CPU through `set_prev_task_on_cpu_ptr()`.
+    pub unsafe fn clear_prev_task_on_cpu(&self) {
+        AtomicBool::from_ptr(self.prev_task_on_cpu_ptr.load(Ordering::Acquire))
+            .store(false, Ordering::Release);
+    }
+
+    /// Returns whether the task is running on a CPU.
+    ///
+    /// It is used to protect the task from being moved to a different run queue
+    /// while it has not finished its scheduling process.
+    /// The `on_cpu` field is set to `true` when the task is preparing to run on a CPU,
+    /// and it is set to `false` when the task has finished its scheduling process in `finish_switch`.
+    pub fn on_cpu(&self) -> bool {
+        self.on_cpu.load(Ordering::Acquire)
+    }
+
     /// Returns the top address of the kernel stack.
     #[inline]
     pub const fn kernel_stack_top(&self) -> Option<VirtAddr> {
@@ -490,6 +539,10 @@ impl Deref for CurrentTask {
 }
 
 extern "C" fn task_entry() -> ! {
+    unsafe {
+        // Clear the prev task on CPU before running the task entry function.
+        crate::current().clear_prev_task_on_cpu();
+    }
     // Enable irq (if feature "irq" is enabled) before running the task entry function.
     #[cfg(feature = "irq")]
     axhal::arch::enable_irqs();
diff --git a/modules/axtask/src/wait_queue.rs b/modules/axtask/src/wait_queue.rs
index d167ce5a7d..b4f746564a 100644
--- a/modules/axtask/src/wait_queue.rs
+++ b/modules/axtask/src/wait_queue.rs
@@ -74,16 +74,7 @@ impl WaitQueue {
         #[cfg(feature = "preempt")]
         assert!(curr.can_preempt(2));
 
-        // We set task state as `Blocking` to clarify that the task is blocked
-        // but **still NOT** finished its scheduling process.
-        //
-        // When another task (generally on another run queue) try to unblock this task,
-        // * if this task's state is still `Blocking`:
-        //      it needs to wait for this task's state to be changed to `Blocked`, which means it has finished its scheduling process.
-        // * if this task's state is `Blocked`:
-        //      it means this task is blocked and finished its scheduling process, in another word, it has left current run queue,
-        //      so this task can be scheduled on any run queue.
-        curr.set_state(TaskState::Blocking);
+        curr.set_state(TaskState::Blocked);
         curr.set_in_wait_queue(true);
 
         debug!("{} push to wait queue", curr.id_name());
@@ -128,7 +119,7 @@ impl WaitQueue {
             assert!(curr.can_preempt(2));
             wq.push_back(curr.clone());
 
-            curr.set_state(TaskState::Blocking);
+            curr.set_state(TaskState::Blocked);
             curr.set_in_wait_queue(true);
             drop(wq);
 
@@ -196,7 +187,7 @@ impl WaitQueue {
             assert!(curr.can_preempt(2));
             wq.push_back(curr.clone());
 
-            curr.set_state(TaskState::Blocking);
+            curr.set_state(TaskState::Blocked);
             curr.set_in_wait_queue(true);
             drop(wq);
 

From aafcf9b85baaf1074f483148627f54c2bcfa7227 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Thu, 26 Sep 2024 14:54:50 +0800
Subject: [PATCH 06/51] fix: delete redundant kernel guard in WaitQueue wait()

---
 modules/axtask/src/wait_queue.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/axtask/src/wait_queue.rs b/modules/axtask/src/wait_queue.rs
index b4f746564a..141d812db6 100644
--- a/modules/axtask/src/wait_queue.rs
+++ b/modules/axtask/src/wait_queue.rs
@@ -85,9 +85,9 @@ impl WaitQueue {
     /// Blocks the current task and put it into the wait queue, until other task
     /// notifies it.
     pub fn wait(&self) {
-        let _kernel_guard = NoPreemptIrqSave::new();
+        let mut rq = current_run_queue::<NoPreemptIrqSave>();
         self.push_to_wait_queue();
-        current_run_queue::<NoOp>().blocked_resched();
+        rq.blocked_resched();
         self.cancel_events(crate::current());
     }
 

From 8e645fd5cd6cf529b4dc49e337d85af215ce9628 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Thu, 26 Sep 2024 14:55:14 +0800
Subject: [PATCH 07/51] Add percpu scheduler doc

---
 doc/percpu_scheduler.md | 389 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 389 insertions(+)
 create mode 100644 doc/percpu_scheduler.md

diff --git a/doc/percpu_scheduler.md b/doc/percpu_scheduler.md
new file mode 100644
index 0000000000..507b016e3d
--- /dev/null
+++ b/doc/percpu_scheduler.md
@@ -0,0 +1,389 @@
+# About How to support percpu scheduler in ArceOS.
+
+## Background
+
+Orginally, ArceOS uses a rude global RunQueue, and scheduling operations like 
+task yielding, waiting on wait queue and notifying a blocked task are 
+all under the protection of  a global SpinLockNoIrq hold by RunQueue.
+
+To support percpu scheduling, we must refactor the run queue structure,  
+as well as the locking mechanism in the current scheduling framework.
+
+## AxRunQueue and Scheduler crate
+
+For the design of the scheduler interface, we can reference the design used in Linux:
+
+```C
+// kernel/sched/sched.h
+
+struct sched_class {
+
+#ifdef CONFIG_UCLAMP_TASK
+	int uclamp_enabled;
+#endif
+
+	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
+	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
+	void (*yield_task)   (struct rq *rq);
+	bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
+
+	void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
+
+	struct task_struct *(*pick_next_task)(struct rq *rq);
+
+	void (*put_prev_task)(struct rq *rq, struct task_struct *p);
+	void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
+
+#ifdef CONFIG_SMP
+	int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
+	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
+
+	struct task_struct * (*pick_task)(struct rq *rq);
+
+	void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
+
+	void (*task_woken)(struct rq *this_rq, struct task_struct *task);
+
+	void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx);
+
+	void (*rq_online)(struct rq *rq);
+	void (*rq_offline)(struct rq *rq);
+
+	struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
+#endif
+
+	void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
+	void (*task_fork)(struct task_struct *p);
+	void (*task_dead)(struct task_struct *p);
+
+	/*
+	 * The switched_from() call is allowed to drop rq->lock, therefore we
+	 * cannot assume the switched_from/switched_to pair is serialized by
+	 * rq->lock. They are however serialized by p->pi_lock.
+	 */
+	void (*switched_from)(struct rq *this_rq, struct task_struct *task);
+	void (*switched_to)  (struct rq *this_rq, struct task_struct *task);
+	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
+			      int oldprio);
+
+	unsigned int (*get_rr_interval)(struct rq *rq,
+					struct task_struct *task);
+
+	void (*update_curr)(struct rq *rq);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	void (*task_change_group)(struct task_struct *p);
+#endif
+
+#ifdef CONFIG_SCHED_CORE
+	int (*task_is_throttled)(struct task_struct *p, int cpu);
+#endif
+};
+```
+
+Current [`scheduler`](https://github.com/arceos-org/scheduler) crate used by ArceOS 
+provides a more fundamental scheduling method interface, which only includes 
+basic task operations and does not account for multiple run queues:
+
+```Rust
+/// The base scheduler trait that all schedulers should implement.
+///
+/// All tasks in the scheduler are considered runnable. If a task is go to
+/// sleep, it should be removed from the scheduler.
+pub trait BaseScheduler {
+    /// Type of scheduled entities. Often a task struct.
+    type SchedItem;
+
+    /// Initializes the scheduler.
+    fn init(&mut self);
+
+    /// Adds a task to the scheduler.
+    fn add_task(&mut self, task: Self::SchedItem);
+
+    /// Removes a task by its reference from the scheduler. Returns the owned
+    /// removed task with ownership if it exists.
+    ///
+    /// # Safety
+    ///
+    /// The caller should ensure that the task is in the scheduler, otherwise
+    /// the behavior is undefined.
+    fn remove_task(&mut self, task: &Self::SchedItem) -> Option<Self::SchedItem>;
+
+    /// Picks the next task to run, it will be removed from the scheduler.
+    /// Returns [`None`] if there is not runnable task.
+    fn pick_next_task(&mut self) -> Option<Self::SchedItem>;
+
+    /// Puts the previous task back to the scheduler. The previous task is
+    /// usually placed at the end of the ready queue, making it less likely
+    /// to be re-scheduled.
+    ///
+    /// `preempt` indicates whether the previous task is preempted by the next
+    /// task. In this case, the previous task may be placed at the front of the
+    /// ready queue.
+    fn put_prev_task(&mut self, prev: Self::SchedItem, preempt: bool);
+
+    /// Advances the scheduler state at each timer tick. Returns `true` if
+    /// re-scheduling is required.
+    ///
+    /// `current` is the current running task.
+    fn task_tick(&mut self, current: &Self::SchedItem) -> bool;
+
+    /// Set priority for a task.
+    fn set_priority(&mut self, task: &Self::SchedItem, prio: isize) -> bool;
+}
+```
+
+The current scheduler design focuses solely on the task states within its own ready queue. 
+The scheduler is held by AxRunQueue as a global static variable 
+and serves as a globally unique scheduler for all cores.
+
+```Rust
+// modules/axtask/src/run_queue.rs
+
+// TODO: per-CPU
+pub(crate) static RUN_QUEUE: LazyInit<SpinNoIrq<AxRunQueue>> = LazyInit::new();
+
+pub(crate) struct AxRunQueue {
+    scheduler: Scheduler,
+}
+```
+
+Referencing Linux's design, methods such as `select_task_rq` and those related to 
+load balancing should be provided by the scheduler itself. 
+However, to simplify the design and minimize modifications to the scheduler crate, 
+we continue to use ArceOS's original design, managing the scheduler with AxRunQueue. 
+We will change `AxRunQueue` to be a per-CPU variable instead of a globally unique instance 
+(as well as `EXITED_TASKS`, `WAIT_FOR_EXIT`, and `TIMER_LIST`).
+
+By doing this, the originally global unique SpinNoIrq of AxRunQueue needs to be distributed across each core. 
+We will refactor the locking mechanism and refine the granularity of the locks.
+
+## cpumask crate
+
+We introduce [cpumask](https://github.com/arceos-org/cpumask) crate for CPU affinity attribute for a task.
+
+## Lock, Irq and Preemption
+
+### AxRunQueue
+The lock for AxRunQueue no longer exists. 
+
+For the run queue, we have refined the locks to the ready queue within the scheduler, 
+meaning that only operations that require interaction with the ready queue, 
+such as picking the next task and pushing tasks, need to be locked.
+
+The current run queue for a core can be obtained through the `current_run_queue` method. 
+This process needs to be performed under the protection of `kernel_guard` to ensure the safety of preemption and interrupt states.
+
+```Rust
+/// Returns a reference to the current run queue.
+///
+/// ## Safety
+///
+/// This function returns a static reference to the current run queue, which
+/// is inherently unsafe. It assumes that the `RUN_QUEUE` has been properly
+/// initialized and is not accessed concurrently in a way that could cause
+/// data races or undefined behavior.
+///
+/// ## Returns
+///
+/// A static reference to the current run queue.
+// #[inline(always)]
+pub(crate) fn current_run_queue<G: BaseGuard>() -> AxRunQueueRef<'static, G> {
+    let irq_state = G::acquire();
+    AxRunQueueRef {
+        inner: unsafe { RUN_QUEUE.current_ref_mut_raw() },
+        state: irq_state,
+        _phantom: core::marker::PhantomData,
+    }
+}
+```
+
+### WaitQueue
+Operations on the wait queue are no longer protected by the large lock of AxRunQueue. 
+
+We need to protect the wait queue using `SpinNoIrq` and distinguish it from operations related to the run queue:
+* When waiting for a task, first insert it into the wait queue, then call the relevant methods of the run queue for task switching.
+* When waking up a task, first remove it from the wait queue, then call the `select_run_queue` method to choose an appropriate run queue for insertion.
+
+### TimerList
+
+The TimerList is also designed to be per-CPU, used for recording and responding to specific clock times. 
+This allows us to eliminate the lock for TimerList itself. 
+
+TimerList may be used in `wait_timeout_until`, where a task can simultaneously wait for either a notification or a timer event. 
+Therefore, a task may be placed in both TimerList and WaitQueue. 
+
+To prevent a task from being awakened by both methods simultaneously, we need to apply an `unblock_lock` to the task, ensuring that the unblock operation for a task can **succeed only once**.
+
+```Rust
+    pub(crate) fn unblock_locked<F>(&self, mut run_queue_push: F)
+    where
+        F: FnMut(),
+    {
+        debug!("{} unblocking", self.id_name());
+
+        // If the owning (remote) CPU is still in the middle of schedule() with
+        // this task as prev, wait until it's done referencing the task.
+        //
+        // Pairs with the `clear_prev_task_on_cpu()`.
+        //
+        // This ensures that tasks getting woken will be fully ordered against
+        // their previous state and preserve Program Order.
+        if self.on_cpu() {
+            while self.on_cpu() {
+                // Wait for the task to finish its scheduling process.
+                core::hint::spin_loop();
+            }
+            assert!(!self.on_cpu())
+        }
+
+        // When irq is enabled, use `unblock_lock` to protect the task from being unblocked by timer and `notify()` at the same time.
+        #[cfg(feature = "irq")]
+        let _lock = self.unblock_lock.lock();
+        if self.is_blocked() {
+            run_queue_push();
+        }
+    }
+```
+
+## The `on_cpu` flag
+
+
+When we reduce the lock granularity of the run queue and distinguish it from the wait queue locks, 
+we need to address a phenomenon: 
+when a task calls a `wait_xxx` method to wait on a specific wait queue, 
+it may not have been scheduled away from its current CPU before being woken up by another CPU's wait queue and running on that CPU. 
+The general flow may be as follows:
+
+```
+CPU 0               |   CPU1
+wq.lock()               
+push A to wq
+wq.unlock()             
+                        wq.lock()
+                        pop A from wq
+                        wq.unlock()
+...                     ...
+...                     save prev_ctx
+-------------------------------------------
+save prev_ctx(A)        restore next_ctx(A)
+-------------------------------------------
+restore next_ctx
+```
+
+We have to use some stragety to ensure read-after-write consistency.
+
+* shenango and skyloft introduce a `stack_busy` flag in task struct to indicate whether the task has finishes switching stacks,
+it is set as true for a task when yielding is about to happened, and cleared after its context has been saved to stack.
+
+    ```ASM
+    .align 16
+    .globl __context_switch
+    .type __context_switch, @function
+    __context_switch:
+        SAVE_CALLEE
+        SAVE_FXSTATE
+
+        mov [rdi], rsp
+
+        /* clear the stack busy flag */
+        mov byte ptr [rdx], 0
+
+        mov rsp, rsi
+
+        RESTORE_FXSTATE
+        RESTORE_CALLEE
+    #ifdef SKYLOFT_UINTR
+        /* enable preemption */
+        stui
+    #endif
+        ret
+    ```
+
+    During scheduling process, when it tries to yield to a task with `stack_busy` true, it need to enter a spin loop:
+
+    ```C
+        /* task must be scheduled atomically */
+        if (unlikely(atomic_load_acq(&next->stack_busy))) {
+            /* wait until the scheduler finishes switching stacks */
+            while (atomic_load_acq(&next->stack_busy)) cpu_relax();
+        }
+    ```
+
+* Linux use a `on_cpu` flag 
+
+    ```C
+    * p->on_cpu <- { 0, 1 }:
+    *
+    *   is set by prepare_task() and cleared by finish_task() such that it will be
+    *   set before p is scheduled-in and cleared after p is scheduled-out, both
+    *   under rq->lock. Non-zero indicates the task is running on its CPU.
+    *
+    *   [ The astute reader will observe that it is possible for two tasks on one
+    *     CPU to have ->on_cpu = 1 at the same time. ]
+    ```
+
+
+    During a scheduling event in Linux, the process begins by calling `prepare_task` to set the `on_cpu` flag of the next task to 1. 
+    After invoking the `switch_to` method to switch to the next task, 
+    the next task receives a return value pointing to the previous task's pointer, 
+    allowing it to clear the `on_cpu` flag of the previous task.
+    Basic workflow:
+    ```C
+    // on prev task
+    context_switch
+        prepare_task_switch(rq, prev, next);
+            prepare_task(next);
+                WRITE_ONCE(next->on_cpu, 1);
+        switch_to(prev, next, prev);
+            ((last) = __switch_to_asm((prev), (next)));
+    // On next task
+        finish_task_switch(prev);   
+            finish_task(prev);
+                smp_store_release(&prev->on_cpu, 0);
+    ```
+    The TTWU_QUEUE feature in Linux allows the use of IPI to wake up a remote CPU within the `try_to_wake_up` function, 
+    instead of waiting for the task on the remote CPU to complete its scheduling process. 
+    This reduces the overhead of spinlocks and locks.
+
+    ```C
+    // kernel/sched/core.c
+
+    int try_to_wake_up() {
+        // ...
+
+        /*
+        * If the owning (remote) CPU is still in the middle of schedule() with
+        * this task as prev, considering queueing p on the remote CPUs wake_list
+        * which potentially sends an IPI instead of spinning on p->on_cpu to
+        * let the waker make forward progress. This is safe because IRQs are
+        * disabled and the IPI will deliver after on_cpu is cleared.
+        *
+        * Ensure we load task_cpu(p) after p->on_cpu:
+        *
+        * set_task_cpu(p, cpu);
+        *   STORE p->cpu = @cpu
+        * __schedule() (switch to task 'p')
+        *   LOCK rq->lock
+        *   smp_mb__after_spin_lock()		smp_cond_load_acquire(&p->on_cpu)
+        *   STORE p->on_cpu = 1		LOAD p->cpu
+        *
+        * to ensure we observe the correct CPU on which the task is currently
+        * scheduling.
+        */
+        if (smp_load_acquire(&p->on_cpu) &&
+            ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
+            break;
+
+        /*
+        * If the owning (remote) CPU is still in the middle of schedule() with
+        * this task as prev, wait until it's done referencing the task.
+        *
+        * Pairs with the smp_store_release() in finish_task().
+        *
+        * This ensures that tasks getting woken will be fully ordered against
+        * their previous state and preserve Program Order.
+        */
+        smp_cond_load_acquire(&p->on_cpu, !VAL);
+    }
+    ```

From 091a054e464da59148fb0f1eba9052b45ba3505b Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Thu, 26 Sep 2024 15:25:20 +0800
Subject: [PATCH 08/51] [fix] unit test error related to doc comment

---
 modules/axtask/src/task.rs | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/modules/axtask/src/task.rs b/modules/axtask/src/task.rs
index 69957fa412..2da5adaf61 100644
--- a/modules/axtask/src/task.rs
+++ b/modules/axtask/src/task.rs
@@ -408,13 +408,13 @@ impl TaskInner {
     /// The pointer is provided by previous task running on this CPU through `set_prev_task_on_cpu_ptr()`.
     ///
     /// ## Note
-    ///     This must be the very last reference to @_prev_task from this CPU.
-    ///     After `on_cpu` is cleared, the task can be moved to a different CPU.
-    ///     We must ensure this doesn't happen until the switch is completely finished.
+    /// This must be the very last reference to @_prev_task from this CPU.
+    /// After `on_cpu` is cleared, the task can be moved to a different CPU.
+    /// We must ensure this doesn't happen until the switch is completely finished.
     ///
     /// ## Safety
-    ///     The caller must ensure that the pointer is valid and points to a boolean value, which is
-    ///     done by the previous task running on this CPU through `set_prev_task_on_cpu_ptr()`.
+    /// The caller must ensure that the pointer is valid and points to a boolean value, which is
+    /// done by the previous task running on this CPU through `set_prev_task_on_cpu_ptr()`.
     pub unsafe fn clear_prev_task_on_cpu(&self) {
         AtomicBool::from_ptr(self.prev_task_on_cpu_ptr.load(Ordering::Acquire))
             .store(false, Ordering::Release);
@@ -424,8 +424,8 @@ impl TaskInner {
     ///
     /// It is used to protect the task from being moved to a different run queue
     /// while it has not finished its scheduling process.
-    /// The `on_cpu` field is set to `true` when the task is preparing to run on a CPU,
-    /// and it is set to `false` when the task has finished its scheduling process in `finish_switch`.
+    /// The `on_cpu field is set to `true` when the task is preparing to run on a CPU,
+    /// and it is set to `false` when the task has finished its scheduling process in `clear_prev_task_on_cpu()`.
     pub fn on_cpu(&self) -> bool {
         self.on_cpu.load(Ordering::Acquire)
     }

From a86d819263f3a4561bf2c47a7d7425ac9dacffd5 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Thu, 26 Sep 2024 15:49:01 +0800
Subject: [PATCH 09/51] [feat] delete cancel_alarm from timers

---
 modules/axtask/src/timers.rs     | 15 ++++++---------
 modules/axtask/src/wait_queue.rs |  7 +++++--
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/modules/axtask/src/timers.rs b/modules/axtask/src/timers.rs
index 954b07f330..a4f73a3814 100644
--- a/modules/axtask/src/timers.rs
+++ b/modules/axtask/src/timers.rs
@@ -1,5 +1,3 @@
-use alloc::sync::Arc;
-
 use kernel_guard::NoOp;
 use lazyinit::LazyInit;
 use timer_list::{TimeValue, TimerEvent, TimerList};
@@ -16,6 +14,12 @@ struct TaskWakeupEvent(AxTaskRef);
 
 impl TimerEvent for TaskWakeupEvent {
     fn callback(self, _now: TimeValue) {
+        // Ignore the timer event if the task is not in the timer list.
+        // timeout was set but not triggered (wake up by `WaitQueue::notify()`).
+        if !self.0.in_timer_list() {
+            return;
+        }
+
         self.0.set_in_timer_list(false);
         select_run_queue::<NoOp>(
             #[cfg(feature = "smp")]
@@ -32,13 +36,6 @@ pub fn set_alarm_wakeup(deadline: TimeValue, task: AxTaskRef) {
     })
 }
 
-pub fn cancel_alarm(task: &AxTaskRef) {
-    TIMER_LIST.with_current(|timer_list| {
-        task.set_in_timer_list(false);
-        timer_list.cancel(|t| Arc::ptr_eq(&t.0, task));
-    })
-}
-
 pub fn check_events() {
     loop {
         let now = wall_time();
diff --git a/modules/axtask/src/wait_queue.rs b/modules/axtask/src/wait_queue.rs
index 141d812db6..243f075854 100644
--- a/modules/axtask/src/wait_queue.rs
+++ b/modules/axtask/src/wait_queue.rs
@@ -1,7 +1,7 @@
 use alloc::collections::VecDeque;
 use alloc::sync::Arc;
 
-use kernel_guard::{NoOp, NoPreemptIrqSave};
+use kernel_guard::NoPreemptIrqSave;
 use kspin::SpinNoIrq;
 
 use crate::{current_run_queue, select_run_queue, task::TaskState, AxTaskRef, CurrentTask};
@@ -59,7 +59,10 @@ impl WaitQueue {
         #[cfg(feature = "irq")]
         if curr.in_timer_list() {
             // timeout was set but not triggered (wake up by `WaitQueue::notify()`)
-            crate::timers::cancel_alarm(curr.as_task_ref());
+            curr.set_in_timer_list(false);
+            // TODO:
+            // this task is still not remoted from timer list of target CPU,
+            // which may cause some redundant timer events.
         }
     }
 

From c49bd2d7ab3aba17459e7d743e6681713146a29d Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Fri, 27 Sep 2024 21:54:55 +0800
Subject: [PATCH 10/51] Update percpu_rq

---
 Cargo.lock                      | 2 +-
 modules/axtask/src/run_queue.rs | 4 ++--
 modules/axtask/src/task.rs      | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 98747e20d2..1e31dedf19 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -700,7 +700,7 @@ checksum = "7f8f80099a98041a3d1622845c271458a2d73e688351bf3cb999266764b81d48"
 [[package]]
 name = "cpumask"
 version = "0.1.0"
-source = "git+https://github.com/arceos-org/cpumask.git#b4eab5e7ae8543029bf248769fcccb137151acb3"
+source = "git+https://github.com/arceos-org/cpumask.git#5b61f0136d140a529ba2ab609e47beb41dc4f1c4"
 dependencies = [
  "bitmaps",
 ]
diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index 7507a51f31..a1ea2ccd43 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -181,7 +181,7 @@ impl AxRunQueue {
     pub fn new(cpu_id: usize) -> Self {
         let gc_task = TaskInner::new(gc_entry, "gc".into(), axconfig::TASK_STACK_SIZE).into_arc();
         // gc task shoule be pinned to the current CPU.
-        gc_task.set_cpumask(CpuMask::from_usize(1 << cpu_id));
+        gc_task.set_cpumask(CpuMask::one_shot(cpu_id));
 
         let mut scheduler = Scheduler::new();
         scheduler.add_task(gc_task);
@@ -420,7 +420,7 @@ pub(crate) fn init() {
     const IDLE_TASK_STACK_SIZE: usize = 4096;
     let idle_task = TaskInner::new(|| crate::run_idle(), "idle".into(), IDLE_TASK_STACK_SIZE);
     // idle task should be pinned to the current CPU.
-    idle_task.set_cpumask(CpuMask::from_usize(1 << cpu_id));
+    idle_task.set_cpumask(CpuMask::one_shot(cpu_id));
     IDLE_TASK.with_current(|i| {
         i.init_once(idle_task.into_arc());
     });
diff --git a/modules/axtask/src/task.rs b/modules/axtask/src/task.rs
index 2da5adaf61..5fa49d0432 100644
--- a/modules/axtask/src/task.rs
+++ b/modules/axtask/src/task.rs
@@ -225,7 +225,7 @@ impl TaskInner {
     pub(crate) fn new_init(name: String) -> Self {
         let mut t = Self::new_common(TaskId::new(), name);
         t.is_init = true;
-        t.set_cpumask(CpuMask::from_usize(1 << this_cpu_id()));
+        t.set_cpumask(CpuMask::one_shot(this_cpu_id()));
         if t.name == "idle" {
             t.is_idle = true;
         }

From e2b1ce134dad2141af68b20f12d1a48553eb878e Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Sat, 28 Sep 2024 14:06:17 +0800
Subject: [PATCH 11/51] [fix] use timer ticket id to prevent potential bug
 introduced by timer event confusion

---
 modules/axtask/src/task.rs       | 23 +++++++++++++++++++++++
 modules/axtask/src/timers.rs     | 28 ++++++++++++++++++++++------
 modules/axtask/src/wait_queue.rs |  4 ++--
 3 files changed, 47 insertions(+), 8 deletions(-)

diff --git a/modules/axtask/src/task.rs b/modules/axtask/src/task.rs
index 5fa49d0432..b033a6301d 100644
--- a/modules/axtask/src/task.rs
+++ b/modules/axtask/src/task.rs
@@ -49,9 +49,15 @@ pub struct TaskInner {
     /// CPU affinity mask.
     cpumask: SpinNoIrq<CpuMask<{ axconfig::SMP }>>,
 
+    /// Mark whether the task is in the wait queue.
     in_wait_queue: AtomicBool,
+    /// Mark whether the task is in the timer list.
     #[cfg(feature = "irq")]
     in_timer_list: AtomicBool,
+    /// A ticket ID used to identify the timer event.
+    /// Incremented by 1 each time the timer event is triggered or expired.
+    #[cfg(feature = "irq")]
+    timer_ticket_id: AtomicUsize,
 
     /// Used to indicate whether the task is running on a CPU.
     on_cpu: AtomicBool,
@@ -196,6 +202,8 @@ impl TaskInner {
             in_wait_queue: AtomicBool::new(false),
             #[cfg(feature = "irq")]
             in_timer_list: AtomicBool::new(false),
+            #[cfg(feature = "irq")]
+            timer_ticket_id: AtomicUsize::new(0),
             on_cpu: AtomicBool::new(false),
             prev_task_on_cpu_ptr: AtomicPtr::new(core::ptr::null_mut()),
             #[cfg(feature = "irq")]
@@ -302,6 +310,21 @@ impl TaskInner {
         self.in_timer_list.store(in_timer_list, Ordering::Release);
     }
 
+    /// Returns current available timer ticket ID.
+    #[inline]
+    #[cfg(feature = "irq")]
+    pub(crate) fn timer_ticket(&self) -> usize {
+        self.timer_ticket_id.load(Ordering::Acquire)
+    }
+
+    /// Expire one timer ticket ID, increment timer_ticket_id by 1,
+    /// which can be used to identify one timer event is triggered or expired.
+    #[inline]
+    #[cfg(feature = "irq")]
+    pub(crate) fn timer_ticket_expire_one(&self) {
+        self.timer_ticket_id.fetch_add(1, Ordering::Release);
+    }
+
     pub(crate) fn unblock_locked<F>(&self, mut run_queue_push: F)
     where
         F: FnMut(),
diff --git a/modules/axtask/src/timers.rs b/modules/axtask/src/timers.rs
index a4f73a3814..a3808ecf1b 100644
--- a/modules/axtask/src/timers.rs
+++ b/modules/axtask/src/timers.rs
@@ -10,29 +10,45 @@ percpu_static! {
     TIMER_LIST: LazyInit<TimerList<TaskWakeupEvent>> = LazyInit::new(),
 }
 
-struct TaskWakeupEvent(AxTaskRef);
+struct TaskWakeupEvent {
+    ticket_id: usize,
+    task: AxTaskRef,
+}
 
 impl TimerEvent for TaskWakeupEvent {
     fn callback(self, _now: TimeValue) {
         // Ignore the timer event if the task is not in the timer list.
         // timeout was set but not triggered (wake up by `WaitQueue::notify()`).
-        if !self.0.in_timer_list() {
+        // Judge if this timer event is still valid by checking the ticket ID.
+        if !self.task.in_timer_list() || self.task.timer_ticket() != self.ticket_id {
+            // The task is not in the timer list or the ticket ID is not matched.
+            // Just ignore this timer event and return.
             return;
         }
 
-        self.0.set_in_timer_list(false);
+        // Timer ticket match.
+        // Mark the task as not in the timer list.
+        self.task.set_in_timer_list(false);
+        // Timer event is triggered, expire the ticket ID.
+        self.task.timer_ticket_expire_one();
         select_run_queue::<NoOp>(
             #[cfg(feature = "smp")]
-            self.0.clone(),
+            self.task.clone(),
         )
-        .unblock_task(self.0, true)
+        .unblock_task(self.task, true)
     }
 }
 
 pub fn set_alarm_wakeup(deadline: TimeValue, task: AxTaskRef) {
     TIMER_LIST.with_current(|timer_list| {
         task.set_in_timer_list(true);
-        timer_list.set(deadline, TaskWakeupEvent(task));
+        timer_list.set(
+            deadline,
+            TaskWakeupEvent {
+                ticket_id: task.timer_ticket(),
+                task,
+            },
+        );
     })
 }
 
diff --git a/modules/axtask/src/wait_queue.rs b/modules/axtask/src/wait_queue.rs
index 243f075854..9bc470d4bb 100644
--- a/modules/axtask/src/wait_queue.rs
+++ b/modules/axtask/src/wait_queue.rs
@@ -58,10 +58,10 @@ impl WaitQueue {
         }
         #[cfg(feature = "irq")]
         if curr.in_timer_list() {
-            // timeout was set but not triggered (wake up by `WaitQueue::notify()`)
             curr.set_in_timer_list(false);
+            curr.timer_ticket_expire_one();
             // TODO:
-            // this task is still not remoted from timer list of target CPU,
+            // this task is still not removed from timer list of target CPU,
             // which may cause some redundant timer events.
         }
     }

From 602fadddd77e39cf2967c63170f4a35b6cd524d5 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Sat, 28 Sep 2024 14:56:59 +0800
Subject: [PATCH 12/51] [feat] use type CpuSet to wrap cpumask::CpuMask

---
 Cargo.lock                       |  3 +-
 modules/axtask/Cargo.toml        |  3 +-
 modules/axtask/src/api.rs        |  9 ++--
 modules/axtask/src/run_queue.rs  | 80 ++++++++++++++++----------------
 modules/axtask/src/task.rs       | 12 ++---
 modules/axtask/src/timers.rs     |  6 +--
 modules/axtask/src/wait_queue.rs |  6 +--
 7 files changed, 52 insertions(+), 67 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 1e31dedf19..fd04d1ea00 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1323,9 +1323,8 @@ checksum = "e6e36312fb5ddc10d08ecdc65187402baba4ac34585cb9d1b78522ae2358d890"
 [[package]]
 name = "scheduler"
 version = "0.1.0"
-source = "git+https://github.com/arceos-org/scheduler.git?branch=num_tasks#415a620347722cb734fa440d9103065690a5853b"
+source = "git+https://github.com/arceos-org/scheduler.git?tag=v0.1.0#c8d25d9aed146dca28dc8987afd229b52c20361a"
 dependencies = [
- "kspin",
  "linked_list",
 ]
 
diff --git a/modules/axtask/Cargo.toml b/modules/axtask/Cargo.toml
index 133b8aa3bd..daa83d9d3f 100644
--- a/modules/axtask/Cargo.toml
+++ b/modules/axtask/Cargo.toml
@@ -47,8 +47,7 @@ memory_addr = { version = "0.3", optional = true }
 timer_list = { version = "0.1", optional = true }
 kernel_guard = { version = "0.1", optional = true }
 crate_interface = { version = "0.1", optional = true }
-# scheduler = { git = "https://github.com/arceos-org/scheduler.git", tag = "v0.1.0", optional = true }
-scheduler = { git = "https://github.com/arceos-org/scheduler.git", branch = "num_tasks", optional = true }
+scheduler = { git = "https://github.com/arceos-org/scheduler.git", tag = "v0.1.0", optional = true }
 cpumask = { git = "https://github.com/arceos-org/cpumask.git", optional = true }
 
 [dev-dependencies]
diff --git a/modules/axtask/src/api.rs b/modules/axtask/src/api.rs
index b89369b552..0900993b6d 100644
--- a/modules/axtask/src/api.rs
+++ b/modules/axtask/src/api.rs
@@ -16,6 +16,8 @@ pub use crate::wait_queue::WaitQueue;
 /// The reference type of a task.
 pub type AxTaskRef = Arc<AxTask>;
 
+pub type CpuSet = cpumask::CpuMask<{ axconfig::SMP }>;
+
 cfg_if::cfg_if! {
     if #[cfg(feature = "sched_rr")] {
         const MAX_TIME_SLICE: usize = 5;
@@ -96,12 +98,7 @@ pub fn on_timer_tick() {
 /// Adds the given task to the run queue, returns the task reference.
 pub fn spawn_task(task: TaskInner) -> AxTaskRef {
     let task_ref = task.into_arc();
-    let _kernel_guard = kernel_guard::NoPreemptIrqSave::new();
-    crate::select_run_queue::<NoPreemptIrqSave>(
-        #[cfg(feature = "smp")]
-        task_ref.clone(),
-    )
-    .add_task(task_ref.clone());
+    crate::select_run_queue::<NoPreemptIrqSave>(task_ref.clone()).add_task(task_ref.clone());
     task_ref
 }
 
diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index a1ea2ccd43..327b43fac0 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -1,16 +1,18 @@
 use alloc::collections::VecDeque;
 use alloc::sync::Arc;
 use core::mem::MaybeUninit;
+use core::sync::atomic::{AtomicUsize, Ordering};
 
 use cpumask::CpuMask;
 use kernel_guard::BaseGuard;
+use kspin::SpinRaw;
 use lazyinit::LazyInit;
 use scheduler::BaseScheduler;
 
 use axhal::cpu::this_cpu_id;
 
 use crate::task::{CurrentTask, TaskState};
-use crate::{AxTaskRef, Scheduler, TaskInner, WaitQueue};
+use crate::{AxTaskRef, CpuSet, Scheduler, TaskInner, WaitQueue};
 
 macro_rules! percpu_static {
     ($($name:ident: $ty:ty = $init:expr),* $(,)?) => {
@@ -63,10 +65,10 @@ pub(crate) fn current_run_queue<G: BaseGuard>() -> AxRunQueueRef<'static, G> {
     }
 }
 
-/// Selects the run queue index based on a CPU set bitmap, minimizing the number of tasks.
+/// Selects the run queue index based on a CPU set bitmap and load balancing.
 ///
 /// This function filters the available run queues based on the provided `cpumask` and
-/// selects the one with the fewest tasks. The selected run queue's index (cpu_id) is returned.
+/// selects the run queue index for the next task. The selection is based on a round-robin algorithm.
 ///
 /// ## Arguments
 ///
@@ -78,19 +80,22 @@ pub(crate) fn current_run_queue<G: BaseGuard>() -> AxRunQueueRef<'static, G> {
 ///
 /// ## Panics
 ///
-/// This function will panic if there is no available run queue that matches the CPU set.
+/// This function will panic if `cpu_mask` is empty, indicating that there are no available CPUs for task execution.
 ///
 #[cfg(feature = "smp")]
 #[inline]
-fn select_run_queue_index(cpumask: CpuMask<{ axconfig::SMP }>) -> usize {
-    unsafe {
-        RUN_QUEUES
-            .iter()
-            .filter(|rq| cpumask.get(rq.assume_init_ref().cpu_id()))
-            .min_by_key(|rq| rq.assume_init_ref().num_tasks())
-            .expect("No available run queue that matches the CPU set")
-            .assume_init_ref()
-            .cpu_id()
+fn select_run_queue_index(cpumask: CpuSet) -> usize {
+    static RUN_QUEUE_INDEX: AtomicUsize = AtomicUsize::new(0);
+
+    assert!(!cpumask.is_empty(), "No available CPU for task execution");
+
+    // Round-robin selection of the run queue index.
+    loop {
+        let index = RUN_QUEUE_INDEX.load(Ordering::SeqCst) % axconfig::SMP;
+        if cpumask.get(index) {
+            return index;
+        }
+        RUN_QUEUE_INDEX.fetch_add(1, Ordering::SeqCst);
     }
 }
 
@@ -136,9 +141,7 @@ fn get_run_queue(index: usize) -> &'static mut AxRunQueue {
 /// 2. Use a more generic load balancing algorithm that can be customized or replaced.
 ///
 #[inline]
-pub(crate) fn select_run_queue<G: BaseGuard>(
-    #[cfg(feature = "smp")] task: AxTaskRef,
-) -> AxRunQueueRef<'static, G> {
+pub(crate) fn select_run_queue<G: BaseGuard>(task: AxTaskRef) -> AxRunQueueRef<'static, G> {
     #[cfg(not(feature = "smp"))]
     {
         // When SMP is disabled, all tasks are scheduled on the same global run queue.
@@ -162,7 +165,9 @@ pub(crate) struct AxRunQueue {
     /// The ID of the CPU this run queue is associated with.
     cpu_id: usize,
     /// The core scheduler of this run queue.
-    scheduler: Scheduler,
+    /// Since irq and preempt are preserved by the kernel guard hold by `AxRunQueueRef`,
+    /// we just use a simple raw spin lock here.
+    scheduler: SpinRaw<Scheduler>,
 }
 
 pub(crate) struct AxRunQueueRef<'a, G: BaseGuard> {
@@ -185,20 +190,10 @@ impl AxRunQueue {
 
         let mut scheduler = Scheduler::new();
         scheduler.add_task(gc_task);
-        Self { cpu_id, scheduler }
-    }
-
-    /// Returns the cpu id of current run queue,
-    /// which is also its index in `RUN_QUEUES`.
-    pub fn cpu_id(&self) -> usize {
-        self.cpu_id
-    }
-
-    /// Returns the number of tasks in current run queue's scheduler,
-    /// which is used for load balance during scheduling.
-    #[cfg(feature = "smp")]
-    pub fn num_tasks(&self) -> usize {
-        self.scheduler.num_tasks()
+        Self {
+            cpu_id,
+            scheduler: SpinRaw::new(scheduler),
+        }
     }
 }
 
@@ -207,20 +202,19 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
     pub fn add_task(&mut self, task: AxTaskRef) {
         debug!("Add {} on run_queue {}", task.id_name(), self.inner.cpu_id);
         assert!(task.is_ready());
-        self.inner.scheduler.add_task(task);
+        self.inner.scheduler.lock().add_task(task);
     }
 
     #[cfg(feature = "irq")]
     pub fn scheduler_timer_tick(&mut self) {
         let curr = crate::current();
-        if !curr.is_idle() && self.inner.scheduler.task_tick(curr.as_task_ref()) {
+        if !curr.is_idle() && self.inner.scheduler.lock().task_tick(curr.as_task_ref()) {
             #[cfg(feature = "preempt")]
             curr.set_preempt_pending(true);
         }
     }
 
     pub fn yield_current(&mut self) {
-        let _kernel_guard = kernel_guard::NoPreemptIrqSave::new();
         let curr = crate::current();
         trace!("task yield: {}", curr.id_name());
         assert!(curr.is_running());
@@ -230,6 +224,7 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
     pub fn set_current_priority(&mut self, prio: isize) -> bool {
         self.inner
             .scheduler
+            .lock()
             .set_priority(crate::current().as_task_ref(), prio)
     }
 
@@ -297,7 +292,7 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
             let cpu_id = self.inner.cpu_id;
             debug!("task unblock: {} on run_queue {}", task.id_name(), cpu_id);
             task.set_state(TaskState::Ready);
-            self.inner.scheduler.add_task(task.clone()); // TODO: priority
+            self.inner.scheduler.lock().add_task(task.clone()); // TODO: priority
 
             // Note: when the task is unblocked on another CPU's run queue,
             // we just ingiore the `resched` flag.
@@ -310,7 +305,6 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
 
     #[cfg(feature = "irq")]
     pub fn sleep_until(&mut self, deadline: axhal::time::TimeValue) {
-        let _kernel_guard = kernel_guard::NoPreemptIrqSave::new();
         let curr = crate::current();
         debug!("task sleep: {}, deadline={:?}", curr.id_name(), deadline);
         assert!(curr.is_running());
@@ -333,14 +327,18 @@ impl AxRunQueue {
         if prev.is_running() {
             prev.set_state(TaskState::Ready);
             if !prev.is_idle() {
-                self.scheduler.put_prev_task(prev.clone(), preempt);
+                self.scheduler.lock().put_prev_task(prev.clone(), preempt);
             }
         }
 
-        let next = self.scheduler.pick_next_task().unwrap_or_else(|| unsafe {
-            // Safety: IRQs must be disabled at this time.
-            IDLE_TASK.current_ref_raw().get_unchecked().clone()
-        });
+        let next = self
+            .scheduler
+            .lock()
+            .pick_next_task()
+            .unwrap_or_else(|| unsafe {
+                // Safety: IRQs must be disabled at this time.
+                IDLE_TASK.current_ref_raw().get_unchecked().clone()
+            });
         assert!(
             next.is_ready(),
             "next {} is not ready: {:?}",
diff --git a/modules/axtask/src/task.rs b/modules/axtask/src/task.rs
index b033a6301d..4941494c16 100644
--- a/modules/axtask/src/task.rs
+++ b/modules/axtask/src/task.rs
@@ -15,7 +15,7 @@ use axhal::cpu::this_cpu_id;
 use axhal::tls::TlsArea;
 
 use crate::task_ext::AxTaskExt;
-use crate::{AxTask, AxTaskRef, WaitQueue};
+use crate::{AxTask, AxTaskRef, CpuSet, WaitQueue};
 
 /// A unique identifier for a thread.
 #[derive(Debug, Clone, Copy, Eq, PartialEq)]
@@ -47,7 +47,7 @@ pub struct TaskInner {
     state: AtomicU8,
 
     /// CPU affinity mask.
-    cpumask: SpinNoIrq<CpuMask<{ axconfig::SMP }>>,
+    cpumask: SpinNoIrq<CpuSet>,
 
     /// Mark whether the task is in the wait queue.
     in_wait_queue: AtomicBool,
@@ -280,11 +280,11 @@ impl TaskInner {
     }
 
     #[inline]
-    pub(crate) fn cpumask(&self) -> CpuMask<{ axconfig::SMP }> {
+    pub(crate) fn cpumask(&self) -> CpuSet {
         *self.cpumask.lock()
     }
 
-    pub(crate) fn set_cpumask(&self, cpumask: CpuMask<{ axconfig::SMP }>) {
+    pub(crate) fn set_cpumask(&self, cpumask: CpuSet) {
         *self.cpumask.lock() = cpumask
     }
 
@@ -385,9 +385,9 @@ impl TaskInner {
     fn current_check_preempt_pending() {
         let curr = crate::current();
         if curr.need_resched.load(Ordering::Acquire) && curr.can_preempt(0) {
-            let _kernel_guard = kernel_guard::NoPreemptIrqSave::new();
+            let mut rq = crate::current_run_queue::<kernel_guard::NoPreemptIrqSave>();
             if curr.need_resched.load(Ordering::Acquire) {
-                crate::current_run_queue::<kernel_guard::NoOp>().preempt_resched()
+                rq.preempt_resched()
             }
         }
     }
diff --git a/modules/axtask/src/timers.rs b/modules/axtask/src/timers.rs
index a3808ecf1b..6ab30201b7 100644
--- a/modules/axtask/src/timers.rs
+++ b/modules/axtask/src/timers.rs
@@ -31,11 +31,7 @@ impl TimerEvent for TaskWakeupEvent {
         self.task.set_in_timer_list(false);
         // Timer event is triggered, expire the ticket ID.
         self.task.timer_ticket_expire_one();
-        select_run_queue::<NoOp>(
-            #[cfg(feature = "smp")]
-            self.task.clone(),
-        )
-        .unblock_task(self.task, true)
+        select_run_queue::<NoOp>(self.task.clone()).unblock_task(self.task, true)
     }
 }
 
diff --git a/modules/axtask/src/wait_queue.rs b/modules/axtask/src/wait_queue.rs
index 9bc470d4bb..8ea84a764b 100644
--- a/modules/axtask/src/wait_queue.rs
+++ b/modules/axtask/src/wait_queue.rs
@@ -260,9 +260,5 @@ impl WaitQueue {
 
 pub(crate) fn unblock_one_task(task: AxTaskRef, resched: bool) {
     // Select run queue by the CPU set of the task.
-    select_run_queue::<NoPreemptIrqSave>(
-        #[cfg(feature = "smp")]
-        task.clone(),
-    )
-    .unblock_task(task, resched)
+    select_run_queue::<NoPreemptIrqSave>(task.clone()).unblock_task(task, resched)
 }

From 95c9344fb38491697e83696ff6c9e75d8126bff0 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Sat, 28 Sep 2024 15:02:08 +0800
Subject: [PATCH 13/51] [fix] use allow(clippy::modulo_one) to bypass modulo 1
 error in select_run_queue_index

---
 modules/axtask/src/run_queue.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index 327b43fac0..bfa4c1b79d 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -83,6 +83,8 @@ pub(crate) fn current_run_queue<G: BaseGuard>() -> AxRunQueueRef<'static, G> {
 /// This function will panic if `cpu_mask` is empty, indicating that there are no available CPUs for task execution.
 ///
 #[cfg(feature = "smp")]
+// The modulo operation is safe here because `axconfig::SMP` is always greater than 1 with "smp" enabled.
+#[allow(clippy::modulo_one)]
 #[inline]
 fn select_run_queue_index(cpumask: CpuSet) -> usize {
     static RUN_QUEUE_INDEX: AtomicUsize = AtomicUsize::new(0);

From bad33552b595155da20cca15143b92d035e802ef Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Sat, 28 Sep 2024 15:08:58 +0800
Subject: [PATCH 14/51] [fix] miss doc for type alias CpuSet

---
 modules/axtask/src/api.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/axtask/src/api.rs b/modules/axtask/src/api.rs
index 0900993b6d..191a3a2e59 100644
--- a/modules/axtask/src/api.rs
+++ b/modules/axtask/src/api.rs
@@ -16,6 +16,7 @@ pub use crate::wait_queue::WaitQueue;
 /// The reference type of a task.
 pub type AxTaskRef = Arc<AxTask>;
 
+/// The wrapper type for cpumask::CpuMask with SMP configuration.
 pub type CpuSet = cpumask::CpuMask<{ axconfig::SMP }>;
 
 cfg_if::cfg_if! {

From 73eb533cc40879487b42059fb2ba2db23edb219f Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Mon, 30 Sep 2024 11:56:20 +0800
Subject: [PATCH 15/51] [fix] little modification in axtask api.rs

---
 modules/axtask/src/api.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/axtask/src/api.rs b/modules/axtask/src/api.rs
index 191a3a2e59..cbaad55c90 100644
--- a/modules/axtask/src/api.rs
+++ b/modules/axtask/src/api.rs
@@ -99,7 +99,7 @@ pub fn on_timer_tick() {
 /// Adds the given task to the run queue, returns the task reference.
 pub fn spawn_task(task: TaskInner) -> AxTaskRef {
     let task_ref = task.into_arc();
-    crate::select_run_queue::<NoPreemptIrqSave>(task_ref.clone()).add_task(task_ref.clone());
+    select_run_queue::<NoPreemptIrqSave>(task_ref.clone()).add_task(task_ref.clone());
     task_ref
 }
 

From 71909cfdcf668a5158d29d14e3e63b297c9a7ec9 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Mon, 30 Sep 2024 13:47:25 +0800
Subject: [PATCH 16/51] [fix] some review opinions

---
 Cargo.lock                      |  4 ++--
 modules/axtask/src/api.rs       |  4 +++-
 modules/axtask/src/run_queue.rs |  7 +++----
 modules/axtask/src/task.rs      | 15 +++++++--------
 modules/axtask/src/timers.rs    |  2 +-
 5 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index fd04d1ea00..6f71e698e1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1031,9 +1031,9 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
 
 [[package]]
 name = "memory_addr"
-version = "0.3.0"
+version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2ca25419c2b34080d526d6836a53dcab129767ab6a9904587c87265ae6d41aa9"
+checksum = "6f769efcf10b9dfb4c913bebb409cda77b1a3f072b249bf5465e250bcb30eb49"
 
 [[package]]
 name = "minimal-lexical"
diff --git a/modules/axtask/src/api.rs b/modules/axtask/src/api.rs
index cbaad55c90..3166505962 100644
--- a/modules/axtask/src/api.rs
+++ b/modules/axtask/src/api.rs
@@ -17,7 +17,7 @@ pub use crate::wait_queue::WaitQueue;
 pub type AxTaskRef = Arc<AxTask>;
 
 /// The wrapper type for cpumask::CpuMask with SMP configuration.
-pub type CpuSet = cpumask::CpuMask<{ axconfig::SMP }>;
+pub type CpuMask = cpumask::CpuMask<{ axconfig::SMP }>;
 
 cfg_if::cfg_if! {
     if #[cfg(feature = "sched_rr")] {
@@ -93,6 +93,8 @@ pub fn init_scheduler_secondary() {
 #[doc(cfg(feature = "irq"))]
 pub fn on_timer_tick() {
     crate::timers::check_events();
+    // Since irq and preemption are both disabled here,
+    // we can get current run queue with the default `kernel_guard::NoOp`.
     current_run_queue::<NoOp>().scheduler_timer_tick();
 }
 
diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index bfa4c1b79d..65d4f43d40 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -3,7 +3,6 @@ use alloc::sync::Arc;
 use core::mem::MaybeUninit;
 use core::sync::atomic::{AtomicUsize, Ordering};
 
-use cpumask::CpuMask;
 use kernel_guard::BaseGuard;
 use kspin::SpinRaw;
 use lazyinit::LazyInit;
@@ -12,7 +11,7 @@ use scheduler::BaseScheduler;
 use axhal::cpu::this_cpu_id;
 
 use crate::task::{CurrentTask, TaskState};
-use crate::{AxTaskRef, CpuSet, Scheduler, TaskInner, WaitQueue};
+use crate::{AxTaskRef, CpuMask, Scheduler, TaskInner, WaitQueue};
 
 macro_rules! percpu_static {
     ($($name:ident: $ty:ty = $init:expr),* $(,)?) => {
@@ -55,7 +54,7 @@ const ARRAY_REPEAT_VALUE: MaybeUninit<&'static mut AxRunQueue> = MaybeUninit::un
 /// ## Returns
 ///
 /// A static reference to the current run queue.
-// #[inline(always)]
+#[inline(always)]
 pub(crate) fn current_run_queue<G: BaseGuard>() -> AxRunQueueRef<'static, G> {
     let irq_state = G::acquire();
     AxRunQueueRef {
@@ -86,7 +85,7 @@ pub(crate) fn current_run_queue<G: BaseGuard>() -> AxRunQueueRef<'static, G> {
 // The modulo operation is safe here because `axconfig::SMP` is always greater than 1 with "smp" enabled.
 #[allow(clippy::modulo_one)]
 #[inline]
-fn select_run_queue_index(cpumask: CpuSet) -> usize {
+fn select_run_queue_index(cpumask: CpuMask) -> usize {
     static RUN_QUEUE_INDEX: AtomicUsize = AtomicUsize::new(0);
 
     assert!(!cpumask.is_empty(), "No available CPU for task execution");
diff --git a/modules/axtask/src/task.rs b/modules/axtask/src/task.rs
index 4941494c16..c487d015d8 100644
--- a/modules/axtask/src/task.rs
+++ b/modules/axtask/src/task.rs
@@ -5,7 +5,6 @@ use core::sync::atomic::AtomicUsize;
 use core::sync::atomic::{AtomicBool, AtomicI32, AtomicPtr, AtomicU64, AtomicU8, Ordering};
 use core::{alloc::Layout, cell::UnsafeCell, fmt, ptr::NonNull};
 
-use cpumask::CpuMask;
 use kspin::{SpinNoIrq, SpinRaw};
 use memory_addr::{align_up_4k, VirtAddr};
 
@@ -15,7 +14,7 @@ use axhal::cpu::this_cpu_id;
 use axhal::tls::TlsArea;
 
 use crate::task_ext::AxTaskExt;
-use crate::{AxTask, AxTaskRef, CpuSet, WaitQueue};
+use crate::{AxTask, AxTaskRef, CpuMask, WaitQueue};
 
 /// A unique identifier for a thread.
 #[derive(Debug, Clone, Copy, Eq, PartialEq)]
@@ -47,7 +46,7 @@ pub struct TaskInner {
     state: AtomicU8,
 
     /// CPU affinity mask.
-    cpumask: SpinNoIrq<CpuSet>,
+    cpumask: SpinNoIrq<CpuMask>,
 
     /// Mark whether the task is in the wait queue.
     in_wait_queue: AtomicBool,
@@ -57,7 +56,7 @@ pub struct TaskInner {
     /// A ticket ID used to identify the timer event.
     /// Incremented by 1 each time the timer event is triggered or expired.
     #[cfg(feature = "irq")]
-    timer_ticket_id: AtomicUsize,
+    timer_ticket_id: AtomicU64,
 
     /// Used to indicate whether the task is running on a CPU.
     on_cpu: AtomicBool,
@@ -203,7 +202,7 @@ impl TaskInner {
             #[cfg(feature = "irq")]
             in_timer_list: AtomicBool::new(false),
             #[cfg(feature = "irq")]
-            timer_ticket_id: AtomicUsize::new(0),
+            timer_ticket_id: AtomicU64::new(0),
             on_cpu: AtomicBool::new(false),
             prev_task_on_cpu_ptr: AtomicPtr::new(core::ptr::null_mut()),
             #[cfg(feature = "irq")]
@@ -280,11 +279,11 @@ impl TaskInner {
     }
 
     #[inline]
-    pub(crate) fn cpumask(&self) -> CpuSet {
+    pub(crate) fn cpumask(&self) -> CpuMask {
         *self.cpumask.lock()
     }
 
-    pub(crate) fn set_cpumask(&self, cpumask: CpuSet) {
+    pub(crate) fn set_cpumask(&self, cpumask: CpuMask) {
         *self.cpumask.lock() = cpumask
     }
 
@@ -313,7 +312,7 @@ impl TaskInner {
     /// Returns current available timer ticket ID.
     #[inline]
     #[cfg(feature = "irq")]
-    pub(crate) fn timer_ticket(&self) -> usize {
+    pub(crate) fn timer_ticket(&self) -> u64 {
         self.timer_ticket_id.load(Ordering::Acquire)
     }
 
diff --git a/modules/axtask/src/timers.rs b/modules/axtask/src/timers.rs
index 6ab30201b7..518998511a 100644
--- a/modules/axtask/src/timers.rs
+++ b/modules/axtask/src/timers.rs
@@ -11,7 +11,7 @@ percpu_static! {
 }
 
 struct TaskWakeupEvent {
-    ticket_id: usize,
+    ticket_id: u64,
     task: AxTaskRef,
 }
 

From 75b7756f29c0f6387c264ec1a4f810678766ac4c Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Mon, 30 Sep 2024 13:54:50 +0800
Subject: [PATCH 17/51] [fix] delete if self.on_cpu() block in unblock_locked

---
 modules/axtask/src/task.rs | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/modules/axtask/src/task.rs b/modules/axtask/src/task.rs
index c487d015d8..1999a36e1d 100644
--- a/modules/axtask/src/task.rs
+++ b/modules/axtask/src/task.rs
@@ -337,13 +337,11 @@ impl TaskInner {
         //
         // This ensures that tasks getting woken will be fully ordered against
         // their previous state and preserve Program Order.
-        if self.on_cpu() {
-            while self.on_cpu() {
-                // Wait for the task to finish its scheduling process.
-                core::hint::spin_loop();
-            }
-            assert!(!self.on_cpu())
+        while self.on_cpu() {
+            // Wait for the task to finish its scheduling process.
+            core::hint::spin_loop();
         }
+        assert!(!self.on_cpu());
 
         // When irq is enabled, use `unblock_lock` to protect the task from being unblocked by timer and `notify()` at the same time.
         #[cfg(feature = "irq")]

From 347f3faceec541b21ded504537e8664459afdbc6 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Mon, 30 Sep 2024 21:10:24 +0800
Subject: [PATCH 18/51] [refactor] remove in_timer_list flag in axtask

---
 modules/axtask/src/task.rs       | 17 -----------------
 modules/axtask/src/timers.rs     |  9 +++------
 modules/axtask/src/wait_queue.rs | 20 +++++++++++++-------
 3 files changed, 16 insertions(+), 30 deletions(-)

diff --git a/modules/axtask/src/task.rs b/modules/axtask/src/task.rs
index 1999a36e1d..1ec5af753c 100644
--- a/modules/axtask/src/task.rs
+++ b/modules/axtask/src/task.rs
@@ -50,9 +50,6 @@ pub struct TaskInner {
 
     /// Mark whether the task is in the wait queue.
     in_wait_queue: AtomicBool,
-    /// Mark whether the task is in the timer list.
-    #[cfg(feature = "irq")]
-    in_timer_list: AtomicBool,
     /// A ticket ID used to identify the timer event.
     /// Incremented by 1 each time the timer event is triggered or expired.
     #[cfg(feature = "irq")]
@@ -200,8 +197,6 @@ impl TaskInner {
             cpumask: SpinNoIrq::new(CpuMask::full()),
             in_wait_queue: AtomicBool::new(false),
             #[cfg(feature = "irq")]
-            in_timer_list: AtomicBool::new(false),
-            #[cfg(feature = "irq")]
             timer_ticket_id: AtomicU64::new(0),
             on_cpu: AtomicBool::new(false),
             prev_task_on_cpu_ptr: AtomicPtr::new(core::ptr::null_mut()),
@@ -297,18 +292,6 @@ impl TaskInner {
         self.in_wait_queue.store(in_wait_queue, Ordering::Release);
     }
 
-    #[inline]
-    #[cfg(feature = "irq")]
-    pub(crate) fn in_timer_list(&self) -> bool {
-        self.in_timer_list.load(Ordering::Acquire)
-    }
-
-    #[inline]
-    #[cfg(feature = "irq")]
-    pub(crate) fn set_in_timer_list(&self, in_timer_list: bool) {
-        self.in_timer_list.store(in_timer_list, Ordering::Release);
-    }
-
     /// Returns current available timer ticket ID.
     #[inline]
     #[cfg(feature = "irq")]
diff --git a/modules/axtask/src/timers.rs b/modules/axtask/src/timers.rs
index 518998511a..5bd1cd28af 100644
--- a/modules/axtask/src/timers.rs
+++ b/modules/axtask/src/timers.rs
@@ -17,18 +17,16 @@ struct TaskWakeupEvent {
 
 impl TimerEvent for TaskWakeupEvent {
     fn callback(self, _now: TimeValue) {
-        // Ignore the timer event if the task is not in the timer list.
-        // timeout was set but not triggered (wake up by `WaitQueue::notify()`).
+        // Ignore the timer event if timeout was set but not triggered
+        // (wake up by `WaitQueue::notify()`).
         // Judge if this timer event is still valid by checking the ticket ID.
-        if !self.task.in_timer_list() || self.task.timer_ticket() != self.ticket_id {
+        if self.task.timer_ticket() != self.ticket_id {
             // The task is not in the timer list or the ticket ID is not matched.
             // Just ignore this timer event and return.
             return;
         }
 
         // Timer ticket match.
-        // Mark the task as not in the timer list.
-        self.task.set_in_timer_list(false);
         // Timer event is triggered, expire the ticket ID.
         self.task.timer_ticket_expire_one();
         select_run_queue::<NoOp>(self.task.clone()).unblock_task(self.task, true)
@@ -37,7 +35,6 @@ impl TimerEvent for TaskWakeupEvent {
 
 pub fn set_alarm_wakeup(deadline: TimeValue, task: AxTaskRef) {
     TIMER_LIST.with_current(|timer_list| {
-        task.set_in_timer_list(true);
         timer_list.set(
             deadline,
             TaskWakeupEvent {
diff --git a/modules/axtask/src/wait_queue.rs b/modules/axtask/src/wait_queue.rs
index 8ea84a764b..cee8802fd7 100644
--- a/modules/axtask/src/wait_queue.rs
+++ b/modules/axtask/src/wait_queue.rs
@@ -47,7 +47,9 @@ impl WaitQueue {
         }
     }
 
-    fn cancel_events(&self, curr: CurrentTask) {
+    /// Cancel events by removing the task from the wait queue.
+    /// If `from_timer_list` is true, the task should be removed from the timer list.
+    fn cancel_events(&self, curr: CurrentTask, from_timer_list: bool) {
         // A task can be wake up only one events (timer or `notify()`), remove
         // the event from another queue.
         if curr.in_wait_queue() {
@@ -56,9 +58,10 @@ impl WaitQueue {
             wq_locked.retain(|t| !curr.ptr_eq(t));
             curr.set_in_wait_queue(false);
         }
+        // Try to cancel a timer event from timer lists.
+        // Just mark task's current timer ticket ID as expired.
         #[cfg(feature = "irq")]
-        if curr.in_timer_list() {
-            curr.set_in_timer_list(false);
+        if from_timer_list {
             curr.timer_ticket_expire_one();
             // TODO:
             // this task is still not removed from timer list of target CPU,
@@ -91,7 +94,7 @@ impl WaitQueue {
         let mut rq = current_run_queue::<NoPreemptIrqSave>();
         self.push_to_wait_queue();
         rq.blocked_resched();
-        self.cancel_events(crate::current());
+        self.cancel_events(crate::current(), false);
     }
 
     /// Blocks the current task and put it into the wait queue, until the given
@@ -128,7 +131,7 @@ impl WaitQueue {
 
             rq.blocked_resched();
         }
-        self.cancel_events(crate::current());
+        self.cancel_events(crate::current(), false);
     }
 
     /// Blocks the current task and put it into the wait queue, until other tasks
@@ -149,7 +152,10 @@ impl WaitQueue {
         rq.blocked_resched();
 
         let timeout = curr.in_wait_queue(); // still in the wait queue, must have timed out
-        self.cancel_events(curr);
+
+        // If `timeout` is true, the task is still in the wait queue,
+        // which means timer event is triggered and the task has been removed from timer list.
+        self.cancel_events(curr, !timeout);
         timeout
     }
 
@@ -196,7 +202,7 @@ impl WaitQueue {
 
             rq.blocked_resched()
         }
-        self.cancel_events(curr);
+        self.cancel_events(curr, !timeout);
         timeout
     }
 

From 1fd84b12e31682495151cd21d8b4c885701de2a9 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Mon, 30 Sep 2024 21:12:43 +0800
Subject: [PATCH 19/51] [fix] doc error in unblock_task

---
 modules/axtask/src/run_queue.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index 65d4f43d40..b9c9f0f919 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -287,7 +287,9 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
     }
 
     /// Unblock one task by inserting it into the run queue.
-    /// If task state is `BLOCKING`, it will enter a loop until the task is in `BLOCKED` state.
+    /// If task's `on_cpu` flag is true,
+    /// it will enter a loop until the task finishes its scheduling process,
+    /// see `unblock_locked` for details.
     pub fn unblock_task(&mut self, task: AxTaskRef, resched: bool) {
         task.clone().unblock_locked(|| {
             let cpu_id = self.inner.cpu_id;

From 2782e4ebd020de45cb8f63f20c680ddbf8442139 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Mon, 30 Sep 2024 22:11:54 +0800
Subject: [PATCH 20/51] [refactor] timer ticket id

---
 modules/axtask/src/task.rs       | 21 ++++++++++++++++-----
 modules/axtask/src/timers.rs     | 18 +++++++++---------
 modules/axtask/src/wait_queue.rs |  2 +-
 3 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/modules/axtask/src/task.rs b/modules/axtask/src/task.rs
index 1ec5af753c..1e4c315328 100644
--- a/modules/axtask/src/task.rs
+++ b/modules/axtask/src/task.rs
@@ -292,19 +292,30 @@ impl TaskInner {
         self.in_wait_queue.store(in_wait_queue, Ordering::Release);
     }
 
-    /// Returns current available timer ticket ID.
+    /// Returns task's current timer ticket ID.
     #[inline]
     #[cfg(feature = "irq")]
     pub(crate) fn timer_ticket(&self) -> u64 {
         self.timer_ticket_id.load(Ordering::Acquire)
     }
 
-    /// Expire one timer ticket ID, increment timer_ticket_id by 1,
-    /// which can be used to identify one timer event is triggered or expired.
+    /// Set the timer ticket ID.
     #[inline]
     #[cfg(feature = "irq")]
-    pub(crate) fn timer_ticket_expire_one(&self) {
-        self.timer_ticket_id.fetch_add(1, Ordering::Release);
+    pub(crate) fn set_timer_ticket(&self, timer_ticket_id: u64) {
+        // CAN NOT set timer_ticket_id to 0,
+        // because 0 is used to indicate the timer event is expired.
+        assert!(timer_ticket_id != 0);
+        self.timer_ticket_id
+            .store(timer_ticket_id, Ordering::Release);
+    }
+
+    /// Expire timer ticket ID by setting it to 0,
+    /// it can be used to identify one timer event is triggered or expired.
+    #[inline]
+    #[cfg(feature = "irq")]
+    pub(crate) fn timer_ticket_expired(&self) {
+        self.timer_ticket_id.store(0, Ordering::Release);
     }
 
     pub(crate) fn unblock_locked<F>(&self, mut run_queue_push: F)
diff --git a/modules/axtask/src/timers.rs b/modules/axtask/src/timers.rs
index 5bd1cd28af..25e22d7c84 100644
--- a/modules/axtask/src/timers.rs
+++ b/modules/axtask/src/timers.rs
@@ -1,3 +1,5 @@
+use core::sync::atomic::{AtomicU64, Ordering};
+
 use kernel_guard::NoOp;
 use lazyinit::LazyInit;
 use timer_list::{TimeValue, TimerEvent, TimerList};
@@ -6,6 +8,8 @@ use axhal::time::wall_time;
 
 use crate::{select_run_queue, AxTaskRef};
 
+static TIMER_TICKET_ID: AtomicU64 = AtomicU64::new(1);
+
 percpu_static! {
     TIMER_LIST: LazyInit<TimerList<TaskWakeupEvent>> = LazyInit::new(),
 }
@@ -21,27 +25,23 @@ impl TimerEvent for TaskWakeupEvent {
         // (wake up by `WaitQueue::notify()`).
         // Judge if this timer event is still valid by checking the ticket ID.
         if self.task.timer_ticket() != self.ticket_id {
-            // The task is not in the timer list or the ticket ID is not matched.
+            // Timer ticket ID is not matched.
             // Just ignore this timer event and return.
             return;
         }
 
         // Timer ticket match.
         // Timer event is triggered, expire the ticket ID.
-        self.task.timer_ticket_expire_one();
+        self.task.timer_ticket_expired();
         select_run_queue::<NoOp>(self.task.clone()).unblock_task(self.task, true)
     }
 }
 
 pub fn set_alarm_wakeup(deadline: TimeValue, task: AxTaskRef) {
     TIMER_LIST.with_current(|timer_list| {
-        timer_list.set(
-            deadline,
-            TaskWakeupEvent {
-                ticket_id: task.timer_ticket(),
-                task,
-            },
-        );
+        let ticket_id = TIMER_TICKET_ID.fetch_add(1, Ordering::AcqRel);
+        task.set_timer_ticket(ticket_id);
+        timer_list.set(deadline, TaskWakeupEvent { ticket_id, task });
     })
 }
 
diff --git a/modules/axtask/src/wait_queue.rs b/modules/axtask/src/wait_queue.rs
index cee8802fd7..e17639b14b 100644
--- a/modules/axtask/src/wait_queue.rs
+++ b/modules/axtask/src/wait_queue.rs
@@ -62,7 +62,7 @@ impl WaitQueue {
         // Just mark task's current timer ticket ID as expired.
         #[cfg(feature = "irq")]
         if from_timer_list {
-            curr.timer_ticket_expire_one();
+            curr.timer_ticket_expired();
             // TODO:
             // this task is still not removed from timer list of target CPU,
             // which may cause some redundant timer events.

From 1f7ac39fe8cc429ff88698bd5b3ec5960adec0ae Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Mon, 30 Sep 2024 22:40:24 +0800
Subject: [PATCH 21/51] [fix] use put_prev_task in unblock_task

---
 modules/axtask/src/run_queue.rs | 5 ++++-
 modules/axtask/src/timers.rs    | 2 --
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index b9c9f0f919..aa1dc20e44 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -295,7 +295,10 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
             let cpu_id = self.inner.cpu_id;
             debug!("task unblock: {} on run_queue {}", task.id_name(), cpu_id);
             task.set_state(TaskState::Ready);
-            self.inner.scheduler.lock().add_task(task.clone()); // TODO: priority
+            self.inner
+                .scheduler
+                .lock()
+                .put_prev_task(task.clone(), resched); // TODO: priority
 
             // Note: when the task is unblocked on another CPU's run queue,
             // we just ingiore the `resched` flag.
diff --git a/modules/axtask/src/timers.rs b/modules/axtask/src/timers.rs
index 25e22d7c84..1afd61cbd5 100644
--- a/modules/axtask/src/timers.rs
+++ b/modules/axtask/src/timers.rs
@@ -31,8 +31,6 @@ impl TimerEvent for TaskWakeupEvent {
         }
 
         // Timer ticket match.
-        // Timer event is triggered, expire the ticket ID.
-        self.task.timer_ticket_expired();
         select_run_queue::<NoOp>(self.task.clone()).unblock_task(self.task, true)
     }
 }

From de504e1a5c0cd1ae2061c20d80433463f69e518e Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Fri, 4 Oct 2024 12:16:03 +0800
Subject: [PATCH 22/51] [fix] pontential bug in unblock_locked, set on_cpu as
 true for init tasks

---
 modules/axtask/src/run_queue.rs |  2 ++
 modules/axtask/src/task.rs      | 25 ++++++++++++-------------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index aa1dc20e44..437df22a8e 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -432,6 +432,7 @@ pub(crate) fn init() {
     // Put the subsequent execution into the `main` task.
     let main_task = TaskInner::new_init("main".into()).into_arc();
     main_task.set_state(TaskState::Running);
+    main_task.set_on_cpu(true);
     unsafe { CurrentTask::init_current(main_task) }
 
     info!("Initialize RUN_QUEUES");
@@ -452,6 +453,7 @@ pub(crate) fn init_secondary() {
     IDLE_TASK.with_current(|i| {
         i.init_once(idle_task.clone());
     });
+    idle_task.set_on_cpu(true);
     unsafe { CurrentTask::init_current(idle_task) }
     RUN_QUEUE.with_current(|rq| {
         rq.init_once(AxRunQueue::new(cpu_id));
diff --git a/modules/axtask/src/task.rs b/modules/axtask/src/task.rs
index 1e4c315328..9c5e944a86 100644
--- a/modules/axtask/src/task.rs
+++ b/modules/axtask/src/task.rs
@@ -324,23 +324,22 @@ impl TaskInner {
     {
         debug!("{} unblocking", self.id_name());
 
-        // If the owning (remote) CPU is still in the middle of schedule() with
-        // this task as prev, wait until it's done referencing the task.
-        //
-        // Pairs with the `clear_prev_task_on_cpu()`.
-        //
-        // This ensures that tasks getting woken will be fully ordered against
-        // their previous state and preserve Program Order.
-        while self.on_cpu() {
-            // Wait for the task to finish its scheduling process.
-            core::hint::spin_loop();
-        }
-        assert!(!self.on_cpu());
-
         // When irq is enabled, use `unblock_lock` to protect the task from being unblocked by timer and `notify()` at the same time.
         #[cfg(feature = "irq")]
         let _lock = self.unblock_lock.lock();
         if self.is_blocked() {
+            // If the owning (remote) CPU is still in the middle of schedule() with
+            // this task as prev, wait until it's done referencing the task.
+            //
+            // Pairs with the `clear_prev_task_on_cpu()`.
+            //
+            // This ensures that tasks getting woken will be fully ordered against
+            // their previous state and preserve Program Order.
+            while self.on_cpu() {
+                // Wait for the task to finish its scheduling process.
+                core::hint::spin_loop();
+            }
+            assert!(!self.on_cpu());
             run_queue_push();
         }
     }

From c9b9ecc6250e40b596975e81143dcea29703da67 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Fri, 4 Oct 2024 14:52:52 +0800
Subject: [PATCH 23/51] [feat] enrich code comments and scheduling-related docs

---
 doc/percpu_scheduler.md    | 69 +++++++++++++++++++++++++++++---------
 modules/axtask/src/task.rs |  5 +++
 2 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/doc/percpu_scheduler.md b/doc/percpu_scheduler.md
index 507b016e3d..bd48447d6a 100644
--- a/doc/percpu_scheduler.md
+++ b/doc/percpu_scheduler.md
@@ -222,25 +222,22 @@ To prevent a task from being awakened by both methods simultaneously, we need to
     {
         debug!("{} unblocking", self.id_name());
 
-        // If the owning (remote) CPU is still in the middle of schedule() with
-        // this task as prev, wait until it's done referencing the task.
-        //
-        // Pairs with the `clear_prev_task_on_cpu()`.
-        //
-        // This ensures that tasks getting woken will be fully ordered against
-        // their previous state and preserve Program Order.
-        if self.on_cpu() {
-            while self.on_cpu() {
-                // Wait for the task to finish its scheduling process.
-                core::hint::spin_loop();
-            }
-            assert!(!self.on_cpu())
-        }
-
         // When irq is enabled, use `unblock_lock` to protect the task from being unblocked by timer and `notify()` at the same time.
         #[cfg(feature = "irq")]
         let _lock = self.unblock_lock.lock();
         if self.is_blocked() {
+            // If the owning (remote) CPU is still in the middle of schedule() with
+            // this task as prev, wait until it's done referencing the task.
+            //
+            // Pairs with the `clear_prev_task_on_cpu()`.
+            //
+            // This ensures that tasks getting woken will be fully ordered against
+            // their previous state and preserve Program Order.
+            while self.on_cpu() {
+                // Wait for the task to finish its scheduling process.
+                core::hint::spin_loop();
+            }
+            assert!(!self.on_cpu());
             run_queue_push();
         }
     }
@@ -387,3 +384,45 @@ it is set as true for a task when yielding is about to happened, and cleared aft
         smp_cond_load_acquire(&p->on_cpu, !VAL);
     }
     ```
+
+*  `on_cpu` flag in axtask 
+    
+    Inspired by Linux's `on_cpu` flag design, we adopted a simpler logic.
+    
+    The on_cpu flag of a task running on a CPU is set to `true`, and after a task yields the CPU, the next task clears the `on_cpu` flag of the previous task using the `clear_prev_task_on_cpu` method. 
+    
+    This method requires the task structure to store a pointer to the `on_cpu` flag of the previous task:
+    ```Rust
+        next_task.set_prev_task_on_cpu_ptr(prev_task.on_cpu_mut_ptr());
+    ```
+    In the `unblock_task` method, if `the` on_cpu flag of the target task is found to be `true`, it indicates that the task has not completed its scheduling.
+    
+    We spin and wait for the task's `on_cpu` flag to become `false`, ensuring that the task being placed into the run queue has already vacated a CPU.
+
+## Potential Bugs in Per-CPU Scheduling and Solutions
+
+1. Task Stale State in Previous CPU
+
+* Bug: A task scheduled on a CPU may not have fully vacated the previous CPU, as it may not have completed saving its context (e.g., saving callee-saved registers). This can lead to a stale reference to the task on the previous CPU. If another core attempts to run this task before the context is fully saved, it may cause **read-after-write** consistency issues with the task's context.
+
+* Solution: Ensure the `on_cpu` flag is correctly used to track whether a task is still running on any CPU. A task must be fully removed from the previous CPU once its scheduling is finished. The `on_cpu` flag should be used to prevent prematurely scheduling the task on another CPU. The task should only be added to another CPU's run queue when its `on_cpu` flag has been set to `false`.
+
+    Additionally, the next task on the CPU should clear the `on_cpu` flag of the previous task using methods like `clear_prev_task_on_cpu`, ensuring the previous task is no longer active before transitioning to the next task. This ensures that context consistency is maintained across CPU cores.
+
+2. Race Condition on Wait Queue and Timer Event Wakeups
+
+* Bug: If a task is blocked using methods like `wait_timeout` or `wait_timeout_until`, it is placed in both the wait queue and the timer list. 
+This leads to a potential issue where the task may be woken up by both a notification from another core and a timer event simultaneously. 
+Both attempts may call the `unblock_task` function, trying to insert the task into a run queue. 
+This can result in the task being added to the run queue multiple times.
+
+* Solution: Introduce an `unblock_lock` mechanism to prevent simultaneous wakeups from the wait queue and the timer event. 
+Ensure that only one wakeup path can succeed at a time. In the `unblock_task` function, call the `unblock_locked` method of axtask, which involves competing for the `unblock_lock`. Only the side that acquires the lock first can proceed to invoke the `run_queue_push` closure, setting the task's state to Ready and adding it to the designated run queue. The other side, which acquires the lock later, will find that the task's state is no longer Blocked and will return without taking any action.
+
+3. Race Condition on Timer Event Not Being Cleared in Time
+
+* Bug: When a task on core0 invokes methods like `wait_timeout` or `wait_timeout_until`, it is placed in both the wait queue and the timer list. If core1 notifies the wait queue and the task starts running on core2, the task should continue executing in the `wait_xxx` function on core2 and invoke `cancel_events` to remove itself from the wait queue and disable the timer event. However, if the task on core2 has not yet disabled the timer event, the timer event on core0 may still trigger and attempt to call `unblock_task`, causing an inconsistency.
+
+* Solution: In the `unblock_locked` method, if the task’s state is no longer `Blocked`, this unblock attempt will be ignored, preventing any erroneous state changes caused by the stale timer event.
+
+* Note: The polling of the `on_cpu` flag must be placed inside the `if self.is_blocked()` code block. Otherwise, the task's `on_cpu` flag could be set to true on another CPU, leading to inconsistent states or potential deadlocks. This ensures that only blocked tasks undergo the `on_cpu` check, preventing premature or redundant wakeups from other CPUs.
\ No newline at end of file
diff --git a/modules/axtask/src/task.rs b/modules/axtask/src/task.rs
index 9c5e944a86..4e5df6ec70 100644
--- a/modules/axtask/src/task.rs
+++ b/modules/axtask/src/task.rs
@@ -335,6 +335,11 @@ impl TaskInner {
             //
             // This ensures that tasks getting woken will be fully ordered against
             // their previous state and preserve Program Order.
+            //
+            // Note:
+            // 1. This should be placed inside `if self.is_blocked() { ... }`,
+            //    because the task may have been woken up by other cores.
+            // 2. This can be placed in the front of `switch_to()`
             while self.on_cpu() {
                 // Wait for the task to finish its scheduling process.
                 core::hint::spin_loop();

From 901575f9379dda1b40399709ef4e4e561de96b17 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Sun, 6 Oct 2024 10:56:32 +0800
Subject: [PATCH 24/51] [refactor] move percpu related docs to discussions

---
 README.md               |   2 +-
 doc/README.md           |   7 +
 doc/percpu_scheduler.md | 428 ----------------------------------------
 3 files changed, 8 insertions(+), 429 deletions(-)
 delete mode 100644 doc/percpu_scheduler.md

diff --git a/README.md b/README.md
index e0a1e52189..c936cb3ede 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ ArceOS was inspired a lot by [Unikraft](https://github.com/unikraft/unikraft).
 * [x] VirtIO net/blk/gpu drivers
 * [x] TCP/UDP net stack using [smoltcp](https://github.com/smoltcp-rs/smoltcp)
 * [x] Synchronization/Mutex
-* [x] SMP scheduling with single run queue
+* [x] SMP scheduling with [per-cpu run queue](https://github.com/arceos-org/arceos/discussions/181)
 * [x] File system
 * [ ] Compatible with Linux apps
 * [ ] Interrupt driven device I/O
diff --git a/doc/README.md b/doc/README.md
index b3f7d2d5b1..4029880b5a 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -38,3 +38,10 @@ See [arceos-apps](https://github.com/arceos-org/arceos-apps) for example applica
 Documentation of ArceOS [modules](../modules), [api](../api), and [ulib](../ulib) are generated by [`rustdoc`](https://doc.rust-lang.org/rustdoc/what-is-rustdoc.html) and hosted on GitHub Pages:
 
 * https://arceos-org.github.io/arceos/
+
+## Discussions
+
+* [Rust std support](https://github.com/arceos-org/arceos/discussions/92)
+* [ArceOS for ARM64](https://github.com/arceos-org/arceos/discussions/101)
+* [ArceOS for RISCV Hardware](https://github.com/arceos-org/arceos/discussions/120)
+* [Per-CPU scheduling](https://github.com/arceos-org/arceos/discussions/181)
diff --git a/doc/percpu_scheduler.md b/doc/percpu_scheduler.md
deleted file mode 100644
index bd48447d6a..0000000000
--- a/doc/percpu_scheduler.md
+++ /dev/null
@@ -1,428 +0,0 @@
-# About How to support percpu scheduler in ArceOS.
-
-## Background
-
-Orginally, ArceOS uses a rude global RunQueue, and scheduling operations like 
-task yielding, waiting on wait queue and notifying a blocked task are 
-all under the protection of  a global SpinLockNoIrq hold by RunQueue.
-
-To support percpu scheduling, we must refactor the run queue structure,  
-as well as the locking mechanism in the current scheduling framework.
-
-## AxRunQueue and Scheduler crate
-
-For the design of the scheduler interface, we can reference the design used in Linux:
-
-```C
-// kernel/sched/sched.h
-
-struct sched_class {
-
-#ifdef CONFIG_UCLAMP_TASK
-	int uclamp_enabled;
-#endif
-
-	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
-	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
-	void (*yield_task)   (struct rq *rq);
-	bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
-
-	void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags);
-
-	struct task_struct *(*pick_next_task)(struct rq *rq);
-
-	void (*put_prev_task)(struct rq *rq, struct task_struct *p);
-	void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
-
-#ifdef CONFIG_SMP
-	int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
-	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
-
-	struct task_struct * (*pick_task)(struct rq *rq);
-
-	void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
-
-	void (*task_woken)(struct rq *this_rq, struct task_struct *task);
-
-	void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx);
-
-	void (*rq_online)(struct rq *rq);
-	void (*rq_offline)(struct rq *rq);
-
-	struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
-#endif
-
-	void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
-	void (*task_fork)(struct task_struct *p);
-	void (*task_dead)(struct task_struct *p);
-
-	/*
-	 * The switched_from() call is allowed to drop rq->lock, therefore we
-	 * cannot assume the switched_from/switched_to pair is serialized by
-	 * rq->lock. They are however serialized by p->pi_lock.
-	 */
-	void (*switched_from)(struct rq *this_rq, struct task_struct *task);
-	void (*switched_to)  (struct rq *this_rq, struct task_struct *task);
-	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
-			      int oldprio);
-
-	unsigned int (*get_rr_interval)(struct rq *rq,
-					struct task_struct *task);
-
-	void (*update_curr)(struct rq *rq);
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-	void (*task_change_group)(struct task_struct *p);
-#endif
-
-#ifdef CONFIG_SCHED_CORE
-	int (*task_is_throttled)(struct task_struct *p, int cpu);
-#endif
-};
-```
-
-Current [`scheduler`](https://github.com/arceos-org/scheduler) crate used by ArceOS 
-provides a more fundamental scheduling method interface, which only includes 
-basic task operations and does not account for multiple run queues:
-
-```Rust
-/// The base scheduler trait that all schedulers should implement.
-///
-/// All tasks in the scheduler are considered runnable. If a task is go to
-/// sleep, it should be removed from the scheduler.
-pub trait BaseScheduler {
-    /// Type of scheduled entities. Often a task struct.
-    type SchedItem;
-
-    /// Initializes the scheduler.
-    fn init(&mut self);
-
-    /// Adds a task to the scheduler.
-    fn add_task(&mut self, task: Self::SchedItem);
-
-    /// Removes a task by its reference from the scheduler. Returns the owned
-    /// removed task with ownership if it exists.
-    ///
-    /// # Safety
-    ///
-    /// The caller should ensure that the task is in the scheduler, otherwise
-    /// the behavior is undefined.
-    fn remove_task(&mut self, task: &Self::SchedItem) -> Option<Self::SchedItem>;
-
-    /// Picks the next task to run, it will be removed from the scheduler.
-    /// Returns [`None`] if there is not runnable task.
-    fn pick_next_task(&mut self) -> Option<Self::SchedItem>;
-
-    /// Puts the previous task back to the scheduler. The previous task is
-    /// usually placed at the end of the ready queue, making it less likely
-    /// to be re-scheduled.
-    ///
-    /// `preempt` indicates whether the previous task is preempted by the next
-    /// task. In this case, the previous task may be placed at the front of the
-    /// ready queue.
-    fn put_prev_task(&mut self, prev: Self::SchedItem, preempt: bool);
-
-    /// Advances the scheduler state at each timer tick. Returns `true` if
-    /// re-scheduling is required.
-    ///
-    /// `current` is the current running task.
-    fn task_tick(&mut self, current: &Self::SchedItem) -> bool;
-
-    /// Set priority for a task.
-    fn set_priority(&mut self, task: &Self::SchedItem, prio: isize) -> bool;
-}
-```
-
-The current scheduler design focuses solely on the task states within its own ready queue. 
-The scheduler is held by AxRunQueue as a global static variable 
-and serves as a globally unique scheduler for all cores.
-
-```Rust
-// modules/axtask/src/run_queue.rs
-
-// TODO: per-CPU
-pub(crate) static RUN_QUEUE: LazyInit<SpinNoIrq<AxRunQueue>> = LazyInit::new();
-
-pub(crate) struct AxRunQueue {
-    scheduler: Scheduler,
-}
-```
-
-Referencing Linux's design, methods such as `select_task_rq` and those related to 
-load balancing should be provided by the scheduler itself. 
-However, to simplify the design and minimize modifications to the scheduler crate, 
-we continue to use ArceOS's original design, managing the scheduler with AxRunQueue. 
-We will change `AxRunQueue` to be a per-CPU variable instead of a globally unique instance 
-(as well as `EXITED_TASKS`, `WAIT_FOR_EXIT`, and `TIMER_LIST`).
-
-By doing this, the originally global unique SpinNoIrq of AxRunQueue needs to be distributed across each core. 
-We will refactor the locking mechanism and refine the granularity of the locks.
-
-## cpumask crate
-
-We introduce [cpumask](https://github.com/arceos-org/cpumask) crate for CPU affinity attribute for a task.
-
-## Lock, Irq and Preemption
-
-### AxRunQueue
-The lock for AxRunQueue no longer exists. 
-
-For the run queue, we have refined the locks to the ready queue within the scheduler, 
-meaning that only operations that require interaction with the ready queue, 
-such as picking the next task and pushing tasks, need to be locked.
-
-The current run queue for a core can be obtained through the `current_run_queue` method. 
-This process needs to be performed under the protection of `kernel_guard` to ensure the safety of preemption and interrupt states.
-
-```Rust
-/// Returns a reference to the current run queue.
-///
-/// ## Safety
-///
-/// This function returns a static reference to the current run queue, which
-/// is inherently unsafe. It assumes that the `RUN_QUEUE` has been properly
-/// initialized and is not accessed concurrently in a way that could cause
-/// data races or undefined behavior.
-///
-/// ## Returns
-///
-/// A static reference to the current run queue.
-// #[inline(always)]
-pub(crate) fn current_run_queue<G: BaseGuard>() -> AxRunQueueRef<'static, G> {
-    let irq_state = G::acquire();
-    AxRunQueueRef {
-        inner: unsafe { RUN_QUEUE.current_ref_mut_raw() },
-        state: irq_state,
-        _phantom: core::marker::PhantomData,
-    }
-}
-```
-
-### WaitQueue
-Operations on the wait queue are no longer protected by the large lock of AxRunQueue. 
-
-We need to protect the wait queue using `SpinNoIrq` and distinguish it from operations related to the run queue:
-* When waiting for a task, first insert it into the wait queue, then call the relevant methods of the run queue for task switching.
-* When waking up a task, first remove it from the wait queue, then call the `select_run_queue` method to choose an appropriate run queue for insertion.
-
-### TimerList
-
-The TimerList is also designed to be per-CPU, used for recording and responding to specific clock times. 
-This allows us to eliminate the lock for TimerList itself. 
-
-TimerList may be used in `wait_timeout_until`, where a task can simultaneously wait for either a notification or a timer event. 
-Therefore, a task may be placed in both TimerList and WaitQueue. 
-
-To prevent a task from being awakened by both methods simultaneously, we need to apply an `unblock_lock` to the task, ensuring that the unblock operation for a task can **succeed only once**.
-
-```Rust
-    pub(crate) fn unblock_locked<F>(&self, mut run_queue_push: F)
-    where
-        F: FnMut(),
-    {
-        debug!("{} unblocking", self.id_name());
-
-        // When irq is enabled, use `unblock_lock` to protect the task from being unblocked by timer and `notify()` at the same time.
-        #[cfg(feature = "irq")]
-        let _lock = self.unblock_lock.lock();
-        if self.is_blocked() {
-            // If the owning (remote) CPU is still in the middle of schedule() with
-            // this task as prev, wait until it's done referencing the task.
-            //
-            // Pairs with the `clear_prev_task_on_cpu()`.
-            //
-            // This ensures that tasks getting woken will be fully ordered against
-            // their previous state and preserve Program Order.
-            while self.on_cpu() {
-                // Wait for the task to finish its scheduling process.
-                core::hint::spin_loop();
-            }
-            assert!(!self.on_cpu());
-            run_queue_push();
-        }
-    }
-```
-
-## The `on_cpu` flag
-
-
-When we reduce the lock granularity of the run queue and distinguish it from the wait queue locks, 
-we need to address a phenomenon: 
-when a task calls a `wait_xxx` method to wait on a specific wait queue, 
-it may not have been scheduled away from its current CPU before being woken up by another CPU's wait queue and running on that CPU. 
-The general flow may be as follows:
-
-```
-CPU 0               |   CPU1
-wq.lock()               
-push A to wq
-wq.unlock()             
-                        wq.lock()
-                        pop A from wq
-                        wq.unlock()
-...                     ...
-...                     save prev_ctx
--------------------------------------------
-save prev_ctx(A)        restore next_ctx(A)
--------------------------------------------
-restore next_ctx
-```
-
-We have to use some stragety to ensure read-after-write consistency.
-
-* shenango and skyloft introduce a `stack_busy` flag in task struct to indicate whether the task has finishes switching stacks,
-it is set as true for a task when yielding is about to happened, and cleared after its context has been saved to stack.
-
-    ```ASM
-    .align 16
-    .globl __context_switch
-    .type __context_switch, @function
-    __context_switch:
-        SAVE_CALLEE
-        SAVE_FXSTATE
-
-        mov [rdi], rsp
-
-        /* clear the stack busy flag */
-        mov byte ptr [rdx], 0
-
-        mov rsp, rsi
-
-        RESTORE_FXSTATE
-        RESTORE_CALLEE
-    #ifdef SKYLOFT_UINTR
-        /* enable preemption */
-        stui
-    #endif
-        ret
-    ```
-
-    During scheduling process, when it tries to yield to a task with `stack_busy` true, it need to enter a spin loop:
-
-    ```C
-        /* task must be scheduled atomically */
-        if (unlikely(atomic_load_acq(&next->stack_busy))) {
-            /* wait until the scheduler finishes switching stacks */
-            while (atomic_load_acq(&next->stack_busy)) cpu_relax();
-        }
-    ```
-
-* Linux use a `on_cpu` flag 
-
-    ```C
-    * p->on_cpu <- { 0, 1 }:
-    *
-    *   is set by prepare_task() and cleared by finish_task() such that it will be
-    *   set before p is scheduled-in and cleared after p is scheduled-out, both
-    *   under rq->lock. Non-zero indicates the task is running on its CPU.
-    *
-    *   [ The astute reader will observe that it is possible for two tasks on one
-    *     CPU to have ->on_cpu = 1 at the same time. ]
-    ```
-
-
-    During a scheduling event in Linux, the process begins by calling `prepare_task` to set the `on_cpu` flag of the next task to 1. 
-    After invoking the `switch_to` method to switch to the next task, 
-    the next task receives a return value pointing to the previous task's pointer, 
-    allowing it to clear the `on_cpu` flag of the previous task.
-    Basic workflow:
-    ```C
-    // on prev task
-    context_switch
-        prepare_task_switch(rq, prev, next);
-            prepare_task(next);
-                WRITE_ONCE(next->on_cpu, 1);
-        switch_to(prev, next, prev);
-            ((last) = __switch_to_asm((prev), (next)));
-    // On next task
-        finish_task_switch(prev);   
-            finish_task(prev);
-                smp_store_release(&prev->on_cpu, 0);
-    ```
-    The TTWU_QUEUE feature in Linux allows the use of IPI to wake up a remote CPU within the `try_to_wake_up` function, 
-    instead of waiting for the task on the remote CPU to complete its scheduling process. 
-    This reduces the overhead of spinlocks and locks.
-
-    ```C
-    // kernel/sched/core.c
-
-    int try_to_wake_up() {
-        // ...
-
-        /*
-        * If the owning (remote) CPU is still in the middle of schedule() with
-        * this task as prev, considering queueing p on the remote CPUs wake_list
-        * which potentially sends an IPI instead of spinning on p->on_cpu to
-        * let the waker make forward progress. This is safe because IRQs are
-        * disabled and the IPI will deliver after on_cpu is cleared.
-        *
-        * Ensure we load task_cpu(p) after p->on_cpu:
-        *
-        * set_task_cpu(p, cpu);
-        *   STORE p->cpu = @cpu
-        * __schedule() (switch to task 'p')
-        *   LOCK rq->lock
-        *   smp_mb__after_spin_lock()		smp_cond_load_acquire(&p->on_cpu)
-        *   STORE p->on_cpu = 1		LOAD p->cpu
-        *
-        * to ensure we observe the correct CPU on which the task is currently
-        * scheduling.
-        */
-        if (smp_load_acquire(&p->on_cpu) &&
-            ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
-            break;
-
-        /*
-        * If the owning (remote) CPU is still in the middle of schedule() with
-        * this task as prev, wait until it's done referencing the task.
-        *
-        * Pairs with the smp_store_release() in finish_task().
-        *
-        * This ensures that tasks getting woken will be fully ordered against
-        * their previous state and preserve Program Order.
-        */
-        smp_cond_load_acquire(&p->on_cpu, !VAL);
-    }
-    ```
-
-*  `on_cpu` flag in axtask 
-    
-    Inspired by Linux's `on_cpu` flag design, we adopted a simpler logic.
-    
-    The on_cpu flag of a task running on a CPU is set to `true`, and after a task yields the CPU, the next task clears the `on_cpu` flag of the previous task using the `clear_prev_task_on_cpu` method. 
-    
-    This method requires the task structure to store a pointer to the `on_cpu` flag of the previous task:
-    ```Rust
-        next_task.set_prev_task_on_cpu_ptr(prev_task.on_cpu_mut_ptr());
-    ```
-    In the `unblock_task` method, if `the` on_cpu flag of the target task is found to be `true`, it indicates that the task has not completed its scheduling.
-    
-    We spin and wait for the task's `on_cpu` flag to become `false`, ensuring that the task being placed into the run queue has already vacated a CPU.
-
-## Potential Bugs in Per-CPU Scheduling and Solutions
-
-1. Task Stale State in Previous CPU
-
-* Bug: A task scheduled on a CPU may not have fully vacated the previous CPU, as it may not have completed saving its context (e.g., saving callee-saved registers). This can lead to a stale reference to the task on the previous CPU. If another core attempts to run this task before the context is fully saved, it may cause **read-after-write** consistency issues with the task's context.
-
-* Solution: Ensure the `on_cpu` flag is correctly used to track whether a task is still running on any CPU. A task must be fully removed from the previous CPU once its scheduling is finished. The `on_cpu` flag should be used to prevent prematurely scheduling the task on another CPU. The task should only be added to another CPU's run queue when its `on_cpu` flag has been set to `false`.
-
-    Additionally, the next task on the CPU should clear the `on_cpu` flag of the previous task using methods like `clear_prev_task_on_cpu`, ensuring the previous task is no longer active before transitioning to the next task. This ensures that context consistency is maintained across CPU cores.
-
-2. Race Condition on Wait Queue and Timer Event Wakeups
-
-* Bug: If a task is blocked using methods like `wait_timeout` or `wait_timeout_until`, it is placed in both the wait queue and the timer list. 
-This leads to a potential issue where the task may be woken up by both a notification from another core and a timer event simultaneously. 
-Both attempts may call the `unblock_task` function, trying to insert the task into a run queue. 
-This can result in the task being added to the run queue multiple times.
-
-* Solution: Introduce an `unblock_lock` mechanism to prevent simultaneous wakeups from the wait queue and the timer event. 
-Ensure that only one wakeup path can succeed at a time. In the `unblock_task` function, call the `unblock_locked` method of axtask, which involves competing for the `unblock_lock`. Only the side that acquires the lock first can proceed to invoke the `run_queue_push` closure, setting the task's state to Ready and adding it to the designated run queue. The other side, which acquires the lock later, will find that the task's state is no longer Blocked and will return without taking any action.
-
-3. Race Condition on Timer Event Not Being Cleared in Time
-
-* Bug: When a task on core0 invokes methods like `wait_timeout` or `wait_timeout_until`, it is placed in both the wait queue and the timer list. If core1 notifies the wait queue and the task starts running on core2, the task should continue executing in the `wait_xxx` function on core2 and invoke `cancel_events` to remove itself from the wait queue and disable the timer event. However, if the task on core2 has not yet disabled the timer event, the timer event on core0 may still trigger and attempt to call `unblock_task`, causing an inconsistency.
-
-* Solution: In the `unblock_locked` method, if the task’s state is no longer `Blocked`, this unblock attempt will be ignored, preventing any erroneous state changes caused by the stale timer event.
-
-* Note: The polling of the `on_cpu` flag must be placed inside the `if self.is_blocked()` code block. Otherwise, the task's `on_cpu` flag could be set to true on another CPU, leading to inconsistent states or potential deadlocks. This ensures that only blocked tasks undergo the `on_cpu` check, preventing premature or redundant wakeups from other CPUs.
\ No newline at end of file

From 0f69a5b34f67b89dcb4e99101b765b6bfe81f8e9 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Mon, 7 Oct 2024 01:07:36 +0800
Subject: [PATCH 25/51] [fix] first round fix of review suggestions

---
 modules/axtask/src/run_queue.rs | 49 +++++++++++++++++----------------
 modules/axtask/src/task.rs      |  6 ++++
 2 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index 437df22a8e..65ce6dbd62 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -85,18 +85,17 @@ pub(crate) fn current_run_queue<G: BaseGuard>() -> AxRunQueueRef<'static, G> {
 // The modulo operation is safe here because `axconfig::SMP` is always greater than 1 with "smp" enabled.
 #[allow(clippy::modulo_one)]
 #[inline]
-fn select_run_queue_index(cpumask: CpuMask) -> usize {
+fn select_run_queue_index(cpumask: &CpuMask) -> usize {
     static RUN_QUEUE_INDEX: AtomicUsize = AtomicUsize::new(0);
 
     assert!(!cpumask.is_empty(), "No available CPU for task execution");
 
     // Round-robin selection of the run queue index.
     loop {
-        let index = RUN_QUEUE_INDEX.load(Ordering::SeqCst) % axconfig::SMP;
+        let index = RUN_QUEUE_INDEX.fetch_add(1, Ordering::SeqCst) % axconfig::SMP;
         if cpumask.get(index) {
             return index;
         }
-        RUN_QUEUE_INDEX.fetch_add(1, Ordering::SeqCst);
     }
 }
 
@@ -119,7 +118,6 @@ fn select_run_queue_index(cpumask: CpuMask) -> usize {
 ///
 #[inline]
 fn get_run_queue(index: usize) -> &'static mut AxRunQueue {
-    assert!(index < axconfig::SMP);
     unsafe { RUN_QUEUES[index].assume_init_mut() }
 }
 
@@ -152,7 +150,7 @@ pub(crate) fn select_run_queue<G: BaseGuard>(task: AxTaskRef) -> AxRunQueueRef<'
     {
         let irq_state = G::acquire();
         // When SMP is enabled, select the run queue based on the task's CPU affinity and load balance.
-        let index = select_run_queue_index(task.cpumask());
+        let index = select_run_queue_index(&task.cpumask());
         AxRunQueueRef {
             inner: get_run_queue(index),
             state: irq_state,
@@ -161,7 +159,7 @@ pub(crate) fn select_run_queue<G: BaseGuard>(task: AxTaskRef) -> AxRunQueueRef<'
     }
 }
 
-/// AxRunQueue represents a run queue for global system or a specific CPU.
+/// `AxRunQueue` represents a run queue for global system or a specific CPU.
 pub(crate) struct AxRunQueue {
     /// The ID of the CPU this run queue is associated with.
     cpu_id: usize,
@@ -183,21 +181,6 @@ impl<'a, G: BaseGuard> Drop for AxRunQueueRef<'a, G> {
     }
 }
 
-impl AxRunQueue {
-    pub fn new(cpu_id: usize) -> Self {
-        let gc_task = TaskInner::new(gc_entry, "gc".into(), axconfig::TASK_STACK_SIZE).into_arc();
-        // gc task shoule be pinned to the current CPU.
-        gc_task.set_cpumask(CpuMask::one_shot(cpu_id));
-
-        let mut scheduler = Scheduler::new();
-        scheduler.add_task(gc_task);
-        Self {
-            cpu_id,
-            scheduler: SpinRaw::new(scheduler),
-        }
-    }
-}
-
 /// Core functions of run queue.
 impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
     pub fn add_task(&mut self, task: AxTaskRef) {
@@ -254,6 +237,8 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
         }
     }
 
+    /// Exit the current task with the specified exit code.
+    /// This function will never return.
     pub fn exit_current(&mut self, exit_code: i32) -> ! {
         let curr = crate::current();
         debug!("task exit: {}, exit_code={}", curr.id_name(), exit_code);
@@ -326,6 +311,21 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
 }
 
 impl AxRunQueue {
+    /// Create a new run queue for the specified CPU.
+    /// The run queue is initialized with a per-CPU gc task in its scheduler.
+    pub fn new(cpu_id: usize) -> Self {
+        let gc_task = TaskInner::new(gc_entry, "gc".into(), axconfig::TASK_STACK_SIZE).into_arc();
+        // gc task should be pinned to the current CPU.
+        gc_task.set_cpumask(CpuMask::one_shot(cpu_id));
+
+        let mut scheduler = Scheduler::new();
+        scheduler.add_task(gc_task);
+        Self {
+            cpu_id,
+            scheduler: SpinRaw::new(scheduler),
+        }
+    }
+
     /// Common reschedule subroutine. If `preempt`, keep current task's time
     /// slice, otherwise reset it.
     fn resched(&mut self, preempt: bool) {
@@ -365,13 +365,14 @@ impl AxRunQueue {
         #[cfg(feature = "preempt")]
         next_task.set_preempt_pending(false);
         next_task.set_state(TaskState::Running);
-        // Claim the task as running, we do this before switching to it
-        // such that any running task will have this set.
-        next_task.set_on_cpu(true);
         if prev_task.ptr_eq(&next_task) {
             return;
         }
 
+        // Claim the task as running, we do this before switching to it
+        // such that any running task will have this set.
+        next_task.set_on_cpu(true);
+
         unsafe {
             let prev_ctx_ptr = prev_task.ctx_mut_ptr();
             let next_ctx_ptr = next_task.ctx_mut_ptr();
diff --git a/modules/axtask/src/task.rs b/modules/axtask/src/task.rs
index 4e5df6ec70..4771ffd751 100644
--- a/modules/axtask/src/task.rs
+++ b/modules/axtask/src/task.rs
@@ -325,6 +325,12 @@ impl TaskInner {
         debug!("{} unblocking", self.id_name());
 
         // When irq is enabled, use `unblock_lock` to protect the task from being unblocked by timer and `notify()` at the same time.
+        // Note:
+        //  Since a task can not exist in two wait queues at the same time,
+        //  we do not need to worry about a task being unblocked from two different wait queues concurrently,
+        //  This `unblock_lock` is ONLY used to protect the task from being unblocked from timer list and wait queue at the same time,
+        //  because a task may exist in both timer list and wait queue due to `wait_timeout_xxx()` related functions,
+        //  eventually, this lock is only need to be used with "irq" feature enabled.
         #[cfg(feature = "irq")]
         let _lock = self.unblock_lock.lock();
         if self.is_blocked() {

From 6c7f53351bb95d0fb9bb66e534ccbec887972f66 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Mon, 7 Oct 2024 11:19:15 +0800
Subject: [PATCH 26/51] [refactor] add block_current in run_queue.rs

---
 modules/axtask/src/run_queue.rs  | 15 +++++++
 modules/axtask/src/wait_queue.rs | 77 +++++++++++---------------------
 2 files changed, 40 insertions(+), 52 deletions(-)

diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index 65ce6dbd62..c4549818f1 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -263,6 +263,21 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
         unreachable!("task exited!");
     }
 
+    /// Mark the state of current task as `Blocked`, set the `in_wait_queue` flag as true.
+    pub fn block_current(&mut self) {
+        let curr = crate::current();
+        assert!(curr.is_running());
+        assert!(!curr.is_idle());
+        // we must not block current task with preemption disabled.
+        // Current expected preempt count is 2.
+        // 1 for `NoPreemptIrqSave`, 1 for wait queue's `SpinNoIrq`.
+        #[cfg(feature = "preempt")]
+        assert!(curr.can_preempt(2));
+
+        curr.set_state(TaskState::Blocked);
+        curr.set_in_wait_queue(true);
+    }
+
     pub fn blocked_resched(&mut self) {
         let curr = crate::current();
         assert!(curr.is_blocked(), "task is not blocked, {:?}", curr.state());
diff --git a/modules/axtask/src/wait_queue.rs b/modules/axtask/src/wait_queue.rs
index e17639b14b..a626ed3ac8 100644
--- a/modules/axtask/src/wait_queue.rs
+++ b/modules/axtask/src/wait_queue.rs
@@ -1,10 +1,10 @@
 use alloc::collections::VecDeque;
 use alloc::sync::Arc;
 
-use kernel_guard::NoPreemptIrqSave;
+use kernel_guard::{NoOp, NoPreemptIrqSave};
 use kspin::SpinNoIrq;
 
-use crate::{current_run_queue, select_run_queue, task::TaskState, AxTaskRef, CurrentTask};
+use crate::{current_run_queue, select_run_queue, AxTaskRef, CurrentTask};
 
 /// A queue to store sleeping tasks.
 ///
@@ -69,30 +69,17 @@ impl WaitQueue {
         }
     }
 
-    fn push_to_wait_queue(&self) {
-        let mut wq = self.queue.lock();
-        let curr = crate::current();
-        assert!(curr.is_running());
-        assert!(!curr.is_idle());
-        // we must not block current task with preemption disabled.
-        // Current expected preempt count is 2.
-        // 1 for `NoPreemptIrqSave`, 1 for wait queue's `SpinNoIrq`.
-        #[cfg(feature = "preempt")]
-        assert!(curr.can_preempt(2));
-
-        curr.set_state(TaskState::Blocked);
-        curr.set_in_wait_queue(true);
-
-        debug!("{} push to wait queue", curr.id_name());
-
-        wq.push_back(curr.clone());
-    }
-
     /// Blocks the current task and put it into the wait queue, until other task
     /// notifies it.
     pub fn wait(&self) {
         let mut rq = current_run_queue::<NoPreemptIrqSave>();
-        self.push_to_wait_queue();
+        let curr = crate::current();
+        let mut wq = self.queue.lock();
+        rq.block_current();
+        wq.push_back(curr.clone());
+        // Drop the lock of wq explictly.
+        drop(wq);
+
         rq.blocked_resched();
         self.cancel_events(crate::current(), false);
     }
@@ -107,31 +94,20 @@ impl WaitQueue {
         F: Fn() -> bool,
     {
         let mut rq = current_run_queue::<NoPreemptIrqSave>();
+        let curr = crate::current();
         loop {
             let mut wq = self.queue.lock();
             if condition() {
                 break;
             }
-            let curr = crate::current();
-            assert!(curr.is_running());
-            assert!(!curr.is_idle());
-
-            debug!("{} push to wait queue on wait_until", curr.id_name());
-
-            // we must not block current task with preemption disabled.
-            // Current expected preempt count is 2.
-            // 1 for `NoPreemptIrqSave`, 1 for wait queue's `SpinNoIrq`.
-            #[cfg(feature = "preempt")]
-            assert!(curr.can_preempt(2));
+            rq.block_current();
             wq.push_back(curr.clone());
-
-            curr.set_state(TaskState::Blocked);
-            curr.set_in_wait_queue(true);
+            // Drop the lock of wq explictly.
             drop(wq);
 
             rq.blocked_resched();
         }
-        self.cancel_events(crate::current(), false);
+        self.cancel_events(curr, false);
     }
 
     /// Blocks the current task and put it into the wait queue, until other tasks
@@ -148,7 +124,12 @@ impl WaitQueue {
         );
         crate::timers::set_alarm_wakeup(deadline, curr.clone());
 
-        self.push_to_wait_queue();
+        let mut wq = self.queue.lock();
+        rq.block_current();
+        wq.push_back(curr.clone());
+        // Drop the lock of wq explictly.
+        drop(wq);
+
         rq.blocked_resched();
 
         let timeout = curr.in_wait_queue(); // still in the wait queue, must have timed out
@@ -186,18 +167,10 @@ impl WaitQueue {
                 timeout = false;
                 break;
             }
-            assert!(curr.is_running());
-            assert!(!curr.is_idle());
 
-            // we must not block current task with preemption disabled.
-            // Current expected preempt count is 2.
-            // 1 for `NoPreemptIrqSave`, 1 for wait queue's `SpinNoIrq`.
-            #[cfg(feature = "preempt")]
-            assert!(curr.can_preempt(2));
+            rq.block_current();
             wq.push_back(curr.clone());
-
-            curr.set_state(TaskState::Blocked);
-            curr.set_in_wait_queue(true);
+            // Drop the lock of wq explictly.
             drop(wq);
 
             rq.blocked_resched()
@@ -215,7 +188,6 @@ impl WaitQueue {
         if let Some(task) = wq.pop_front() {
             task.set_in_wait_queue(false);
             unblock_one_task(task, resched);
-            drop(wq);
             true
         } else {
             false
@@ -256,7 +228,6 @@ impl WaitQueue {
             // Mark task as not in wait queue.
             task.set_in_wait_queue(false);
             unblock_one_task(task, resched);
-            drop(wq);
             true
         } else {
             false
@@ -264,7 +235,9 @@ impl WaitQueue {
     }
 }
 
-pub(crate) fn unblock_one_task(task: AxTaskRef, resched: bool) {
+fn unblock_one_task(task: AxTaskRef, resched: bool) {
     // Select run queue by the CPU set of the task.
-    select_run_queue::<NoPreemptIrqSave>(task.clone()).unblock_task(task, resched)
+    // Use `NoOp` kernel guard here because the function is called with holding the
+    // lock of wait queue, where the irq and preemption are disabled.
+    select_run_queue::<NoOp>(task.clone()).unblock_task(task, resched)
 }

From 868ac3e3621e35c2b3821fa91dda076bff5a6712 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Mon, 7 Oct 2024 15:10:23 +0800
Subject: [PATCH 27/51] [refactor] pass SpinNoIrqGuard to blocked_resched

---
 modules/axtask/src/run_queue.rs  | 19 ++++++++++++-----
 modules/axtask/src/wait_queue.rs | 35 ++++++--------------------------
 2 files changed, 20 insertions(+), 34 deletions(-)

diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index c4549818f1..f1a7324c47 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -4,7 +4,7 @@ use core::mem::MaybeUninit;
 use core::sync::atomic::{AtomicUsize, Ordering};
 
 use kernel_guard::BaseGuard;
-use kspin::SpinRaw;
+use kspin::{SpinNoIrqGuard, SpinRaw};
 use lazyinit::LazyInit;
 use scheduler::BaseScheduler;
 
@@ -263,8 +263,14 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
         unreachable!("task exited!");
     }
 
+    /// Block the current task, put current task into the wait queue and reschedule.
     /// Mark the state of current task as `Blocked`, set the `in_wait_queue` flag as true.
-    pub fn block_current(&mut self) {
+    /// Note:
+    ///     1. The caller must hold the lock of the wait queue.
+    ///     2. The caller must ensure that the current task is in the running state.
+    ///     3. The caller must ensure that the current task is not the idle task.
+    ///     4. The lock of the wait queue will be released explicitly after current task is pushed into it.
+    pub fn blocked_resched(&mut self, mut wq_locked: SpinNoIrqGuard<VecDeque<AxTaskRef>>) {
         let curr = crate::current();
         assert!(curr.is_running());
         assert!(!curr.is_idle());
@@ -274,12 +280,15 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
         #[cfg(feature = "preempt")]
         assert!(curr.can_preempt(2));
 
+        // Mark the task as blocked, this has to be done before adding it to the wait queue
+        // while holding the lock of the wait queue.
         curr.set_state(TaskState::Blocked);
         curr.set_in_wait_queue(true);
-    }
 
-    pub fn blocked_resched(&mut self) {
-        let curr = crate::current();
+        wq_locked.push_back(curr.clone());
+        // Drop the lock of wait queue explictly.
+        drop(wq_locked);
+
         assert!(curr.is_blocked(), "task is not blocked, {:?}", curr.state());
 
         debug!("task block: {}", curr.id_name());
diff --git a/modules/axtask/src/wait_queue.rs b/modules/axtask/src/wait_queue.rs
index a626ed3ac8..5e608abbf3 100644
--- a/modules/axtask/src/wait_queue.rs
+++ b/modules/axtask/src/wait_queue.rs
@@ -73,14 +73,7 @@ impl WaitQueue {
     /// notifies it.
     pub fn wait(&self) {
         let mut rq = current_run_queue::<NoPreemptIrqSave>();
-        let curr = crate::current();
-        let mut wq = self.queue.lock();
-        rq.block_current();
-        wq.push_back(curr.clone());
-        // Drop the lock of wq explictly.
-        drop(wq);
-
-        rq.blocked_resched();
+        rq.blocked_resched(self.queue.lock());
         self.cancel_events(crate::current(), false);
     }
 
@@ -96,16 +89,11 @@ impl WaitQueue {
         let mut rq = current_run_queue::<NoPreemptIrqSave>();
         let curr = crate::current();
         loop {
-            let mut wq = self.queue.lock();
+            let wq = self.queue.lock();
             if condition() {
                 break;
             }
-            rq.block_current();
-            wq.push_back(curr.clone());
-            // Drop the lock of wq explictly.
-            drop(wq);
-
-            rq.blocked_resched();
+            rq.blocked_resched(wq);
         }
         self.cancel_events(curr, false);
     }
@@ -124,13 +112,7 @@ impl WaitQueue {
         );
         crate::timers::set_alarm_wakeup(deadline, curr.clone());
 
-        let mut wq = self.queue.lock();
-        rq.block_current();
-        wq.push_back(curr.clone());
-        // Drop the lock of wq explictly.
-        drop(wq);
-
-        rq.blocked_resched();
+        rq.blocked_resched(self.queue.lock());
 
         let timeout = curr.in_wait_queue(); // still in the wait queue, must have timed out
 
@@ -162,18 +144,13 @@ impl WaitQueue {
 
         let mut timeout = true;
         while axhal::time::wall_time() < deadline {
-            let mut wq = self.queue.lock();
+            let wq = self.queue.lock();
             if condition() {
                 timeout = false;
                 break;
             }
 
-            rq.block_current();
-            wq.push_back(curr.clone());
-            // Drop the lock of wq explictly.
-            drop(wq);
-
-            rq.blocked_resched()
+            rq.blocked_resched(wq);
         }
         self.cancel_events(curr, !timeout);
         timeout

From d21f7800328112ddcb00a559aff0d8335b9a2cbc Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Mon, 7 Oct 2024 23:21:47 +0800
Subject: [PATCH 28/51] [feat] add WaitQueueGuard

---
 modules/axtask/src/run_queue.rs  | 13 +++++++------
 modules/axtask/src/wait_queue.rs |  4 +++-
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index f1a7324c47..e373448dda 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -4,13 +4,14 @@ use core::mem::MaybeUninit;
 use core::sync::atomic::{AtomicUsize, Ordering};
 
 use kernel_guard::BaseGuard;
-use kspin::{SpinNoIrqGuard, SpinRaw};
+use kspin::SpinRaw;
 use lazyinit::LazyInit;
 use scheduler::BaseScheduler;
 
 use axhal::cpu::this_cpu_id;
 
 use crate::task::{CurrentTask, TaskState};
+use crate::wait_queue::WaitQueueGuard;
 use crate::{AxTaskRef, CpuMask, Scheduler, TaskInner, WaitQueue};
 
 macro_rules! percpu_static {
@@ -85,7 +86,7 @@ pub(crate) fn current_run_queue<G: BaseGuard>() -> AxRunQueueRef<'static, G> {
 // The modulo operation is safe here because `axconfig::SMP` is always greater than 1 with "smp" enabled.
 #[allow(clippy::modulo_one)]
 #[inline]
-fn select_run_queue_index(cpumask: &CpuMask) -> usize {
+fn select_run_queue_index(cpumask: CpuMask) -> usize {
     static RUN_QUEUE_INDEX: AtomicUsize = AtomicUsize::new(0);
 
     assert!(!cpumask.is_empty(), "No available CPU for task execution");
@@ -150,7 +151,7 @@ pub(crate) fn select_run_queue<G: BaseGuard>(task: AxTaskRef) -> AxRunQueueRef<'
     {
         let irq_state = G::acquire();
         // When SMP is enabled, select the run queue based on the task's CPU affinity and load balance.
-        let index = select_run_queue_index(&task.cpumask());
+        let index = select_run_queue_index(task.cpumask());
         AxRunQueueRef {
             inner: get_run_queue(index),
             state: irq_state,
@@ -270,7 +271,7 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
     ///     2. The caller must ensure that the current task is in the running state.
     ///     3. The caller must ensure that the current task is not the idle task.
     ///     4. The lock of the wait queue will be released explicitly after current task is pushed into it.
-    pub fn blocked_resched(&mut self, mut wq_locked: SpinNoIrqGuard<VecDeque<AxTaskRef>>) {
+    pub fn blocked_resched(&mut self, mut wq_guard: WaitQueueGuard) {
         let curr = crate::current();
         assert!(curr.is_running());
         assert!(!curr.is_idle());
@@ -285,9 +286,9 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
         curr.set_state(TaskState::Blocked);
         curr.set_in_wait_queue(true);
 
-        wq_locked.push_back(curr.clone());
+        wq_guard.push_back(curr.clone());
         // Drop the lock of wait queue explictly.
-        drop(wq_locked);
+        drop(wq_guard);
 
         assert!(curr.is_blocked(), "task is not blocked, {:?}", curr.state());
 
diff --git a/modules/axtask/src/wait_queue.rs b/modules/axtask/src/wait_queue.rs
index 5e608abbf3..bf9b92bad8 100644
--- a/modules/axtask/src/wait_queue.rs
+++ b/modules/axtask/src/wait_queue.rs
@@ -2,7 +2,7 @@ use alloc::collections::VecDeque;
 use alloc::sync::Arc;
 
 use kernel_guard::{NoOp, NoPreemptIrqSave};
-use kspin::SpinNoIrq;
+use kspin::{SpinNoIrq, SpinNoIrqGuard};
 
 use crate::{current_run_queue, select_run_queue, AxTaskRef, CurrentTask};
 
@@ -32,6 +32,8 @@ pub struct WaitQueue {
     queue: SpinNoIrq<VecDeque<AxTaskRef>>,
 }
 
+pub(crate) type WaitQueueGuard<'a> = SpinNoIrqGuard<'a, VecDeque<AxTaskRef>>;
+
 impl WaitQueue {
     /// Creates an empty wait queue.
     pub const fn new() -> Self {

From 8b1fe9b0f59e2671b14b6a34a32bb67b27ac2372 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Mon, 7 Oct 2024 23:37:17 +0800
Subject: [PATCH 29/51] [refactor] merge unblock_task, add TaskUnblockGuard

---
 modules/axtask/src/run_queue.rs | 26 ++++++++++++++--
 modules/axtask/src/task.rs      | 53 +++++++++++----------------------
 2 files changed, 41 insertions(+), 38 deletions(-)

diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index e373448dda..ef38328307 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -301,7 +301,29 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
     /// it will enter a loop until the task finishes its scheduling process,
     /// see `unblock_locked` for details.
     pub fn unblock_task(&mut self, task: AxTaskRef, resched: bool) {
-        task.clone().unblock_locked(|| {
+        // When irq is enabled, use `unblock_lock` to protect the task from being unblocked by timer and `notify()` at the same time.
+        #[cfg(feature = "irq")]
+        let _unblock_lock = task.get_unblock_lock();
+
+        if task.is_blocked() {
+            // If the owning (remote) CPU is still in the middle of schedule() with
+            // this task as prev, wait until it's done referencing the task.
+            //
+            // Pairs with the `clear_prev_task_on_cpu()`.
+            //
+            // This ensures that tasks getting woken will be fully ordered against
+            // their previous state and preserve Program Order.
+            //
+            // Note:
+            // 1. This should be placed inside `if self.is_blocked() { ... }`,
+            //    because the task may have been woken up by other cores.
+            // 2. This can be placed in the front of `switch_to()`
+            while task.on_cpu() {
+                // Wait for the task to finish its scheduling process.
+                core::hint::spin_loop();
+            }
+            assert!(!task.on_cpu());
+
             let cpu_id = self.inner.cpu_id;
             debug!("task unblock: {} on run_queue {}", task.id_name(), cpu_id);
             task.set_state(TaskState::Ready);
@@ -316,7 +338,7 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
                 #[cfg(feature = "preempt")]
                 crate::current().set_preempt_pending(true);
             }
-        })
+        }
     }
 
     #[cfg(feature = "irq")]
diff --git a/modules/axtask/src/task.rs b/modules/axtask/src/task.rs
index 4771ffd751..95de358882 100644
--- a/modules/axtask/src/task.rs
+++ b/modules/axtask/src/task.rs
@@ -5,7 +5,7 @@ use core::sync::atomic::AtomicUsize;
 use core::sync::atomic::{AtomicBool, AtomicI32, AtomicPtr, AtomicU64, AtomicU8, Ordering};
 use core::{alloc::Layout, cell::UnsafeCell, fmt, ptr::NonNull};
 
-use kspin::{SpinNoIrq, SpinRaw};
+use kspin::{SpinNoIrq, SpinRaw, SpinRawGuard};
 use memory_addr::{align_up_4k, VirtAddr};
 
 use axhal::arch::TaskContext;
@@ -20,6 +20,11 @@ use crate::{AxTask, AxTaskRef, CpuMask, WaitQueue};
 #[derive(Debug, Clone, Copy, Eq, PartialEq)]
 pub struct TaskId(u64);
 
+/// The guard of task's unblock lock.
+/// When irq is enabled, use a `SpinRaw<()>` called `unblock_lock`
+/// to protect the task from being unblocked by timer and `notify()` at the same time.
+pub(crate) type TaskUnblockGuard<'a> = SpinRawGuard<'a, ()>;
+
 /// The possible states of a task.
 #[repr(u8)]
 #[derive(Debug, Clone, Copy, Eq, PartialEq)]
@@ -318,41 +323,17 @@ impl TaskInner {
         self.timer_ticket_id.store(0, Ordering::Release);
     }
 
-    pub(crate) fn unblock_locked<F>(&self, mut run_queue_push: F)
-    where
-        F: FnMut(),
-    {
-        debug!("{} unblocking", self.id_name());
-
-        // When irq is enabled, use `unblock_lock` to protect the task from being unblocked by timer and `notify()` at the same time.
-        // Note:
-        //  Since a task can not exist in two wait queues at the same time,
-        //  we do not need to worry about a task being unblocked from two different wait queues concurrently,
-        //  This `unblock_lock` is ONLY used to protect the task from being unblocked from timer list and wait queue at the same time,
-        //  because a task may exist in both timer list and wait queue due to `wait_timeout_xxx()` related functions,
-        //  eventually, this lock is only need to be used with "irq" feature enabled.
-        #[cfg(feature = "irq")]
-        let _lock = self.unblock_lock.lock();
-        if self.is_blocked() {
-            // If the owning (remote) CPU is still in the middle of schedule() with
-            // this task as prev, wait until it's done referencing the task.
-            //
-            // Pairs with the `clear_prev_task_on_cpu()`.
-            //
-            // This ensures that tasks getting woken will be fully ordered against
-            // their previous state and preserve Program Order.
-            //
-            // Note:
-            // 1. This should be placed inside `if self.is_blocked() { ... }`,
-            //    because the task may have been woken up by other cores.
-            // 2. This can be placed in the front of `switch_to()`
-            while self.on_cpu() {
-                // Wait for the task to finish its scheduling process.
-                core::hint::spin_loop();
-            }
-            assert!(!self.on_cpu());
-            run_queue_push();
-        }
+    /// Get the guard of task's unblock lock.
+    /// When irq is enabled, use `unblock_lock` to protect the task from being unblocked by timer and `notify()` at the same time.
+    /// Note:
+    ///  Since a task can not exist in two wait queues at the same time,
+    ///  we do not need to worry about a task being unblocked from two different wait queues concurrently,
+    ///  This `unblock_lock` is ONLY used to protect the task from being unblocked from timer list and wait queue at the same time,
+    ///  because a task may exist in both timer list and wait queue due to `wait_timeout_xxx()` related functions,
+    ///  eventually, this lock is only need to be used with "irq" feature enabled.
+    #[cfg(feature = "irq")]
+    pub(crate) fn get_unblock_lock(&self) -> TaskUnblockGuard {
+        self.unblock_lock.lock()
     }
 
     #[inline]

From d04dfd6e26f48c4fd4baafadff9c132826493de9 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Tue, 8 Oct 2024 19:17:49 +0800
Subject: [PATCH 30/51] [fix] second round fix of review suggestions

---
 api/axfeat/Cargo.toml            |  2 +-
 modules/axtask/src/api.rs        |  2 +-
 modules/axtask/src/run_queue.rs  |  2 +-
 modules/axtask/src/task.rs       | 34 +++++++++++++++++---------------
 modules/axtask/src/timers.rs     |  2 +-
 modules/axtask/src/wait_queue.rs |  2 +-
 6 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/api/axfeat/Cargo.toml b/api/axfeat/Cargo.toml
index 17121025f8..5e9d983d34 100644
--- a/api/axfeat/Cargo.toml
+++ b/api/axfeat/Cargo.toml
@@ -13,7 +13,7 @@ documentation = "https://arceos-org.github.io/arceos/axfeat/index.html"
 default = []
 
 # Multicore
-smp = ["axhal/smp", "axruntime/smp", "axtask/smp", "kspin/smp"]
+smp = ["axhal/smp", "axruntime/smp", "axtask?/smp"]
 
 # Floating point/SIMD
 fp_simd = ["axhal/fp_simd"]
diff --git a/modules/axtask/src/api.rs b/modules/axtask/src/api.rs
index 3166505962..0597ba8acf 100644
--- a/modules/axtask/src/api.rs
+++ b/modules/axtask/src/api.rs
@@ -101,7 +101,7 @@ pub fn on_timer_tick() {
 /// Adds the given task to the run queue, returns the task reference.
 pub fn spawn_task(task: TaskInner) -> AxTaskRef {
     let task_ref = task.into_arc();
-    select_run_queue::<NoPreemptIrqSave>(task_ref.clone()).add_task(task_ref.clone());
+    select_run_queue::<NoPreemptIrqSave>(&task_ref).add_task(task_ref.clone());
     task_ref
 }
 
diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index ef38328307..2115e995f4 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -141,7 +141,7 @@ fn get_run_queue(index: usize) -> &'static mut AxRunQueue {
 /// 2. Use a more generic load balancing algorithm that can be customized or replaced.
 ///
 #[inline]
-pub(crate) fn select_run_queue<G: BaseGuard>(task: AxTaskRef) -> AxRunQueueRef<'static, G> {
+pub(crate) fn select_run_queue<G: BaseGuard>(task: &AxTaskRef) -> AxRunQueueRef<'static, G> {
     #[cfg(not(feature = "smp"))]
     {
         // When SMP is disabled, all tasks are scheduled on the same global run queue.
diff --git a/modules/axtask/src/task.rs b/modules/axtask/src/task.rs
index 95de358882..c89107c227 100644
--- a/modules/axtask/src/task.rs
+++ b/modules/axtask/src/task.rs
@@ -1,10 +1,11 @@
 use alloc::{boxed::Box, string::String, sync::Arc};
 use core::ops::Deref;
-#[cfg(any(feature = "preempt", feature = "irq"))]
-use core::sync::atomic::AtomicUsize;
 use core::sync::atomic::{AtomicBool, AtomicI32, AtomicPtr, AtomicU64, AtomicU8, Ordering};
 use core::{alloc::Layout, cell::UnsafeCell, fmt, ptr::NonNull};
 
+#[cfg(feature = "preempt")]
+use core::sync::atomic::AtomicUsize;
+
 use kspin::{SpinNoIrq, SpinRaw, SpinRawGuard};
 use memory_addr::{align_up_4k, VirtAddr};
 
@@ -55,15 +56,16 @@ pub struct TaskInner {
 
     /// Mark whether the task is in the wait queue.
     in_wait_queue: AtomicBool,
-    /// A ticket ID used to identify the timer event.
-    /// Incremented by 1 each time the timer event is triggered or expired.
-    #[cfg(feature = "irq")]
-    timer_ticket_id: AtomicU64,
 
     /// Used to indicate whether the task is running on a CPU.
     on_cpu: AtomicBool,
     prev_task_on_cpu_ptr: AtomicPtr<bool>,
 
+    /// A ticket ID used to identify the timer event.
+    /// Set by `set_timer_ticket()` when creating a timer event in `set_alarm_wakeup()`,
+    /// expired by setting it as zero in `timer_ticket_expired()`, which is called by `cancel_events()`.
+    #[cfg(feature = "irq")]
+    timer_ticket_id: AtomicU64,
     /// Used to protect the task from being unblocked by timer and `notify()` at the same time.
     /// It is used in `unblock_task()`, which is called by wait queue's `notify()` and timer's callback.
     /// Since preemption and irq are both disabled during `unblock_task()`, we can simply use a raw spin lock here.
@@ -397,6 +399,16 @@ impl TaskInner {
         self.on_cpu.as_ptr()
     }
 
+    /// Returns whether the task is running on a CPU.
+    ///
+    /// It is used to protect the task from being moved to a different run queue
+    /// while it has not finished its scheduling process.
+    /// The `on_cpu field is set to `true` when the task is preparing to run on a CPU,
+    /// and it is set to `false` when the task has finished its scheduling process in `clear_prev_task_on_cpu()`.
+    pub fn on_cpu(&self) -> bool {
+        self.on_cpu.load(Ordering::Acquire)
+    }
+
     /// Sets whether the task is running on a CPU.
     pub fn set_on_cpu(&self, on_cpu: bool) {
         self.on_cpu.store(on_cpu, Ordering::Release)
@@ -425,16 +437,6 @@ impl TaskInner {
             .store(false, Ordering::Release);
     }
 
-    /// Returns whether the task is running on a CPU.
-    ///
-    /// It is used to protect the task from being moved to a different run queue
-    /// while it has not finished its scheduling process.
-    /// The `on_cpu field is set to `true` when the task is preparing to run on a CPU,
-    /// and it is set to `false` when the task has finished its scheduling process in `clear_prev_task_on_cpu()`.
-    pub fn on_cpu(&self) -> bool {
-        self.on_cpu.load(Ordering::Acquire)
-    }
-
     /// Returns the top address of the kernel stack.
     #[inline]
     pub const fn kernel_stack_top(&self) -> Option<VirtAddr> {
diff --git a/modules/axtask/src/timers.rs b/modules/axtask/src/timers.rs
index 1afd61cbd5..1a6173b9b9 100644
--- a/modules/axtask/src/timers.rs
+++ b/modules/axtask/src/timers.rs
@@ -31,7 +31,7 @@ impl TimerEvent for TaskWakeupEvent {
         }
 
         // Timer ticket match.
-        select_run_queue::<NoOp>(self.task.clone()).unblock_task(self.task, true)
+        select_run_queue::<NoOp>(&self.task).unblock_task(self.task, true)
     }
 }
 
diff --git a/modules/axtask/src/wait_queue.rs b/modules/axtask/src/wait_queue.rs
index bf9b92bad8..d79b5d2190 100644
--- a/modules/axtask/src/wait_queue.rs
+++ b/modules/axtask/src/wait_queue.rs
@@ -218,5 +218,5 @@ fn unblock_one_task(task: AxTaskRef, resched: bool) {
     // Select run queue by the CPU set of the task.
     // Use `NoOp` kernel guard here because the function is called with holding the
     // lock of wait queue, where the irq and preemption are disabled.
-    select_run_queue::<NoOp>(task.clone()).unblock_task(task, resched)
+    select_run_queue::<NoOp>(&task).unblock_task(task, resched)
 }

From 80a2c94b27e9228fd72847fce1c87625a9f9c073 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Tue, 8 Oct 2024 20:19:29 +0800
Subject: [PATCH 31/51] [fix] third round fix of review suggestions

---
 Makefile                         |  7 ++++---
 api/axfeat/Cargo.toml            |  2 +-
 modules/axtask/src/task.rs       |  5 +++--
 modules/axtask/src/wait_queue.rs | 23 +++++++++++------------
 4 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/Makefile b/Makefile
index 3d0dd587b0..01f2652131 100644
--- a/Makefile
+++ b/Makefile
@@ -174,9 +174,10 @@ run: build justrun
 justrun:
 	$(call run_qemu)
 
-debug: build
-	$(call run_qemu_debug) &
-	sleep 1
+debug:
+	$(call run_qemu_debug)
+
+gdb:
 	$(GDB) $(OUT_ELF) \
 	  -ex 'target remote localhost:1234' \
 	  -ex 'b rust_entry' \
diff --git a/api/axfeat/Cargo.toml b/api/axfeat/Cargo.toml
index 5e9d983d34..412fdb8027 100644
--- a/api/axfeat/Cargo.toml
+++ b/api/axfeat/Cargo.toml
@@ -13,7 +13,7 @@ documentation = "https://arceos-org.github.io/arceos/axfeat/index.html"
 default = []
 
 # Multicore
-smp = ["axhal/smp", "axruntime/smp", "axtask?/smp"]
+smp = ["axhal/smp", "axruntime/smp", "axtask?/smp", "kspin/smp"]
 
 # Floating point/SIMD
 fp_simd = ["axhal/fp_simd"]
diff --git a/modules/axtask/src/task.rs b/modules/axtask/src/task.rs
index c89107c227..97d5f4291e 100644
--- a/modules/axtask/src/task.rs
+++ b/modules/axtask/src/task.rs
@@ -10,7 +10,6 @@ use kspin::{SpinNoIrq, SpinRaw, SpinRawGuard};
 use memory_addr::{align_up_4k, VirtAddr};
 
 use axhal::arch::TaskContext;
-use axhal::cpu::this_cpu_id;
 #[cfg(feature = "tls")]
 use axhal::tls::TlsArea;
 
@@ -234,7 +233,8 @@ impl TaskInner {
     pub(crate) fn new_init(name: String) -> Self {
         let mut t = Self::new_common(TaskId::new(), name);
         t.is_init = true;
-        t.set_cpumask(CpuMask::one_shot(this_cpu_id()));
+        // Init task can be scheduled on all CPUs.
+        t.set_cpumask(CpuMask::full());
         if t.name == "idle" {
             t.is_idle = true;
         }
@@ -285,6 +285,7 @@ impl TaskInner {
         *self.cpumask.lock()
     }
 
+    #[inline]
     pub(crate) fn set_cpumask(&self, cpumask: CpuMask) {
         *self.cpumask.lock() = cpumask
     }
diff --git a/modules/axtask/src/wait_queue.rs b/modules/axtask/src/wait_queue.rs
index d79b5d2190..aa24c64c9c 100644
--- a/modules/axtask/src/wait_queue.rs
+++ b/modules/axtask/src/wait_queue.rs
@@ -1,7 +1,7 @@
 use alloc::collections::VecDeque;
 use alloc::sync::Arc;
 
-use kernel_guard::{NoOp, NoPreemptIrqSave};
+use kernel_guard::{NoOp, NoPreempt, NoPreemptIrqSave};
 use kspin::{SpinNoIrq, SpinNoIrqGuard};
 
 use crate::{current_run_queue, select_run_queue, AxTaskRef, CurrentTask};
@@ -65,9 +65,11 @@ impl WaitQueue {
         #[cfg(feature = "irq")]
         if from_timer_list {
             curr.timer_ticket_expired();
-            // TODO:
-            // this task is still not removed from timer list of target CPU,
-            // which may cause some redundant timer events.
+            // Note:
+            //  this task is still not removed from timer list of target CPU,
+            //  which may cause some redundant timer events because it still needs to
+            //  go through the process of expiring an event from the timer list and invoking the callback.
+            //  (it can be considered a lazy-removal strategy, it will be ignored when it is about to take effect.)
         }
     }
 
@@ -88,7 +90,7 @@ impl WaitQueue {
     where
         F: Fn() -> bool,
     {
-        let mut rq = current_run_queue::<NoPreemptIrqSave>();
+        let mut rq = current_run_queue::<NoPreempt>();
         let curr = crate::current();
         loop {
             let wq = self.queue.lock();
@@ -165,7 +167,6 @@ impl WaitQueue {
     pub fn notify_one(&self, resched: bool) -> bool {
         let mut wq = self.queue.lock();
         if let Some(task) = wq.pop_front() {
-            task.set_in_wait_queue(false);
             unblock_one_task(task, resched);
             true
         } else {
@@ -181,7 +182,6 @@ impl WaitQueue {
         loop {
             let mut wq = self.queue.lock();
             if let Some(task) = wq.pop_front() {
-                task.set_in_wait_queue(false);
                 unblock_one_task(task, resched);
             } else {
                 break;
@@ -196,16 +196,13 @@ impl WaitQueue {
     /// preemption is enabled.
     pub fn notify_task(&mut self, resched: bool, task: &AxTaskRef) -> bool {
         let mut wq = self.queue.lock();
-        let task_to_be_notify = {
+        if let Some(task) = {
             if let Some(index) = wq.iter().position(|t| Arc::ptr_eq(t, task)) {
                 wq.remove(index)
             } else {
                 None
             }
-        };
-        if let Some(task) = task_to_be_notify {
-            // Mark task as not in wait queue.
-            task.set_in_wait_queue(false);
+        } {
             unblock_one_task(task, resched);
             true
         } else {
@@ -215,6 +212,8 @@ impl WaitQueue {
 }
 
 fn unblock_one_task(task: AxTaskRef, resched: bool) {
+    // Mark task as not in wait queue.
+    task.set_in_wait_queue(false);
     // Select run queue by the CPU set of the task.
     // Use `NoOp` kernel guard here because the function is called with holding the
     // lock of wait queue, where the irq and preemption are disabled.

From 0d629a41db100b0720acb871239c42c4f25e30da Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Tue, 8 Oct 2024 22:31:07 +0800
Subject: [PATCH 32/51] [refactor] use weak reference for prev task to
 manipulate on_cpu flag

---
 modules/axtask/src/run_queue.rs |  4 +--
 modules/axtask/src/task.rs      | 43 ++++++++++++++++++++-------------
 2 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index 2115e995f4..5a74bfc0c7 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -424,8 +424,8 @@ impl AxRunQueue {
             let prev_ctx_ptr = prev_task.ctx_mut_ptr();
             let next_ctx_ptr = next_task.ctx_mut_ptr();
 
-            // Store the pointer of `on_cpu` flag of **prev_task** in **next_task**'s struct.
-            next_task.set_prev_task_on_cpu_ptr(prev_task.on_cpu_mut_ptr());
+            // Store the weak pointer of **prev_task** in **next_task**'s struct.
+            next_task.set_prev_task(prev_task.weak());
 
             // The strong reference count of `prev_task` will be decremented by 1,
             // but won't be dropped until `gc_entry()` is called.
diff --git a/modules/axtask/src/task.rs b/modules/axtask/src/task.rs
index 97d5f4291e..f31ccc7e8d 100644
--- a/modules/axtask/src/task.rs
+++ b/modules/axtask/src/task.rs
@@ -1,6 +1,7 @@
+use alloc::sync::Weak;
 use alloc::{boxed::Box, string::String, sync::Arc};
 use core::ops::Deref;
-use core::sync::atomic::{AtomicBool, AtomicI32, AtomicPtr, AtomicU64, AtomicU8, Ordering};
+use core::sync::atomic::{AtomicBool, AtomicI32, AtomicU64, AtomicU8, Ordering};
 use core::{alloc::Layout, cell::UnsafeCell, fmt, ptr::NonNull};
 
 #[cfg(feature = "preempt")]
@@ -58,7 +59,8 @@ pub struct TaskInner {
 
     /// Used to indicate whether the task is running on a CPU.
     on_cpu: AtomicBool,
-    prev_task_on_cpu_ptr: AtomicPtr<bool>,
+    /// A weak reference to the previous task running on this CPU.
+    prev_task: UnsafeCell<Weak<AxTask>>,
 
     /// A ticket ID used to identify the timer event.
     /// Set by `set_timer_ticket()` when creating a timer event in `set_alarm_wakeup()`,
@@ -205,7 +207,7 @@ impl TaskInner {
             #[cfg(feature = "irq")]
             timer_ticket_id: AtomicU64::new(0),
             on_cpu: AtomicBool::new(false),
-            prev_task_on_cpu_ptr: AtomicPtr::new(core::ptr::null_mut()),
+            prev_task: UnsafeCell::new(Weak::default()),
             #[cfg(feature = "irq")]
             unblock_lock: SpinRaw::new(()),
             #[cfg(feature = "preempt")]
@@ -394,12 +396,6 @@ impl TaskInner {
         self.ctx.get_mut()
     }
 
-    /// Returns the raw pointer to the `on_cpu` field.
-    #[inline]
-    pub(crate) const fn on_cpu_mut_ptr(&self) -> *mut bool {
-        self.on_cpu.as_ptr()
-    }
-
     /// Returns whether the task is running on a CPU.
     ///
     /// It is used to protect the task from being moved to a different run queue
@@ -415,15 +411,19 @@ impl TaskInner {
         self.on_cpu.store(on_cpu, Ordering::Release)
     }
 
-    /// Sets `prev_task_on_cpu_ptr` to the given pointer provided by previous task running on this CPU.
-    pub fn set_prev_task_on_cpu_ptr(&self, prev_task_on_cpu_ptr: *mut bool) {
-        self.prev_task_on_cpu_ptr
-            .store(prev_task_on_cpu_ptr, Ordering::Release)
+    /// Stores a weak reference to the previous task running on this CPU.
+    ///
+    /// ## Safety
+    /// This function is only called by current task in `switch_to`.
+    pub unsafe fn set_prev_task(&self, prev_task: Weak<AxTask>) {
+        *self.prev_task.get() = prev_task;
     }
 
     /// Clears the `on_cpu` field of previous task running on this CPU.
     /// It is called by the current task before running.
-    /// The pointer is provided by previous task running on this CPU through `set_prev_task_on_cpu_ptr()`.
+    /// The weak reference of previous task running on this CPU is set through `set_prev_task()`.
+    ///
+    /// Panic if the pointer is invalid or the previous task is dropped.
     ///
     /// ## Note
     /// This must be the very last reference to @_prev_task from this CPU.
@@ -431,11 +431,16 @@ impl TaskInner {
     /// We must ensure this doesn't happen until the switch is completely finished.
     ///
     /// ## Safety
-    /// The caller must ensure that the pointer is valid and points to a boolean value, which is
+    /// The caller must ensure that the weak reference to the prev task is valid, which is
     /// done by the previous task running on this CPU through `set_prev_task_on_cpu_ptr()`.
     pub unsafe fn clear_prev_task_on_cpu(&self) {
-        AtomicBool::from_ptr(self.prev_task_on_cpu_ptr.load(Ordering::Acquire))
-            .store(false, Ordering::Release);
+        self.prev_task
+            .get()
+            .as_ref()
+            .expect("Invalid prev_task pointer")
+            .upgrade()
+            .expect("prev_task is dropped")
+            .set_on_cpu(false);
     }
 
     /// Returns the top address of the kernel stack.
@@ -519,6 +524,10 @@ impl CurrentTask {
         self.0.deref().clone()
     }
 
+    pub(crate) fn weak(&self) -> Weak<AxTask> {
+        Arc::downgrade(&self.0)
+    }
+
     pub(crate) fn ptr_eq(&self, other: &AxTaskRef) -> bool {
         Arc::ptr_eq(&self.0, other)
     }

From c3e665c96cc9f096e15d89f65fd62ed4d32bd0e9 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Tue, 8 Oct 2024 22:40:46 +0800
Subject: [PATCH 33/51] [feat] enable on_cpu flag only with smp feature enabled

---
 modules/axtask/src/run_queue.rs  | 2 ++
 modules/axtask/src/task.rs       | 6 ++++++
 modules/axtask/src/wait_queue.rs | 2 +-
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index 5a74bfc0c7..81890e85c8 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -318,10 +318,12 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
             // 1. This should be placed inside `if self.is_blocked() { ... }`,
             //    because the task may have been woken up by other cores.
             // 2. This can be placed in the front of `switch_to()`
+            #[cfg(feature = "smp")]
             while task.on_cpu() {
                 // Wait for the task to finish its scheduling process.
                 core::hint::spin_loop();
             }
+            #[cfg(feature = "smp")]
             assert!(!task.on_cpu());
 
             let cpu_id = self.inner.cpu_id;
diff --git a/modules/axtask/src/task.rs b/modules/axtask/src/task.rs
index f31ccc7e8d..503dda5a9a 100644
--- a/modules/axtask/src/task.rs
+++ b/modules/axtask/src/task.rs
@@ -58,8 +58,10 @@ pub struct TaskInner {
     in_wait_queue: AtomicBool,
 
     /// Used to indicate whether the task is running on a CPU.
+    #[cfg(feature = "smp")]
     on_cpu: AtomicBool,
     /// A weak reference to the previous task running on this CPU.
+    #[cfg(feature = "smp")]
     prev_task: UnsafeCell<Weak<AxTask>>,
 
     /// A ticket ID used to identify the timer event.
@@ -402,11 +404,13 @@ impl TaskInner {
     /// while it has not finished its scheduling process.
     /// The `on_cpu field is set to `true` when the task is preparing to run on a CPU,
     /// and it is set to `false` when the task has finished its scheduling process in `clear_prev_task_on_cpu()`.
+    #[cfg(feature = "smp")]
     pub fn on_cpu(&self) -> bool {
         self.on_cpu.load(Ordering::Acquire)
     }
 
     /// Sets whether the task is running on a CPU.
+    #[cfg(feature = "smp")]
     pub fn set_on_cpu(&self, on_cpu: bool) {
         self.on_cpu.store(on_cpu, Ordering::Release)
     }
@@ -415,6 +419,7 @@ impl TaskInner {
     ///
     /// ## Safety
     /// This function is only called by current task in `switch_to`.
+    #[cfg(feature = "smp")]
     pub unsafe fn set_prev_task(&self, prev_task: Weak<AxTask>) {
         *self.prev_task.get() = prev_task;
     }
@@ -433,6 +438,7 @@ impl TaskInner {
     /// ## Safety
     /// The caller must ensure that the weak reference to the prev task is valid, which is
     /// done by the previous task running on this CPU through `set_prev_task_on_cpu_ptr()`.
+    #[cfg(feature = "smp")]
     pub unsafe fn clear_prev_task_on_cpu(&self) {
         self.prev_task
             .get()
diff --git a/modules/axtask/src/wait_queue.rs b/modules/axtask/src/wait_queue.rs
index aa24c64c9c..0cafcc2820 100644
--- a/modules/axtask/src/wait_queue.rs
+++ b/modules/axtask/src/wait_queue.rs
@@ -90,9 +90,9 @@ impl WaitQueue {
     where
         F: Fn() -> bool,
     {
-        let mut rq = current_run_queue::<NoPreempt>();
         let curr = crate::current();
         loop {
+            let mut rq = current_run_queue::<NoPreempt>();
             let wq = self.queue.lock();
             if condition() {
                 break;

From 02a8f9a815ac1f0cce1758304858402b69c1fcbb Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Tue, 8 Oct 2024 22:49:23 +0800
Subject: [PATCH 34/51] [fix] enable on_cpu flag only with smp feature enabled

---
 modules/axtask/src/run_queue.rs |  9 ++++++---
 modules/axtask/src/task.rs      | 16 +++++++++-------
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index 81890e85c8..c81c4ee140 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -1,6 +1,7 @@
 use alloc::collections::VecDeque;
 use alloc::sync::Arc;
 use core::mem::MaybeUninit;
+#[cfg(feature = "smp")]
 use core::sync::atomic::{AtomicUsize, Ordering};
 
 use kernel_guard::BaseGuard;
@@ -117,6 +118,7 @@ fn select_run_queue_index(cpumask: CpuMask) -> usize {
 ///
 /// This function will panic if the index is out of bounds.
 ///
+#[cfg(feature = "smp")]
 #[inline]
 fn get_run_queue(index: usize) -> &'static mut AxRunQueue {
     unsafe { RUN_QUEUES[index].assume_init_mut() }
@@ -420,6 +422,7 @@ impl AxRunQueue {
 
         // Claim the task as running, we do this before switching to it
         // such that any running task will have this set.
+        #[cfg(feature = "smp")]
         next_task.set_on_cpu(true);
 
         unsafe {
@@ -427,7 +430,8 @@ impl AxRunQueue {
             let next_ctx_ptr = next_task.ctx_mut_ptr();
 
             // Store the weak pointer of **prev_task** in **next_task**'s struct.
-            next_task.set_prev_task(prev_task.weak());
+            #[cfg(feature = "smp")]
+            next_task.set_prev_task(prev_task.clone());
 
             // The strong reference count of `prev_task` will be decremented by 1,
             // but won't be dropped until `gc_entry()` is called.
@@ -440,6 +444,7 @@ impl AxRunQueue {
 
             // Current it's **next_task** running on this CPU, clear the `prev_task`'s `on_cpu` field
             // to indicate that it has finished its scheduling process and no longer running on this CPU.
+            #[cfg(feature = "smp")]
             crate::current().clear_prev_task_on_cpu();
         }
     }
@@ -482,7 +487,6 @@ pub(crate) fn init() {
     // Put the subsequent execution into the `main` task.
     let main_task = TaskInner::new_init("main".into()).into_arc();
     main_task.set_state(TaskState::Running);
-    main_task.set_on_cpu(true);
     unsafe { CurrentTask::init_current(main_task) }
 
     info!("Initialize RUN_QUEUES");
@@ -503,7 +507,6 @@ pub(crate) fn init_secondary() {
     IDLE_TASK.with_current(|i| {
         i.init_once(idle_task.clone());
     });
-    idle_task.set_on_cpu(true);
     unsafe { CurrentTask::init_current(idle_task) }
     RUN_QUEUE.with_current(|rq| {
         rq.init_once(AxRunQueue::new(cpu_id));
diff --git a/modules/axtask/src/task.rs b/modules/axtask/src/task.rs
index 503dda5a9a..444eccd769 100644
--- a/modules/axtask/src/task.rs
+++ b/modules/axtask/src/task.rs
@@ -1,9 +1,10 @@
-use alloc::sync::Weak;
 use alloc::{boxed::Box, string::String, sync::Arc};
 use core::ops::Deref;
 use core::sync::atomic::{AtomicBool, AtomicI32, AtomicU64, AtomicU8, Ordering};
 use core::{alloc::Layout, cell::UnsafeCell, fmt, ptr::NonNull};
 
+#[cfg(feature = "smp")]
+use alloc::sync::Weak;
 #[cfg(feature = "preempt")]
 use core::sync::atomic::AtomicUsize;
 
@@ -208,7 +209,9 @@ impl TaskInner {
             in_wait_queue: AtomicBool::new(false),
             #[cfg(feature = "irq")]
             timer_ticket_id: AtomicU64::new(0),
+            #[cfg(feature = "smp")]
             on_cpu: AtomicBool::new(false),
+            #[cfg(feature = "smp")]
             prev_task: UnsafeCell::new(Weak::default()),
             #[cfg(feature = "irq")]
             unblock_lock: SpinRaw::new(()),
@@ -237,6 +240,8 @@ impl TaskInner {
     pub(crate) fn new_init(name: String) -> Self {
         let mut t = Self::new_common(TaskId::new(), name);
         t.is_init = true;
+        #[cfg(feature = "smp")]
+        t.set_on_cpu(true);
         // Init task can be scheduled on all CPUs.
         t.set_cpumask(CpuMask::full());
         if t.name == "idle" {
@@ -420,8 +425,8 @@ impl TaskInner {
     /// ## Safety
     /// This function is only called by current task in `switch_to`.
     #[cfg(feature = "smp")]
-    pub unsafe fn set_prev_task(&self, prev_task: Weak<AxTask>) {
-        *self.prev_task.get() = prev_task;
+    pub unsafe fn set_prev_task(&self, prev_task: Arc<AxTask>) {
+        *self.prev_task.get() = Arc::downgrade(&prev_task);
     }
 
     /// Clears the `on_cpu` field of previous task running on this CPU.
@@ -530,10 +535,6 @@ impl CurrentTask {
         self.0.deref().clone()
     }
 
-    pub(crate) fn weak(&self) -> Weak<AxTask> {
-        Arc::downgrade(&self.0)
-    }
-
     pub(crate) fn ptr_eq(&self, other: &AxTaskRef) -> bool {
         Arc::ptr_eq(&self.0, other)
     }
@@ -562,6 +563,7 @@ impl Deref for CurrentTask {
 }
 
 extern "C" fn task_entry() -> ! {
+    #[cfg(feature = "smp")]
     unsafe {
         // Clear the prev task on CPU before running the task entry function.
         crate::current().clear_prev_task_on_cpu();

From ee2fc14692e665babd506f46436400c6aef5f6a8 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Wed, 9 Oct 2024 15:56:01 +0800
Subject: [PATCH 35/51] [fix] change the pos of NoPreemptIrqSave guard hold by
 rq in wait_queue.rs

---
 modules/axtask/src/run_queue.rs  | 17 ++++++++++-------
 modules/axtask/src/wait_queue.rs | 16 +++++-----------
 2 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index c81c4ee140..7a2f4fd6c9 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -406,13 +406,16 @@ impl AxRunQueue {
     }
 
     fn switch_to(&mut self, prev_task: CurrentTask, next_task: AxTaskRef) {
-        if !prev_task.is_idle() || !next_task.is_idle() {
-            debug!(
-                "context switch: {} -> {}",
-                prev_task.id_name(),
-                next_task.id_name()
-            );
-        }
+        // Make sure that IRQs are disabled by kernel guard or other means.
+        assert!(
+            !axhal::arch::irqs_enabled(),
+            "IRQs must be disabled during scheduling"
+        );
+        trace!(
+            "context switch: {} -> {}",
+            prev_task.id_name(),
+            next_task.id_name()
+        );
         #[cfg(feature = "preempt")]
         next_task.set_preempt_pending(false);
         next_task.set_state(TaskState::Running);
diff --git a/modules/axtask/src/wait_queue.rs b/modules/axtask/src/wait_queue.rs
index 0cafcc2820..bd94107d7d 100644
--- a/modules/axtask/src/wait_queue.rs
+++ b/modules/axtask/src/wait_queue.rs
@@ -1,7 +1,7 @@
 use alloc::collections::VecDeque;
 use alloc::sync::Arc;
 
-use kernel_guard::{NoOp, NoPreempt, NoPreemptIrqSave};
+use kernel_guard::{NoOp, NoPreemptIrqSave};
 use kspin::{SpinNoIrq, SpinNoIrqGuard};
 
 use crate::{current_run_queue, select_run_queue, AxTaskRef, CurrentTask};
@@ -92,7 +92,7 @@ impl WaitQueue {
     {
         let curr = crate::current();
         loop {
-            let mut rq = current_run_queue::<NoPreempt>();
+            let mut rq = current_run_queue::<NoPreemptIrqSave>();
             let wq = self.queue.lock();
             if condition() {
                 break;
@@ -136,7 +136,6 @@ impl WaitQueue {
     where
         F: Fn() -> bool,
     {
-        let mut rq = current_run_queue::<NoPreemptIrqSave>();
         let curr = crate::current();
         let deadline = axhal::time::wall_time() + dur;
         debug!(
@@ -148,6 +147,7 @@ impl WaitQueue {
 
         let mut timeout = true;
         while axhal::time::wall_time() < deadline {
+            let mut rq = current_run_queue::<NoPreemptIrqSave>();
             let wq = self.queue.lock();
             if condition() {
                 timeout = false;
@@ -196,14 +196,8 @@ impl WaitQueue {
     /// preemption is enabled.
     pub fn notify_task(&mut self, resched: bool, task: &AxTaskRef) -> bool {
         let mut wq = self.queue.lock();
-        if let Some(task) = {
-            if let Some(index) = wq.iter().position(|t| Arc::ptr_eq(t, task)) {
-                wq.remove(index)
-            } else {
-                None
-            }
-        } {
-            unblock_one_task(task, resched);
+        if let Some(index) = wq.iter().position(|t| Arc::ptr_eq(t, task)) {
+            unblock_one_task(wq.remove(index).unwrap(), resched);
             true
         } else {
             false

From e64a60a16263728c0b53c7397532fc8a0756eaec Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Wed, 9 Oct 2024 16:01:41 +0800
Subject: [PATCH 36/51] [refactor] delete unblock_locked, use atomic
 compare_exchange in unblock_task

---
 modules/axtask/src/run_queue.rs | 13 ++++------
 modules/axtask/src/task.rs      | 42 +++++++++++++--------------------
 2 files changed, 21 insertions(+), 34 deletions(-)

diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index 7a2f4fd6c9..568fa3409c 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -300,14 +300,12 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
 
     /// Unblock one task by inserting it into the run queue.
     /// If task's `on_cpu` flag is true,
-    /// it will enter a loop until the task finishes its scheduling process,
-    /// see `unblock_locked` for details.
+    /// it will enter a loop until the task finishes its scheduling process.
     pub fn unblock_task(&mut self, task: AxTaskRef, resched: bool) {
-        // When irq is enabled, use `unblock_lock` to protect the task from being unblocked by timer and `notify()` at the same time.
-        #[cfg(feature = "irq")]
-        let _unblock_lock = task.get_unblock_lock();
-
-        if task.is_blocked() {
+        // Try to change the state of the task from `Blocked` to `Ready`,
+        // if successful, put it into the run queue,
+        // otherwise, the task is already unblocked by other cores.
+        if task.transition_state(TaskState::Blocked, TaskState::Ready) {
             // If the owning (remote) CPU is still in the middle of schedule() with
             // this task as prev, wait until it's done referencing the task.
             //
@@ -330,7 +328,6 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
 
             let cpu_id = self.inner.cpu_id;
             debug!("task unblock: {} on run_queue {}", task.id_name(), cpu_id);
-            task.set_state(TaskState::Ready);
             self.inner
                 .scheduler
                 .lock()
diff --git a/modules/axtask/src/task.rs b/modules/axtask/src/task.rs
index 444eccd769..33837398d9 100644
--- a/modules/axtask/src/task.rs
+++ b/modules/axtask/src/task.rs
@@ -8,7 +8,7 @@ use alloc::sync::Weak;
 #[cfg(feature = "preempt")]
 use core::sync::atomic::AtomicUsize;
 
-use kspin::{SpinNoIrq, SpinRaw, SpinRawGuard};
+use kspin::SpinNoIrq;
 use memory_addr::{align_up_4k, VirtAddr};
 
 use axhal::arch::TaskContext;
@@ -22,11 +22,6 @@ use crate::{AxTask, AxTaskRef, CpuMask, WaitQueue};
 #[derive(Debug, Clone, Copy, Eq, PartialEq)]
 pub struct TaskId(u64);
 
-/// The guard of task's unblock lock.
-/// When irq is enabled, use a `SpinRaw<()>` called `unblock_lock`
-/// to protect the task from being unblocked by timer and `notify()` at the same time.
-pub(crate) type TaskUnblockGuard<'a> = SpinRawGuard<'a, ()>;
-
 /// The possible states of a task.
 #[repr(u8)]
 #[derive(Debug, Clone, Copy, Eq, PartialEq)]
@@ -70,11 +65,6 @@ pub struct TaskInner {
     /// expired by setting it as zero in `timer_ticket_expired()`, which is called by `cancel_events()`.
     #[cfg(feature = "irq")]
     timer_ticket_id: AtomicU64,
-    /// Used to protect the task from being unblocked by timer and `notify()` at the same time.
-    /// It is used in `unblock_task()`, which is called by wait queue's `notify()` and timer's callback.
-    /// Since preemption and irq are both disabled during `unblock_task()`, we can simply use a raw spin lock here.
-    #[cfg(feature = "irq")]
-    unblock_lock: SpinRaw<()>,
 
     #[cfg(feature = "preempt")]
     need_resched: AtomicBool,
@@ -213,8 +203,6 @@ impl TaskInner {
             on_cpu: AtomicBool::new(false),
             #[cfg(feature = "smp")]
             prev_task: UnsafeCell::new(Weak::default()),
-            #[cfg(feature = "irq")]
-            unblock_lock: SpinRaw::new(()),
             #[cfg(feature = "preempt")]
             need_resched: AtomicBool::new(false),
             #[cfg(feature = "preempt")]
@@ -264,6 +252,21 @@ impl TaskInner {
         self.state.store(state as u8, Ordering::Release)
     }
 
+    /// Transition the task state from `current_state` to `new_state`,
+    /// Returns `true` if the current state is `current_state` and the state is successfully set to `new_state`,
+    /// otherwise returns `false`.
+    #[inline]
+    pub(crate) fn transition_state(&self, current_state: TaskState, new_state: TaskState) -> bool {
+        self.state
+            .compare_exchange(
+                current_state as u8,
+                new_state as u8,
+                Ordering::AcqRel,
+                Ordering::Acquire,
+            )
+            .is_ok()
+    }
+
     #[inline]
     pub(crate) fn is_running(&self) -> bool {
         matches!(self.state(), TaskState::Running)
@@ -335,19 +338,6 @@ impl TaskInner {
         self.timer_ticket_id.store(0, Ordering::Release);
     }
 
-    /// Get the guard of task's unblock lock.
-    /// When irq is enabled, use `unblock_lock` to protect the task from being unblocked by timer and `notify()` at the same time.
-    /// Note:
-    ///  Since a task can not exist in two wait queues at the same time,
-    ///  we do not need to worry about a task being unblocked from two different wait queues concurrently,
-    ///  This `unblock_lock` is ONLY used to protect the task from being unblocked from timer list and wait queue at the same time,
-    ///  because a task may exist in both timer list and wait queue due to `wait_timeout_xxx()` related functions,
-    ///  eventually, this lock is only need to be used with "irq" feature enabled.
-    #[cfg(feature = "irq")]
-    pub(crate) fn get_unblock_lock(&self) -> TaskUnblockGuard {
-        self.unblock_lock.lock()
-    }
-
     #[inline]
     #[cfg(feature = "preempt")]
     pub(crate) fn set_preempt_pending(&self, pending: bool) {

From 038e3e438ec3ea1b630f0707e8b28faea1ed68f5 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Wed, 9 Oct 2024 16:33:17 +0800
Subject: [PATCH 37/51] [fix] bug in wait_timeout, modify assert in
 blocked_resched and switch_to

---
 modules/axtask/src/run_queue.rs  | 14 ++++++++++++--
 modules/axtask/src/wait_queue.rs |  2 +-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index 568fa3409c..6d9ad0501f 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -292,7 +292,14 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
         // Drop the lock of wait queue explictly.
         drop(wq_guard);
 
-        assert!(curr.is_blocked(), "task is not blocked, {:?}", curr.state());
+        // Current task's state has been changed to `Blocked` and added to the wait queue.
+        // Note that the state may have been set as `Ready` in `unblock_task()`,
+        // see `unblock_task()` for details.
+        assert!(
+            curr.is_blocked() | curr.is_ready(),
+            "current task is not blocked or ready, state: {:?}",
+            curr.state()
+        );
 
         debug!("task block: {}", curr.id_name());
         self.inner.resched(false);
@@ -306,6 +313,8 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
         // if successful, put it into the run queue,
         // otherwise, the task is already unblocked by other cores.
         if task.transition_state(TaskState::Blocked, TaskState::Ready) {
+            // Since now, the task to be unblocked is in the `Ready` state.
+
             // If the owning (remote) CPU is still in the middle of schedule() with
             // this task as prev, wait until it's done referencing the task.
             //
@@ -315,7 +324,7 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
             // their previous state and preserve Program Order.
             //
             // Note:
-            // 1. This should be placed inside `if self.is_blocked() { ... }`,
+            // 1. This should be placed after the judgement of `TaskState::Blocked,`,
             //    because the task may have been woken up by other cores.
             // 2. This can be placed in the front of `switch_to()`
             #[cfg(feature = "smp")]
@@ -404,6 +413,7 @@ impl AxRunQueue {
 
     fn switch_to(&mut self, prev_task: CurrentTask, next_task: AxTaskRef) {
         // Make sure that IRQs are disabled by kernel guard or other means.
+        #[cfg(not(test))]
         assert!(
             !axhal::arch::irqs_enabled(),
             "IRQs must be disabled during scheduling"
diff --git a/modules/axtask/src/wait_queue.rs b/modules/axtask/src/wait_queue.rs
index bd94107d7d..ee21cf7f37 100644
--- a/modules/axtask/src/wait_queue.rs
+++ b/modules/axtask/src/wait_queue.rs
@@ -136,6 +136,7 @@ impl WaitQueue {
     where
         F: Fn() -> bool,
     {
+        let mut rq = current_run_queue::<NoPreemptIrqSave>();
         let curr = crate::current();
         let deadline = axhal::time::wall_time() + dur;
         debug!(
@@ -147,7 +148,6 @@ impl WaitQueue {
 
         let mut timeout = true;
         while axhal::time::wall_time() < deadline {
-            let mut rq = current_run_queue::<NoPreemptIrqSave>();
             let wq = self.queue.lock();
             if condition() {
                 timeout = false;

From 0b3786b1ad6a1d86563cc57caf19b28188668a74 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Thu, 10 Oct 2024 13:05:05 +0800
Subject: [PATCH 38/51] [fix] modify assert about irqs_enabled in switch_to

---
 modules/axhal/src/arch/x86_64/mod.rs | 8 ++++++++
 modules/axtask/src/run_queue.rs      | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/modules/axhal/src/arch/x86_64/mod.rs b/modules/axhal/src/arch/x86_64/mod.rs
index f19469b726..c9002a58d3 100644
--- a/modules/axhal/src/arch/x86_64/mod.rs
+++ b/modules/axhal/src/arch/x86_64/mod.rs
@@ -19,6 +19,10 @@ pub use x86_64::structures::tss::TaskStateSegment;
 /// Allows the current CPU to respond to interrupts.
 #[inline]
 pub fn enable_irqs() {
+    #[cfg(not(target_os = "none"))]
+    {
+        warn!("enable_irqs: not implemented");
+    }
     #[cfg(target_os = "none")]
     interrupts::enable()
 }
@@ -26,6 +30,10 @@ pub fn enable_irqs() {
 /// Makes the current CPU to ignore interrupts.
 #[inline]
 pub fn disable_irqs() {
+    #[cfg(not(target_os = "none"))]
+    {
+        warn!("disable_irqs: not implemented");
+    }
     #[cfg(target_os = "none")]
     interrupts::disable()
 }
diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index 6d9ad0501f..d320b2e283 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -413,7 +413,7 @@ impl AxRunQueue {
 
     fn switch_to(&mut self, prev_task: CurrentTask, next_task: AxTaskRef) {
         // Make sure that IRQs are disabled by kernel guard or other means.
-        #[cfg(not(test))]
+        #[cfg(all(not(test), feature = "irq"))] // Note: irq is faked under unit tests.
         assert!(
             !axhal::arch::irqs_enabled(),
             "IRQs must be disabled during scheduling"

From 713f4ea59fe7f448cf534e02cc2cd46c12f8d432 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Thu, 10 Oct 2024 13:11:50 +0800
Subject: [PATCH 39/51] [refactor] keep the guard in the loop in
 wait_timeout_until

---
 Makefile                         | 7 +++----
 modules/axtask/src/wait_queue.rs | 7 +++++--
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/Makefile b/Makefile
index 01f2652131..3d0dd587b0 100644
--- a/Makefile
+++ b/Makefile
@@ -174,10 +174,9 @@ run: build justrun
 justrun:
 	$(call run_qemu)
 
-debug:
-	$(call run_qemu_debug)
-
-gdb:
+debug: build
+	$(call run_qemu_debug) &
+	sleep 1
 	$(GDB) $(OUT_ELF) \
 	  -ex 'target remote localhost:1234' \
 	  -ex 'b rust_entry' \
diff --git a/modules/axtask/src/wait_queue.rs b/modules/axtask/src/wait_queue.rs
index ee21cf7f37..c61ef9af13 100644
--- a/modules/axtask/src/wait_queue.rs
+++ b/modules/axtask/src/wait_queue.rs
@@ -136,7 +136,6 @@ impl WaitQueue {
     where
         F: Fn() -> bool,
     {
-        let mut rq = current_run_queue::<NoPreemptIrqSave>();
         let curr = crate::current();
         let deadline = axhal::time::wall_time() + dur;
         debug!(
@@ -147,7 +146,11 @@ impl WaitQueue {
         crate::timers::set_alarm_wakeup(deadline, curr.clone());
 
         let mut timeout = true;
-        while axhal::time::wall_time() < deadline {
+        loop {
+            let mut rq = current_run_queue::<NoPreemptIrqSave>();
+            if axhal::time::wall_time() >= deadline {
+                break;
+            }
             let wq = self.queue.lock();
             if condition() {
                 timeout = false;

From d5db096acd92fa170ee533b7004c0cc940e5397c Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Thu, 10 Oct 2024 14:47:35 +0800
Subject: [PATCH 40/51] [fix] some compile warnings

---
 modules/axtask/src/run_queue.rs | 1 +
 modules/axtask/src/task.rs      | 1 +
 2 files changed, 2 insertions(+)

diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index d320b2e283..c206dfa3cd 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -146,6 +146,7 @@ fn get_run_queue(index: usize) -> &'static mut AxRunQueue {
 pub(crate) fn select_run_queue<G: BaseGuard>(task: &AxTaskRef) -> AxRunQueueRef<'static, G> {
     #[cfg(not(feature = "smp"))]
     {
+        let _ = task;
         // When SMP is disabled, all tasks are scheduled on the same global run queue.
         current_run_queue()
     }
diff --git a/modules/axtask/src/task.rs b/modules/axtask/src/task.rs
index 33837398d9..2f03dc5745 100644
--- a/modules/axtask/src/task.rs
+++ b/modules/axtask/src/task.rs
@@ -292,6 +292,7 @@ impl TaskInner {
         self.is_idle
     }
 
+    #[allow(unused)]
     #[inline]
     pub(crate) fn cpumask(&self) -> CpuMask {
         *self.cpumask.lock()

From 5dfd043e3bf3df3a8b51998173d0b441c18625eb Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Thu, 10 Oct 2024 14:52:25 +0800
Subject: [PATCH 41/51] Update the commit hash for arceos-apps

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 1eedadff69..a84ea5f796 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -5,7 +5,7 @@ on: [push, pull_request]
 env:
   qemu-version: 8.2.0
   rust-toolchain: nightly-2024-05-02
-  arceos-apps: 'b25b7e2'
+  arceos-apps: '15af71e'
 
 jobs:
   unit-test:

From b4c09aa756ff867798a8e1e5b9a95e24b429f9eb Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Fri, 11 Oct 2024 00:14:09 +0800
Subject: [PATCH 42/51] Update the commit hash for arceos-apps again

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index a84ea5f796..963b891647 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -5,7 +5,7 @@ on: [push, pull_request]
 env:
   qemu-version: 8.2.0
   rust-toolchain: nightly-2024-05-02
-  arceos-apps: '15af71e'
+  arceos-apps: '42d9bbd'
 
 jobs:
   unit-test:

From 4690c10af08006c4f920110b6fbf6daa6b348049 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Fri, 11 Oct 2024 00:53:43 +0800
Subject: [PATCH 43/51] Update the commit hash for arceos-apps again and again

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 963b891647..59b7bbab21 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -5,7 +5,7 @@ on: [push, pull_request]
 env:
   qemu-version: 8.2.0
   rust-toolchain: nightly-2024-05-02
-  arceos-apps: '42d9bbd'
+  arceos-apps: '57d495a'
 
 jobs:
   unit-test:

From 6cef2ae17ce7571d6abfc28eee590b5b07202fa5 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Fri, 11 Oct 2024 11:41:39 +0800
Subject: [PATCH 44/51] [fix] bug in wait_timeout and wait_timeout_until

---
 modules/axtask/src/wait_queue.rs | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/modules/axtask/src/wait_queue.rs b/modules/axtask/src/wait_queue.rs
index c61ef9af13..c05fd1c924 100644
--- a/modules/axtask/src/wait_queue.rs
+++ b/modules/axtask/src/wait_queue.rs
@@ -50,7 +50,7 @@ impl WaitQueue {
     }
 
     /// Cancel events by removing the task from the wait queue.
-    /// If `from_timer_list` is true, the task should be removed from the timer list.
+    /// If `from_timer_list` is true, try to remove the task from the timer list.
     fn cancel_events(&self, curr: CurrentTask, from_timer_list: bool) {
         // A task can be wake up only one events (timer or `notify()`), remove
         // the event from another queue.
@@ -60,6 +60,7 @@ impl WaitQueue {
             wq_locked.retain(|t| !curr.ptr_eq(t));
             curr.set_in_wait_queue(false);
         }
+
         // Try to cancel a timer event from timer lists.
         // Just mark task's current timer ticket ID as expired.
         #[cfg(feature = "irq")]
@@ -120,9 +121,8 @@ impl WaitQueue {
 
         let timeout = curr.in_wait_queue(); // still in the wait queue, must have timed out
 
-        // If `timeout` is true, the task is still in the wait queue,
-        // which means timer event is triggered and the task has been removed from timer list.
-        self.cancel_events(curr, !timeout);
+        // Always try to remove the task from the timer list.
+        self.cancel_events(curr, true);
         timeout
     }
 
@@ -149,6 +149,7 @@ impl WaitQueue {
         loop {
             let mut rq = current_run_queue::<NoPreemptIrqSave>();
             if axhal::time::wall_time() >= deadline {
+                timeout = true;
                 break;
             }
             let wq = self.queue.lock();
@@ -158,8 +159,11 @@ impl WaitQueue {
             }
 
             rq.blocked_resched(wq);
+
+            timeout = curr.in_wait_queue(); // still in the wait queue, must have timed out
         }
-        self.cancel_events(curr, !timeout);
+        // Always try to remove the task from the timer list.
+        self.cancel_events(curr, true);
         timeout
     }
 

From a8fa3daca95fa3e833fb999f5246437f1e29544d Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Fri, 11 Oct 2024 11:50:39 +0800
Subject: [PATCH 45/51] [CI] update dependencies for qemu build

---
 .github/workflows/actions/setup-musl/action.yml | 4 ++--
 .github/workflows/actions/setup-qemu/action.yml | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/actions/setup-musl/action.yml b/.github/workflows/actions/setup-musl/action.yml
index b8aee3661a..67062e057a 100644
--- a/.github/workflows/actions/setup-musl/action.yml
+++ b/.github/workflows/actions/setup-musl/action.yml
@@ -11,7 +11,7 @@ runs:
   steps:
   - name: Cache musl
     id: cache-musl
-    uses: actions/cache/restore@v3
+    uses: actions/cache/restore@v4
     with:
       path: ${{ inputs.arch }}-linux-musl-cross
       key: ${{ inputs.arch }}-linux-musl-cross
@@ -22,7 +22,7 @@ runs:
       MUSL_PATH=${{ inputs.arch }}-linux-musl-cross
       wget https://musl.cc/${MUSL_PATH}.tgz
       tar -xf ${MUSL_PATH}.tgz
-  - uses: actions/cache/save@v3
+  - uses: actions/cache/save@v4
     if: steps.cache-musl.outputs.cache-hit != 'true'
     with:
       path: ${{ inputs.arch }}-linux-musl-cross
diff --git a/.github/workflows/actions/setup-qemu/action.yml b/.github/workflows/actions/setup-qemu/action.yml
index 0f8bc4d1ca..819fb08797 100644
--- a/.github/workflows/actions/setup-qemu/action.yml
+++ b/.github/workflows/actions/setup-qemu/action.yml
@@ -11,7 +11,7 @@ runs:
   steps:
   - name: Cache QEMU
     id: cache-qemu
-    uses: actions/cache/restore@v3
+    uses: actions/cache/restore@v4
     with:
       path: qemu_build
       key: qemu-${{ inputs.qemu-version }}-slirp-1
@@ -22,13 +22,13 @@ runs:
       PREFIX: ${{ github.workspace }}/qemu_build
     shell: bash
     run: |
-      sudo apt-get update && sudo apt-get install -y ninja-build libslirp-dev
+      sudo apt-get update && sudo apt-get install -y ninja-build libslirp-dev libglib2.0-dev
       wget https://download.qemu.org/$QEMU_PATH.tar.xz && tar -xJf $QEMU_PATH.tar.xz
       cd $QEMU_PATH \
         && ./configure --prefix=$PREFIX --target-list=x86_64-softmmu,riscv64-softmmu,aarch64-softmmu --enable-slirp \
         && make -j > /dev/null 2>&1 \
         && make install
-  - uses: actions/cache/save@v3
+  - uses: actions/cache/save@v4
     if: steps.cache-qemu.outputs.cache-hit != 'true'
     with:
       path: qemu_build

From c09cb39e39b3ed36b8fe13676d011e365ea68b12 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Fri, 11 Oct 2024 14:54:47 +0800
Subject: [PATCH 46/51] [fix] fourth round fix of review suggestions

---
 modules/axtask/src/run_queue.rs  | 23 +++++++++++++++--------
 modules/axtask/src/task.rs       | 19 ++++++++-----------
 modules/axtask/src/wait_queue.rs | 10 ++--------
 3 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index c206dfa3cd..b0fe09a4a2 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -1,8 +1,6 @@
 use alloc::collections::VecDeque;
 use alloc::sync::Arc;
 use core::mem::MaybeUninit;
-#[cfg(feature = "smp")]
-use core::sync::atomic::{AtomicUsize, Ordering};
 
 use kernel_guard::BaseGuard;
 use kspin::SpinRaw;
@@ -88,6 +86,7 @@ pub(crate) fn current_run_queue<G: BaseGuard>() -> AxRunQueueRef<'static, G> {
 #[allow(clippy::modulo_one)]
 #[inline]
 fn select_run_queue_index(cpumask: CpuMask) -> usize {
+    use core::sync::atomic::{AtomicUsize, Ordering};
     static RUN_QUEUE_INDEX: AtomicUsize = AtomicUsize::new(0);
 
     assert!(!cpumask.is_empty(), "No available CPU for task execution");
@@ -188,7 +187,11 @@ impl<'a, G: BaseGuard> Drop for AxRunQueueRef<'a, G> {
 /// Core functions of run queue.
 impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
     pub fn add_task(&mut self, task: AxTaskRef) {
-        debug!("Add {} on run_queue {}", task.id_name(), self.inner.cpu_id);
+        debug!(
+            "task add: {} on run_queue {}",
+            task.id_name(),
+            self.inner.cpu_id
+        );
         assert!(task.is_ready());
         self.inner.scheduler.lock().add_task(task);
     }
@@ -249,7 +252,11 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
         assert!(curr.is_running(), "task is not running: {:?}", curr.state());
         assert!(!curr.is_idle());
         if curr.is_init() {
-            EXITED_TASKS.with_current(|exited_tasks| exited_tasks.clear());
+            // Safety: it is called from `current_run_queue::<NoPreemptIrqSave>().exit_current(exit_code)`,
+            // which disabled IRQs and preemption.
+            unsafe {
+                EXITED_TASKS.current_ref_mut_raw().clear();
+            }
             axhal::misc::terminate();
         } else {
             curr.set_state(TaskState::Exited);
@@ -371,7 +378,7 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
 impl AxRunQueue {
     /// Create a new run queue for the specified CPU.
     /// The run queue is initialized with a per-CPU gc task in its scheduler.
-    pub fn new(cpu_id: usize) -> Self {
+    fn new(cpu_id: usize) -> Self {
         let gc_task = TaskInner::new(gc_entry, "gc".into(), axconfig::TASK_STACK_SIZE).into_arc();
         // gc task should be pinned to the current CPU.
         gc_task.set_cpumask(CpuMask::one_shot(cpu_id));
@@ -442,7 +449,7 @@ impl AxRunQueue {
 
             // Store the weak pointer of **prev_task** in **next_task**'s struct.
             #[cfg(feature = "smp")]
-            next_task.set_prev_task(prev_task.clone());
+            next_task.set_prev_task(prev_task.as_task_ref());
 
             // The strong reference count of `prev_task` will be decremented by 1,
             // but won't be dropped until `gc_entry()` is called.
@@ -479,7 +486,7 @@ fn gc_entry() {
                 }
             }
         }
-        unsafe { WAIT_FOR_EXIT.current_ref_raw() }.wait();
+        WAIT_FOR_EXIT.with_current(|wq| wq.wait());
     }
 }
 
@@ -500,7 +507,6 @@ pub(crate) fn init() {
     main_task.set_state(TaskState::Running);
     unsafe { CurrentTask::init_current(main_task) }
 
-    info!("Initialize RUN_QUEUES");
     RUN_QUEUE.with_current(|rq| {
         rq.init_once(AxRunQueue::new(cpu_id));
     });
@@ -519,6 +525,7 @@ pub(crate) fn init_secondary() {
         i.init_once(idle_task.clone());
     });
     unsafe { CurrentTask::init_current(idle_task) }
+
     RUN_QUEUE.with_current(|rq| {
         rq.init_once(AxRunQueue::new(cpu_id));
     });
diff --git a/modules/axtask/src/task.rs b/modules/axtask/src/task.rs
index 2f03dc5745..50e256b828 100644
--- a/modules/axtask/src/task.rs
+++ b/modules/axtask/src/task.rs
@@ -230,8 +230,6 @@ impl TaskInner {
         t.is_init = true;
         #[cfg(feature = "smp")]
         t.set_on_cpu(true);
-        // Init task can be scheduled on all CPUs.
-        t.set_cpumask(CpuMask::full());
         if t.name == "idle" {
             t.is_idle = true;
         }
@@ -401,13 +399,13 @@ impl TaskInner {
     /// The `on_cpu field is set to `true` when the task is preparing to run on a CPU,
     /// and it is set to `false` when the task has finished its scheduling process in `clear_prev_task_on_cpu()`.
     #[cfg(feature = "smp")]
-    pub fn on_cpu(&self) -> bool {
+    pub(crate) fn on_cpu(&self) -> bool {
         self.on_cpu.load(Ordering::Acquire)
     }
 
     /// Sets whether the task is running on a CPU.
     #[cfg(feature = "smp")]
-    pub fn set_on_cpu(&self, on_cpu: bool) {
+    pub(crate) fn set_on_cpu(&self, on_cpu: bool) {
         self.on_cpu.store(on_cpu, Ordering::Release)
     }
 
@@ -416,8 +414,8 @@ impl TaskInner {
     /// ## Safety
     /// This function is only called by current task in `switch_to`.
     #[cfg(feature = "smp")]
-    pub unsafe fn set_prev_task(&self, prev_task: Arc<AxTask>) {
-        *self.prev_task.get() = Arc::downgrade(&prev_task);
+    pub(crate) unsafe fn set_prev_task(&self, prev_task: &AxTaskRef) {
+        *self.prev_task.get() = Arc::downgrade(prev_task);
     }
 
     /// Clears the `on_cpu` field of previous task running on this CPU.
@@ -433,15 +431,14 @@ impl TaskInner {
     ///
     /// ## Safety
     /// The caller must ensure that the weak reference to the prev task is valid, which is
-    /// done by the previous task running on this CPU through `set_prev_task_on_cpu_ptr()`.
+    /// done by the previous task running on this CPU through `set_prev_task()`.
     #[cfg(feature = "smp")]
-    pub unsafe fn clear_prev_task_on_cpu(&self) {
+    pub(crate) unsafe fn clear_prev_task_on_cpu(&self) {
         self.prev_task
             .get()
             .as_ref()
-            .expect("Invalid prev_task pointer")
-            .upgrade()
-            .expect("prev_task is dropped")
+            .and_then(|weak| weak.upgrade())
+            .expect("Invalid prev_task pointer or prev_task has been dropped")
             .set_on_cpu(false);
     }
 
diff --git a/modules/axtask/src/wait_queue.rs b/modules/axtask/src/wait_queue.rs
index c05fd1c924..eaca42f5f0 100644
--- a/modules/axtask/src/wait_queue.rs
+++ b/modules/axtask/src/wait_queue.rs
@@ -149,7 +149,6 @@ impl WaitQueue {
         loop {
             let mut rq = current_run_queue::<NoPreemptIrqSave>();
             if axhal::time::wall_time() >= deadline {
-                timeout = true;
                 break;
             }
             let wq = self.queue.lock();
@@ -159,8 +158,6 @@ impl WaitQueue {
             }
 
             rq.blocked_resched(wq);
-
-            timeout = curr.in_wait_queue(); // still in the wait queue, must have timed out
         }
         // Always try to remove the task from the timer list.
         self.cancel_events(curr, true);
@@ -187,13 +184,10 @@ impl WaitQueue {
     /// preemption is enabled.
     pub fn notify_all(&self, resched: bool) {
         loop {
-            let mut wq = self.queue.lock();
-            if let Some(task) = wq.pop_front() {
-                unblock_one_task(task, resched);
-            } else {
+            if self.queue.lock().is_empty() {
                 break;
             }
-            drop(wq);
+            self.notify_one(resched);
         }
     }
 

From e41100a5469efce3f719e28b63f3fcef5f1686ce Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Fri, 11 Oct 2024 15:43:24 +0800
Subject: [PATCH 47/51] [fix] problems related to WAIT_FOR_EXIT in gc_entry

---
 modules/axtask/src/run_queue.rs | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index b0fe09a4a2..1f62357349 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -264,10 +264,15 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
             // Notify the joiner task.
             curr.notify_exit(exit_code);
 
-            // Push current task to the `EXITED_TASKS` list, which will be consumed by the GC task.
-            EXITED_TASKS.with_current(|exited_tasks| exited_tasks.push_back(curr.clone()));
-            // Wake up the GC task to drop the exited tasks.
-            WAIT_FOR_EXIT.with_current(|wq| wq.notify_one(false));
+            // Safety: it is called from `current_run_queue::<NoPreemptIrqSave>().exit_current(exit_code)`,
+            // which disabled IRQs and preemption.
+            unsafe {
+                // Push current task to the `EXITED_TASKS` list, which will be consumed by the GC task.
+                EXITED_TASKS.current_ref_mut_raw().push_back(curr.clone());
+                // Wake up the GC task to drop the exited tasks.
+                WAIT_FOR_EXIT.current_ref_mut_raw().notify_one(false);
+            }
+
             // Schedule to next task.
             self.inner.resched(false);
         }
@@ -486,7 +491,10 @@ fn gc_entry() {
                 }
             }
         }
-        WAIT_FOR_EXIT.with_current(|wq| wq.wait());
+        // Note: we cannot block current task with preemption disabled,
+        // use `current_ref_raw` to get the `WAIT_FOR_EXIT`'s reference here to avoid the use of `NoPreemptGuard`.
+        // Since gc task is pinned to the current CPU, there is no affection if the gc task is preempted during the process.
+        unsafe { WAIT_FOR_EXIT.current_ref_raw() }.wait();
     }
 }
 

From 7a34fc55de8e31c5ba6fa1ff257376403f57f722 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Fri, 11 Oct 2024 16:16:10 +0800
Subject: [PATCH 48/51] [refactor] use current_ref_mut_raw to get TIMER_LIST in
 check_events()

---
 modules/axtask/src/timers.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/modules/axtask/src/timers.rs b/modules/axtask/src/timers.rs
index 1a6173b9b9..4dcc03c41b 100644
--- a/modules/axtask/src/timers.rs
+++ b/modules/axtask/src/timers.rs
@@ -46,7 +46,11 @@ pub fn set_alarm_wakeup(deadline: TimeValue, task: AxTaskRef) {
 pub fn check_events() {
     loop {
         let now = wall_time();
-        let event = TIMER_LIST.with_current(|timer_list| timer_list.expire_one(now));
+        let event = unsafe {
+            // Safety: IRQs are disabled at this time.
+            TIMER_LIST.current_ref_mut_raw()
+        }
+        .expire_one(now);
         if let Some((_deadline, event)) = event {
             event.callback(now);
         } else {

From ec64c8e88490b1f9e5ec4883ef30c4f32a3b50fd Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Fri, 11 Oct 2024 22:46:26 +0800
Subject: [PATCH 49/51] [refactor] improve notify_all in wait_queue.rs

---
 modules/axtask/src/wait_queue.rs | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/modules/axtask/src/wait_queue.rs b/modules/axtask/src/wait_queue.rs
index eaca42f5f0..a4fda3785b 100644
--- a/modules/axtask/src/wait_queue.rs
+++ b/modules/axtask/src/wait_queue.rs
@@ -183,11 +183,8 @@ impl WaitQueue {
     /// If `resched` is true, the current task will be preempted when the
     /// preemption is enabled.
     pub fn notify_all(&self, resched: bool) {
-        loop {
-            if self.queue.lock().is_empty() {
-                break;
-            }
-            self.notify_one(resched);
+        while self.notify_one(resched) {
+            // loop until the wait queue is empty
         }
     }
 

From 836a79da817d87023a3702bb330a8339946697be Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Fri, 11 Oct 2024 22:50:54 +0800
Subject: [PATCH 50/51] [refactor] delete redundant assertion in
 block_resched()

---
 modules/axtask/src/run_queue.rs | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/modules/axtask/src/run_queue.rs b/modules/axtask/src/run_queue.rs
index 1f62357349..9eb53ed72e 100644
--- a/modules/axtask/src/run_queue.rs
+++ b/modules/axtask/src/run_queue.rs
@@ -308,11 +308,6 @@ impl<'a, G: BaseGuard> AxRunQueueRef<'a, G> {
         // Current task's state has been changed to `Blocked` and added to the wait queue.
         // Note that the state may have been set as `Ready` in `unblock_task()`,
         // see `unblock_task()` for details.
-        assert!(
-            curr.is_blocked() | curr.is_ready(),
-            "current task is not blocked or ready, state: {:?}",
-            curr.state()
-        );
 
         debug!("task block: {}", curr.id_name());
         self.inner.resched(false);

From 63e978bf428edf52d05177aa22a19d101a569ea8 Mon Sep 17 00:00:00 2001
From: hky1999 <976929993@qq.com>
Date: Sat, 12 Oct 2024 00:22:59 +0800
Subject: [PATCH 51/51] [fix] some compile warnings

---
 modules/axtask/src/api.rs        | 3 ++-
 modules/axtask/src/task.rs       | 5 -----
 modules/axtask/src/wait_queue.rs | 4 ++--
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/modules/axtask/src/api.rs b/modules/axtask/src/api.rs
index 0597ba8acf..b3278b9595 100644
--- a/modules/axtask/src/api.rs
+++ b/modules/axtask/src/api.rs
@@ -2,7 +2,7 @@
 
 use alloc::{string::String, sync::Arc};
 
-use kernel_guard::{NoOp, NoPreemptIrqSave};
+use kernel_guard::NoPreemptIrqSave;
 
 pub(crate) use crate::run_queue::{current_run_queue, select_run_queue};
 
@@ -92,6 +92,7 @@ pub fn init_scheduler_secondary() {
 #[cfg(feature = "irq")]
 #[doc(cfg(feature = "irq"))]
 pub fn on_timer_tick() {
+    use kernel_guard::NoOp;
     crate::timers::check_events();
     // Since irq and preemption are both disabled here,
     // we can get current run queue with the default `kernel_guard::NoOp`.
diff --git a/modules/axtask/src/task.rs b/modules/axtask/src/task.rs
index 50e256b828..017b764171 100644
--- a/modules/axtask/src/task.rs
+++ b/modules/axtask/src/task.rs
@@ -275,11 +275,6 @@ impl TaskInner {
         matches!(self.state(), TaskState::Ready)
     }
 
-    #[inline]
-    pub(crate) fn is_blocked(&self) -> bool {
-        matches!(self.state(), TaskState::Blocked)
-    }
-
     #[inline]
     pub(crate) const fn is_init(&self) -> bool {
         self.is_init
diff --git a/modules/axtask/src/wait_queue.rs b/modules/axtask/src/wait_queue.rs
index a4fda3785b..d6e546f347 100644
--- a/modules/axtask/src/wait_queue.rs
+++ b/modules/axtask/src/wait_queue.rs
@@ -51,7 +51,7 @@ impl WaitQueue {
 
     /// Cancel events by removing the task from the wait queue.
     /// If `from_timer_list` is true, try to remove the task from the timer list.
-    fn cancel_events(&self, curr: CurrentTask, from_timer_list: bool) {
+    fn cancel_events(&self, curr: CurrentTask, _from_timer_list: bool) {
         // A task can be wake up only one events (timer or `notify()`), remove
         // the event from another queue.
         if curr.in_wait_queue() {
@@ -64,7 +64,7 @@ impl WaitQueue {
         // Try to cancel a timer event from timer lists.
         // Just mark task's current timer ticket ID as expired.
         #[cfg(feature = "irq")]
-        if from_timer_list {
+        if _from_timer_list {
             curr.timer_ticket_expired();
             // Note:
             //  this task is still not removed from timer list of target CPU,