Skip to content

Commit d5048d1

Browse files
committed
Merge tag 'timers-core-2025-03-23' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull timer core updates from Thomas Gleixner: - Fix a memory ordering issue in posix-timers Posix-timer lookup is lockless and reevaluates the timer validity under the timer lock, but the update which validates the timer is not protected by the timer lock. That allows the store to be reordered against the initialization stores, so that the lookup side can observe a partially initialized timer. That's mostly a theoretical problem, but incorrect nevertheless. - Fix a long standing inconsistency of the coarse time getters The coarse time getters read the base time of the current update cycle without reading the actual hardware clock. NTP frequency adjustment can set the base time backwards. The fine grained interfaces compensate this by reading the clock and applying the new conversion factor, but the coarse grained time getters use the base time directly. That allows the user to observe time going backwards. Cure it by always forwarding base time, when NTP changes the frequency with an immediate step. - Rework of posix-timer hashing The posix-timer hash is not scalable and due to the CRIU timer restore mechanism prone to massive contention on the global hash bucket lock. Replace the global hash lock with a fine grained per bucket locking scheme to address that. - Rework the proc/$PID/timers interface. /proc/$PID/timers is provided for CRIU to be able to restore a timer. The printout happens with sighand lock held and interrupts disabled. That's not required as this can be done with RCU protection as well. - Provide a sane mechanism for CRIU to restore a timer ID CRIU restores timers by creating and deleting them until the kernel internal per process ID counter reached the requested ID. That's horribly slow for sparse timer IDs. Provide a prctl() which allows CRIU to restore a timer with a given ID. When enabled the ID pointer is used as input pointer to read the requested ID from user space. When disabled, the normal allocation scheme (next ID) is active as before. This is backwards compatible for both kernel and user space. - Make hrtimer_update_function() less expensive. The sanity checks are valuable, but expensive for high frequency usage in io/uring. Make the debug checks conditional and enable them only when lockdep is enabled. - Small updates, cleanups and improvements * tag 'timers-core-2025-03-23' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (27 commits) selftests/timers: Improve skew_consistency by testing with other clockids timekeeping: Fix possible inconsistencies in _COARSE clockids posix-timers: Drop redundant memset() invocation selftests/timers/posix-timers: Add a test for exact allocation mode posix-timers: Provide a mechanism to allocate a given timer ID posix-timers: Dont iterate /proc/$PID/timers with sighand:: Siglock held posix-timers: Make per process list RCU safe posix-timers: Avoid false cacheline sharing posix-timers: Switch to jhash32() posix-timers: Improve hash table performance posix-timers: Make signal_struct:: Next_posix_timer_id an atomic_t posix-timers: Make lock_timer() use guard() posix-timers: Rework timer removal posix-timers: Simplify lock/unlock_timer() posix-timers: Use guards in a few places posix-timers: Remove SLAB_PANIC from kmem cache posix-timers: Remove a few paranoid warnings posix-timers: Cleanup includes posix-timers: Add cond_resched() to posix_timer_add() search loop posix-timers: Initialise timer before adding it to the hash table ...
2 parents 0ae2062 + e40d370 commit d5048d1

File tree

16 files changed

+524
-379
lines changed

16 files changed

+524
-379
lines changed

fs/proc/base.c

+20-28
Original file line numberDiff line numberDiff line change
@@ -2494,11 +2494,9 @@ static const struct file_operations proc_map_files_operations = {
24942494

24952495
#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
24962496
struct timers_private {
2497-
struct pid *pid;
2498-
struct task_struct *task;
2499-
struct sighand_struct *sighand;
2500-
struct pid_namespace *ns;
2501-
unsigned long flags;
2497+
struct pid *pid;
2498+
struct task_struct *task;
2499+
struct pid_namespace *ns;
25022500
};
25032501

25042502
static void *timers_start(struct seq_file *m, loff_t *pos)
@@ -2509,54 +2507,48 @@ static void *timers_start(struct seq_file *m, loff_t *pos)
25092507
if (!tp->task)
25102508
return ERR_PTR(-ESRCH);
25112509

2512-
tp->sighand = lock_task_sighand(tp->task, &tp->flags);
2513-
if (!tp->sighand)
2514-
return ERR_PTR(-ESRCH);
2515-
2516-
return seq_hlist_start(&tp->task->signal->posix_timers, *pos);
2510+
rcu_read_lock();
2511+
return seq_hlist_start_rcu(&tp->task->signal->posix_timers, *pos);
25172512
}
25182513

25192514
static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
25202515
{
25212516
struct timers_private *tp = m->private;
2522-
return seq_hlist_next(v, &tp->task->signal->posix_timers, pos);
2517+
2518+
return seq_hlist_next_rcu(v, &tp->task->signal->posix_timers, pos);
25232519
}
25242520

25252521
static void timers_stop(struct seq_file *m, void *v)
25262522
{
25272523
struct timers_private *tp = m->private;
25282524

2529-
if (tp->sighand) {
2530-
unlock_task_sighand(tp->task, &tp->flags);
2531-
tp->sighand = NULL;
2532-
}
2533-
25342525
if (tp->task) {
25352526
put_task_struct(tp->task);
25362527
tp->task = NULL;
2528+
rcu_read_unlock();
25372529
}
25382530
}
25392531

25402532
static int show_timer(struct seq_file *m, void *v)
25412533
{
2542-
struct k_itimer *timer;
2543-
struct timers_private *tp = m->private;
2544-
int notify;
25452534
static const char * const nstr[] = {
2546-
[SIGEV_SIGNAL] = "signal",
2547-
[SIGEV_NONE] = "none",
2548-
[SIGEV_THREAD] = "thread",
2535+
[SIGEV_SIGNAL] = "signal",
2536+
[SIGEV_NONE] = "none",
2537+
[SIGEV_THREAD] = "thread",
25492538
};
25502539

2551-
timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list);
2552-
notify = timer->it_sigev_notify;
2540+
struct k_itimer *timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list);
2541+
struct timers_private *tp = m->private;
2542+
int notify = timer->it_sigev_notify;
2543+
2544+
guard(spinlock_irq)(&timer->it_lock);
2545+
if (!posixtimer_valid(timer))
2546+
return 0;
25532547

25542548
seq_printf(m, "ID: %d\n", timer->it_id);
2555-
seq_printf(m, "signal: %d/%px\n",
2556-
timer->sigq.info.si_signo,
2549+
seq_printf(m, "signal: %d/%px\n", timer->sigq.info.si_signo,
25572550
timer->sigq.info.si_value.sival_ptr);
2558-
seq_printf(m, "notify: %s/%s.%d\n",
2559-
nstr[notify & ~SIGEV_THREAD_ID],
2551+
seq_printf(m, "notify: %s/%s.%d\n", nstr[notify & ~SIGEV_THREAD_ID],
25602552
(notify & SIGEV_THREAD_ID) ? "tid" : "pid",
25612553
pid_nr_ns(timer->it_pid, tp->ns));
25622554
seq_printf(m, "ClockID: %d\n", timer->it_clock);

include/linux/cleanup.h

+14-8
Original file line numberDiff line numberDiff line change
@@ -308,11 +308,21 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
308308
#define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond) \
309309
static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
310310

311-
#define DEFINE_GUARD(_name, _type, _lock, _unlock) \
311+
#define __DEFINE_GUARD_LOCK_PTR(_name, _exp) \
312+
static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \
313+
{ return (void *)(__force unsigned long)*(_exp); }
314+
315+
#define DEFINE_CLASS_IS_GUARD(_name) \
312316
__DEFINE_CLASS_IS_CONDITIONAL(_name, false); \
317+
__DEFINE_GUARD_LOCK_PTR(_name, _T)
318+
319+
#define DEFINE_CLASS_IS_COND_GUARD(_name) \
320+
__DEFINE_CLASS_IS_CONDITIONAL(_name, true); \
321+
__DEFINE_GUARD_LOCK_PTR(_name, _T)
322+
323+
#define DEFINE_GUARD(_name, _type, _lock, _unlock) \
313324
DEFINE_CLASS(_name, _type, if (_T) { _unlock; }, ({ _lock; _T; }), _type _T); \
314-
static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \
315-
{ return (void *)(__force unsigned long)*_T; }
325+
DEFINE_CLASS_IS_GUARD(_name)
316326

317327
#define DEFINE_GUARD_COND(_name, _ext, _condlock) \
318328
__DEFINE_CLASS_IS_CONDITIONAL(_name##_ext, true); \
@@ -392,11 +402,7 @@ static inline void class_##_name##_destructor(class_##_name##_t *_T) \
392402
if (_T->lock) { _unlock; } \
393403
} \
394404
\
395-
static inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \
396-
{ \
397-
return (void *)(__force unsigned long)_T->lock; \
398-
}
399-
405+
__DEFINE_GUARD_LOCK_PTR(_name, &_T->lock)
400406

401407
#define __DEFINE_LOCK_GUARD_1(_name, _type, _lock) \
402408
static inline class_##_name##_t class_##_name##_constructor(_type *l) \

include/linux/hrtimer.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -333,14 +333,15 @@ static inline int hrtimer_callback_running(struct hrtimer *timer)
333333
static inline void hrtimer_update_function(struct hrtimer *timer,
334334
enum hrtimer_restart (*function)(struct hrtimer *))
335335
{
336+
#ifdef CONFIG_PROVE_LOCKING
336337
guard(raw_spinlock_irqsave)(&timer->base->cpu_base->lock);
337338

338339
if (WARN_ON_ONCE(hrtimer_is_queued(timer)))
339340
return;
340341

341342
if (WARN_ON_ONCE(!function))
342343
return;
343-
344+
#endif
344345
timer->function = function;
345346
}
346347

include/linux/posix-timers.h

+21-9
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ bool posixtimer_init_sigqueue(struct sigqueue *q);
114114
void posixtimer_send_sigqueue(struct k_itimer *tmr);
115115
bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq);
116116
void posixtimer_free_timer(struct k_itimer *timer);
117+
long posixtimer_create_prctl(unsigned long ctrl);
117118

118119
/* Init task static initializer */
119120
#define INIT_CPU_TIMERBASE(b) { \
@@ -140,6 +141,7 @@ static inline void posixtimer_rearm_itimer(struct task_struct *p) { }
140141
static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info,
141142
struct sigqueue *timer_sigq) { return false; }
142143
static inline void posixtimer_free_timer(struct k_itimer *timer) { }
144+
static inline long posixtimer_create_prctl(unsigned long ctrl) { return -EINVAL; }
143145
#endif
144146

145147
#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
@@ -177,23 +179,26 @@ static inline void posix_cputimers_init_work(void) { }
177179
* @rcu: RCU head for freeing the timer.
178180
*/
179181
struct k_itimer {
180-
struct hlist_node list;
181-
struct hlist_node ignored_list;
182+
/* 1st cacheline contains read-mostly fields */
182183
struct hlist_node t_hash;
183-
spinlock_t it_lock;
184-
const struct k_clock *kclock;
185-
clockid_t it_clock;
184+
struct hlist_node list;
186185
timer_t it_id;
186+
clockid_t it_clock;
187+
int it_sigev_notify;
188+
enum pid_type it_pid_type;
189+
struct signal_struct *it_signal;
190+
const struct k_clock *kclock;
191+
192+
/* 2nd cacheline and above contain fields which are modified regularly */
193+
spinlock_t it_lock;
187194
int it_status;
188195
bool it_sig_periodic;
189196
s64 it_overrun;
190197
s64 it_overrun_last;
191198
unsigned int it_signal_seq;
192199
unsigned int it_sigqueue_seq;
193-
int it_sigev_notify;
194-
enum pid_type it_pid_type;
195200
ktime_t it_interval;
196-
struct signal_struct *it_signal;
201+
struct hlist_node ignored_list;
197202
union {
198203
struct pid *it_pid;
199204
struct task_struct *it_process;
@@ -210,7 +215,7 @@ struct k_itimer {
210215
} alarm;
211216
} it;
212217
struct rcu_head rcu;
213-
};
218+
} ____cacheline_aligned_in_smp;
214219

215220
void run_posix_cpu_timers(void);
216221
void posix_cpu_timers_exit(struct task_struct *task);
@@ -240,6 +245,13 @@ static inline void posixtimer_sigqueue_putref(struct sigqueue *q)
240245

241246
posixtimer_putref(tmr);
242247
}
248+
249+
static inline bool posixtimer_valid(const struct k_itimer *timer)
250+
{
251+
unsigned long val = (unsigned long)timer->it_signal;
252+
253+
return !(val & 0x1UL);
254+
}
243255
#else /* CONFIG_POSIX_TIMERS */
244256
static inline void posixtimer_sigqueue_getref(struct sigqueue *q) { }
245257
static inline void posixtimer_sigqueue_putref(struct sigqueue *q) { }

include/linux/sched/signal.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,8 @@ struct signal_struct {
136136
#ifdef CONFIG_POSIX_TIMERS
137137

138138
/* POSIX.1b Interval Timers */
139-
unsigned int next_posix_timer_id;
139+
unsigned int timer_create_restore_ids:1;
140+
atomic_t next_posix_timer_id;
140141
struct hlist_head posix_timers;
141142
struct hlist_head ignored_posix_timers;
142143

include/uapi/linux/prctl.h

+11
Original file line numberDiff line numberDiff line change
@@ -353,4 +353,15 @@ struct prctl_mm_map {
353353
*/
354354
#define PR_LOCK_SHADOW_STACK_STATUS 76
355355

356+
/*
357+
* Controls the mode of timer_create() for CRIU restore operations.
358+
* Enabling this allows CRIU to restore timers with explicit IDs.
359+
*
360+
* Don't use for normal operations as the result might be undefined.
361+
*/
362+
#define PR_TIMER_CREATE_RESTORE_IDS 77
363+
# define PR_TIMER_CREATE_RESTORE_IDS_OFF 0
364+
# define PR_TIMER_CREATE_RESTORE_IDS_ON 1
365+
# define PR_TIMER_CREATE_RESTORE_IDS_GET 2
366+
356367
#endif /* _LINUX_PRCTL_H */

kernel/signal.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -2092,7 +2092,7 @@ static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueu
20922092
* from a non-periodic timer, then just drop the reference
20932093
* count. Otherwise queue it on the ignored list.
20942094
*/
2095-
if (tmr->it_signal && tmr->it_sig_periodic)
2095+
if (posixtimer_valid(tmr) && tmr->it_sig_periodic)
20962096
hlist_add_head(&tmr->ignored_list, &tsk->signal->ignored_posix_timers);
20972097
else
20982098
posixtimer_putref(tmr);

kernel/sys.c

+5
Original file line numberDiff line numberDiff line change
@@ -2815,6 +2815,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
28152815
return -EINVAL;
28162816
error = arch_lock_shadow_stack_status(me, arg2);
28172817
break;
2818+
case PR_TIMER_CREATE_RESTORE_IDS:
2819+
if (arg3 || arg4 || arg5)
2820+
return -EINVAL;
2821+
error = posixtimer_create_prctl(arg2);
2822+
break;
28182823
default:
28192824
trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);
28202825
error = -EINVAL;

kernel/time/clocksource.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -1510,7 +1510,7 @@ static int __init boot_override_clocksource(char* str)
15101510
{
15111511
mutex_lock(&clocksource_mutex);
15121512
if (str)
1513-
strscpy(override_name, str, sizeof(override_name));
1513+
strscpy(override_name, str);
15141514
mutex_unlock(&clocksource_mutex);
15151515
return 1;
15161516
}

kernel/time/hrtimer.c

+12-17
Original file line numberDiff line numberDiff line change
@@ -117,16 +117,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
117117
.csd = CSD_INIT(retrigger_next_event, NULL)
118118
};
119119

120-
static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
121-
/* Make sure we catch unsupported clockids */
122-
[0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES,
123-
124-
[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
125-
[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
126-
[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
127-
[CLOCK_TAI] = HRTIMER_BASE_TAI,
128-
};
129-
130120
static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
131121
{
132122
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
@@ -1587,14 +1577,19 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude)
15871577

15881578
static inline int hrtimer_clockid_to_base(clockid_t clock_id)
15891579
{
1590-
if (likely(clock_id < MAX_CLOCKS)) {
1591-
int base = hrtimer_clock_to_base_table[clock_id];
1592-
1593-
if (likely(base != HRTIMER_MAX_CLOCK_BASES))
1594-
return base;
1580+
switch (clock_id) {
1581+
case CLOCK_REALTIME:
1582+
return HRTIMER_BASE_REALTIME;
1583+
case CLOCK_MONOTONIC:
1584+
return HRTIMER_BASE_MONOTONIC;
1585+
case CLOCK_BOOTTIME:
1586+
return HRTIMER_BASE_BOOTTIME;
1587+
case CLOCK_TAI:
1588+
return HRTIMER_BASE_TAI;
1589+
default:
1590+
WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
1591+
return HRTIMER_BASE_MONOTONIC;
15951592
}
1596-
WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
1597-
return HRTIMER_BASE_MONOTONIC;
15981593
}
15991594

16001595
static enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused)

kernel/time/posix-clock.c

+1-23
Original file line numberDiff line numberDiff line change
@@ -90,26 +90,6 @@ static long posix_clock_ioctl(struct file *fp,
9090
return err;
9191
}
9292

93-
#ifdef CONFIG_COMPAT
94-
static long posix_clock_compat_ioctl(struct file *fp,
95-
unsigned int cmd, unsigned long arg)
96-
{
97-
struct posix_clock_context *pccontext = fp->private_data;
98-
struct posix_clock *clk = get_posix_clock(fp);
99-
int err = -ENOTTY;
100-
101-
if (!clk)
102-
return -ENODEV;
103-
104-
if (clk->ops.ioctl)
105-
err = clk->ops.ioctl(pccontext, cmd, arg);
106-
107-
put_posix_clock(clk);
108-
109-
return err;
110-
}
111-
#endif
112-
11393
static int posix_clock_open(struct inode *inode, struct file *fp)
11494
{
11595
int err;
@@ -171,11 +151,9 @@ static const struct file_operations posix_clock_file_operations = {
171151
.read = posix_clock_read,
172152
.poll = posix_clock_poll,
173153
.unlocked_ioctl = posix_clock_ioctl,
154+
.compat_ioctl = posix_clock_ioctl,
174155
.open = posix_clock_open,
175156
.release = posix_clock_release,
176-
#ifdef CONFIG_COMPAT
177-
.compat_ioctl = posix_clock_compat_ioctl,
178-
#endif
179157
};
180158

181159
int posix_clock_register(struct posix_clock *clk, struct device *dev)

0 commit comments

Comments
 (0)