From 434c79d144727cd182a3c076e96498a43cd1120a Mon Sep 17 00:00:00 2001 From: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> Date: Sun, 5 Dec 2021 01:03:36 -0600 Subject: [PATCH 01/30] ARM64: dts: Optimize HBON/OFF commands to remove flickering while fod * Also reduce a bit greenish Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- .../qcom/dsi-panel-j11-38-08-0a-fhd-cmd.dtsi | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/arch/arm64/boot/dts/vendor/qcom/dsi-panel-j11-38-08-0a-fhd-cmd.dtsi b/arch/arm64/boot/dts/vendor/qcom/dsi-panel-j11-38-08-0a-fhd-cmd.dtsi index 12bbecba5a54..e49a8a6dfdb9 100755 --- a/arch/arm64/boot/dts/vendor/qcom/dsi-panel-j11-38-08-0a-fhd-cmd.dtsi +++ b/arch/arm64/boot/dts/vendor/qcom/dsi-panel-j11-38-08-0a-fhd-cmd.dtsi @@ -208,20 +208,17 @@ qcom,mdss-dsi-dispparam-hbm-fod-on-command = [ 39 00 00 00 00 00 03 F0 5A 5A - /* elvss dimming off */ - 39 00 00 00 00 00 02 B0 07 - 39 00 00 00 00 00 02 B7 51 - 39 01 00 00 00 00 03 F0 A5 A5 - /* HBM on */ - 39 01 00 00 01 00 02 53 E0]; + 39 00 00 00 00 00 02 B0 03 + 39 00 00 00 00 00 02 B7 C9 + 39 00 00 00 00 00 03 F0 A5 A5 + 39 01 00 00 10 00 02 53 E0]; qcom,mdss-dsi-dispparam-hbm-fod-on-command-state = "dsi_hs_mode"; qcom,mdss-dsi-dispparam-hbm-fod-off-command = [ - 15 01 00 00 10 00 02 53 20 39 00 00 00 00 00 03 F0 5A 5A - /* elvss dimming on */ - 39 00 00 00 00 00 02 B0 07 - 39 00 00 00 00 00 02 B7 D1 - 39 01 00 00 00 00 03 F0 A5 A5]; + 39 00 00 00 00 00 02 B0 03 + 39 00 00 00 00 00 02 B7 C9 + 39 00 00 00 00 00 03 F0 A5 A5 + 39 01 00 00 10 00 02 53 20]; qcom,mdss-dsi-dispparam-hbm-fod-off-command-state = "dsi_hs_mode"; mi,mdss-dsi-hbm-off-command = [ From 5c640ddee1b2c0fb9521ea5b10b03cd39d58770a Mon Sep 17 00:00:00 2001 From: Chris Lew Date: Fri, 11 Sep 2020 20:42:55 -0700 Subject: [PATCH 02/30] soc: qcom: smp2p: Add proper retrigger detection Currently, smp2p relies on the hwirq resend feature to retrigger irqs that were missed because the irq was disabled at the time of receiving it. The hwirq resend feature will retrigger the parent smp2p interrupt. In order to keep track of what children needed to be retriggered, the pending bitmap was added. After calling handle_nested_irq, smp2p checks if the interrupt is enabled and sets the pending bit if the interrupt is not enabled. There is a small window where a client can enable the interrupt between calling handle_nested_irq and checking if the interrupt is enabled. If this happens, the interrupt is never called when the parent smp2p interrupt is retriggered. Add the irq_retrigger callback so smp2p can know which child interrupts need to be retriggered. Set the pending bits accordingly. Change-Id: I774b6ef91e22edbd55ddfffbbb3ae6062d48a560 Signed-off-by: Chris Lew Signed-off-by: Panchajanya1999 Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- drivers/soc/qcom/smp2p.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/soc/qcom/smp2p.c b/drivers/soc/qcom/smp2p.c index 700161c3ef43..b01bea9c0296 100644 --- a/drivers/soc/qcom/smp2p.c +++ b/drivers/soc/qcom/smp2p.c @@ -289,11 +289,7 @@ static void qcom_smp2p_notify_in(struct qcom_smp2p *smp2p) (!(val & BIT(i)) && test_bit(i, entry->irq_falling))) { irq_pin = irq_find_mapping(entry->domain, i); handle_nested_irq(irq_pin); - - if (test_bit(i, entry->irq_enabled)) - clear_bit(i, entry->irq_pending); - else - set_bit(i, entry->irq_pending); + clear_bit(i, entry->irq_pending); } } } @@ -392,11 +388,23 @@ static int smp2p_set_irq_type(struct irq_data *irqd, unsigned int type) return 0; } +static int smp2p_retrigger_irq(struct irq_data *irqd) +{ + struct smp2p_entry *entry = irq_data_get_irq_chip_data(irqd); + irq_hw_number_t irq = irqd_to_hwirq(irqd); + + SMP2P_INFO("%d: %s: %lu\n", entry->smp2p->remote_pid, entry->name, irq); + set_bit(irq, entry->irq_pending); + + return 0; +} + static struct irq_chip smp2p_irq_chip = { .name = "smp2p", .irq_mask = smp2p_mask_irq, .irq_unmask = smp2p_unmask_irq, .irq_set_type = smp2p_set_irq_type, + .irq_retrigger = smp2p_retrigger_irq, }; static int smp2p_irq_map(struct irq_domain *d, From 86a167c482838bb0f9aae9a1f45ba6a454b9c886 Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Tue, 15 Oct 2019 10:17:39 -0600 Subject: [PATCH 03/30] soc: qcom: smp2p: Don't check for NULL before ipc_log_string() The ipc_log_string() function quietly checks for a NULL context so an explicit check is not needed. Change-Id: Ic0dedbada086a4daf011875cf3996c5749317307 Signed-off-by: Jordan Crouse Signed-off-by: Panchajanya1999 Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- drivers/soc/qcom/smp2p.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/soc/qcom/smp2p.c b/drivers/soc/qcom/smp2p.c index b01bea9c0296..e256f5860120 100644 --- a/drivers/soc/qcom/smp2p.c +++ b/drivers/soc/qcom/smp2p.c @@ -173,10 +173,7 @@ struct qcom_smp2p { static void *ilc; #define SMP2P_LOG_PAGE_CNT 2 #define SMP2P_INFO(x, ...) \ -do { \ - if (ilc) \ - ipc_log_string(ilc, "[%s]: "x, __func__, ##__VA_ARGS__); \ -} while (0) + ipc_log_string(ilc, "[%s]: "x, __func__, ##__VA_ARGS__) static void qcom_smp2p_kick(struct qcom_smp2p *smp2p) { From 92b28408e5d235314423db5ad6db685dc2e85f00 Mon Sep 17 00:00:00 2001 From: Chris Lew Date: Fri, 6 Nov 2020 14:37:02 -0800 Subject: [PATCH 04/30] soc: qcom: smp2p: Add memory barrier for irq_pending There is a very tight race where the irq_retrigger function is run on one cpu and the actual retrigger softirq is running on a second cpu. When this happens, there may be a chance that the second cpu will not see the updated irq_pending value from first cpu. Add a memory barrier to ensure that irq_pending is read correctly. Change-Id: I3dd185decc4f050bd57c0b6558f417ead2a3aa5a Signed-off-by: Chris Lew Signed-off-by: Panchajanya1999 Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- drivers/soc/qcom/smp2p.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/soc/qcom/smp2p.c b/drivers/soc/qcom/smp2p.c index e256f5860120..ed71f00b85d3 100644 --- a/drivers/soc/qcom/smp2p.c +++ b/drivers/soc/qcom/smp2p.c @@ -272,6 +272,9 @@ static void qcom_smp2p_notify_in(struct qcom_smp2p *smp2p) status = val ^ entry->last_value; entry->last_value = val; + + /* Ensure irq_pending is read correctly */ + mb(); status |= *entry->irq_pending; /* No changes of this entry? */ @@ -393,6 +396,11 @@ static int smp2p_retrigger_irq(struct irq_data *irqd) SMP2P_INFO("%d: %s: %lu\n", entry->smp2p->remote_pid, entry->name, irq); set_bit(irq, entry->irq_pending); + /* Ensure irq_pending is visible to all cpus that retried interrupt + * can run on + */ + mb(); + return 0; } From 8c12a13b97b937bc2601c70ede2ec8834676998c Mon Sep 17 00:00:00 2001 From: Panchajanya1999 Date: Wed, 24 Nov 2021 14:21:46 +0530 Subject: [PATCH 05/30] binder: Return EFAULT on failing copy_from_user() The call copy_from/to_user() should always return EFAULT, since it is segmented(this means that a pointer by itself doesn't reference a unique location in memory, only a location in a memory segment). Change-Id: I014bcd9e6030d8b8b483ee828d65d861e0ea6b99 Signed-off-by: Panchajanya1999 --- drivers/android/binder.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 2f16437bdc27..b8d59b8c1eae 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -5034,7 +5034,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (copy_from_user(&max_threads, ubuf, sizeof(max_threads))) { - ret = -EINVAL; + ret = -EFAULT; goto err; } binder_inner_proc_lock(proc); @@ -5046,7 +5046,7 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) struct flat_binder_object fbo; if (copy_from_user(&fbo, ubuf, sizeof(fbo))) { - ret = -EINVAL; + ret = -EFAULT; goto err; } ret = binder_ioctl_set_ctx_mgr(filp, &fbo); From 750400b17699b4848df13b503f69940614f76a89 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Wed, 10 Nov 2021 18:18:14 -0800 Subject: [PATCH 06/30] BACKPORT: perf/core: Avoid put_page() when GUP fails PEBS PERF_SAMPLE_PHYS_ADDR events use perf_virt_to_phys() to convert PMU sampled virtual addresses to physical using get_user_page_fast_only() and page_to_phys(). Some get_user_page_fast_only() error cases return false, indicating no page reference, but still initialize the output page pointer with an unreferenced page. In these error cases perf_virt_to_phys() calls put_page(). This causes page reference count underflow, which can lead to unintentional page sharing. Fix perf_virt_to_phys() to only put_page() if get_user_page_fast_only() returns a referenced page. Fixes: fc7ce9c ("perf/core, x86: Add PERF_SAMPLE_PHYS_ADDR") Signed-off-by: Greg Thelen Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211111021814.757086-1-gthelen@google.com [cyberknight777: backport to 4.14] Signed-off-by: Cyber Knight Signed-off-by: Panchajanya1999 Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- kernel/events/core.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index ccbba551a505..7352731ac236 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6588,7 +6588,6 @@ void perf_output_sample(struct perf_output_handle *handle, static u64 perf_virt_to_phys(u64 virt) { u64 phys_addr = 0; - struct page *p = NULL; if (!virt) return 0; @@ -6607,14 +6606,14 @@ static u64 perf_virt_to_phys(u64 virt) * If failed, leave phys_addr as 0. */ if (current->mm != NULL) { + struct page *p; pagefault_disable(); - if (__get_user_pages_fast(virt, 1, 0, &p) == 1) + if (__get_user_pages_fast(virt, 1, 0, &p) == 1) { phys_addr = page_to_phys(p) + virt % PAGE_SIZE; + put_page(p); + } pagefault_enable(); } - - if (p) - put_page(p); } return phys_addr; From 931a50f75bd9a051c7f76219bff38e68a27321ef Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:30 -0800 Subject: [PATCH 07/30] BACKPORT: tcp: minor optimization in tcp_add_backlog() If packet is going to be coalesced, sk_sndbuf/sk_rcvbuf values are not used. Defer their access to the point we need them. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller [cyberknight777: backport to 4.14] Signed-off-by: Cyber Knight Signed-off-by: Panchajanya1999 Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- net/ipv4/tcp_ipv4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f9795e791c97..58511678725a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1627,13 +1627,13 @@ int tcp_v4_early_demux(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) { - u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; + u32 limit; /* Only socket owner can try to collapse/prune rx queues * to reduce memory overhead, so add a little headroom here. * Few sockets backlog are possibly concurrently non empty. */ - limit += 64*1024; + limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024; /* In case all data was pulled from skb frags (in __pskb_pull_tail()), * we can fix skb->truesize to its real value to avoid future drops. From ff81b96379146bf501de4b5067d5bfb416b5fc63 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:31 -0800 Subject: [PATCH 08/30] BACKPORT: tcp: remove dead code in __tcp_v6_send_check() For some reason, I forgot to change __tcp_v6_send_check() at the same time I removed (ip_summed == CHECKSUM_PARTIAL) check in __tcp_v4_send_check() Fixes: 0c63791fd180 ("tcp: remove dead code after CHECKSUM_PARTIAL adoption") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller [cyberknight777: backport to 4.14] Signed-off-by: Cyber Knight Signed-off-by: Panchajanya1999 Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- include/net/ip6_checksum.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/include/net/ip6_checksum.h b/include/net/ip6_checksum.h index cca840584c88..6c92dc23545e 100644 --- a/include/net/ip6_checksum.h +++ b/include/net/ip6_checksum.h @@ -69,15 +69,9 @@ static inline void __tcp_v6_send_check(struct sk_buff *skb, { struct tcphdr *th = tcp_hdr(skb); - if (skb->ip_summed == CHECKSUM_PARTIAL) { - th->check = ~tcp_v6_check(skb->len, saddr, daddr, 0); - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct tcphdr, check); - } else { - th->check = tcp_v6_check(skb->len, saddr, daddr, - csum_partial(th, th->doff << 2, - skb->csum)); - } + th->check = ~tcp_v6_check(skb->len, saddr, daddr, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); } #if IS_ENABLED(CONFIG_IPV6) From 8913dd7e9b4360b61ccba104d64b8aefeafea89d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:40 -0800 Subject: [PATCH 09/30] BACKPORT: tcp: small optimization in tcp recvmsg() When reading large chunks of data, incoming packets might be added to the backlog from BH. tcp recvmsg() detects the backlog queue is not empty, and uses a release_sock()/lock_sock() pair to process this backlog. We now have __sk_flush_backlog() to perform this a bit faster. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller [cyberknight777: backport to 4.14] Signed-off-by: Cyber Knight Signed-off-by: Panchajanya1999 Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- net/ipv4/tcp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5fca70def1c5..95dede1f1c18 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2093,8 +2093,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, if (copied >= target) { /* Do not sleep, just process backlog. */ - release_sock(sk); - lock_sock(sk); + __sk_flush_backlog(sk); } else { sk_wait_data(sk, &timeo, last); } From 6e70ed36cc48d90711aac9c962ef3633e8ce360b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:42 -0800 Subject: [PATCH 10/30] BACKPORT: tcp: annotate data-races on tp->segs_in and tp->data_segs_in tcp_segs_in() can be called from BH, while socket spinlock is held but socket owned by user, eventually reading these fields from tcp_get_info() Found by code inspection, no need to backport this patch to older kernels. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller [cyberknight777: backport to 4.14] Signed-off-by: Cyber Knight Signed-off-by: Panchajanya1999 Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- include/net/tcp.h | 8 ++++++-- net/ipv4/tcp.c | 6 ++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 7f21e900b613..36aebe546505 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2060,9 +2060,13 @@ static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb) u16 segs_in; segs_in = max_t(u16, 1, skb_shinfo(skb)->gso_segs); - tp->segs_in += segs_in; + + /* We update these fields while other threads might + * read them from tcp_get_info() + */ + WRITE_ONCE(tp->segs_in, tp->segs_in + segs_in); if (skb->len > tcp_hdrlen(skb)) - tp->data_segs_in += segs_in; + WRITE_ONCE(tp->data_segs_in, tp->data_segs_in + segs_in); } /* diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 95dede1f1c18..e405fb2b2dc7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3237,10 +3237,12 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) tcp_get_info_chrono_stats(tp, info); info->tcpi_segs_out = tp->segs_out; - info->tcpi_segs_in = tp->segs_in; + + /* segs_in and data_segs_in can be updated from tcp_segs_in() from BH */ + info->tcpi_segs_in = READ_ONCE(tp->segs_in); + info->tcpi_data_segs_in = READ_ONCE(tp->data_segs_in); info->tcpi_min_rtt = tcp_min_rtt(tp); - info->tcpi_data_segs_in = tp->data_segs_in; info->tcpi_data_segs_out = tp->data_segs_out; info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0; From dec70545abe2ee5cc0d1a61059db229d8c5a2346 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:43 -0800 Subject: [PATCH 11/30] BACKPORT: tcp: annotate races around tp->urg_data tcp_poll() and tcp_ioctl() are reading tp->urg_data without socket lock owned. Also, it is faster to first check tp->urg_data in tcp_poll(), then tp->urg_seq == tp->copied_seq, because tp->urg_seq is located in a different/cold cache line. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller [cyberknight777: backport to 4.14] Signed-off-by: Cyber Knight Signed-off-by: Panchajanya1999 Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- net/ipv4/tcp.c | 15 ++++++++------- net/ipv4/tcp_input.c | 6 +++--- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e405fb2b2dc7..b3d8fc7a23f5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -571,10 +571,11 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) if (state != TCP_SYN_SENT && (state != TCP_SYN_RECV || tp->fastopen_rsk)) { int target = sock_rcvlowat(sk, 0, INT_MAX); + u16 urg_data = READ_ONCE(tp->urg_data); - if (tp->urg_seq == tp->copied_seq && - !sock_flag(sk, SOCK_URGINLINE) && - tp->urg_data) + if (urg_data && + READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) && + !sock_flag(sk, SOCK_URGINLINE)) target++; if (tcp_stream_is_readable(tp, target, sk)) @@ -600,7 +601,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) mask |= EPOLLOUT | EPOLLWRNORM; if (tp->urg_data & TCP_URG_VALID) - mask |= EPOLLPRI; + mask |= POLLPRI; } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) { /* Active TCP fastopen socket with defer_connect * Return EPOLLOUT so application can call write() @@ -633,7 +634,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) unlock_sock_fast(sk, slow); break; case SIOCATMARK: - answ = tp->urg_data && tp->urg_seq == tp->copied_seq; + answ = READ_ONCE(tp->urg_data) && READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq); break; case SIOCOUTQ: if (sk->sk_state == TCP_LISTEN) @@ -1493,7 +1494,7 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags) char c = tp->urg_data; if (!(flags & MSG_PEEK)) - tp->urg_data = TCP_URG_READ; + WRITE_ONCE(tp->urg_data, TCP_URG_READ); /* Read urgent data. */ msg->msg_flags |= MSG_OOB; @@ -2149,7 +2150,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, skip_copy: if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) { - tp->urg_data = 0; + WRITE_ONCE(tp->urg_data, 0); tcp_fast_path_check(sk); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 166ff0e48c61..959e982e4f2a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5324,8 +5324,8 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) } } - tp->urg_data = TCP_URG_NOTYET; - tp->urg_seq = ptr; + WRITE_ONCE(tp->urg_data, TCP_URG_NOTYET); + WRITE_ONCE(tp->urg_seq, ptr); /* Disable header prediction. */ tp->pred_flags = 0; @@ -5350,7 +5350,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t u8 tmp; if (skb_copy_bits(skb, ptr, &tmp, 1)) BUG(); - tp->urg_data = TCP_URG_VALID | tmp; + WRITE_ONCE(tp->urg_data, TCP_URG_VALID | tmp); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); } From 57785ef724249f7a9bb05ddf765e21f18410aacc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:44 -0800 Subject: [PATCH 12/30] BACKPORT: tcp: tp->urg_data is unlikely to be set Use some unlikely() hints in the fast path. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller [cyberknight777: backport to 4.14] Signed-off-by: Cyber Knight Signed-off-by: Panchajanya1999 Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- net/ipv4/tcp.c | 10 +++++----- net/ipv4/tcp_input.c | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b3d8fc7a23f5..6d83a7ca5a2a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -573,7 +573,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) int target = sock_rcvlowat(sk, 0, INT_MAX); u16 urg_data = READ_ONCE(tp->urg_data); - if (urg_data && + if (unlikely(urg_data) && READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) && !sock_flag(sk, SOCK_URGINLINE)) target++; @@ -1666,7 +1666,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, len = skb->len - offset; /* Stop reading if we hit a patch of urgent data */ - if (tp->urg_data) { + if (unlikely(tp->urg_data)) { u32 urg_offset = tp->urg_seq - seq; if (urg_offset < len) len = urg_offset; @@ -2010,7 +2010,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, u32 offset; /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ - if (tp->urg_data && tp->urg_seq == *seq) { + if (unlikely(tp->urg_data) && tp->urg_seq == *seq) { if (copied) break; if (signal_pending(current)) { @@ -2115,7 +2115,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, used = len; /* Do we have urgent data here? */ - if (tp->urg_data) { + if (unlikely(tp->urg_data)) { u32 urg_offset = tp->urg_seq - *seq; if (urg_offset < used) { if (!urg_offset) { @@ -2149,7 +2149,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, tcp_rcv_space_adjust(sk); skip_copy: - if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) { + if (unlikely(tp->urg_data) && after(tp->copied_seq, tp->urg_seq)) { WRITE_ONCE(tp->urg_data, 0); tcp_fast_path_check(sk); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 959e982e4f2a..f5c531cb1b4d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5337,11 +5337,11 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t struct tcp_sock *tp = tcp_sk(sk); /* Check if we get a new urgent pointer - normally not. */ - if (th->urg) + if (unlikely(th->urg)) tcp_check_urg(sk, th); /* Do we wait for any urgent data? - normally not... */ - if (tp->urg_data == TCP_URG_NOTYET) { + if (unlikely(tp->urg_data == TCP_URG_NOTYET)) { u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) - th->syn; From 25e936b52afbf3d6f890860080b901d378abc137 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:45 -0800 Subject: [PATCH 13/30] BACKPORT: tcp: avoid indirect calls to sock_rfree TCP uses sk_eat_skb() when skbs can be removed from receive queue. However, the call to skb_orphan() from __kfree_skb() incurs an indirect call so sock_rfee(), which is more expensive than a direct call, especially for CONFIG_RETPOLINE=y. Add tcp_eat_recv_skb() function to make the call before __kfree_skb(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller [cyberknight777: backport to 4.14] Signed-off-by: Cyber Knight Signed-off-by: Panchajanya1999 Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- net/ipv4/tcp.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6d83a7ca5a2a..1062f4c5d657 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1613,6 +1613,16 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied) tcp_send_ack(sk); } +static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) +{ + if (likely(skb->destructor == sock_rfree)) { + sock_rfree(skb); + skb->destructor = NULL; + skb->sk = NULL; + } + sk_eat_skb(sk, skb); +} + static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; @@ -1632,7 +1642,7 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) * splitted a fat GRO packet, while we released socket lock * in skb_splice_bits() */ - sk_eat_skb(sk, skb); + tcp_eat_recv_skb(sk, skb); } return NULL; } @@ -1698,11 +1708,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, continue; } if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { - sk_eat_skb(sk, skb); + tcp_eat_recv_skb(sk, skb); ++seq; break; } - sk_eat_skb(sk, skb); + tcp_eat_recv_skb(sk, skb); if (!desc->count) break; tp->copied_seq = seq; @@ -2166,14 +2176,14 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); + tcp_eat_recv_skb(sk, skb); continue; found_fin_ok: /* Process the FIN. */ ++*seq; if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); + tcp_eat_recv_skb(sk, skb); break; } while (len > 0); From df21983f504967b2c3f85d7a79c3f38206548fe2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:46 -0800 Subject: [PATCH 14/30] BACKPORT: tcp: defer skb freeing after socket lock is released tcp recvmsg() (or rx zerocopy) spends a fair amount of time freeing skbs after their payload has been consumed. A typical ~64KB GRO packet has to release ~45 page references, eventually going to page allocator for each of them. Currently, this freeing is performed while socket lock is held, meaning that there is a high chance that BH handler has to queue incoming packets to tcp socket backlog. This can cause additional latencies, because the user thread has to process the backlog at release_sock() time, and while doing so, additional frames can be added by BH handler. This patch adds logic to defer these frees after socket lock is released, or directly from BH handler if possible. Being able to free these skbs from BH handler helps a lot, because this avoids the usual alloc/free assymetry, when BH handler and user thread do not run on same cpu or NUMA node. One cpu can now be fully utilized for the kernel->user copy, and another cpu is handling BH processing and skb/page allocs/frees (assuming RFS is not forcing use of a single CPU) Tested: 100Gbit NIC Max throughput for one TCP_STREAM flow, over 10 runs MTU : 1500 Before: 55 Gbit After: 66 Gbit MTU : 4096+(headers) Before: 82 Gbit After: 95 Gbit Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller [cyberknight777: backport to 4.14] Signed-off-by: Cyber Knight Signed-off-by: Panchajanya1999 Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- include/linux/skbuff.h | 2 ++ include/net/sock.h | 2 ++ include/net/tcp.h | 10 ++++++++++ net/ipv4/tcp.c | 26 +++++++++++++++++++++++++- net/ipv4/tcp_ipv4.c | 1 + net/ipv6/tcp_ipv6.c | 1 + 6 files changed, 41 insertions(+), 1 deletion(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 184ffd40cb82..d46e3e43c877 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -39,6 +39,7 @@ #include #include #include +#include #include /* The interface for checksum offload between the stack and networking drivers @@ -679,6 +680,7 @@ struct sk_buff { }; struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */ struct list_head list; + struct llist_node ll_node; }; union { diff --git a/include/net/sock.h b/include/net/sock.h index 7100472cec0f..997ee99a367f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -67,6 +67,7 @@ #include #include +#include #include #include #include @@ -384,6 +385,7 @@ struct sock { struct sk_buff *head; struct sk_buff *tail; } sk_backlog; + struct llist_head defer_list; #define sk_rmem_alloc sk_backlog.rmem_alloc int sk_forward_alloc; diff --git a/include/net/tcp.h b/include/net/tcp.h index 36aebe546505..5ed37b563f15 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1322,6 +1322,16 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb) } bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb); + +void __sk_defer_free_flush(struct sock *sk); + +static inline void sk_defer_free_flush(struct sock *sk) +{ + if (llist_empty(&sk->defer_list)) + return; + __sk_defer_free_flush(sk); +} + int tcp_filter(struct sock *sk, struct sk_buff *skb); #undef STATE_TRACE diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1062f4c5d657..c0fd3ae353fa 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1613,14 +1613,34 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied) tcp_send_ack(sk); } +void __sk_defer_free_flush(struct sock *sk) +{ + struct llist_node *head; + struct sk_buff *skb, *n; + + head = llist_del_all(&sk->defer_list); + llist_for_each_entry_safe(skb, n, head, ll_node) { + prefetch(n); + skb_mark_not_on_list(skb); + __kfree_skb(skb); + } +} +EXPORT_SYMBOL(__sk_defer_free_flush); + static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) { + __skb_unlink(skb, &sk->sk_receive_queue); if (likely(skb->destructor == sock_rfree)) { sock_rfree(skb); skb->destructor = NULL; skb->sk = NULL; + if (!skb_queue_empty(&sk->sk_receive_queue) || + !llist_empty(&sk->defer_list)) { + llist_add(&skb->ll_node, &sk->defer_list); + return; + } } - sk_eat_skb(sk, skb); + __kfree_skb(skb); } static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) @@ -1981,6 +2001,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, sk_busy_loop(sk, nonblock); lock_sock(sk); + sk_defer_free_flush(sk); err = -ENOTCONN; if (sk->sk_state == TCP_LISTEN) @@ -2106,6 +2127,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, /* Do not sleep, just process backlog. */ __sk_flush_backlog(sk); } else { + sk_defer_free_flush(sk); sk_wait_data(sk, &timeo, last); } @@ -2664,6 +2686,7 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_frag.offset = 0; } + sk_defer_free_flush(sk); sk->sk_error_report(sk); return 0; } @@ -2783,6 +2806,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, name[val] = 0; lock_sock(sk); + sk_defer_free_flush(sk); err = tcp_set_congestion_control(sk, name, true, true, ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 58511678725a..dce3a5d26965 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1822,6 +1822,7 @@ int tcp_v4_rcv(struct sk_buff *skb) sk_incoming_cpu_update(sk); + sk_defer_free_flush(sk); bh_lock_sock_nested(sk); tcp_segs_in(tcp_sk(sk), skb); ret = 0; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d56f2373753a..3cdadb17e579 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1548,6 +1548,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) sk_incoming_cpu_update(sk); + sk_defer_free_flush(sk); bh_lock_sock_nested(sk); tcp_segs_in(tcp_sk(sk), skb); ret = 0; From 0b1cd3cc57b3d3656d86999950246f4f317a281f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:47 -0800 Subject: [PATCH 15/30] BACKPORT: tcp: check local var (timeo) before socket fields in one test Testing timeo before sk_err/sk_state/sk_shutdown makes more sense. Modern applications use non-blocking IO, while a socket is terminated only once during its life time. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller [cyberknight777: backport to 4.14] Signed-off-by: Cyber Knight Signed-off-by: Panchajanya1999 Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- net/ipv4/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c0fd3ae353fa..10f37c8a5878 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2084,10 +2084,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, break; if (copied) { - if (sk->sk_err || + if (!timeo || + sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || - !timeo || signal_pending(current)) break; } else { From 9e61ae3933fb92b96d3d1b8c9e275597f30fac56 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 11:02:48 -0800 Subject: [PATCH 16/30] BACKPORT: tcp: do not call tcp_cleanup_rbuf() if we have a backlog Under pressure, tcp recvmsg() has logic to process the socket backlog, but calls tcp_cleanup_rbuf() right before. Avoiding sending ACK right before processing new segments makes a lot of sense, as this decrease the number of ACK packets, with no impact on effective ACK clocking. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller [cyberknight777: backport to 4.14] Signed-off-by: Cyber Knight Signed-off-by: Panchajanya1999 Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- net/ipv4/tcp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 10f37c8a5878..55f1edf32608 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2121,12 +2121,11 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, } } - tcp_cleanup_rbuf(sk, copied); - if (copied >= target) { /* Do not sleep, just process backlog. */ __sk_flush_backlog(sk); } else { + tcp_cleanup_rbuf(sk, copied); sk_defer_free_flush(sk); sk_wait_data(sk, &timeo, last); } From 66f7aec81d929e7bde9343eac2f20e74e88e750d Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Wed, 17 Nov 2021 13:38:35 +0000 Subject: [PATCH 17/30] BACKPORT: sched/uclamp: Fix wrong implementation of cpu.uclamp.min [ Upstream commit 0c18f2ecfcc274a4bcc1d122f79ebd4001c3b445 ] cpu.uclamp.min is a protection as described in cgroup-v2 Resource Distribution Model Documentation/admin-guide/cgroup-v2.rst which means we try our best to preserve the minimum performance point of tasks in this group. See full description of cpu.uclamp.min in the cgroup-v2.rst. But the current implementation makes it a limit, which is not what was intended. For example: tg->cpu.uclamp.min = 20% p0->uclamp[UCLAMP_MIN] = 0 p1->uclamp[UCLAMP_MIN] = 50% Previous Behavior (limit): p0->effective_uclamp = 0 p1->effective_uclamp = 20% New Behavior (Protection): p0->effective_uclamp = 20% p1->effective_uclamp = 50% Which is inline with how protections should work. With this change the cgroup and per-task behaviors are the same, as expected. Additionally, we remove the confusing relationship between cgroup and !user_defined flag. We don't want for example RT tasks that are boosted by default to max to change their boost value when they attach to a cgroup. If a cgroup wants to limit the max performance point of tasks attached to it, then cpu.uclamp.max must be set accordingly. Or if they want to set different boost value based on cgroup, then sysctl_sched_util_clamp_min_rt_default must be used to NOT boost to max and set the right cpu.uclamp.min for each group to let the RT tasks obtain the desired boost value when attached to that group. As it stands the dependency on !user_defined flag adds an extra layer of complexity that is not required now cpu.uclamp.min behaves properly as a protection. The propagation model of effective cpu.uclamp.min in child cgroups as implemented by cpu_util_update_eff() is still correct. The parent protection sets an upper limit of what the child cgroups will effectively get. Change-Id: I2c47d2ed1bcf8969d2bace3d7136f3a28851241a Fixes: 3eac870 (sched/uclamp: Use TG's clamps to restrict TASK's clamps) Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210510145032.1934078-2-qais.yousef@arm.com Signed-off-by: Sasha Levin Signed-off-by: Panchajanya1999 Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- kernel/sched/core.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 094748d878e1..82d467104a6d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -836,7 +836,6 @@ uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) { struct uclamp_se uc_req = p->uclamp_req[clamp_id]; #ifdef CONFIG_UCLAMP_TASK_GROUP - struct uclamp_se uc_max; /* * Tasks in autogroups or root task group will be @@ -847,9 +846,23 @@ uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) if (task_group(p) == &root_task_group) return uc_req; - uc_max = task_group(p)->uclamp[clamp_id]; - if (uc_req.value > uc_max.value || !uc_req.user_defined) - return uc_max; + switch (clamp_id) { + case UCLAMP_MIN: { + struct uclamp_se uc_min = task_group(p)->uclamp[clamp_id]; + if (uc_req.value < uc_min.value) + return uc_min; + break; + } + case UCLAMP_MAX: { + struct uclamp_se uc_max = task_group(p)->uclamp[clamp_id]; + if (uc_req.value > uc_max.value) + return uc_max; + break; + } + default: + WARN_ON_ONCE(1); + break; + } #endif return uc_req; From 223c0cf11d106e7f23c678b294e2a1dcbf4afe69 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 25 Jun 2019 10:39:13 -0400 Subject: [PATCH 18/30] BACKPORT: locking/rwsem: Make handoff writer optimistically spin on owner When the handoff bit is set by a writer, no other tasks other than the setting writer itself is allowed to acquire the lock. If the to-be-handoff'ed writer goes to sleep, there will be a wakeup latency period where the lock is free, but no one can acquire it. That is less than ideal. To reduce that latency, the handoff writer will now optimistically spin on the owner if it happens to be a on-cpu writer. It will spin until it releases the lock and the to-be-handoff'ed writer can then acquire the lock immediately without any delay. Of course, if the owner is not a on-cpu writer, the to-be-handoff'ed writer will have to sleep anyway. The optimistic spinning code is also modified to not stop spinning when the handoff bit is set. This will prevent an occasional setting of handoff bit from causing a bunch of optimistic spinners from entering into the wait queue causing significant reduction in throughput. On a 1-socket 22-core 44-thread Skylake system, the AIM7 shared_memory workload was run with 7000 users. The throughput (jobs/min) of the following kernels were as follows: 1) 5.2-rc6 - 8,092,486 2) 5.2-rc6 + tip's rwsem patches - 7,567,568 3) 5.2-rc6 + tip's rwsem patches + this patch - 7,954,545 Using perf-record(1), the %cpu time used by rwsem_down_write_slowpath(), rwsem_down_write_failed() and their callees for the 3 kernels were 1.70%, 5.46% and 2.08% respectively. Change-Id: Iad8e7ec4be151ad4dbe8ad1c1054bbd94b0504ea Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: x86@kernel.org Cc: Ingo Molnar Cc: Will Deacon Cc: huang ying Cc: Tim Chen Cc: Linus Torvalds Cc: Borislav Petkov Cc: Thomas Gleixner Cc: Davidlohr Bueso Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/20190625143913.24154-1-longman@redhat.com Signed-off-by: Panchajanya1999 Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- kernel/locking/rwsem.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index fe27d123e376..c85d23233aa9 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -799,11 +799,12 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) rcu_read_lock(); for (;;) { - if (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF) { - state = OWNER_NONSPINNABLE; - break; - } - + /* + * When a waiting writer set the handoff flag, it may spin + * on the owner as well. Once that writer acquires the lock, + * we can spin on it. So we don't need to quit even when the + * handoff bit is set. + */ new = rwsem_owner_flags(sem, &new_flags); if ((new != owner) || (new_flags != flags)) { state = rwsem_owner_state(new, new_flags, nonspinnable); @@ -1046,6 +1047,13 @@ static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, { return false; } + +static inline int +rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) +{ + return 0; +} +#define OWNER_NULL 1 #endif /* @@ -1280,6 +1288,18 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) raw_spin_unlock_irq(&sem->wait_lock); + /* + * After setting the handoff bit and failing to acquire + * the lock, attempt to spin on owner to accelerate lock + * transfer. If the previous owner is a on-cpu writer and it + * has just released the lock, OWNER_NULL will be returned. + * In this case, we attempt to acquire the lock again + * without sleeping. + */ + if ((wstate == WRITER_HANDOFF) && + (rwsem_spin_on_owner(sem, 0) == OWNER_NULL)) + goto trylock_again; + /* Block until there are no active lockers. */ for (;;) { if (signal_pending_state(state, current)) @@ -1312,7 +1332,7 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) break; } } - +trylock_again: raw_spin_lock_irq(&sem->wait_lock); } __set_current_state(TASK_RUNNING); From b42d0ae67a55a1929c3c2bfdb3d42c5486290440 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 15 Jan 2020 10:43:36 -0500 Subject: [PATCH 19/30] BACKPORT: locking/rwsem: Fix kernel crash when spinning on RWSEM_OWNER_UNKNOWN commit 39e7234f00bc93613c086ae42d852d5f4147120a upstream. The commit 91d2a812dfb9 ("locking/rwsem: Make handoff writer optimistically spin on owner") will allow a recently woken up waiting writer to spin on the owner. Unfortunately, if the owner happens to be RWSEM_OWNER_UNKNOWN, the code will incorrectly spin on it leading to a kernel crash. This is fixed by passing the proper non-spinnable bits to rwsem_spin_on_owner() so that RWSEM_OWNER_UNKNOWN will be treated as a non-spinnable target. Fixes: 91d2a812dfb9 ("locking/rwsem: Make handoff writer optimistically spin on owner") Reported-by: Christoph Hellwig Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Tested-by: Christoph Hellwig Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20200115154336.8679-1-longman@redhat.com Signed-off-by: Greg Kroah-Hartman Change-Id: I07590a1a41f941873e03241dccbb75a8b4df079b Signed-off-by: Panchajanya1999 Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- kernel/locking/rwsem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index c85d23233aa9..3d34db174e1c 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1296,8 +1296,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) * In this case, we attempt to acquire the lock again * without sleeping. */ - if ((wstate == WRITER_HANDOFF) && - (rwsem_spin_on_owner(sem, 0) == OWNER_NULL)) + if (wstate == WRITER_HANDOFF && + rwsem_spin_on_owner(sem, RWSEM_NONSPINNABLE) == OWNER_NULL) goto trylock_again; /* Block until there are no active lockers. */ From b84ad5a24c05abd84533eb30f0c4e5927f9af945 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 3 Dec 2020 14:10:32 -0600 Subject: [PATCH 20/30] BACKPORT: rwsem: Implement down_read_killable_nested In preparation for converting exec_update_mutex to a rwsem so that multiple readers can execute in parallel and not deadlock, add down_read_killable_nested. This is needed so that kcmp_lock can be converted from working on a mutexes to working on rw_semaphores. Change-Id: I07772b8cdbf121d5d09e22d2a5fb40566abc3136 Signed-off-by: Eric W. Biederman Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/87o8jabqh3.fsf@x220.int.ebiederm.org [panchajanya1999: adapted to k4.14] Signed-off-by: Panchajanya1999 Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- include/linux/rwsem.h | 2 ++ kernel/locking/rwsem.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index df7d6eda00b6..8098cea2e63f 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -173,6 +173,7 @@ extern void downgrade_write(struct rw_semaphore *sem); * See Documentation/locking/lockdep-design.txt for more details.) */ extern void down_read_nested(struct rw_semaphore *sem, int subclass); +extern int __must_check down_read_killable_nested(struct rw_semaphore *sem, int subclass); extern void down_write_nested(struct rw_semaphore *sem, int subclass); extern int down_write_killable_nested(struct rw_semaphore *sem, int subclass); extern void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest_lock); @@ -193,6 +194,7 @@ extern void down_read_non_owner(struct rw_semaphore *sem); extern void up_read_non_owner(struct rw_semaphore *sem); #else # define down_read_nested(sem, subclass) down_read(sem) +# define down_read_killable_nested(sem, subclass) down_read_killable(sem) # define down_write_nest_lock(sem, nest_lock) down_write(sem) # define down_write_nested(sem, subclass) down_write(sem) # define down_write_killable_nested(sem, subclass) down_write_killable(sem) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 3d34db174e1c..1fcc88556ffb 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1664,6 +1664,20 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) } EXPORT_SYMBOL(down_read_nested); +int down_read_killable_nested(struct rw_semaphore *sem, int subclass) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) { + rwsem_release(&sem->dep_map, 1, _RET_IP_); + return -EINTR; + } + + return 0; +} +EXPORT_SYMBOL(down_read_killable_nested); + void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) { might_sleep(); From 19af7ab47764f2dbb7e29330a00d99da63109ea7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 3 Dec 2020 14:11:13 -0600 Subject: [PATCH 21/30] BACKPORT: rwsem: Implement down_read_interruptible In preparation for converting exec_update_mutex to a rwsem so that multiple readers can execute in parallel and not deadlock, add down_read_interruptible. This is needed for perf_event_open to be converted (with no semantic changes) from working on a mutex to wroking on a rwsem. Change-Id: Ie81a42abc864a1af208e7e4f1a9e16d6013c3213 Signed-off-by: Eric W. Biederman Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/87k0tybqfy.fsf@x220.int.ebiederm.org [panchajanya1999: adapt it to k4.14] Signed-off-by: Panchajanya1999 Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- include/linux/rwsem.h | 1 + kernel/locking/rwsem.c | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 8098cea2e63f..ca4dfb8fd640 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -125,6 +125,7 @@ static inline int rwsem_is_contended(struct rw_semaphore *sem) * lock for reading */ extern void down_read(struct rw_semaphore *sem); +extern int __must_check down_read_interruptible(struct rw_semaphore *sem); extern int __must_check down_read_killable(struct rw_semaphore *sem); /* diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 1fcc88556ffb..52da1ec5bde1 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1414,6 +1414,18 @@ inline void __down_read(struct rw_semaphore *sem) } } +static inline int __down_read_interruptible(struct rw_semaphore *sem) +{ + if (!rwsem_read_trylock(sem)) { + if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_INTERRUPTIBLE))) + return -EINTR; + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); + } else { + rwsem_set_reader_owned(sem); + } + return 0; +} + static inline int __down_read_killable(struct rw_semaphore *sem) { if (!rwsem_read_trylock(sem)) { @@ -1554,6 +1566,20 @@ void __sched down_read(struct rw_semaphore *sem) } EXPORT_SYMBOL(down_read); +int __sched down_read_interruptible(struct rw_semaphore *sem) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_interruptible)) { + rwsem_release(&sem->dep_map, 1, _RET_IP_); + return -EINTR; + } + + return 0; +} +EXPORT_SYMBOL(down_read_interruptible); + int __sched down_read_killable(struct rw_semaphore *sem) { might_sleep(); From 09e15b5729fe8c7ea389187babecdc72c496868b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 8 Dec 2020 10:22:16 +0100 Subject: [PATCH 22/30] BACKPORT: locking/rwsem: Better collate rwsem_read_trylock() All users of rwsem_read_trylock() do rwsem_set_reader_owned(sem) on success, move it into rwsem_read_trylock() proper. Change-Id: I0d15f46024d9a535c150c65cfffdc27fc0aff87c Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201207090243.GE3040@hirez.programming.kicks-ass.net Signed-off-by: Panchajanya1999 Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- kernel/locking/rwsem.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 52da1ec5bde1..eaf1aee38a68 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -367,9 +367,16 @@ static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem) static inline bool rwsem_read_trylock(struct rw_semaphore *sem) { long cnt = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count); + if (WARN_ON_ONCE(cnt < 0)) rwsem_set_nonspinnable(sem); - return !(cnt & RWSEM_READ_FAILED_MASK); + + if (!(cnt & RWSEM_READ_FAILED_MASK)) { + rwsem_set_reader_owned(sem); + return true; + } + + return false; } /* @@ -1409,8 +1416,6 @@ inline void __down_read(struct rw_semaphore *sem) if (!rwsem_read_trylock(sem)) { rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE); DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); - } else { - rwsem_set_reader_owned(sem); } } @@ -1420,8 +1425,6 @@ static inline int __down_read_interruptible(struct rw_semaphore *sem) if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_INTERRUPTIBLE))) return -EINTR; DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); - } else { - rwsem_set_reader_owned(sem); } return 0; } @@ -1432,8 +1435,6 @@ static inline int __down_read_killable(struct rw_semaphore *sem) if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE))) return -EINTR; DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); - } else { - rwsem_set_reader_owned(sem); } return 0; } From 1ab2ebd60c04ba74555372defd83d6e428a9372a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 8 Dec 2020 10:25:06 +0100 Subject: [PATCH 23/30] BACKPORT: locking/rwsem: Introduce rwsem_write_trylock() One copy of this logic is better than three. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201207090243.GE3040@hirez.programming.kicks-ass.net Signed-off-by: Panchajanya1999 Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- kernel/locking/rwsem.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index eaf1aee38a68..f8e36a63d191 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -379,6 +379,18 @@ static inline bool rwsem_read_trylock(struct rw_semaphore *sem) return false; } +static inline bool rwsem_write_trylock(struct rw_semaphore *sem) +{ + long tmp = RWSEM_UNLOCKED_VALUE; + + if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) { + rwsem_set_owner(sem); + return true; + } + + return false; +} + /* * Return just the real task structure pointer of the owner */ @@ -1461,39 +1473,23 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) */ static inline void __down_write(struct rw_semaphore *sem) { - long tmp = RWSEM_UNLOCKED_VALUE; - - if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, - RWSEM_WRITER_LOCKED))) + if (unlikely(!rwsem_write_trylock(sem))) rwsem_down_write_slowpath(sem, TASK_UNINTERRUPTIBLE); - else - rwsem_set_owner(sem); } static inline int __down_write_killable(struct rw_semaphore *sem) { - long tmp = RWSEM_UNLOCKED_VALUE; - - if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, - RWSEM_WRITER_LOCKED))) { + if (unlikely(!rwsem_write_trylock(sem))) { if (IS_ERR(rwsem_down_write_slowpath(sem, TASK_KILLABLE))) return -EINTR; - } else { - rwsem_set_owner(sem); } + return 0; } static inline int __down_write_trylock(struct rw_semaphore *sem) { - long tmp = RWSEM_UNLOCKED_VALUE; - - if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, - RWSEM_WRITER_LOCKED)) { - rwsem_set_owner(sem); - return true; - } - return false; + return rwsem_write_trylock(sem); } /* From 223bcd67efcc1258c0101a78130a9306cda300c3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 8 Dec 2020 10:27:41 +0100 Subject: [PATCH 24/30] BACKPORT: locking/rwsem: Fold __down_{read,write}*() There's a lot needless duplication in __down_{read,write}*(), cure that with a helper. Change-Id: I9c3e573b89aa5922590ea90bc650678d948a81a7 Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201207090243.GE3040@hirez.programming.kicks-ass.net [panchajanya1999: Adapt to k4.14] Signed-off-by: Panchajanya1999 Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- kernel/locking/rwsem.c | 45 +++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index f8e36a63d191..473df84e3ce7 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1423,32 +1423,29 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) /* * lock for reading */ -inline void __down_read(struct rw_semaphore *sem) +static inline int __down_read_common(struct rw_semaphore *sem, int state) { if (!rwsem_read_trylock(sem)) { - rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE); + if (IS_ERR(rwsem_down_read_slowpath(sem, state))) + return -EINTR; DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); } + return 0; +} + +inline void __down_read(struct rw_semaphore *sem) +{ + __down_read_common(sem, TASK_UNINTERRUPTIBLE); } static inline int __down_read_interruptible(struct rw_semaphore *sem) { - if (!rwsem_read_trylock(sem)) { - if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_INTERRUPTIBLE))) - return -EINTR; - DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); - } - return 0; + return __down_read_common(sem, TASK_INTERRUPTIBLE); } static inline int __down_read_killable(struct rw_semaphore *sem) { - if (!rwsem_read_trylock(sem)) { - if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE))) - return -EINTR; - DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); - } - return 0; + return __down_read_common(sem, TASK_KILLABLE); } static inline int __down_read_trylock(struct rw_semaphore *sem) @@ -1471,22 +1468,26 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write(struct rw_semaphore *sem) -{ - if (unlikely(!rwsem_write_trylock(sem))) - rwsem_down_write_slowpath(sem, TASK_UNINTERRUPTIBLE); -} - -static inline int __down_write_killable(struct rw_semaphore *sem) +static inline int __down_write_common(struct rw_semaphore *sem, int state) { if (unlikely(!rwsem_write_trylock(sem))) { - if (IS_ERR(rwsem_down_write_slowpath(sem, TASK_KILLABLE))) + if (IS_ERR(rwsem_down_write_slowpath(sem, state))) return -EINTR; } return 0; } +static inline void __down_write(struct rw_semaphore *sem) +{ + __down_write_common(sem, TASK_UNINTERRUPTIBLE); +} + +static inline int __down_write_killable(struct rw_semaphore *sem) +{ + return __down_write_common(sem, TASK_KILLABLE); +} + static inline int __down_write_trylock(struct rw_semaphore *sem) { return rwsem_write_trylock(sem); From 4beba6ed1eee4148428fd97ba692622a1f52661c Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 20 Nov 2020 23:14:12 -0500 Subject: [PATCH 25/30] BACKPORT: locking/rwsem: Pass the current atomic count to rwsem_down_read_slowpath() The atomic count value right after reader count increment can be useful to determine the rwsem state at trylock time. So the count value is passed down to rwsem_down_read_slowpath() to be used when appropriate. Change-Id: I44cfd5a3dbf4154c7c0b8bc832fb70c532c13ce8 Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Davidlohr Bueso Link: https://lkml.kernel.org/r/20201121041416.12285-2-longman@redhat.com Signed-off-by: Panchajanya1999 Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- kernel/locking/rwsem.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 473df84e3ce7..41a6893cd3ae 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -364,14 +364,14 @@ static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem) owner | RWSEM_NONSPINNABLE)); } -static inline bool rwsem_read_trylock(struct rw_semaphore *sem) +static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp) { - long cnt = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count); + *cntp = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count); - if (WARN_ON_ONCE(cnt < 0)) + if (WARN_ON_ONCE(*cntp < 0)) rwsem_set_nonspinnable(sem); - if (!(cnt & RWSEM_READ_FAILED_MASK)) { + if (!(*cntp & RWSEM_READ_FAILED_MASK)) { rwsem_set_reader_owned(sem); return true; } @@ -1079,9 +1079,9 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) * Wait for the read lock to be granted */ static struct rw_semaphore __sched * -rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) +rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, int state) { - long count, adjustment = -RWSEM_READER_BIAS; + long adjustment = -RWSEM_READER_BIAS; struct rwsem_waiter waiter; DEFINE_WAKE_Q(wake_q); bool is_first_waiter = false; @@ -1425,8 +1425,10 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) */ static inline int __down_read_common(struct rw_semaphore *sem, int state) { - if (!rwsem_read_trylock(sem)) { - if (IS_ERR(rwsem_down_read_slowpath(sem, state))) + long count; + + if (!rwsem_read_trylock(sem, &count)) { + if (IS_ERR(rwsem_down_read_slowpath(sem, count, state))) return -EINTR; DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); } From 172d445a016f8720a8ca1442784f14630407404f Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 20 Nov 2020 23:14:13 -0500 Subject: [PATCH 26/30] BACKPORT: locking/rwsem: Prevent potential lock starvation The lock handoff bit is added in commit 4f23dbc1e657 ("locking/rwsem: Implement lock handoff to prevent lock starvation") to avoid lock starvation. However, allowing readers to do optimistic spinning does introduce an unlikely scenario where lock starvation can happen. The lock handoff bit may only be set when a waiter is being woken up. In the case of reader unlock, wakeup happens only when the reader count reaches 0. If there is a continuous stream of incoming readers acquiring read lock via optimistic spinning, it is possible that the reader count may never reach 0 and so the handoff bit will never be asserted. One way to prevent this scenario from happening is to disallow optimistic spinning if the rwsem is currently owned by readers. If the previous or current owner is a writer, optimistic spinning will be allowed. If the previous owner is a reader but the reader count has reached 0 before, a wakeup should have been issued. So the handoff mechanism will be kicked in to prevent lock starvation. As a result, it should be OK to do optimistic spinning in this case. This patch may have some impact on reader performance as it reduces reader optimistic spinning especially if the lock critical sections are short the number of contending readers are small. Change-Id: Icfaea5ade20c08744029040946c7e73e5b13d506 Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Davidlohr Bueso Link: https://lkml.kernel.org/r/20201121041416.12285-3-longman@redhat.com Signed-off-by: Panchajanya1999 Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- kernel/locking/rwsem.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 41a6893cd3ae..485a872c961e 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1081,17 +1081,28 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) static struct rw_semaphore __sched * rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, int state) { - long adjustment = -RWSEM_READER_BIAS; + long owner, adjustment = -RWSEM_READER_BIAS; + long rcnt = (count >> RWSEM_READER_SHIFT); struct rwsem_waiter waiter; DEFINE_WAKE_Q(wake_q); bool is_first_waiter = false; bool wake = false; + /* + * To prevent a constant stream of readers from starving a sleeping + * waiter, don't attempt optimistic spinning if the lock is currently + * owned by readers. + */ + owner = atomic_long_read(&sem->owner); + if ((owner & RWSEM_READER_OWNED) && (rcnt > 1) && + !(count & RWSEM_WRITER_LOCKED)) + goto queue; + /* * Save the current read-owner of rwsem, if available, and the * reader nonspinnable bit. */ - waiter.last_rowner = atomic_long_read(&sem->owner); + waiter.last_rowner = owner; if (!(waiter.last_rowner & RWSEM_READER_OWNED)) waiter.last_rowner &= RWSEM_RD_NONSPINNABLE; From ea04ac906f8a946fba48e77df71e1061f0374532 Mon Sep 17 00:00:00 2001 From: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> Date: Sun, 5 Dec 2021 04:32:31 -0600 Subject: [PATCH 27/30] drivers: input: fpc_tee: Implement fb notifier Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- .../input/fingerprint/fpc_tee/fpc1020_tee.c | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/drivers/input/fingerprint/fpc_tee/fpc1020_tee.c b/drivers/input/fingerprint/fpc_tee/fpc1020_tee.c index 9c4a585fd5f7..4da732842d15 100644 --- a/drivers/input/fingerprint/fpc_tee/fpc1020_tee.c +++ b/drivers/input/fingerprint/fpc_tee/fpc1020_tee.c @@ -41,7 +41,9 @@ #include #include #include +#include #include +#include #define FPC_GPIO_NO_DEFAULT -1 #define FPC_GPIO_NO_DEFINED -2 @@ -113,6 +115,8 @@ struct fpc1020_data { atomic_t wakeup_enabled; /* Used both in ISR and non-ISR */ int irqf; + struct notifier_block fb_notifier; + bool fb_black; }; static int reset_gpio_res(struct fpc1020_data *fpc1020); @@ -847,6 +851,44 @@ static int fpc1020_request_named_gpio(struct fpc1020_data *fpc1020, return 0; } +static int fpc_fb_notif_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct fpc1020_data *fpc1020 = container_of(nb, struct fpc1020_data, + fb_notifier); + struct fb_event *evdata = data; + unsigned int blank; + + if (!fpc1020) + return 0; + + if (val != MI_DRM_EVENT_BLANK ) + return 0; + + printk("hml [info] %s value = %d\n", __func__, (int)val); + + if (evdata && evdata->data && val == MI_DRM_EVENT_BLANK) { + blank = *(int *)(evdata->data); + switch (blank) { + case MI_DRM_BLANK_POWERDOWN: + fpc1020->fb_black = true; + break; + case MI_DRM_BLANK_UNBLANK: + fpc1020->fb_black = false; + break; + default: + printk("%s defalut\n", __func__); + break; + } + } + return NOTIFY_OK; +} + + +static struct notifier_block fpc_notif_block = { + .notifier_call = fpc_fb_notif_callback, +}; + static int fpc1020_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; From 848bebe9f71315fcc782d5c2f94e0bfeaf53cc0e Mon Sep 17 00:00:00 2001 From: Sultanxda Date: Sun, 5 Dec 2021 04:38:18 -0600 Subject: [PATCH 28/30] fpc1020_tee: Set fingerprintd priority to max when screen is off Give fingerprintd the highest priority (MIN_NICE) when the screen is off to speed up fingerprint processing, and then reset its priority back to normal when the screen is on. There are now 2 fingerprint related processes: - android.hardware.biometrics.fingerprint@2.1-service -> comm:fingerprint@2.1 - com.qualcomm.qti.biometrics.fingerprint.service -> comm:erprint.service For the match to work properly, use strstr() with "erprint". Change-Id: Ice4a384f99ae4201b2e5e942b0c8cc16c1190f52 Co-authored-by: Fiqri Ardyansyah Signed-off-by: Panchajanya1999 Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- drivers/input/fingerprint/fpc_tee/fpc1020_tee.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/input/fingerprint/fpc_tee/fpc1020_tee.c b/drivers/input/fingerprint/fpc_tee/fpc1020_tee.c index 4da732842d15..8a8cf09b1ebd 100644 --- a/drivers/input/fingerprint/fpc_tee/fpc1020_tee.c +++ b/drivers/input/fingerprint/fpc_tee/fpc1020_tee.c @@ -851,6 +851,18 @@ static int fpc1020_request_named_gpio(struct fpc1020_data *fpc1020, return 0; } +static void set_fingerprintd_nice(int nice) +{ + struct task_struct *p; + + read_lock(&tasklist_lock); + for_each_process(p) { + if (strstr(p->comm, "erprint")) + set_user_nice(p, nice); + } + read_unlock(&tasklist_lock); +} + static int fpc_fb_notif_callback(struct notifier_block *nb, unsigned long val, void *data) { @@ -871,9 +883,11 @@ static int fpc_fb_notif_callback(struct notifier_block *nb, blank = *(int *)(evdata->data); switch (blank) { case MI_DRM_BLANK_POWERDOWN: + set_fingerprintd_nice(MIN_NICE); fpc1020->fb_black = true; break; case MI_DRM_BLANK_UNBLANK: + set_fingerprintd_nice(0); fpc1020->fb_black = false; break; default: From 284f4e2f2ffba9c68d4b80b71ee20a8d3aafe3b6 Mon Sep 17 00:00:00 2001 From: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> Date: Sun, 5 Dec 2021 04:43:27 -0600 Subject: [PATCH 29/30] drivers: input: goodix_fod_lmi: fix fb notifier for goodix * The notifier was using a generic driver, switch it to use actual XiaoMi notifier. Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- drivers/input/fingerprint/goodix_fod_lmi/gf_spi.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/input/fingerprint/goodix_fod_lmi/gf_spi.c b/drivers/input/fingerprint/goodix_fod_lmi/gf_spi.c index 8735d26be214..7890c4574778 100644 --- a/drivers/input/fingerprint/goodix_fod_lmi/gf_spi.c +++ b/drivers/input/fingerprint/goodix_fod_lmi/gf_spi.c @@ -47,7 +47,7 @@ #include #include #ifndef GOODIX_DRM_INTERFACE_WA -#include +#include #endif #include "gf_spi.h" @@ -730,7 +730,7 @@ static int goodix_fb_state_chg_callback(struct notifier_block *nb, unsigned int blank; char temp[4] = { 0x0 }; - if (val != DRM_EVENT_BLANK) { + if (val != MI_DRM_EVENT_BLANK) { return 0; } @@ -739,11 +739,11 @@ static int goodix_fb_state_chg_callback(struct notifier_block *nb, __func__, (int)val); gf_dev = container_of(nb, struct gf_dev, notifier); - if (evdata && evdata->data && val == DRM_EVENT_BLANK && gf_dev) { + if (evdata && evdata->data && val == MI_DRM_EVENT_BLANK && gf_dev) { blank = *(int *)(evdata->data); switch (blank) { - case DRM_BLANK_POWERDOWN: + case MI_DRM_BLANK_POWERDOWN: if (gf_dev->device_available == 1) { gf_dev->fb_black = 1; gf_dev->wait_finger_down = true; @@ -760,7 +760,7 @@ static int goodix_fb_state_chg_callback(struct notifier_block *nb, } break; - case DRM_BLANK_UNBLANK: + case MI_DRM_BLANK_UNBLANK: if (gf_dev->device_available == 1) { gf_dev->fb_black = 0; #if defined(GF_NETLINK_ENABLE) @@ -891,7 +891,7 @@ static int gf_probe(struct platform_device *pdev) #endif #ifndef GOODIX_DRM_INTERFACE_WA gf_dev->notifier = goodix_noti_block; - drm_register_client(&gf_dev->notifier); + mi_drm_register_client(&gf_dev->notifier); #endif gf_dev->irq = gf_irq_num(gf_dev); fp_wakelock = wakeup_source_register(&gf_dev->spi->dev, "fp_wakelock"); @@ -955,7 +955,7 @@ static int gf_remove(struct platform_device *pdev) gf_cleanup(gf_dev); } #ifndef GOODIX_DRM_INTERFACE_WA - drm_unregister_client(&gf_dev->notifier); + mi_drm_unregister_client(&gf_dev->notifier); #endif mutex_unlock(&device_list_lock); return 0; From ab202aa5a79827e41d1f5cce9c6f8bf9dc005f26 Mon Sep 17 00:00:00 2001 From: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> Date: Sun, 5 Dec 2021 04:50:25 -0600 Subject: [PATCH 30/30] defconfig: bump localversion to Re:Volution Signed-off-by: Carlos Ayrton Lopez Arroyo <15030201@itcelaya.edu.mx> --- arch/arm64/configs/vendor/alioth_defconfig | 2 +- arch/arm64/configs/vendor/apollo_defconfig | 2 +- arch/arm64/configs/vendor/cmi_defconfig | 2 +- arch/arm64/configs/vendor/lmi_defconfig | 2 +- arch/arm64/configs/vendor/thyme_defconfig | 2 +- arch/arm64/configs/vendor/umi_defconfig | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/configs/vendor/alioth_defconfig b/arch/arm64/configs/vendor/alioth_defconfig index 125a94b09355..d56635cd8bc3 100644 --- a/arch/arm64/configs/vendor/alioth_defconfig +++ b/arch/arm64/configs/vendor/alioth_defconfig @@ -1,4 +1,4 @@ -CONFIG_LOCALVERSION="~Quantic" +CONFIG_LOCALVERSION="~Quantic-Re:Volution" # CONFIG_AUDIT is not set CONFIG_NO_HZ=y CONFIG_HZ_100=y diff --git a/arch/arm64/configs/vendor/apollo_defconfig b/arch/arm64/configs/vendor/apollo_defconfig index c46eeb031273..e3d02c686a7e 100644 --- a/arch/arm64/configs/vendor/apollo_defconfig +++ b/arch/arm64/configs/vendor/apollo_defconfig @@ -1,4 +1,4 @@ -CONFIG_LOCALVERSION="~Quantic" +CONFIG_LOCALVERSION="~Quantic-Re:Volution" # CONFIG_AUDIT is not set CONFIG_NO_HZ=y CONFIG_HZ_100=y diff --git a/arch/arm64/configs/vendor/cmi_defconfig b/arch/arm64/configs/vendor/cmi_defconfig index 0553073f6830..6ab09db1fe59 100644 --- a/arch/arm64/configs/vendor/cmi_defconfig +++ b/arch/arm64/configs/vendor/cmi_defconfig @@ -1,4 +1,4 @@ -CONFIG_LOCALVERSION="~Quantic" +CONFIG_LOCALVERSION="~Quantic-Re:Volution" # CONFIG_AUDIT is not set CONFIG_NO_HZ=y CONFIG_HZ_100=y diff --git a/arch/arm64/configs/vendor/lmi_defconfig b/arch/arm64/configs/vendor/lmi_defconfig index 36e484818a10..7c0ec9414cb5 100644 --- a/arch/arm64/configs/vendor/lmi_defconfig +++ b/arch/arm64/configs/vendor/lmi_defconfig @@ -1,4 +1,4 @@ -CONFIG_LOCALVERSION="~Quantic" +CONFIG_LOCALVERSION="~Quantic-Re:Volution" # CONFIG_AUDIT is not set CONFIG_NO_HZ=y CONFIG_HZ_100=y diff --git a/arch/arm64/configs/vendor/thyme_defconfig b/arch/arm64/configs/vendor/thyme_defconfig index 2ce8943f91d9..a78ddeec993e 100644 --- a/arch/arm64/configs/vendor/thyme_defconfig +++ b/arch/arm64/configs/vendor/thyme_defconfig @@ -1,4 +1,4 @@ -CONFIG_LOCALVERSION="~Quantic" +CONFIG_LOCALVERSION="~Quantic-Re:Volution" # CONFIG_AUDIT is not set CONFIG_NO_HZ=y CONFIG_HZ_100=y diff --git a/arch/arm64/configs/vendor/umi_defconfig b/arch/arm64/configs/vendor/umi_defconfig index db5719ab842c..f5d9249725e8 100644 --- a/arch/arm64/configs/vendor/umi_defconfig +++ b/arch/arm64/configs/vendor/umi_defconfig @@ -1,4 +1,4 @@ -CONFIG_LOCALVERSION="~Quantic" +CONFIG_LOCALVERSION="~Quantic-Re:Volution" # CONFIG_AUDIT is not set CONFIG_NO_HZ=y CONFIG_HZ_100=y